EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +323 -193
- euroeval/benchmark_modules/vllm.py +166 -112
- euroeval/benchmarker.py +59 -33
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +8 -7
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -468
- euroeval-15.15.0.dist-info/RECORD +0 -63
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,11 +2,11 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import collections.abc as c
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
6
|
-
import os
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
|
-
from functools import cached_property, partial
|
|
9
|
+
from functools import cache, cached_property, partial
|
|
10
10
|
from time import sleep
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
@@ -27,6 +27,7 @@ from litellm.exceptions import (
|
|
|
27
27
|
RateLimitError,
|
|
28
28
|
ServiceUnavailableError,
|
|
29
29
|
Timeout,
|
|
30
|
+
UnsupportedParamsError,
|
|
30
31
|
)
|
|
31
32
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
32
33
|
from litellm.router import Router
|
|
@@ -37,7 +38,12 @@ from requests.exceptions import RequestException
|
|
|
37
38
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
38
39
|
from tqdm.auto import tqdm
|
|
39
40
|
|
|
40
|
-
from ..constants import
|
|
41
|
+
from ..constants import (
|
|
42
|
+
JSON_STRIP_CHARACTERS,
|
|
43
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY,
|
|
44
|
+
MAX_LITELLM_LOGPROBS,
|
|
45
|
+
REASONING_MAX_TOKENS,
|
|
46
|
+
)
|
|
41
47
|
from ..data_models import (
|
|
42
48
|
BenchmarkConfig,
|
|
43
49
|
DatasetConfig,
|
|
@@ -66,16 +72,18 @@ from ..task_group_utils import (
|
|
|
66
72
|
text_to_text,
|
|
67
73
|
token_classification,
|
|
68
74
|
)
|
|
75
|
+
from ..tasks import NER
|
|
69
76
|
from ..tokenization_utils import get_first_label_token_mapping
|
|
70
77
|
from ..types import ExtractLabelsFunction
|
|
71
78
|
from ..utils import (
|
|
72
79
|
add_semaphore_and_catch_exception,
|
|
73
80
|
create_model_cache_dir,
|
|
81
|
+
get_hf_token,
|
|
74
82
|
log_once,
|
|
75
83
|
safe_run,
|
|
76
84
|
)
|
|
77
85
|
from .base import BenchmarkModule
|
|
78
|
-
from .hf import HuggingFaceEncoderModel, load_hf_model_config,
|
|
86
|
+
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
|
|
79
87
|
|
|
80
88
|
if t.TYPE_CHECKING:
|
|
81
89
|
from datasets import DatasetDict
|
|
@@ -87,6 +95,7 @@ logger = logging.getLogger("euroeval")
|
|
|
87
95
|
|
|
88
96
|
VOCAB_SIZE_MAPPING = {
|
|
89
97
|
# OpenAI models
|
|
98
|
+
r"gpt-5-.*": 100_256,
|
|
90
99
|
r"gpt-4-(32k)?(-[0-9]{4})?": 100_256,
|
|
91
100
|
r"gpt-4-[0-9]{4}-preview": 100_256,
|
|
92
101
|
r"gpt-4-turbo(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": 100_256,
|
|
@@ -105,6 +114,7 @@ VOCAB_SIZE_MAPPING = {
|
|
|
105
114
|
|
|
106
115
|
MODEL_MAX_LENGTH_MAPPING = {
|
|
107
116
|
# OpenAI models
|
|
117
|
+
r"gpt-5-.*": 272_000,
|
|
108
118
|
r"gpt-4(-[0-9]{4})?": 8_191,
|
|
109
119
|
r"gpt-4-32k(-[0-9]{4})?": 32_767,
|
|
110
120
|
r"gpt-4-[0-9]{4}-preview": 128_000,
|
|
@@ -129,6 +139,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
129
139
|
|
|
130
140
|
NUM_PARAMS_MAPPING = {
|
|
131
141
|
# OpenAI models
|
|
142
|
+
r"gpt-5-.*": -1,
|
|
132
143
|
r"gpt-4.*": -1,
|
|
133
144
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": -1,
|
|
134
145
|
# Anthropic models
|
|
@@ -144,13 +155,14 @@ NUM_PARAMS_MAPPING = {
|
|
|
144
155
|
|
|
145
156
|
ALLOWED_PARAMS = {
|
|
146
157
|
# OpenAI models
|
|
158
|
+
r"gpt-5-.*": ["minimal", "low", "medium", "high"],
|
|
147
159
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
|
|
148
160
|
# Anthropic models
|
|
149
161
|
r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
|
|
150
162
|
r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
|
|
151
163
|
# Gemini models
|
|
152
164
|
r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
|
|
153
|
-
r"(gemini/)?gemini-2.5-flash
|
|
165
|
+
r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
|
|
154
166
|
# xAI models
|
|
155
167
|
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
|
|
156
168
|
}
|
|
@@ -176,6 +188,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
176
188
|
model_config: ModelConfig,
|
|
177
189
|
dataset_config: DatasetConfig,
|
|
178
190
|
benchmark_config: BenchmarkConfig,
|
|
191
|
+
log_metadata: bool = True,
|
|
192
|
+
**generation_kwargs: dict[str, t.Any],
|
|
179
193
|
) -> None:
|
|
180
194
|
"""Initialise the model.
|
|
181
195
|
|
|
@@ -186,6 +200,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
186
200
|
The dataset configuration.
|
|
187
201
|
benchmark_config:
|
|
188
202
|
The benchmark configuration.
|
|
203
|
+
log_metadata:
|
|
204
|
+
Whether to log the model metadata.
|
|
205
|
+
generation_kwargs:
|
|
206
|
+
The generation kwargs to pass to the model. If None, default values will
|
|
207
|
+
be used.
|
|
189
208
|
"""
|
|
190
209
|
# Detect whether the model is an Ollama model, as we need to extract metadata
|
|
191
210
|
# differently for these models
|
|
@@ -204,13 +223,16 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
204
223
|
model_config=model_config,
|
|
205
224
|
dataset_config=dataset_config,
|
|
206
225
|
benchmark_config=benchmark_config,
|
|
226
|
+
log_metadata=log_metadata,
|
|
207
227
|
)
|
|
208
228
|
|
|
229
|
+
self.generation_kwargs = generation_kwargs
|
|
209
230
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
210
231
|
dataset_config=self.dataset_config,
|
|
211
232
|
model_config=self.model_config,
|
|
212
|
-
|
|
233
|
+
tokeniser=None,
|
|
213
234
|
generative_type=self.generative_type,
|
|
235
|
+
log_metadata=self.log_metadata,
|
|
214
236
|
)
|
|
215
237
|
|
|
216
238
|
@property
|
|
@@ -240,11 +262,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
240
262
|
else:
|
|
241
263
|
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
242
264
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
265
|
+
if self.log_metadata:
|
|
266
|
+
log_once(
|
|
267
|
+
f"Detected generative type {type_.name!r} for model "
|
|
268
|
+
f"{self.model_config.model_id!r}",
|
|
269
|
+
level=logging.DEBUG,
|
|
270
|
+
)
|
|
248
271
|
return type_
|
|
249
272
|
|
|
250
273
|
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
@@ -265,132 +288,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
265
288
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
266
289
|
dataset_config=self.dataset_config,
|
|
267
290
|
model_config=self.model_config,
|
|
268
|
-
|
|
291
|
+
tokeniser=None,
|
|
269
292
|
generative_type=self.generative_type,
|
|
293
|
+
log_metadata=self.log_metadata,
|
|
270
294
|
)
|
|
271
295
|
|
|
272
|
-
# Set the core generation arguments
|
|
273
|
-
generation_kwargs: dict[str, t.Any] = dict(
|
|
274
|
-
model=self.model_config.model_id,
|
|
275
|
-
max_completion_tokens=(
|
|
276
|
-
REASONING_MAX_TOKENS
|
|
277
|
-
if self.generative_type == GenerativeType.REASONING
|
|
278
|
-
else self.dataset_config.max_generated_tokens
|
|
279
|
-
),
|
|
280
|
-
stop=[],
|
|
281
|
-
temperature=0.0,
|
|
282
|
-
seed=4242,
|
|
283
|
-
api_key=self.benchmark_config.api_key,
|
|
284
|
-
api_base=self.benchmark_config.api_base,
|
|
285
|
-
api_version=self.benchmark_config.api_version,
|
|
286
|
-
max_retries=3,
|
|
287
|
-
)
|
|
288
|
-
|
|
289
|
-
# Set up the `response_format` generation argument if we are dealing with a task
|
|
290
|
-
# using structured generation
|
|
291
|
-
if self.dataset_config.task in TASKS_USING_JSON:
|
|
292
|
-
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
293
|
-
# this
|
|
294
|
-
for conversation in conversations:
|
|
295
|
-
if not conversation:
|
|
296
|
-
raise InvalidBenchmark(
|
|
297
|
-
"Encountered an empty conversation in 'messages'."
|
|
298
|
-
)
|
|
299
|
-
last_message = conversation[-1]
|
|
300
|
-
assert isinstance(last_message, dict), (
|
|
301
|
-
f"Expected dict message, got {type(last_message)}"
|
|
302
|
-
)
|
|
303
|
-
assert "content" in last_message, (
|
|
304
|
-
"Expected 'content' key in the last message of the conversation."
|
|
305
|
-
)
|
|
306
|
-
assert isinstance(last_message["content"], str), (
|
|
307
|
-
"Expected 'content' to be a string."
|
|
308
|
-
)
|
|
309
|
-
assert "json" in last_message["content"].lower(), (
|
|
310
|
-
"Prompt must contain 'json' for JSON tasks."
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
314
|
-
log_once(
|
|
315
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
316
|
-
"and thus does not support structured generation, so we do not "
|
|
317
|
-
"enable it.",
|
|
318
|
-
level=logging.DEBUG,
|
|
319
|
-
)
|
|
320
|
-
elif supports_response_schema(model=self.model_config.model_id):
|
|
321
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
322
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
323
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
324
|
-
for tag_name in ner_tag_names
|
|
325
|
-
}
|
|
326
|
-
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
327
|
-
generation_kwargs["response_format"] = pydantic_class
|
|
328
|
-
log_once(
|
|
329
|
-
"Enabling structured generation for model "
|
|
330
|
-
f"{self.model_config.model_id!r} with the JSON schema "
|
|
331
|
-
f"{pydantic_class.model_json_schema()}",
|
|
332
|
-
level=logging.DEBUG,
|
|
333
|
-
)
|
|
334
|
-
else:
|
|
335
|
-
generation_kwargs["response_format"] = dict(type="json_object")
|
|
336
|
-
log_once(
|
|
337
|
-
"Enabling structured JSON generation for model "
|
|
338
|
-
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
339
|
-
"the model does not support schemas.",
|
|
340
|
-
level=logging.DEBUG,
|
|
341
|
-
)
|
|
342
|
-
|
|
343
|
-
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
344
|
-
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
345
|
-
generation_kwargs["think"] = True
|
|
346
|
-
log_once(
|
|
347
|
-
"Enabling thinking mode for Ollama model "
|
|
348
|
-
f"{self.model_config.model_id!r}",
|
|
349
|
-
level=logging.DEBUG,
|
|
350
|
-
)
|
|
351
|
-
|
|
352
|
-
# Handle manually set parameters
|
|
353
|
-
if self.buffer["first_label_token_mapping"]:
|
|
354
|
-
generation_kwargs["logprobs"] = True
|
|
355
|
-
generation_kwargs["top_logprobs"] = MAX_LOGPROBS
|
|
356
|
-
if self.model_config.revision == "thinking":
|
|
357
|
-
generation_kwargs["thinking"] = dict(
|
|
358
|
-
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
359
|
-
)
|
|
360
|
-
log_once(
|
|
361
|
-
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
362
|
-
level=logging.DEBUG,
|
|
363
|
-
)
|
|
364
|
-
elif self.model_config.revision == "no-thinking":
|
|
365
|
-
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
366
|
-
log_once(
|
|
367
|
-
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
368
|
-
level=logging.DEBUG,
|
|
369
|
-
)
|
|
370
|
-
elif self.model_config.revision in {"low", "medium", "high"}:
|
|
371
|
-
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
372
|
-
log_once(
|
|
373
|
-
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
374
|
-
f"{self.model_config.model_id!r}",
|
|
375
|
-
level=logging.DEBUG,
|
|
376
|
-
)
|
|
377
|
-
|
|
378
|
-
# Drop generation kwargs that are not supported by the model
|
|
379
|
-
litellm.drop_params = True
|
|
380
|
-
|
|
381
|
-
# First attempt is a test run with a single conversation to handle errors
|
|
382
|
-
# quickly
|
|
383
|
-
test_conversation = conversations[0]
|
|
384
|
-
_, failures = safe_run(
|
|
385
|
-
self._generate_async(
|
|
386
|
-
model_id=self.model_config.model_id,
|
|
387
|
-
conversations=[test_conversation],
|
|
388
|
-
**generation_kwargs,
|
|
389
|
-
)
|
|
390
|
-
)
|
|
391
|
-
for _, error in failures:
|
|
392
|
-
self._handle_exception(error=error, generation_kwargs=generation_kwargs)
|
|
393
|
-
|
|
394
296
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
395
297
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
396
298
|
enumerate(conversations)
|
|
@@ -399,6 +301,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
399
301
|
if not conversations_to_run:
|
|
400
302
|
break
|
|
401
303
|
|
|
304
|
+
generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
|
|
305
|
+
dataset_config=self.dataset_config
|
|
306
|
+
)
|
|
307
|
+
|
|
402
308
|
batch_indices, batch_conversations = zip(*conversations_to_run)
|
|
403
309
|
successes, failures = safe_run(
|
|
404
310
|
self._generate_async(
|
|
@@ -431,7 +337,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
431
337
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
432
338
|
# successful generations next time around
|
|
433
339
|
for _, error in failures:
|
|
434
|
-
self._handle_exception(
|
|
340
|
+
generation_kwargs = self._handle_exception(
|
|
341
|
+
error=error, **generation_kwargs
|
|
342
|
+
)
|
|
435
343
|
|
|
436
344
|
# Sleep for a second to avoid pinging the API server too quickly
|
|
437
345
|
sleep(1)
|
|
@@ -454,9 +362,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
454
362
|
|
|
455
363
|
return model_output
|
|
456
364
|
|
|
457
|
-
def _handle_exception(
|
|
458
|
-
self, error: Exception, generation_kwargs: dict[str, t.Any]
|
|
459
|
-
) -> None:
|
|
365
|
+
def _handle_exception(self, error: Exception, **generation_kwargs) -> dict:
|
|
460
366
|
"""Handle an exception from the model.
|
|
461
367
|
|
|
462
368
|
Args:
|
|
@@ -464,6 +370,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
464
370
|
The exception to handle.
|
|
465
371
|
generation_kwargs:
|
|
466
372
|
The generation kwargs to pass to the model.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
The updated generation kwargs to pass to the model.
|
|
467
376
|
"""
|
|
468
377
|
error_msg = str(error).lower()
|
|
469
378
|
model_id = self.model_config.model_id
|
|
@@ -476,6 +385,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
476
385
|
"logprobs is not supported",
|
|
477
386
|
"logprobs is not enabled",
|
|
478
387
|
]
|
|
388
|
+
logprobs_pattern = re.compile(
|
|
389
|
+
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
390
|
+
)
|
|
479
391
|
temperature_messages = [
|
|
480
392
|
"'temperature' is not supported with this model.",
|
|
481
393
|
"temperature is not supported with this model",
|
|
@@ -484,6 +396,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
484
396
|
"`temperature` may only be set to 1",
|
|
485
397
|
"'temperature' does not support 0.0 with this model. Only the default "
|
|
486
398
|
"(1) value is supported",
|
|
399
|
+
"Only temperature=1 is supported",
|
|
487
400
|
]
|
|
488
401
|
max_items_messages = ["'maxItems' is not permitted."]
|
|
489
402
|
no_json_schema_messages = ["Property keys should match pattern"]
|
|
@@ -492,6 +405,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
492
405
|
r"[0-9]+ and ([0-9]+)\."
|
|
493
406
|
)
|
|
494
407
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
408
|
+
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
495
409
|
|
|
496
410
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
497
411
|
log_once(
|
|
@@ -500,9 +414,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
500
414
|
level=logging.DEBUG,
|
|
501
415
|
)
|
|
502
416
|
generation_kwargs["stop"] = None
|
|
503
|
-
return
|
|
417
|
+
return generation_kwargs
|
|
504
418
|
elif (
|
|
505
419
|
any(msg.lower() in error_msg for msg in logprobs_messages)
|
|
420
|
+
or logprobs_pattern.search(string=error_msg)
|
|
506
421
|
# Special case for Vertex AI models, since they have strict rate
|
|
507
422
|
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
508
423
|
# we ignore this since the rate limiting makes it unusable anyway.
|
|
@@ -514,7 +429,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
514
429
|
)
|
|
515
430
|
generation_kwargs.pop("logprobs", None)
|
|
516
431
|
generation_kwargs.pop("top_logprobs", None)
|
|
517
|
-
return
|
|
432
|
+
return generation_kwargs
|
|
518
433
|
elif any(msg.lower() in error_msg for msg in temperature_messages):
|
|
519
434
|
log_once(
|
|
520
435
|
f"The model {model_id!r} does not support "
|
|
@@ -522,7 +437,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
522
437
|
level=logging.DEBUG,
|
|
523
438
|
)
|
|
524
439
|
generation_kwargs.pop("temperature", None)
|
|
525
|
-
return
|
|
440
|
+
return generation_kwargs
|
|
526
441
|
elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
|
|
527
442
|
log_once(
|
|
528
443
|
f"The model {model_id!r} requires "
|
|
@@ -530,8 +445,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
530
445
|
level=logging.DEBUG,
|
|
531
446
|
)
|
|
532
447
|
generation_kwargs["temperature"] = 1.0
|
|
533
|
-
return
|
|
534
|
-
elif
|
|
448
|
+
return generation_kwargs
|
|
449
|
+
elif (
|
|
450
|
+
any(msg.lower() in error_msg for msg in max_items_messages)
|
|
451
|
+
and self.dataset_config.task == NER
|
|
452
|
+
):
|
|
535
453
|
log_once(
|
|
536
454
|
f"The model {model_id!r} does not support "
|
|
537
455
|
"maxItems in the JSON schema, so disabling it.",
|
|
@@ -543,7 +461,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
543
461
|
}
|
|
544
462
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
545
463
|
generation_kwargs["response_format"] = pydantic_class
|
|
546
|
-
return
|
|
464
|
+
return generation_kwargs
|
|
547
465
|
elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
|
|
548
466
|
log_once(
|
|
549
467
|
f"The model {self.model_config.model_id!r} does not support "
|
|
@@ -551,7 +469,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
551
469
|
level=logging.DEBUG,
|
|
552
470
|
)
|
|
553
471
|
generation_kwargs["response_format"] = dict(type="json_object")
|
|
554
|
-
return
|
|
472
|
+
return generation_kwargs
|
|
555
473
|
elif thinking_match := thinking_budget_pattern.search(string=error_msg):
|
|
556
474
|
thinking_budget = int(thinking_match.group(1))
|
|
557
475
|
if thinking_budget >= REASONING_MAX_TOKENS:
|
|
@@ -560,7 +478,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
560
478
|
f"{thinking_budget:,} tokens, which is within the limit of "
|
|
561
479
|
f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
|
|
562
480
|
f"error message was: {error_msg}."
|
|
563
|
-
)
|
|
481
|
+
) from error
|
|
564
482
|
log_once(
|
|
565
483
|
f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
|
|
566
484
|
"for reasoning, which is less than the default of "
|
|
@@ -571,7 +489,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
571
489
|
generation_kwargs["thinking"] = dict(
|
|
572
490
|
type="enabled", budget_tokens=thinking_budget - 1
|
|
573
491
|
)
|
|
574
|
-
return
|
|
492
|
+
return generation_kwargs
|
|
575
493
|
elif (
|
|
576
494
|
any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
|
|
577
495
|
and self.generative_type != GenerativeType.REASONING
|
|
@@ -583,45 +501,73 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
583
501
|
level=logging.DEBUG,
|
|
584
502
|
)
|
|
585
503
|
generation_kwargs["thinking"] = dict(type="disabled")
|
|
586
|
-
return
|
|
504
|
+
return generation_kwargs
|
|
505
|
+
elif re.search(pattern=seed_pattern, string=error_msg):
|
|
506
|
+
log_once(
|
|
507
|
+
f"The model {model_id!r} does not support the `seed` parameter, so "
|
|
508
|
+
"disabling it.",
|
|
509
|
+
level=logging.DEBUG,
|
|
510
|
+
)
|
|
511
|
+
generation_kwargs.pop("seed", None)
|
|
512
|
+
return generation_kwargs
|
|
513
|
+
# If there are too many I/O connections, we increase the number of allowed file
|
|
514
|
+
# descriptors
|
|
515
|
+
elif "too many open files" in error_msg:
|
|
516
|
+
raise InvalidBenchmark(
|
|
517
|
+
"There are too many file descriptors running. See the current "
|
|
518
|
+
"value by running `ulimit -n`. Try increasing it by running "
|
|
519
|
+
"`ulimit -n <new-value>` and try again."
|
|
520
|
+
) from error
|
|
587
521
|
elif isinstance(
|
|
588
522
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
589
523
|
):
|
|
590
524
|
logger.debug(
|
|
591
525
|
f"Service temporarily unavailable. The error message was: {error}. "
|
|
592
|
-
|
|
526
|
+
"Retrying in 10 seconds..."
|
|
593
527
|
)
|
|
594
|
-
sleep(
|
|
595
|
-
return
|
|
528
|
+
sleep(10)
|
|
529
|
+
return generation_kwargs
|
|
530
|
+
elif isinstance(error, UnsupportedParamsError):
|
|
531
|
+
unsupported_param_match = re.search(
|
|
532
|
+
pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
|
|
533
|
+
string=error.message,
|
|
534
|
+
)
|
|
535
|
+
if unsupported_param_match is None:
|
|
536
|
+
raise InvalidModel(error.message) from error
|
|
537
|
+
else:
|
|
538
|
+
unsupported_param = unsupported_param_match.group(0)
|
|
539
|
+
raise InvalidModel(
|
|
540
|
+
f"The model {model_id!r} does not support the parameter "
|
|
541
|
+
f"{unsupported_param!r}. Try again without this parameter. "
|
|
542
|
+
"Skipping this model."
|
|
543
|
+
) from error
|
|
596
544
|
elif isinstance(error, (APIConnectionError, OSError)):
|
|
597
|
-
# If there are too many I/O connections, we increase the number of allowed
|
|
598
|
-
# file descriptors
|
|
599
|
-
if "too many open files" in error_msg:
|
|
600
|
-
raise InvalidBenchmark(
|
|
601
|
-
"There are too many file descriptors running. See the current "
|
|
602
|
-
"value by running `ulimit -n`. Try increasing it by running "
|
|
603
|
-
"`ulimit -n <new-value>` and try again."
|
|
604
|
-
)
|
|
605
545
|
raise InvalidBenchmark(
|
|
606
546
|
f"Encountered {type(error)} during generation: {error}."
|
|
607
|
-
)
|
|
547
|
+
) from error
|
|
548
|
+
|
|
549
|
+
if isinstance(error, NotFoundError):
|
|
550
|
+
raise InvalidModel(
|
|
551
|
+
f"The model {model_id!r} was not found. Please check the model ID "
|
|
552
|
+
"and try again."
|
|
553
|
+
) from error
|
|
608
554
|
|
|
609
555
|
if isinstance(error, RateLimitError):
|
|
610
556
|
raise InvalidModel(
|
|
611
557
|
f"You have encountered your rate limit for model {model_id!r}. "
|
|
612
558
|
"Skipping."
|
|
613
|
-
)
|
|
559
|
+
) from error
|
|
614
560
|
|
|
615
561
|
if isinstance(error, AuthenticationError):
|
|
616
562
|
raise NeedsAdditionalArgument(
|
|
617
563
|
cli_argument="--api-key",
|
|
618
564
|
script_argument="api_key=<your-api-key>",
|
|
619
565
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
620
|
-
)
|
|
566
|
+
) from error
|
|
621
567
|
|
|
622
568
|
raise InvalidBenchmark(
|
|
623
569
|
f"Failed to generate text. The error message was: {error}"
|
|
624
|
-
)
|
|
570
|
+
) from error
|
|
625
571
|
|
|
626
572
|
async def _generate_async(
|
|
627
573
|
self,
|
|
@@ -648,9 +594,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
648
594
|
# for all the requests, preventing "too many open files" errors
|
|
649
595
|
router = Router(
|
|
650
596
|
model_list=[
|
|
651
|
-
|
|
597
|
+
litellm.DeploymentTypedDict(
|
|
652
598
|
model_name=self.model_config.model_id,
|
|
653
|
-
litellm_params=
|
|
599
|
+
litellm_params=litellm.LiteLLMParamsTypedDict(model=model_id),
|
|
654
600
|
)
|
|
655
601
|
]
|
|
656
602
|
)
|
|
@@ -660,7 +606,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
660
606
|
semaphore = asyncio.Semaphore(max_concurrent_calls)
|
|
661
607
|
requests = [
|
|
662
608
|
add_semaphore_and_catch_exception(
|
|
663
|
-
router.acompletion(
|
|
609
|
+
router.acompletion(
|
|
610
|
+
model=model_id, messages=conversation, **generation_kwargs
|
|
611
|
+
),
|
|
664
612
|
semaphore=semaphore,
|
|
665
613
|
)
|
|
666
614
|
for conversation in conversations
|
|
@@ -720,6 +668,23 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
720
668
|
generation_output = generated_message.content or ""
|
|
721
669
|
generation_output = generation_output.strip()
|
|
722
670
|
|
|
671
|
+
# In the case where we're dealing with a classification task, the model is
|
|
672
|
+
# outputting a JSON dictionary, so we will extract the generated text from
|
|
673
|
+
# within the dictionary
|
|
674
|
+
generation_dct: dict[str, t.Any] | None = None
|
|
675
|
+
if LITELLM_CLASSIFICATION_OUTPUT_KEY in generation_output:
|
|
676
|
+
try:
|
|
677
|
+
generation_dct = json.loads(generation_output)
|
|
678
|
+
assert isinstance(generation_dct, dict)
|
|
679
|
+
if set(generation_dct.keys()) == {
|
|
680
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY
|
|
681
|
+
}:
|
|
682
|
+
generation_output = str(
|
|
683
|
+
generation_dct[LITELLM_CLASSIFICATION_OUTPUT_KEY]
|
|
684
|
+
).strip()
|
|
685
|
+
except json.JSONDecodeError:
|
|
686
|
+
pass
|
|
687
|
+
|
|
723
688
|
# Structure the model output as a GenerativeModelOutput object
|
|
724
689
|
sequences.append(generation_output)
|
|
725
690
|
if hasattr(model_response_choices, "logprobs"):
|
|
@@ -732,6 +697,23 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
732
697
|
]
|
|
733
698
|
for content in model_response_choices.logprobs.content or list()
|
|
734
699
|
]
|
|
700
|
+
|
|
701
|
+
# If the model outputted a JSON dictionary, we need to find the
|
|
702
|
+
# token index of the value within the dictionary, rather than the
|
|
703
|
+
# first token of the entire output
|
|
704
|
+
if generation_dct:
|
|
705
|
+
key_name = next(iter(generation_dct.keys()))
|
|
706
|
+
logprobs_list = [
|
|
707
|
+
lst
|
|
708
|
+
for lst in logprobs_list
|
|
709
|
+
if (
|
|
710
|
+
lst
|
|
711
|
+
and lst[0]
|
|
712
|
+
and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
|
|
713
|
+
and not key_name.startswith(token)
|
|
714
|
+
)
|
|
715
|
+
]
|
|
716
|
+
|
|
735
717
|
scores.append(logprobs_list)
|
|
736
718
|
else:
|
|
737
719
|
log_once(
|
|
@@ -805,9 +787,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
805
787
|
repo_info = hf_api.model_info(
|
|
806
788
|
repo_id=model_id,
|
|
807
789
|
revision="main",
|
|
808
|
-
token=
|
|
809
|
-
or self.benchmark_config.api_key
|
|
810
|
-
or True,
|
|
790
|
+
token=get_hf_token(api_key=self.benchmark_config.api_key),
|
|
811
791
|
)
|
|
812
792
|
except (
|
|
813
793
|
RepositoryNotFoundError,
|
|
@@ -864,7 +844,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
864
844
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
865
845
|
)
|
|
866
846
|
|
|
867
|
-
|
|
847
|
+
tokeniser = load_tokeniser(
|
|
868
848
|
model=None,
|
|
869
849
|
model_id=model_id,
|
|
870
850
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -876,10 +856,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
876
856
|
):
|
|
877
857
|
vocab_size = hf_config.vocab_size
|
|
878
858
|
elif (
|
|
879
|
-
hasattr(
|
|
880
|
-
and
|
|
859
|
+
hasattr(tokeniser, "vocab_size")
|
|
860
|
+
and tokeniser.vocab_size is not None
|
|
881
861
|
):
|
|
882
|
-
vocab_size =
|
|
862
|
+
vocab_size = tokeniser.vocab_size
|
|
883
863
|
else:
|
|
884
864
|
vocab_size = -1
|
|
885
865
|
return vocab_size
|
|
@@ -910,13 +890,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
910
890
|
if context_length_keys:
|
|
911
891
|
context_length = model_info[context_length_keys[0]]
|
|
912
892
|
if context_length is not None:
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
893
|
+
if self.log_metadata:
|
|
894
|
+
log_once(
|
|
895
|
+
f"Detected context length key "
|
|
896
|
+
f"{context_length_keys[0]!r} for Ollama model "
|
|
897
|
+
f"{ollama_model_id!r}",
|
|
898
|
+
level=logging.DEBUG,
|
|
899
|
+
)
|
|
918
900
|
return int(context_length)
|
|
919
|
-
|
|
901
|
+
elif self.log_metadata:
|
|
920
902
|
log_once(
|
|
921
903
|
f"Tried to get the maximum length of the Ollama model "
|
|
922
904
|
f"{ollama_model_id!r}, but could not find a context length. "
|
|
@@ -944,7 +926,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
944
926
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
945
927
|
)
|
|
946
928
|
|
|
947
|
-
|
|
929
|
+
tokeniser = load_tokeniser(
|
|
948
930
|
model=None,
|
|
949
931
|
model_id=model_id,
|
|
950
932
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -952,18 +934,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
952
934
|
|
|
953
935
|
all_max_lengths: list[int] = list()
|
|
954
936
|
|
|
955
|
-
# Add the registered max length of the
|
|
937
|
+
# Add the registered max length of the tokeniser
|
|
956
938
|
if hasattr(
|
|
957
|
-
|
|
958
|
-
) and
|
|
959
|
-
all_max_lengths.append(
|
|
939
|
+
tokeniser, "model_max_length"
|
|
940
|
+
) and tokeniser.model_max_length < int(1e30):
|
|
941
|
+
all_max_lengths.append(tokeniser.model_max_length)
|
|
960
942
|
|
|
961
943
|
# Add the max length derived from the model's input sizes
|
|
962
|
-
if hasattr(
|
|
944
|
+
if hasattr(tokeniser, "max_model_input_sizes"):
|
|
963
945
|
all_max_lengths.extend(
|
|
964
946
|
[
|
|
965
947
|
size
|
|
966
|
-
for size in
|
|
948
|
+
for size in tokeniser.max_model_input_sizes.values()
|
|
967
949
|
if size is not None
|
|
968
950
|
]
|
|
969
951
|
)
|
|
@@ -1101,7 +1083,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1101
1083
|
f"Service temporarily unavailable. The error message was: {e}. "
|
|
1102
1084
|
"Retrying in 10 seconds..."
|
|
1103
1085
|
)
|
|
1104
|
-
sleep(
|
|
1086
|
+
sleep(10)
|
|
1105
1087
|
except APIError as e:
|
|
1106
1088
|
if "'503 Service Unavailable" not in str(e):
|
|
1107
1089
|
raise e
|
|
@@ -1211,7 +1193,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1211
1193
|
|
|
1212
1194
|
if self.benchmark_config.few_shot:
|
|
1213
1195
|
few_shot_examples = extract_few_shot_examples(
|
|
1214
|
-
dataset=dataset,
|
|
1196
|
+
dataset=dataset,
|
|
1197
|
+
dataset_config=self.dataset_config,
|
|
1198
|
+
benchmark_config=self.benchmark_config,
|
|
1199
|
+
itr_idx=itr_idx,
|
|
1215
1200
|
)
|
|
1216
1201
|
else:
|
|
1217
1202
|
few_shot_examples = list()
|
|
@@ -1224,7 +1209,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1224
1209
|
dataset_config=self.dataset_config,
|
|
1225
1210
|
instruction_model=True,
|
|
1226
1211
|
always_populate_text_field=False,
|
|
1227
|
-
|
|
1212
|
+
tokeniser=None,
|
|
1228
1213
|
),
|
|
1229
1214
|
batched=True,
|
|
1230
1215
|
load_from_cache_file=False,
|
|
@@ -1233,6 +1218,146 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1233
1218
|
|
|
1234
1219
|
return dataset
|
|
1235
1220
|
|
|
1221
|
+
@cache
|
|
1222
|
+
def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
|
|
1223
|
+
"""Get the generation arguments for the model.
|
|
1224
|
+
|
|
1225
|
+
Args:
|
|
1226
|
+
dataset_config:
|
|
1227
|
+
The dataset configuration, which is used to determine the generative
|
|
1228
|
+
type of the model. We use this as an argument here rather than using
|
|
1229
|
+
`self.dataset_config` to ensure that that the cache is updated when the
|
|
1230
|
+
dataset configuration changes.
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
The generation arguments for the model.
|
|
1234
|
+
"""
|
|
1235
|
+
# Set the core generation arguments
|
|
1236
|
+
generation_kwargs: dict[str, t.Any] = dict(
|
|
1237
|
+
max_completion_tokens=(
|
|
1238
|
+
REASONING_MAX_TOKENS
|
|
1239
|
+
if self.generative_type == GenerativeType.REASONING
|
|
1240
|
+
else dataset_config.max_generated_tokens
|
|
1241
|
+
),
|
|
1242
|
+
stop=[],
|
|
1243
|
+
temperature=0.0,
|
|
1244
|
+
seed=4242,
|
|
1245
|
+
api_key=self.benchmark_config.api_key,
|
|
1246
|
+
api_base=self.benchmark_config.api_base,
|
|
1247
|
+
api_version=self.benchmark_config.api_version,
|
|
1248
|
+
max_retries=3,
|
|
1249
|
+
)
|
|
1250
|
+
|
|
1251
|
+
# Set up the `response_format` generation argument if we are dealing with a task
|
|
1252
|
+
# using structured generation
|
|
1253
|
+
if dataset_config.task.uses_structured_output:
|
|
1254
|
+
if self.generative_type == GenerativeType.REASONING:
|
|
1255
|
+
log_once(
|
|
1256
|
+
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
1257
|
+
"and thus does not support structured generation, so we do not "
|
|
1258
|
+
"enable it.",
|
|
1259
|
+
level=logging.DEBUG,
|
|
1260
|
+
)
|
|
1261
|
+
elif supports_response_schema(model=self.model_config.model_id):
|
|
1262
|
+
if dataset_config.task == NER:
|
|
1263
|
+
ner_tag_names = list(dataset_config.prompt_label_mapping.values())
|
|
1264
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
1265
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
1266
|
+
for tag_name in ner_tag_names
|
|
1267
|
+
}
|
|
1268
|
+
pydantic_class = create_model(
|
|
1269
|
+
"AnswerFormat", **keys_and_their_types
|
|
1270
|
+
)
|
|
1271
|
+
else:
|
|
1272
|
+
raise InvalidBenchmark(
|
|
1273
|
+
"This task requires structured generation, but it has not "
|
|
1274
|
+
"been implemented for this task yet. Please open an issue "
|
|
1275
|
+
"at https://github.com/EuroEval/EuroEval/issues."
|
|
1276
|
+
)
|
|
1277
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
1278
|
+
log_once(
|
|
1279
|
+
"Enabling structured generation for model "
|
|
1280
|
+
f"{self.model_config.model_id!r} with the JSON schema "
|
|
1281
|
+
f"{pydantic_class.model_json_schema()}",
|
|
1282
|
+
level=logging.DEBUG,
|
|
1283
|
+
)
|
|
1284
|
+
else:
|
|
1285
|
+
generation_kwargs["response_format"] = dict(type="json_object")
|
|
1286
|
+
log_once(
|
|
1287
|
+
"Enabling structured JSON generation for model "
|
|
1288
|
+
f"{self.model_config.model_id!r} with no custom JSON schema, as "
|
|
1289
|
+
"the model does not support schemas.",
|
|
1290
|
+
level=logging.DEBUG,
|
|
1291
|
+
)
|
|
1292
|
+
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
1293
|
+
localised_labels = [
|
|
1294
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
1295
|
+
for label in self.dataset_config.labels
|
|
1296
|
+
]
|
|
1297
|
+
keys_and_their_types = {
|
|
1298
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY: (t.Literal[*localised_labels], ...)
|
|
1299
|
+
}
|
|
1300
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
1301
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
1302
|
+
|
|
1303
|
+
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
1304
|
+
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
1305
|
+
generation_kwargs["think"] = True
|
|
1306
|
+
log_once(
|
|
1307
|
+
"Enabling thinking mode for Ollama model "
|
|
1308
|
+
f"{self.model_config.model_id!r}",
|
|
1309
|
+
level=logging.DEBUG,
|
|
1310
|
+
)
|
|
1311
|
+
|
|
1312
|
+
# Handle manually set parameters
|
|
1313
|
+
if self.buffer["first_label_token_mapping"]:
|
|
1314
|
+
generation_kwargs["logprobs"] = True
|
|
1315
|
+
generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
|
|
1316
|
+
if self.model_config.revision == "thinking":
|
|
1317
|
+
generation_kwargs["thinking"] = dict(
|
|
1318
|
+
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
1319
|
+
)
|
|
1320
|
+
log_once(
|
|
1321
|
+
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
1322
|
+
level=logging.DEBUG,
|
|
1323
|
+
)
|
|
1324
|
+
elif self.model_config.revision == "no-thinking":
|
|
1325
|
+
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
1326
|
+
log_once(
|
|
1327
|
+
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
1328
|
+
level=logging.DEBUG,
|
|
1329
|
+
)
|
|
1330
|
+
elif self.model_config.revision in {"minimal", "low", "medium", "high"}:
|
|
1331
|
+
generation_kwargs["reasoning_effort"] = self.model_config.revision
|
|
1332
|
+
log_once(
|
|
1333
|
+
f"Enabling reasoning effort {self.model_config.revision!r} for model "
|
|
1334
|
+
f"{self.model_config.model_id!r}",
|
|
1335
|
+
level=logging.DEBUG,
|
|
1336
|
+
)
|
|
1337
|
+
|
|
1338
|
+
# First attempt is a test run with a single conversation to handle errors
|
|
1339
|
+
# quickly. We repeat this multiple times to deal with different types of
|
|
1340
|
+
# errors, and stop if we get a successful response.
|
|
1341
|
+
test_conversation: list[litellm.AllMessageValues] = [
|
|
1342
|
+
litellm.ChatCompletionUserMessage(role="user", content="Test message")
|
|
1343
|
+
]
|
|
1344
|
+
for _ in range(5):
|
|
1345
|
+
_, failures = safe_run(
|
|
1346
|
+
self._generate_async(
|
|
1347
|
+
model_id=self.model_config.model_id,
|
|
1348
|
+
conversations=[test_conversation],
|
|
1349
|
+
**generation_kwargs,
|
|
1350
|
+
)
|
|
1351
|
+
)
|
|
1352
|
+
if not failures:
|
|
1353
|
+
break
|
|
1354
|
+
for _, error in failures:
|
|
1355
|
+
generation_kwargs = self._handle_exception(
|
|
1356
|
+
error=error, **generation_kwargs
|
|
1357
|
+
)
|
|
1358
|
+
|
|
1359
|
+
return generation_kwargs
|
|
1360
|
+
|
|
1236
1361
|
|
|
1237
1362
|
def raise_if_wrong_params(
|
|
1238
1363
|
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
@@ -1264,6 +1389,11 @@ def raise_if_wrong_params(
|
|
|
1264
1389
|
msg += " No parameters are allowed."
|
|
1265
1390
|
raise InvalidModel(msg)
|
|
1266
1391
|
return
|
|
1392
|
+
else:
|
|
1393
|
+
raise InvalidModel(
|
|
1394
|
+
f"The parameter {param!r} is not supported for the model "
|
|
1395
|
+
f"{model_config.model_id!r}."
|
|
1396
|
+
)
|
|
1267
1397
|
|
|
1268
1398
|
|
|
1269
1399
|
def try_download_ollama_model(model_id: str) -> bool:
|
|
@@ -1300,11 +1430,11 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1300
1430
|
for model_obj in ollama.list().models
|
|
1301
1431
|
if model_obj.model is not None
|
|
1302
1432
|
]
|
|
1303
|
-
except ConnectionError:
|
|
1433
|
+
except ConnectionError as e:
|
|
1304
1434
|
raise InvalidModel(
|
|
1305
1435
|
"Ollama does not seem to be running, so we cannot evaluate the model "
|
|
1306
1436
|
f"{model_id!r}. Please make sure that Ollama is running and try again."
|
|
1307
|
-
)
|
|
1437
|
+
) from e
|
|
1308
1438
|
|
|
1309
1439
|
ollama_model_id = "/".join(model_id.split("/")[1:])
|
|
1310
1440
|
if ollama_model_id not in downloaded_ollama_models:
|
|
@@ -1334,12 +1464,12 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1334
1464
|
raise InvalidModel(
|
|
1335
1465
|
f"Failed to download Ollama model {ollama_model_id}. "
|
|
1336
1466
|
f"The error message was: {inner_e}"
|
|
1337
|
-
)
|
|
1467
|
+
) from inner_e
|
|
1338
1468
|
else:
|
|
1339
1469
|
raise InvalidModel(
|
|
1340
1470
|
f"Failed to download Ollama model {ollama_model_id}. "
|
|
1341
1471
|
f"The error message was: {e}"
|
|
1342
|
-
)
|
|
1472
|
+
) from e
|
|
1343
1473
|
|
|
1344
1474
|
# Download the model
|
|
1345
1475
|
with tqdm(
|