EuroEval 16.0.1__py3-none-any.whl → 16.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +6 -1
- euroeval/benchmark_modules/base.py +2 -0
- euroeval/benchmark_modules/fresh.py +7 -1
- euroeval/benchmark_modules/hf.py +26 -21
- euroeval/benchmark_modules/litellm.py +258 -131
- euroeval/benchmark_modules/vllm.py +79 -40
- euroeval/benchmarker.py +11 -2
- euroeval/cli.py +14 -1
- euroeval/constants.py +1 -1
- euroeval/data_models.py +77 -6
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +14 -0
- euroeval/dataset_configs/dutch.py +14 -0
- euroeval/dataset_configs/english.py +22 -0
- euroeval/dataset_configs/estonian.py +15 -7
- euroeval/dataset_configs/finnish.py +14 -0
- euroeval/dataset_configs/french.py +14 -0
- euroeval/dataset_configs/german.py +23 -0
- euroeval/dataset_configs/italian.py +14 -0
- euroeval/dataset_configs/latvian.py +14 -0
- euroeval/dataset_configs/norwegian.py +14 -0
- euroeval/dataset_configs/polish.py +126 -0
- euroeval/dataset_configs/portuguese.py +14 -0
- euroeval/dataset_configs/spanish.py +14 -0
- euroeval/dataset_configs/swedish.py +25 -0
- euroeval/enums.py +12 -0
- euroeval/generation.py +17 -8
- euroeval/generation_utils.py +58 -10
- euroeval/metrics/pipeline.py +1 -1
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +27 -1
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +11 -0
- euroeval/prompt_templates/sentiment_classification.py +15 -0
- euroeval/prompt_templates/summarization.py +27 -1
- euroeval/scores.py +5 -0
- euroeval/task_group_utils/question_answering.py +29 -29
- euroeval/task_group_utils/sequence_classification.py +10 -33
- euroeval/task_group_utils/token_classification.py +3 -3
- euroeval/tasks.py +4 -4
- euroeval/{tokenization_utils.py → tokenisation_utils.py} +40 -23
- euroeval/utils.py +36 -3
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/METADATA +1 -1
- euroeval-16.1.0.dist-info/RECORD +70 -0
- euroeval-16.0.1.dist-info/RECORD +0 -69
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/WHEEL +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.0.1.dist-info → euroeval-16.1.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -31,7 +31,7 @@ from litellm.exceptions import (
|
|
|
31
31
|
)
|
|
32
32
|
from litellm.llms.vertex_ai.common_utils import VertexAIError
|
|
33
33
|
from litellm.router import Router
|
|
34
|
-
from litellm.types.utils import ChoiceLogprobs
|
|
34
|
+
from litellm.types.utils import ChoiceLogprobs, Logprobs
|
|
35
35
|
from litellm.utils import supports_reasoning, supports_response_schema
|
|
36
36
|
from pydantic import conlist, create_model
|
|
37
37
|
from requests.exceptions import RequestException
|
|
@@ -65,7 +65,11 @@ from ..exceptions import (
|
|
|
65
65
|
NeedsEnvironmentVariable,
|
|
66
66
|
NeedsExtraInstalled,
|
|
67
67
|
)
|
|
68
|
-
from ..generation_utils import
|
|
68
|
+
from ..generation_utils import (
|
|
69
|
+
apply_prompt,
|
|
70
|
+
extract_few_shot_examples,
|
|
71
|
+
raise_if_wrong_params,
|
|
72
|
+
)
|
|
69
73
|
from ..task_group_utils import (
|
|
70
74
|
question_answering,
|
|
71
75
|
sequence_classification,
|
|
@@ -73,7 +77,7 @@ from ..task_group_utils import (
|
|
|
73
77
|
token_classification,
|
|
74
78
|
)
|
|
75
79
|
from ..tasks import NER
|
|
76
|
-
from ..
|
|
80
|
+
from ..tokenisation_utils import get_first_label_token_mapping
|
|
77
81
|
from ..types import ExtractLabelsFunction
|
|
78
82
|
from ..utils import (
|
|
79
83
|
add_semaphore_and_catch_exception,
|
|
@@ -81,6 +85,7 @@ from ..utils import (
|
|
|
81
85
|
get_hf_token,
|
|
82
86
|
log_once,
|
|
83
87
|
safe_run,
|
|
88
|
+
split_model_id,
|
|
84
89
|
)
|
|
85
90
|
from .base import BenchmarkModule
|
|
86
91
|
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
|
|
@@ -153,21 +158,6 @@ NUM_PARAMS_MAPPING = {
|
|
|
153
158
|
}
|
|
154
159
|
|
|
155
160
|
|
|
156
|
-
ALLOWED_PARAMS = {
|
|
157
|
-
# OpenAI models
|
|
158
|
-
r"gpt-5-.*": ["minimal", "low", "medium", "high"],
|
|
159
|
-
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?": ["low", "medium", "high"],
|
|
160
|
-
# Anthropic models
|
|
161
|
-
r"(anthropic/)?claude-3-7-sonnet.*": ["no-thinking", "thinking"],
|
|
162
|
-
r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
|
|
163
|
-
# Gemini models
|
|
164
|
-
r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
|
|
165
|
-
r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
|
|
166
|
-
# xAI models
|
|
167
|
-
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
|
|
171
161
|
REASONING_MODELS = [
|
|
172
162
|
r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?",
|
|
173
163
|
r"(gemini/)?gemini.*thinking.*",
|
|
@@ -175,6 +165,15 @@ REASONING_MODELS = [
|
|
|
175
165
|
r"(xai/)?grok-3-mini.*",
|
|
176
166
|
]
|
|
177
167
|
|
|
168
|
+
BASE_DECODER_MODELS = [
|
|
169
|
+
r"gpt-3.5-turbo-instruct.*",
|
|
170
|
+
r"ada-[0-9]{3}",
|
|
171
|
+
r"babbage-[0-9]{3}",
|
|
172
|
+
r"curie-[0-9]{3}",
|
|
173
|
+
r"davinci-[0-9]{3}",
|
|
174
|
+
r"text-davinci-[0-9]{3}",
|
|
175
|
+
]
|
|
176
|
+
|
|
178
177
|
|
|
179
178
|
class LiteLLMModel(BenchmarkModule):
|
|
180
179
|
"""A generative model from LiteLLM."""
|
|
@@ -182,6 +181,26 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
182
181
|
fresh_model = False
|
|
183
182
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
184
183
|
high_priority = False
|
|
184
|
+
allowed_params = {
|
|
185
|
+
# OpenAI models
|
|
186
|
+
re.compile(r"gpt-5-.*"): ["minimal", "low", "medium", "high"],
|
|
187
|
+
re.compile(r"o[1-9](-mini|-preview)?(-[0-9]{4}-[0-9]{2}-[0-9]{2})?"): [
|
|
188
|
+
"low",
|
|
189
|
+
"medium",
|
|
190
|
+
"high",
|
|
191
|
+
],
|
|
192
|
+
# Anthropic models
|
|
193
|
+
re.compile(r"(anthropic/)?claude-3-7-sonnet.*"): ["no-thinking", "thinking"],
|
|
194
|
+
re.compile(r"(anthropic/)?claude-(sonnet|opus)-4.*"): [
|
|
195
|
+
"no-thinking",
|
|
196
|
+
"thinking",
|
|
197
|
+
],
|
|
198
|
+
# Gemini models
|
|
199
|
+
re.compile(r"(gemini/)?gemini-2.5-flash-lite.*"): ["no-thinking", "thinking"],
|
|
200
|
+
re.compile(r"(gemini/)?gemini-2.5-flash.*"): ["no-thinking", "thinking"],
|
|
201
|
+
# xAI models
|
|
202
|
+
re.compile(r"(xai/)?grok-3-mini(-fast)?(-beta)?"): ["low", "medium", "high"],
|
|
203
|
+
}
|
|
185
204
|
|
|
186
205
|
def __init__(
|
|
187
206
|
self,
|
|
@@ -206,6 +225,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
206
225
|
The generation kwargs to pass to the model. If None, default values will
|
|
207
226
|
be used.
|
|
208
227
|
"""
|
|
228
|
+
raise_if_wrong_params(
|
|
229
|
+
model_config=model_config, allowed_params=self.allowed_params
|
|
230
|
+
)
|
|
231
|
+
|
|
209
232
|
# Detect whether the model is an Ollama model, as we need to extract metadata
|
|
210
233
|
# differently for these models
|
|
211
234
|
self.is_ollama = model_config.model_id.startswith(
|
|
@@ -217,8 +240,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
217
240
|
else ollama.ShowResponse(model_info=None)
|
|
218
241
|
)
|
|
219
242
|
|
|
220
|
-
raise_if_wrong_params(model_config=model_config, allowed_params=ALLOWED_PARAMS)
|
|
221
|
-
|
|
222
243
|
super().__init__(
|
|
223
244
|
model_config=model_config,
|
|
224
245
|
dataset_config=dataset_config,
|
|
@@ -242,21 +263,27 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
242
263
|
Returns:
|
|
243
264
|
The generative type of the model, or None if it has not been set yet.
|
|
244
265
|
"""
|
|
245
|
-
if self.
|
|
266
|
+
if self.benchmark_config.generative_type is not None:
|
|
267
|
+
type_ = self.benchmark_config.generative_type
|
|
268
|
+
elif self.is_ollama:
|
|
246
269
|
reasoning_model = "thinking" in (self._ollama_show.capabilities or [])
|
|
247
270
|
type_ = (
|
|
248
271
|
GenerativeType.REASONING
|
|
249
272
|
if reasoning_model
|
|
250
273
|
else GenerativeType.INSTRUCTION_TUNED
|
|
251
274
|
)
|
|
252
|
-
elif self.model_config.
|
|
275
|
+
elif self.model_config.param in {"thinking"}:
|
|
253
276
|
type_ = GenerativeType.REASONING
|
|
254
|
-
elif self.model_config.
|
|
277
|
+
elif self.model_config.param in {"no-thinking"}:
|
|
255
278
|
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
256
279
|
elif re.fullmatch(
|
|
257
280
|
pattern="|".join(REASONING_MODELS), string=self.model_config.model_id
|
|
258
281
|
):
|
|
259
282
|
type_ = GenerativeType.REASONING
|
|
283
|
+
elif re.fullmatch(
|
|
284
|
+
pattern="|".join(BASE_DECODER_MODELS), string=self.model_config.model_id
|
|
285
|
+
):
|
|
286
|
+
type_ = GenerativeType.BASE
|
|
260
287
|
elif supports_reasoning(model=self.model_config.model_id):
|
|
261
288
|
type_ = GenerativeType.REASONING
|
|
262
289
|
else:
|
|
@@ -279,9 +306,20 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
279
306
|
|
|
280
307
|
Returns:
|
|
281
308
|
The generated model outputs.
|
|
309
|
+
|
|
310
|
+
Raises:
|
|
311
|
+
InvalidBenchmark:
|
|
312
|
+
If the inputs do not contain either 'messages' or 'text' keys.
|
|
282
313
|
"""
|
|
283
|
-
|
|
284
|
-
|
|
314
|
+
model_inputs: list[list[litellm.AllMessageValues] | str]
|
|
315
|
+
if "messages" in inputs:
|
|
316
|
+
model_inputs = inputs["messages"]
|
|
317
|
+
elif "text" in inputs:
|
|
318
|
+
model_inputs = inputs["text"]
|
|
319
|
+
else:
|
|
320
|
+
raise InvalidBenchmark(
|
|
321
|
+
"The inputs must contain either 'messages' or 'text' keys."
|
|
322
|
+
)
|
|
285
323
|
|
|
286
324
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
287
325
|
# time we generate a new dataset since the dataset config can change
|
|
@@ -294,22 +332,22 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
294
332
|
)
|
|
295
333
|
|
|
296
334
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
297
|
-
|
|
298
|
-
enumerate(
|
|
335
|
+
inputs_to_run: list[tuple[int, list[litellm.AllMessageValues] | str]] = list(
|
|
336
|
+
enumerate(model_inputs)
|
|
299
337
|
)
|
|
300
338
|
for attempt in range(num_attempts := 10):
|
|
301
|
-
if not
|
|
339
|
+
if not inputs_to_run:
|
|
302
340
|
break
|
|
303
341
|
|
|
304
342
|
generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
|
|
305
343
|
dataset_config=self.dataset_config
|
|
306
344
|
)
|
|
307
345
|
|
|
308
|
-
batch_indices,
|
|
346
|
+
batch_indices, batch_inputs = zip(*inputs_to_run)
|
|
309
347
|
successes, failures = safe_run(
|
|
310
348
|
self._generate_async(
|
|
311
349
|
model_id=self.model_config.model_id,
|
|
312
|
-
|
|
350
|
+
inputs=list(batch_inputs),
|
|
313
351
|
**generation_kwargs,
|
|
314
352
|
)
|
|
315
353
|
)
|
|
@@ -321,17 +359,17 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
321
359
|
|
|
322
360
|
# If all requests were successful, break
|
|
323
361
|
if not failures:
|
|
324
|
-
|
|
362
|
+
inputs_to_run = []
|
|
325
363
|
break
|
|
326
364
|
|
|
327
365
|
# Put the failed requests back in the queue to try again
|
|
328
|
-
|
|
329
|
-
(batch_indices[idx],
|
|
366
|
+
inputs_to_run = [
|
|
367
|
+
(batch_indices[idx], model_inputs[batch_indices[idx]])
|
|
330
368
|
for idx, _ in failures
|
|
331
369
|
]
|
|
332
370
|
logger.debug(
|
|
333
371
|
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
334
|
-
f"{len(
|
|
372
|
+
f"{len(inputs_to_run):,} failed message(s)"
|
|
335
373
|
)
|
|
336
374
|
|
|
337
375
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
@@ -349,14 +387,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
349
387
|
)
|
|
350
388
|
|
|
351
389
|
# Extract the generations from the model output
|
|
352
|
-
ordered_responses = [all_responses[i] for i in range(len(
|
|
390
|
+
ordered_responses = [all_responses[i] for i in range(len(model_inputs))]
|
|
353
391
|
model_output = self._create_model_output(
|
|
354
392
|
model_responses=ordered_responses, model_id=self.model_config.model_id
|
|
355
393
|
)
|
|
356
394
|
|
|
357
|
-
if len(
|
|
395
|
+
if len(model_inputs) != len(model_output.sequences):
|
|
358
396
|
raise InvalidBenchmark(
|
|
359
|
-
f"Number of model inputs ({len(
|
|
397
|
+
f"Number of model inputs ({len(model_inputs):,}) does not match the "
|
|
360
398
|
f"number of model outputs ({len(model_output.sequences):,})."
|
|
361
399
|
)
|
|
362
400
|
|
|
@@ -378,16 +416,24 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
378
416
|
model_id = self.model_config.model_id
|
|
379
417
|
|
|
380
418
|
# Error messages that we want to catch and handle
|
|
381
|
-
stop_messages = [
|
|
419
|
+
stop_messages = [
|
|
420
|
+
"stop_sequences",
|
|
421
|
+
"'stop' is not supported with this model",
|
|
422
|
+
"'$.stop' is invalid",
|
|
423
|
+
]
|
|
382
424
|
logprobs_messages = [
|
|
383
425
|
"you are not allowed to request logprobs",
|
|
384
426
|
"you've reached the maximum number of requests with logprobs",
|
|
385
427
|
"logprobs is not supported",
|
|
386
428
|
"logprobs is not enabled",
|
|
387
429
|
]
|
|
430
|
+
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
388
431
|
logprobs_pattern = re.compile(
|
|
389
432
|
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
390
433
|
)
|
|
434
|
+
max_completion_tokens_pattern = re.compile(
|
|
435
|
+
r"does not support parameters: \[.*'max_completion_tokens'.*\]"
|
|
436
|
+
)
|
|
391
437
|
temperature_messages = [
|
|
392
438
|
"'temperature' is not supported with this model.",
|
|
393
439
|
"temperature is not supported with this model",
|
|
@@ -406,6 +452,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
406
452
|
)
|
|
407
453
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
408
454
|
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
455
|
+
response_format_messages = [
|
|
456
|
+
"got an unexpected keyword argument 'response_format'",
|
|
457
|
+
"The model outputs empty dictionaries.",
|
|
458
|
+
]
|
|
409
459
|
|
|
410
460
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
411
461
|
log_once(
|
|
@@ -430,6 +480,24 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
430
480
|
generation_kwargs.pop("logprobs", None)
|
|
431
481
|
generation_kwargs.pop("top_logprobs", None)
|
|
432
482
|
return generation_kwargs
|
|
483
|
+
elif any(msg.lower() in error_msg for msg in top_logprobs_messages):
|
|
484
|
+
log_once(
|
|
485
|
+
f"The model {model_id!r} does not support the `top_logprobs` argument, "
|
|
486
|
+
"so moving the value to `logprobs`.",
|
|
487
|
+
level=logging.DEBUG,
|
|
488
|
+
)
|
|
489
|
+
generation_kwargs["logprobs"] = generation_kwargs.pop("top_logprobs", None)
|
|
490
|
+
return generation_kwargs
|
|
491
|
+
elif max_completion_tokens_pattern.search(string=error_msg):
|
|
492
|
+
log_once(
|
|
493
|
+
f"The model {model_id!r} does not support max_completion_tokens, so "
|
|
494
|
+
"disabling it.",
|
|
495
|
+
level=logging.DEBUG,
|
|
496
|
+
)
|
|
497
|
+
generation_kwargs["max_tokens"] = generation_kwargs.pop(
|
|
498
|
+
"max_completion_tokens", None
|
|
499
|
+
)
|
|
500
|
+
return generation_kwargs
|
|
433
501
|
elif any(msg.lower() in error_msg for msg in temperature_messages):
|
|
434
502
|
log_once(
|
|
435
503
|
f"The model {model_id!r} does not support "
|
|
@@ -510,6 +578,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
510
578
|
)
|
|
511
579
|
generation_kwargs.pop("seed", None)
|
|
512
580
|
return generation_kwargs
|
|
581
|
+
elif any(msg.lower() in error_msg for msg in response_format_messages):
|
|
582
|
+
log_once(
|
|
583
|
+
f"The model {model_id!r} does not support the `response_format` "
|
|
584
|
+
"parameter, so disabling it.",
|
|
585
|
+
level=logging.DEBUG,
|
|
586
|
+
)
|
|
587
|
+
generation_kwargs.pop("response_format", None)
|
|
588
|
+
return generation_kwargs
|
|
513
589
|
# If there are too many I/O connections, we increase the number of allowed file
|
|
514
590
|
# descriptors
|
|
515
591
|
elif "too many open files" in error_msg:
|
|
@@ -572,7 +648,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
572
648
|
async def _generate_async(
|
|
573
649
|
self,
|
|
574
650
|
model_id: str,
|
|
575
|
-
|
|
651
|
+
inputs: list[list[litellm.AllMessageValues] | str],
|
|
576
652
|
**generation_kwargs,
|
|
577
653
|
) -> tuple[list[tuple[int, "ModelResponse"]], list[tuple[int, Exception]]]:
|
|
578
654
|
"""Generate outputs from the model asynchronously.
|
|
@@ -580,8 +656,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
580
656
|
Args:
|
|
581
657
|
model_id:
|
|
582
658
|
The ID of the model to use for generation.
|
|
583
|
-
|
|
584
|
-
The
|
|
659
|
+
inputs:
|
|
660
|
+
The inputs to pass to the model.
|
|
585
661
|
**generation_kwargs:
|
|
586
662
|
Additional generation arguments to pass to the model.
|
|
587
663
|
|
|
@@ -604,17 +680,51 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
604
680
|
# Get the LLM generations asynchronously
|
|
605
681
|
max_concurrent_calls = 20
|
|
606
682
|
semaphore = asyncio.Semaphore(max_concurrent_calls)
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
683
|
+
if self.generative_type == GenerativeType.BASE:
|
|
684
|
+
if not all(isinstance(input_, str) for input_ in inputs):
|
|
685
|
+
raise InvalidBenchmark(
|
|
686
|
+
"For base generative models, all inputs must be strings."
|
|
687
|
+
)
|
|
688
|
+
requests = [
|
|
689
|
+
add_semaphore_and_catch_exception(
|
|
690
|
+
router.atext_completion(
|
|
691
|
+
model=model_id, prompt=input_, **generation_kwargs
|
|
692
|
+
),
|
|
693
|
+
semaphore=semaphore,
|
|
694
|
+
)
|
|
695
|
+
for input_ in inputs
|
|
696
|
+
if isinstance(input_, str)
|
|
697
|
+
]
|
|
698
|
+
else:
|
|
699
|
+
if not all(isinstance(input_, list) for input_ in inputs):
|
|
700
|
+
raise InvalidBenchmark(
|
|
701
|
+
"For instruction-tuned and reasoning generative models, all "
|
|
702
|
+
"inputs must be lists of messages."
|
|
703
|
+
)
|
|
704
|
+
requests = [
|
|
705
|
+
add_semaphore_and_catch_exception(
|
|
706
|
+
router.acompletion(
|
|
707
|
+
model=model_id, messages=input_, **generation_kwargs
|
|
708
|
+
),
|
|
709
|
+
semaphore=semaphore,
|
|
710
|
+
)
|
|
711
|
+
for input_ in inputs
|
|
712
|
+
if isinstance(input_, list)
|
|
713
|
+
]
|
|
616
714
|
responses = await tqdm_async.gather(*requests, leave=False)
|
|
617
715
|
|
|
716
|
+
# If we are performing structured generation and the model just outputs an empty
|
|
717
|
+
# dictionary, then we convert those to exceptions, to disable structured
|
|
718
|
+
# generation
|
|
719
|
+
if "response_format" in generation_kwargs:
|
|
720
|
+
responses = [
|
|
721
|
+
RuntimeError("The model outputs empty dictionaries.")
|
|
722
|
+
if not isinstance(response, Exception)
|
|
723
|
+
and any(choice.message.content == "{}" for choice in response.choices)
|
|
724
|
+
else response
|
|
725
|
+
for response in responses
|
|
726
|
+
]
|
|
727
|
+
|
|
618
728
|
# Separate the successful responses from the failed ones
|
|
619
729
|
successes = [
|
|
620
730
|
(idx, response)
|
|
@@ -630,7 +740,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
630
740
|
# Close connections
|
|
631
741
|
for request in requests:
|
|
632
742
|
if hasattr(request, "close"):
|
|
633
|
-
|
|
743
|
+
try:
|
|
744
|
+
request.close()
|
|
745
|
+
except RuntimeError as e:
|
|
746
|
+
logger.debug(f"RuntimeError during request.close(): {e}")
|
|
634
747
|
|
|
635
748
|
return successes, failures
|
|
636
749
|
|
|
@@ -663,10 +776,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
663
776
|
continue
|
|
664
777
|
|
|
665
778
|
model_response_choices = model_response.choices[0]
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
779
|
+
|
|
780
|
+
if isinstance(model_response_choices, litellm.Choices):
|
|
781
|
+
generated_message: litellm.Message = model_response_choices.message
|
|
782
|
+
generation_output = generated_message.content or ""
|
|
783
|
+
generation_output = generation_output.strip()
|
|
784
|
+
elif isinstance(model_response_choices, litellm.litellm.TextChoices):
|
|
785
|
+
generation_output = model_response_choices.text or ""
|
|
786
|
+
else:
|
|
787
|
+
raise InvalidBenchmark(
|
|
788
|
+
"The model response choices must be of type Choices or "
|
|
789
|
+
f"TextChoices. Got {type(model_response_choices)}."
|
|
790
|
+
)
|
|
670
791
|
|
|
671
792
|
# In the case where we're dealing with a classification task, the model is
|
|
672
793
|
# outputting a JSON dictionary, so we will extract the generated text from
|
|
@@ -687,40 +808,55 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
687
808
|
|
|
688
809
|
# Structure the model output as a GenerativeModelOutput object
|
|
689
810
|
sequences.append(generation_output)
|
|
690
|
-
if
|
|
811
|
+
if (
|
|
812
|
+
hasattr(model_response_choices, "logprobs")
|
|
813
|
+
and model_response_choices.logprobs is not None
|
|
814
|
+
):
|
|
691
815
|
logprobs_obj = model_response_choices.logprobs
|
|
816
|
+
|
|
817
|
+
if not isinstance(logprobs_obj, (Logprobs, ChoiceLogprobs)):
|
|
818
|
+
log_once(
|
|
819
|
+
"The logprobs object is malformed, so we won't use logprobs to "
|
|
820
|
+
"determine the labels.",
|
|
821
|
+
level=logging.WARNING,
|
|
822
|
+
)
|
|
823
|
+
continue
|
|
824
|
+
|
|
825
|
+
logprobs_list: list[list[tuple[str, float]]]
|
|
692
826
|
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
693
|
-
logprobs_list
|
|
827
|
+
logprobs_list = [
|
|
694
828
|
[
|
|
695
829
|
(top_logprob.token, top_logprob.logprob)
|
|
696
830
|
for top_logprob in content.top_logprobs
|
|
697
831
|
]
|
|
698
|
-
for content in
|
|
832
|
+
for content in logprobs_obj.content or list()
|
|
833
|
+
]
|
|
834
|
+
else:
|
|
835
|
+
logprobs_list = [
|
|
836
|
+
[
|
|
837
|
+
(token, logprob)
|
|
838
|
+
for token, logprob in (top_logprobs_dct or dict()).items()
|
|
839
|
+
]
|
|
840
|
+
for top_logprobs_dct in logprobs_obj.top_logprobs or list()
|
|
699
841
|
]
|
|
700
842
|
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
843
|
+
# If the model outputted a JSON dictionary, we need to find the
|
|
844
|
+
# token index of the value within the dictionary, rather than the
|
|
845
|
+
# first token of the entire output
|
|
846
|
+
if generation_dct:
|
|
847
|
+
key_name = next(iter(generation_dct.keys()))
|
|
848
|
+
logprobs_list = [
|
|
849
|
+
lst
|
|
850
|
+
for lst in logprobs_list
|
|
851
|
+
if (
|
|
707
852
|
lst
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
and not key_name.startswith(token)
|
|
714
|
-
)
|
|
715
|
-
]
|
|
853
|
+
and lst[0]
|
|
854
|
+
and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
|
|
855
|
+
and not key_name.startswith(token)
|
|
856
|
+
)
|
|
857
|
+
]
|
|
716
858
|
|
|
717
|
-
|
|
718
|
-
else:
|
|
719
|
-
log_once(
|
|
720
|
-
"The logprobs object is malformed, so we won't use logprobs to "
|
|
721
|
-
"determine the labels.",
|
|
722
|
-
level=logging.WARNING,
|
|
723
|
-
)
|
|
859
|
+
scores.append(logprobs_list)
|
|
724
860
|
|
|
725
861
|
if not sequences:
|
|
726
862
|
logger.warning(
|
|
@@ -1047,7 +1183,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1047
1183
|
Whether the model exists, or an error describing why we cannot check
|
|
1048
1184
|
whether the model exists.
|
|
1049
1185
|
"""
|
|
1050
|
-
model_id
|
|
1186
|
+
model_id = split_model_id(model_id=model_id).model_id
|
|
1051
1187
|
if model_id in litellm.model_list:
|
|
1052
1188
|
return True
|
|
1053
1189
|
|
|
@@ -1135,10 +1271,29 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1135
1271
|
Returns:
|
|
1136
1272
|
The model configuration.
|
|
1137
1273
|
"""
|
|
1138
|
-
|
|
1274
|
+
model_id_components = split_model_id(model_id=model_id)
|
|
1275
|
+
|
|
1276
|
+
# Backwards compatibility: If the revision is set but not the parameter, we
|
|
1277
|
+
# assume that the revision is actually the parameter and log this as a warning.
|
|
1278
|
+
if model_id_components.revision != "main" and model_id_components.param is None:
|
|
1279
|
+
proper_model_id = (
|
|
1280
|
+
f"{model_id_components.model_id}#{model_id_components.revision}"
|
|
1281
|
+
)
|
|
1282
|
+
log_once(
|
|
1283
|
+
f"The model ID {model_id!r} specifies a revision "
|
|
1284
|
+
f"{model_id_components.revision!r} but not a parameter. We assume "
|
|
1285
|
+
"that the revision is actually the parameter and set the revision "
|
|
1286
|
+
"to 'main'. In the future, use the new '#' syntax to specify the "
|
|
1287
|
+
f"parameter (in this case, this would be {proper_model_id!r}), as this "
|
|
1288
|
+
"will be an error in future versions of EuroEval."
|
|
1289
|
+
)
|
|
1290
|
+
model_id_components.param = model_id_components.revision
|
|
1291
|
+
model_id_components.revision = "main"
|
|
1292
|
+
|
|
1139
1293
|
return ModelConfig(
|
|
1140
|
-
model_id=model_id,
|
|
1141
|
-
revision=revision,
|
|
1294
|
+
model_id=model_id_components.model_id,
|
|
1295
|
+
revision=model_id_components.revision,
|
|
1296
|
+
param=model_id_components.param,
|
|
1142
1297
|
task="text-generation",
|
|
1143
1298
|
languages=list(),
|
|
1144
1299
|
merge=False,
|
|
@@ -1207,7 +1362,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1207
1362
|
few_shot_examples=few_shot_examples,
|
|
1208
1363
|
model_config=self.model_config,
|
|
1209
1364
|
dataset_config=self.dataset_config,
|
|
1210
|
-
|
|
1365
|
+
generative_type=self.generative_type,
|
|
1211
1366
|
always_populate_text_field=False,
|
|
1212
1367
|
tokeniser=None,
|
|
1213
1368
|
),
|
|
@@ -1313,7 +1468,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1313
1468
|
if self.buffer["first_label_token_mapping"]:
|
|
1314
1469
|
generation_kwargs["logprobs"] = True
|
|
1315
1470
|
generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
|
|
1316
|
-
if self.model_config.
|
|
1471
|
+
if self.model_config.param == "thinking":
|
|
1317
1472
|
generation_kwargs["thinking"] = dict(
|
|
1318
1473
|
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
1319
1474
|
)
|
|
@@ -1321,16 +1476,16 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1321
1476
|
f"Enabling thinking mode for model {self.model_config.model_id!r}",
|
|
1322
1477
|
level=logging.DEBUG,
|
|
1323
1478
|
)
|
|
1324
|
-
elif self.model_config.
|
|
1479
|
+
elif self.model_config.param == "no-thinking":
|
|
1325
1480
|
generation_kwargs["thinking"] = dict(budget_tokens=0)
|
|
1326
1481
|
log_once(
|
|
1327
1482
|
f"Disabling thinking mode for model {self.model_config.model_id!r}",
|
|
1328
1483
|
level=logging.DEBUG,
|
|
1329
1484
|
)
|
|
1330
|
-
elif self.model_config.
|
|
1331
|
-
generation_kwargs["reasoning_effort"] = self.model_config.
|
|
1485
|
+
elif self.model_config.param in {"minimal", "low", "medium", "high"}:
|
|
1486
|
+
generation_kwargs["reasoning_effort"] = self.model_config.param
|
|
1332
1487
|
log_once(
|
|
1333
|
-
f"Enabling reasoning effort {self.model_config.
|
|
1488
|
+
f"Enabling reasoning effort {self.model_config.param!r} for model "
|
|
1334
1489
|
f"{self.model_config.model_id!r}",
|
|
1335
1490
|
level=logging.DEBUG,
|
|
1336
1491
|
)
|
|
@@ -1338,14 +1493,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1338
1493
|
# First attempt is a test run with a single conversation to handle errors
|
|
1339
1494
|
# quickly. We repeat this multiple times to deal with different types of
|
|
1340
1495
|
# errors, and stop if we get a successful response.
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1496
|
+
test_input: list[litellm.AllMessageValues] | str
|
|
1497
|
+
if self.generative_type == GenerativeType.BASE:
|
|
1498
|
+
test_input = "Test message"
|
|
1499
|
+
else:
|
|
1500
|
+
test_input = [
|
|
1501
|
+
litellm.ChatCompletionUserMessage(role="user", content="Test message")
|
|
1502
|
+
]
|
|
1503
|
+
for _ in range(num_attempts := 10):
|
|
1345
1504
|
_, failures = safe_run(
|
|
1346
1505
|
self._generate_async(
|
|
1347
1506
|
model_id=self.model_config.model_id,
|
|
1348
|
-
|
|
1507
|
+
inputs=[test_input],
|
|
1349
1508
|
**generation_kwargs,
|
|
1350
1509
|
)
|
|
1351
1510
|
)
|
|
@@ -1355,47 +1514,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1355
1514
|
generation_kwargs = self._handle_exception(
|
|
1356
1515
|
error=error, **generation_kwargs
|
|
1357
1516
|
)
|
|
1517
|
+
else:
|
|
1518
|
+
raise InvalidModel(
|
|
1519
|
+
"Failed to get a successful response from the model "
|
|
1520
|
+
f"{self.model_config.model_id!r} after {num_attempts} attempts."
|
|
1521
|
+
)
|
|
1358
1522
|
|
|
1359
1523
|
return generation_kwargs
|
|
1360
1524
|
|
|
1361
1525
|
|
|
1362
|
-
def raise_if_wrong_params(
|
|
1363
|
-
model_config: ModelConfig, allowed_params: dict[str, list[str]]
|
|
1364
|
-
) -> None:
|
|
1365
|
-
"""Raise an error if the model configuration has invalid parameters.
|
|
1366
|
-
|
|
1367
|
-
Args:
|
|
1368
|
-
model_config:
|
|
1369
|
-
The model configuration.
|
|
1370
|
-
allowed_params:
|
|
1371
|
-
The allowed parameters for the model.
|
|
1372
|
-
|
|
1373
|
-
Raises:
|
|
1374
|
-
InvalidModel:
|
|
1375
|
-
If the model configuration has invalid parameters.
|
|
1376
|
-
"""
|
|
1377
|
-
param = model_config.revision
|
|
1378
|
-
if param == "":
|
|
1379
|
-
return
|
|
1380
|
-
for model_regex, allowed_params_list in allowed_params.items():
|
|
1381
|
-
if re.fullmatch(pattern=model_regex, string=model_config.model_id):
|
|
1382
|
-
if param not in allowed_params_list:
|
|
1383
|
-
msg = (
|
|
1384
|
-
f"Invalid parameter {param!r} for model {model_config.model_id!r}."
|
|
1385
|
-
)
|
|
1386
|
-
if allowed_params_list:
|
|
1387
|
-
msg += f" Allowed parameters are: {', '.join(allowed_params_list)}."
|
|
1388
|
-
else:
|
|
1389
|
-
msg += " No parameters are allowed."
|
|
1390
|
-
raise InvalidModel(msg)
|
|
1391
|
-
return
|
|
1392
|
-
else:
|
|
1393
|
-
raise InvalidModel(
|
|
1394
|
-
f"The parameter {param!r} is not supported for the model "
|
|
1395
|
-
f"{model_config.model_id!r}."
|
|
1396
|
-
)
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
1526
|
def try_download_ollama_model(model_id: str) -> bool:
|
|
1400
1527
|
"""Try to download an Ollama model.
|
|
1401
1528
|
|