EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +161 -114
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import collections.abc as c
|
|
5
|
+
import json
|
|
5
6
|
import logging
|
|
6
|
-
import os
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
9
|
from functools import cache, cached_property, partial
|
|
@@ -38,7 +38,12 @@ from requests.exceptions import RequestException
|
|
|
38
38
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
39
39
|
from tqdm.auto import tqdm
|
|
40
40
|
|
|
41
|
-
from ..constants import
|
|
41
|
+
from ..constants import (
|
|
42
|
+
JSON_STRIP_CHARACTERS,
|
|
43
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY,
|
|
44
|
+
MAX_LITELLM_LOGPROBS,
|
|
45
|
+
REASONING_MAX_TOKENS,
|
|
46
|
+
)
|
|
42
47
|
from ..data_models import (
|
|
43
48
|
BenchmarkConfig,
|
|
44
49
|
DatasetConfig,
|
|
@@ -67,16 +72,18 @@ from ..task_group_utils import (
|
|
|
67
72
|
text_to_text,
|
|
68
73
|
token_classification,
|
|
69
74
|
)
|
|
75
|
+
from ..tasks import NER
|
|
70
76
|
from ..tokenization_utils import get_first_label_token_mapping
|
|
71
77
|
from ..types import ExtractLabelsFunction
|
|
72
78
|
from ..utils import (
|
|
73
79
|
add_semaphore_and_catch_exception,
|
|
74
80
|
create_model_cache_dir,
|
|
81
|
+
get_hf_token,
|
|
75
82
|
log_once,
|
|
76
83
|
safe_run,
|
|
77
84
|
)
|
|
78
85
|
from .base import BenchmarkModule
|
|
79
|
-
from .hf import HuggingFaceEncoderModel, load_hf_model_config,
|
|
86
|
+
from .hf import HuggingFaceEncoderModel, load_hf_model_config, load_tokeniser
|
|
80
87
|
|
|
81
88
|
if t.TYPE_CHECKING:
|
|
82
89
|
from datasets import DatasetDict
|
|
@@ -155,7 +162,7 @@ ALLOWED_PARAMS = {
|
|
|
155
162
|
r"(anthropic/)?claude-(sonnet|opus)-4.*": ["no-thinking", "thinking"],
|
|
156
163
|
# Gemini models
|
|
157
164
|
r"(gemini/)?gemini-2.5-flash-lite.*": ["no-thinking", "thinking"],
|
|
158
|
-
r"(gemini/)?gemini-2.5-flash
|
|
165
|
+
r"(gemini/)?gemini-2.5-flash.*": ["no-thinking", "thinking"],
|
|
159
166
|
# xAI models
|
|
160
167
|
r"(xai/)?grok-3-mini(-fast)?(-beta)?": ["low", "medium", "high"],
|
|
161
168
|
}
|
|
@@ -181,6 +188,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
181
188
|
model_config: ModelConfig,
|
|
182
189
|
dataset_config: DatasetConfig,
|
|
183
190
|
benchmark_config: BenchmarkConfig,
|
|
191
|
+
log_metadata: bool = True,
|
|
192
|
+
**generation_kwargs: dict[str, t.Any],
|
|
184
193
|
) -> None:
|
|
185
194
|
"""Initialise the model.
|
|
186
195
|
|
|
@@ -191,6 +200,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
191
200
|
The dataset configuration.
|
|
192
201
|
benchmark_config:
|
|
193
202
|
The benchmark configuration.
|
|
203
|
+
log_metadata:
|
|
204
|
+
Whether to log the model metadata.
|
|
205
|
+
generation_kwargs:
|
|
206
|
+
The generation kwargs to pass to the model. If None, default values will
|
|
207
|
+
be used.
|
|
194
208
|
"""
|
|
195
209
|
# Detect whether the model is an Ollama model, as we need to extract metadata
|
|
196
210
|
# differently for these models
|
|
@@ -209,13 +223,16 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
209
223
|
model_config=model_config,
|
|
210
224
|
dataset_config=dataset_config,
|
|
211
225
|
benchmark_config=benchmark_config,
|
|
226
|
+
log_metadata=log_metadata,
|
|
212
227
|
)
|
|
213
228
|
|
|
229
|
+
self.generation_kwargs = generation_kwargs
|
|
214
230
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
215
231
|
dataset_config=self.dataset_config,
|
|
216
232
|
model_config=self.model_config,
|
|
217
|
-
|
|
233
|
+
tokeniser=None,
|
|
218
234
|
generative_type=self.generative_type,
|
|
235
|
+
log_metadata=self.log_metadata,
|
|
219
236
|
)
|
|
220
237
|
|
|
221
238
|
@property
|
|
@@ -245,11 +262,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
245
262
|
else:
|
|
246
263
|
type_ = GenerativeType.INSTRUCTION_TUNED
|
|
247
264
|
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
265
|
+
if self.log_metadata:
|
|
266
|
+
log_once(
|
|
267
|
+
f"Detected generative type {type_.name!r} for model "
|
|
268
|
+
f"{self.model_config.model_id!r}",
|
|
269
|
+
level=logging.DEBUG,
|
|
270
|
+
)
|
|
253
271
|
return type_
|
|
254
272
|
|
|
255
273
|
def generate(self, inputs: dict) -> GenerativeModelOutput:
|
|
@@ -270,32 +288,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
270
288
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
271
289
|
dataset_config=self.dataset_config,
|
|
272
290
|
model_config=self.model_config,
|
|
273
|
-
|
|
291
|
+
tokeniser=None,
|
|
274
292
|
generative_type=self.generative_type,
|
|
293
|
+
log_metadata=self.log_metadata,
|
|
275
294
|
)
|
|
276
295
|
|
|
277
|
-
# Sanity check that "JSON" is included in the prompt, as some models require
|
|
278
|
-
# this
|
|
279
|
-
if self.dataset_config.task in TASKS_USING_JSON:
|
|
280
|
-
for conversation in conversations:
|
|
281
|
-
if not conversation:
|
|
282
|
-
raise InvalidBenchmark(
|
|
283
|
-
"Encountered an empty conversation in 'messages'."
|
|
284
|
-
)
|
|
285
|
-
last_message = conversation[-1]
|
|
286
|
-
assert isinstance(last_message, dict), (
|
|
287
|
-
f"Expected dict message, got {type(last_message)}"
|
|
288
|
-
)
|
|
289
|
-
assert "content" in last_message, (
|
|
290
|
-
"Expected 'content' key in the last message of the conversation."
|
|
291
|
-
)
|
|
292
|
-
assert isinstance(last_message["content"], str), (
|
|
293
|
-
"Expected 'content' to be a string."
|
|
294
|
-
)
|
|
295
|
-
assert "json" in last_message["content"].lower(), (
|
|
296
|
-
"Prompt must contain 'json' for JSON tasks."
|
|
297
|
-
)
|
|
298
|
-
|
|
299
296
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
300
297
|
conversations_to_run: list[tuple[int, list[litellm.AllMessageValues]]] = list(
|
|
301
298
|
enumerate(conversations)
|
|
@@ -304,12 +301,16 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
304
301
|
if not conversations_to_run:
|
|
305
302
|
break
|
|
306
303
|
|
|
304
|
+
generation_kwargs = self.generation_kwargs or self.get_generation_kwargs(
|
|
305
|
+
dataset_config=self.dataset_config
|
|
306
|
+
)
|
|
307
|
+
|
|
307
308
|
batch_indices, batch_conversations = zip(*conversations_to_run)
|
|
308
309
|
successes, failures = safe_run(
|
|
309
310
|
self._generate_async(
|
|
310
311
|
model_id=self.model_config.model_id,
|
|
311
312
|
conversations=list(batch_conversations),
|
|
312
|
-
**
|
|
313
|
+
**generation_kwargs,
|
|
313
314
|
)
|
|
314
315
|
)
|
|
315
316
|
|
|
@@ -336,11 +337,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
336
337
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
337
338
|
# successful generations next time around
|
|
338
339
|
for _, error in failures:
|
|
339
|
-
self._handle_exception(
|
|
340
|
-
error=error,
|
|
341
|
-
generation_kwargs=self.get_generation_kwargs(
|
|
342
|
-
dataset_config=self.dataset_config
|
|
343
|
-
),
|
|
340
|
+
generation_kwargs = self._handle_exception(
|
|
341
|
+
error=error, **generation_kwargs
|
|
344
342
|
)
|
|
345
343
|
|
|
346
344
|
# Sleep for a second to avoid pinging the API server too quickly
|
|
@@ -364,9 +362,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
364
362
|
|
|
365
363
|
return model_output
|
|
366
364
|
|
|
367
|
-
def _handle_exception(
|
|
368
|
-
self, error: Exception, generation_kwargs: dict[str, t.Any]
|
|
369
|
-
) -> None:
|
|
365
|
+
def _handle_exception(self, error: Exception, **generation_kwargs) -> dict:
|
|
370
366
|
"""Handle an exception from the model.
|
|
371
367
|
|
|
372
368
|
Args:
|
|
@@ -374,6 +370,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
374
370
|
The exception to handle.
|
|
375
371
|
generation_kwargs:
|
|
376
372
|
The generation kwargs to pass to the model.
|
|
373
|
+
|
|
374
|
+
Returns:
|
|
375
|
+
The updated generation kwargs to pass to the model.
|
|
377
376
|
"""
|
|
378
377
|
error_msg = str(error).lower()
|
|
379
378
|
model_id = self.model_config.model_id
|
|
@@ -386,6 +385,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
386
385
|
"logprobs is not supported",
|
|
387
386
|
"logprobs is not enabled",
|
|
388
387
|
]
|
|
388
|
+
logprobs_pattern = re.compile(
|
|
389
|
+
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
390
|
+
)
|
|
389
391
|
temperature_messages = [
|
|
390
392
|
"'temperature' is not supported with this model.",
|
|
391
393
|
"temperature is not supported with this model",
|
|
@@ -403,6 +405,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
403
405
|
r"[0-9]+ and ([0-9]+)\."
|
|
404
406
|
)
|
|
405
407
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
408
|
+
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
406
409
|
|
|
407
410
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
408
411
|
log_once(
|
|
@@ -411,9 +414,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
411
414
|
level=logging.DEBUG,
|
|
412
415
|
)
|
|
413
416
|
generation_kwargs["stop"] = None
|
|
414
|
-
return
|
|
417
|
+
return generation_kwargs
|
|
415
418
|
elif (
|
|
416
419
|
any(msg.lower() in error_msg for msg in logprobs_messages)
|
|
420
|
+
or logprobs_pattern.search(string=error_msg)
|
|
417
421
|
# Special case for Vertex AI models, since they have strict rate
|
|
418
422
|
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
419
423
|
# we ignore this since the rate limiting makes it unusable anyway.
|
|
@@ -425,7 +429,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
425
429
|
)
|
|
426
430
|
generation_kwargs.pop("logprobs", None)
|
|
427
431
|
generation_kwargs.pop("top_logprobs", None)
|
|
428
|
-
return
|
|
432
|
+
return generation_kwargs
|
|
429
433
|
elif any(msg.lower() in error_msg for msg in temperature_messages):
|
|
430
434
|
log_once(
|
|
431
435
|
f"The model {model_id!r} does not support "
|
|
@@ -433,7 +437,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
433
437
|
level=logging.DEBUG,
|
|
434
438
|
)
|
|
435
439
|
generation_kwargs.pop("temperature", None)
|
|
436
|
-
return
|
|
440
|
+
return generation_kwargs
|
|
437
441
|
elif any(msg.lower() in error_msg for msg in temperature_must_be_one_messages):
|
|
438
442
|
log_once(
|
|
439
443
|
f"The model {model_id!r} requires "
|
|
@@ -441,8 +445,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
441
445
|
level=logging.DEBUG,
|
|
442
446
|
)
|
|
443
447
|
generation_kwargs["temperature"] = 1.0
|
|
444
|
-
return
|
|
445
|
-
elif
|
|
448
|
+
return generation_kwargs
|
|
449
|
+
elif (
|
|
450
|
+
any(msg.lower() in error_msg for msg in max_items_messages)
|
|
451
|
+
and self.dataset_config.task == NER
|
|
452
|
+
):
|
|
446
453
|
log_once(
|
|
447
454
|
f"The model {model_id!r} does not support "
|
|
448
455
|
"maxItems in the JSON schema, so disabling it.",
|
|
@@ -454,7 +461,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
454
461
|
}
|
|
455
462
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
456
463
|
generation_kwargs["response_format"] = pydantic_class
|
|
457
|
-
return
|
|
464
|
+
return generation_kwargs
|
|
458
465
|
elif any(msg.lower() in error_msg for msg in no_json_schema_messages):
|
|
459
466
|
log_once(
|
|
460
467
|
f"The model {self.model_config.model_id!r} does not support "
|
|
@@ -462,7 +469,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
462
469
|
level=logging.DEBUG,
|
|
463
470
|
)
|
|
464
471
|
generation_kwargs["response_format"] = dict(type="json_object")
|
|
465
|
-
return
|
|
472
|
+
return generation_kwargs
|
|
466
473
|
elif thinking_match := thinking_budget_pattern.search(string=error_msg):
|
|
467
474
|
thinking_budget = int(thinking_match.group(1))
|
|
468
475
|
if thinking_budget >= REASONING_MAX_TOKENS:
|
|
@@ -471,7 +478,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
471
478
|
f"{thinking_budget:,} tokens, which is within the limit of "
|
|
472
479
|
f"{REASONING_MAX_TOKENS:,} tokens. This should not happen. The "
|
|
473
480
|
f"error message was: {error_msg}."
|
|
474
|
-
)
|
|
481
|
+
) from error
|
|
475
482
|
log_once(
|
|
476
483
|
f"The model {model_id!r} can at most use {thinking_budget:,} tokens "
|
|
477
484
|
"for reasoning, which is less than the default of "
|
|
@@ -482,7 +489,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
482
489
|
generation_kwargs["thinking"] = dict(
|
|
483
490
|
type="enabled", budget_tokens=thinking_budget - 1
|
|
484
491
|
)
|
|
485
|
-
return
|
|
492
|
+
return generation_kwargs
|
|
486
493
|
elif (
|
|
487
494
|
any(msg.lower() in error_msg for msg in requires_thinking_disabled_messages)
|
|
488
495
|
and self.generative_type != GenerativeType.REASONING
|
|
@@ -494,59 +501,73 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
494
501
|
level=logging.DEBUG,
|
|
495
502
|
)
|
|
496
503
|
generation_kwargs["thinking"] = dict(type="disabled")
|
|
497
|
-
return
|
|
504
|
+
return generation_kwargs
|
|
505
|
+
elif re.search(pattern=seed_pattern, string=error_msg):
|
|
506
|
+
log_once(
|
|
507
|
+
f"The model {model_id!r} does not support the `seed` parameter, so "
|
|
508
|
+
"disabling it.",
|
|
509
|
+
level=logging.DEBUG,
|
|
510
|
+
)
|
|
511
|
+
generation_kwargs.pop("seed", None)
|
|
512
|
+
return generation_kwargs
|
|
513
|
+
# If there are too many I/O connections, we increase the number of allowed file
|
|
514
|
+
# descriptors
|
|
515
|
+
elif "too many open files" in error_msg:
|
|
516
|
+
raise InvalidBenchmark(
|
|
517
|
+
"There are too many file descriptors running. See the current "
|
|
518
|
+
"value by running `ulimit -n`. Try increasing it by running "
|
|
519
|
+
"`ulimit -n <new-value>` and try again."
|
|
520
|
+
) from error
|
|
498
521
|
elif isinstance(
|
|
499
522
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
500
523
|
):
|
|
501
524
|
logger.debug(
|
|
502
525
|
f"Service temporarily unavailable. The error message was: {error}. "
|
|
503
|
-
|
|
526
|
+
"Retrying in 10 seconds..."
|
|
504
527
|
)
|
|
505
|
-
sleep(
|
|
506
|
-
return
|
|
528
|
+
sleep(10)
|
|
529
|
+
return generation_kwargs
|
|
507
530
|
elif isinstance(error, UnsupportedParamsError):
|
|
508
531
|
unsupported_param_match = re.search(
|
|
509
532
|
pattern=r"(?<=does not support parameters\: \[')([^ ']+)(?='\])",
|
|
510
533
|
string=error.message,
|
|
511
534
|
)
|
|
512
535
|
if unsupported_param_match is None:
|
|
513
|
-
raise InvalidModel(error.message)
|
|
536
|
+
raise InvalidModel(error.message) from error
|
|
514
537
|
else:
|
|
515
538
|
unsupported_param = unsupported_param_match.group(0)
|
|
516
539
|
raise InvalidModel(
|
|
517
540
|
f"The model {model_id!r} does not support the parameter "
|
|
518
541
|
f"{unsupported_param!r}. Try again without this parameter. "
|
|
519
542
|
"Skipping this model."
|
|
520
|
-
)
|
|
543
|
+
) from error
|
|
521
544
|
elif isinstance(error, (APIConnectionError, OSError)):
|
|
522
|
-
# If there are too many I/O connections, we increase the number of allowed
|
|
523
|
-
# file descriptors
|
|
524
|
-
if "too many open files" in error_msg:
|
|
525
|
-
raise InvalidBenchmark(
|
|
526
|
-
"There are too many file descriptors running. See the current "
|
|
527
|
-
"value by running `ulimit -n`. Try increasing it by running "
|
|
528
|
-
"`ulimit -n <new-value>` and try again."
|
|
529
|
-
)
|
|
530
545
|
raise InvalidBenchmark(
|
|
531
546
|
f"Encountered {type(error)} during generation: {error}."
|
|
532
|
-
)
|
|
547
|
+
) from error
|
|
548
|
+
|
|
549
|
+
if isinstance(error, NotFoundError):
|
|
550
|
+
raise InvalidModel(
|
|
551
|
+
f"The model {model_id!r} was not found. Please check the model ID "
|
|
552
|
+
"and try again."
|
|
553
|
+
) from error
|
|
533
554
|
|
|
534
555
|
if isinstance(error, RateLimitError):
|
|
535
556
|
raise InvalidModel(
|
|
536
557
|
f"You have encountered your rate limit for model {model_id!r}. "
|
|
537
558
|
"Skipping."
|
|
538
|
-
)
|
|
559
|
+
) from error
|
|
539
560
|
|
|
540
561
|
if isinstance(error, AuthenticationError):
|
|
541
562
|
raise NeedsAdditionalArgument(
|
|
542
563
|
cli_argument="--api-key",
|
|
543
564
|
script_argument="api_key=<your-api-key>",
|
|
544
565
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
545
|
-
)
|
|
566
|
+
) from error
|
|
546
567
|
|
|
547
568
|
raise InvalidBenchmark(
|
|
548
569
|
f"Failed to generate text. The error message was: {error}"
|
|
549
|
-
)
|
|
570
|
+
) from error
|
|
550
571
|
|
|
551
572
|
async def _generate_async(
|
|
552
573
|
self,
|
|
@@ -573,9 +594,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
573
594
|
# for all the requests, preventing "too many open files" errors
|
|
574
595
|
router = Router(
|
|
575
596
|
model_list=[
|
|
576
|
-
|
|
597
|
+
litellm.DeploymentTypedDict(
|
|
577
598
|
model_name=self.model_config.model_id,
|
|
578
|
-
litellm_params=
|
|
599
|
+
litellm_params=litellm.LiteLLMParamsTypedDict(model=model_id),
|
|
579
600
|
)
|
|
580
601
|
]
|
|
581
602
|
)
|
|
@@ -585,7 +606,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
585
606
|
semaphore = asyncio.Semaphore(max_concurrent_calls)
|
|
586
607
|
requests = [
|
|
587
608
|
add_semaphore_and_catch_exception(
|
|
588
|
-
router.acompletion(
|
|
609
|
+
router.acompletion(
|
|
610
|
+
model=model_id, messages=conversation, **generation_kwargs
|
|
611
|
+
),
|
|
589
612
|
semaphore=semaphore,
|
|
590
613
|
)
|
|
591
614
|
for conversation in conversations
|
|
@@ -645,6 +668,23 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
645
668
|
generation_output = generated_message.content or ""
|
|
646
669
|
generation_output = generation_output.strip()
|
|
647
670
|
|
|
671
|
+
# In the case where we're dealing with a classification task, the model is
|
|
672
|
+
# outputting a JSON dictionary, so we will extract the generated text from
|
|
673
|
+
# within the dictionary
|
|
674
|
+
generation_dct: dict[str, t.Any] | None = None
|
|
675
|
+
if LITELLM_CLASSIFICATION_OUTPUT_KEY in generation_output:
|
|
676
|
+
try:
|
|
677
|
+
generation_dct = json.loads(generation_output)
|
|
678
|
+
assert isinstance(generation_dct, dict)
|
|
679
|
+
if set(generation_dct.keys()) == {
|
|
680
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY
|
|
681
|
+
}:
|
|
682
|
+
generation_output = str(
|
|
683
|
+
generation_dct[LITELLM_CLASSIFICATION_OUTPUT_KEY]
|
|
684
|
+
).strip()
|
|
685
|
+
except json.JSONDecodeError:
|
|
686
|
+
pass
|
|
687
|
+
|
|
648
688
|
# Structure the model output as a GenerativeModelOutput object
|
|
649
689
|
sequences.append(generation_output)
|
|
650
690
|
if hasattr(model_response_choices, "logprobs"):
|
|
@@ -657,6 +697,23 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
657
697
|
]
|
|
658
698
|
for content in model_response_choices.logprobs.content or list()
|
|
659
699
|
]
|
|
700
|
+
|
|
701
|
+
# If the model outputted a JSON dictionary, we need to find the
|
|
702
|
+
# token index of the value within the dictionary, rather than the
|
|
703
|
+
# first token of the entire output
|
|
704
|
+
if generation_dct:
|
|
705
|
+
key_name = next(iter(generation_dct.keys()))
|
|
706
|
+
logprobs_list = [
|
|
707
|
+
lst
|
|
708
|
+
for lst in logprobs_list
|
|
709
|
+
if (
|
|
710
|
+
lst
|
|
711
|
+
and lst[0]
|
|
712
|
+
and (token := lst[0][0].strip(JSON_STRIP_CHARACTERS))
|
|
713
|
+
and not key_name.startswith(token)
|
|
714
|
+
)
|
|
715
|
+
]
|
|
716
|
+
|
|
660
717
|
scores.append(logprobs_list)
|
|
661
718
|
else:
|
|
662
719
|
log_once(
|
|
@@ -730,9 +787,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
730
787
|
repo_info = hf_api.model_info(
|
|
731
788
|
repo_id=model_id,
|
|
732
789
|
revision="main",
|
|
733
|
-
token=
|
|
734
|
-
or self.benchmark_config.api_key
|
|
735
|
-
or True,
|
|
790
|
+
token=get_hf_token(api_key=self.benchmark_config.api_key),
|
|
736
791
|
)
|
|
737
792
|
except (
|
|
738
793
|
RepositoryNotFoundError,
|
|
@@ -789,7 +844,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
789
844
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
790
845
|
)
|
|
791
846
|
|
|
792
|
-
|
|
847
|
+
tokeniser = load_tokeniser(
|
|
793
848
|
model=None,
|
|
794
849
|
model_id=model_id,
|
|
795
850
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -801,10 +856,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
801
856
|
):
|
|
802
857
|
vocab_size = hf_config.vocab_size
|
|
803
858
|
elif (
|
|
804
|
-
hasattr(
|
|
805
|
-
and
|
|
859
|
+
hasattr(tokeniser, "vocab_size")
|
|
860
|
+
and tokeniser.vocab_size is not None
|
|
806
861
|
):
|
|
807
|
-
vocab_size =
|
|
862
|
+
vocab_size = tokeniser.vocab_size
|
|
808
863
|
else:
|
|
809
864
|
vocab_size = -1
|
|
810
865
|
return vocab_size
|
|
@@ -835,13 +890,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
835
890
|
if context_length_keys:
|
|
836
891
|
context_length = model_info[context_length_keys[0]]
|
|
837
892
|
if context_length is not None:
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
893
|
+
if self.log_metadata:
|
|
894
|
+
log_once(
|
|
895
|
+
f"Detected context length key "
|
|
896
|
+
f"{context_length_keys[0]!r} for Ollama model "
|
|
897
|
+
f"{ollama_model_id!r}",
|
|
898
|
+
level=logging.DEBUG,
|
|
899
|
+
)
|
|
843
900
|
return int(context_length)
|
|
844
|
-
|
|
901
|
+
elif self.log_metadata:
|
|
845
902
|
log_once(
|
|
846
903
|
f"Tried to get the maximum length of the Ollama model "
|
|
847
904
|
f"{ollama_model_id!r}, but could not find a context length. "
|
|
@@ -869,7 +926,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
869
926
|
run_with_cli=self.benchmark_config.run_with_cli,
|
|
870
927
|
)
|
|
871
928
|
|
|
872
|
-
|
|
929
|
+
tokeniser = load_tokeniser(
|
|
873
930
|
model=None,
|
|
874
931
|
model_id=model_id,
|
|
875
932
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
@@ -877,18 +934,18 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
877
934
|
|
|
878
935
|
all_max_lengths: list[int] = list()
|
|
879
936
|
|
|
880
|
-
# Add the registered max length of the
|
|
937
|
+
# Add the registered max length of the tokeniser
|
|
881
938
|
if hasattr(
|
|
882
|
-
|
|
883
|
-
) and
|
|
884
|
-
all_max_lengths.append(
|
|
939
|
+
tokeniser, "model_max_length"
|
|
940
|
+
) and tokeniser.model_max_length < int(1e30):
|
|
941
|
+
all_max_lengths.append(tokeniser.model_max_length)
|
|
885
942
|
|
|
886
943
|
# Add the max length derived from the model's input sizes
|
|
887
|
-
if hasattr(
|
|
944
|
+
if hasattr(tokeniser, "max_model_input_sizes"):
|
|
888
945
|
all_max_lengths.extend(
|
|
889
946
|
[
|
|
890
947
|
size
|
|
891
|
-
for size in
|
|
948
|
+
for size in tokeniser.max_model_input_sizes.values()
|
|
892
949
|
if size is not None
|
|
893
950
|
]
|
|
894
951
|
)
|
|
@@ -1026,7 +1083,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1026
1083
|
f"Service temporarily unavailable. The error message was: {e}. "
|
|
1027
1084
|
"Retrying in 10 seconds..."
|
|
1028
1085
|
)
|
|
1029
|
-
sleep(
|
|
1086
|
+
sleep(10)
|
|
1030
1087
|
except APIError as e:
|
|
1031
1088
|
if "'503 Service Unavailable" not in str(e):
|
|
1032
1089
|
raise e
|
|
@@ -1136,7 +1193,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1136
1193
|
|
|
1137
1194
|
if self.benchmark_config.few_shot:
|
|
1138
1195
|
few_shot_examples = extract_few_shot_examples(
|
|
1139
|
-
dataset=dataset,
|
|
1196
|
+
dataset=dataset,
|
|
1197
|
+
dataset_config=self.dataset_config,
|
|
1198
|
+
benchmark_config=self.benchmark_config,
|
|
1199
|
+
itr_idx=itr_idx,
|
|
1140
1200
|
)
|
|
1141
1201
|
else:
|
|
1142
1202
|
few_shot_examples = list()
|
|
@@ -1149,7 +1209,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1149
1209
|
dataset_config=self.dataset_config,
|
|
1150
1210
|
instruction_model=True,
|
|
1151
1211
|
always_populate_text_field=False,
|
|
1152
|
-
|
|
1212
|
+
tokeniser=None,
|
|
1153
1213
|
),
|
|
1154
1214
|
batched=True,
|
|
1155
1215
|
load_from_cache_file=False,
|
|
@@ -1174,7 +1234,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1174
1234
|
"""
|
|
1175
1235
|
# Set the core generation arguments
|
|
1176
1236
|
generation_kwargs: dict[str, t.Any] = dict(
|
|
1177
|
-
model=self.model_config.model_id,
|
|
1178
1237
|
max_completion_tokens=(
|
|
1179
1238
|
REASONING_MAX_TOKENS
|
|
1180
1239
|
if self.generative_type == GenerativeType.REASONING
|
|
@@ -1191,7 +1250,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1191
1250
|
|
|
1192
1251
|
# Set up the `response_format` generation argument if we are dealing with a task
|
|
1193
1252
|
# using structured generation
|
|
1194
|
-
if dataset_config.task
|
|
1253
|
+
if dataset_config.task.uses_structured_output:
|
|
1195
1254
|
if self.generative_type == GenerativeType.REASONING:
|
|
1196
1255
|
log_once(
|
|
1197
1256
|
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
@@ -1200,12 +1259,21 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1200
1259
|
level=logging.DEBUG,
|
|
1201
1260
|
)
|
|
1202
1261
|
elif supports_response_schema(model=self.model_config.model_id):
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1262
|
+
if dataset_config.task == NER:
|
|
1263
|
+
ner_tag_names = list(dataset_config.prompt_label_mapping.values())
|
|
1264
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
1265
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
1266
|
+
for tag_name in ner_tag_names
|
|
1267
|
+
}
|
|
1268
|
+
pydantic_class = create_model(
|
|
1269
|
+
"AnswerFormat", **keys_and_their_types
|
|
1270
|
+
)
|
|
1271
|
+
else:
|
|
1272
|
+
raise InvalidBenchmark(
|
|
1273
|
+
"This task requires structured generation, but it has not "
|
|
1274
|
+
"been implemented for this task yet. Please open an issue "
|
|
1275
|
+
"at https://github.com/EuroEval/EuroEval/issues."
|
|
1276
|
+
)
|
|
1209
1277
|
generation_kwargs["response_format"] = pydantic_class
|
|
1210
1278
|
log_once(
|
|
1211
1279
|
"Enabling structured generation for model "
|
|
@@ -1221,6 +1289,16 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1221
1289
|
"the model does not support schemas.",
|
|
1222
1290
|
level=logging.DEBUG,
|
|
1223
1291
|
)
|
|
1292
|
+
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
1293
|
+
localised_labels = [
|
|
1294
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
1295
|
+
for label in self.dataset_config.labels
|
|
1296
|
+
]
|
|
1297
|
+
keys_and_their_types = {
|
|
1298
|
+
LITELLM_CLASSIFICATION_OUTPUT_KEY: (t.Literal[*localised_labels], ...)
|
|
1299
|
+
}
|
|
1300
|
+
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
1301
|
+
generation_kwargs["response_format"] = pydantic_class
|
|
1224
1302
|
|
|
1225
1303
|
# If the model is an Ollama reasoning model, we ensure that thinking is enabled
|
|
1226
1304
|
if self.is_ollama and self.generative_type == GenerativeType.REASONING:
|
|
@@ -1234,7 +1312,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1234
1312
|
# Handle manually set parameters
|
|
1235
1313
|
if self.buffer["first_label_token_mapping"]:
|
|
1236
1314
|
generation_kwargs["logprobs"] = True
|
|
1237
|
-
generation_kwargs["top_logprobs"] =
|
|
1315
|
+
generation_kwargs["top_logprobs"] = MAX_LITELLM_LOGPROBS
|
|
1238
1316
|
if self.model_config.revision == "thinking":
|
|
1239
1317
|
generation_kwargs["thinking"] = dict(
|
|
1240
1318
|
type="enabled", budget_tokens=REASONING_MAX_TOKENS - 1
|
|
@@ -1260,7 +1338,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1260
1338
|
# First attempt is a test run with a single conversation to handle errors
|
|
1261
1339
|
# quickly. We repeat this multiple times to deal with different types of
|
|
1262
1340
|
# errors, and stop if we get a successful response.
|
|
1263
|
-
test_conversation = [
|
|
1341
|
+
test_conversation: list[litellm.AllMessageValues] = [
|
|
1264
1342
|
litellm.ChatCompletionUserMessage(role="user", content="Test message")
|
|
1265
1343
|
]
|
|
1266
1344
|
for _ in range(5):
|
|
@@ -1274,7 +1352,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1274
1352
|
if not failures:
|
|
1275
1353
|
break
|
|
1276
1354
|
for _, error in failures:
|
|
1277
|
-
self._handle_exception(
|
|
1355
|
+
generation_kwargs = self._handle_exception(
|
|
1356
|
+
error=error, **generation_kwargs
|
|
1357
|
+
)
|
|
1278
1358
|
|
|
1279
1359
|
return generation_kwargs
|
|
1280
1360
|
|
|
@@ -1350,11 +1430,11 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1350
1430
|
for model_obj in ollama.list().models
|
|
1351
1431
|
if model_obj.model is not None
|
|
1352
1432
|
]
|
|
1353
|
-
except ConnectionError:
|
|
1433
|
+
except ConnectionError as e:
|
|
1354
1434
|
raise InvalidModel(
|
|
1355
1435
|
"Ollama does not seem to be running, so we cannot evaluate the model "
|
|
1356
1436
|
f"{model_id!r}. Please make sure that Ollama is running and try again."
|
|
1357
|
-
)
|
|
1437
|
+
) from e
|
|
1358
1438
|
|
|
1359
1439
|
ollama_model_id = "/".join(model_id.split("/")[1:])
|
|
1360
1440
|
if ollama_model_id not in downloaded_ollama_models:
|
|
@@ -1384,12 +1464,12 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1384
1464
|
raise InvalidModel(
|
|
1385
1465
|
f"Failed to download Ollama model {ollama_model_id}. "
|
|
1386
1466
|
f"The error message was: {inner_e}"
|
|
1387
|
-
)
|
|
1467
|
+
) from inner_e
|
|
1388
1468
|
else:
|
|
1389
1469
|
raise InvalidModel(
|
|
1390
1470
|
f"Failed to download Ollama model {ollama_model_id}. "
|
|
1391
1471
|
f"The error message was: {e}"
|
|
1392
|
-
)
|
|
1472
|
+
) from e
|
|
1393
1473
|
|
|
1394
1474
|
# Download the model
|
|
1395
1475
|
with tqdm(
|