EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +9 -2
- euroeval/benchmark_config_factory.py +51 -50
- euroeval/benchmark_modules/base.py +9 -21
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +101 -71
- euroeval/benchmark_modules/litellm.py +115 -53
- euroeval/benchmark_modules/vllm.py +107 -92
- euroeval/benchmarker.py +144 -121
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +86 -8
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +80 -29
- euroeval/data_models.py +338 -330
- euroeval/dataset_configs/__init__.py +12 -3
- euroeval/dataset_configs/bulgarian.py +56 -0
- euroeval/dataset_configs/czech.py +75 -0
- euroeval/dataset_configs/danish.py +55 -93
- euroeval/dataset_configs/dutch.py +48 -87
- euroeval/dataset_configs/english.py +45 -77
- euroeval/dataset_configs/estonian.py +42 -34
- euroeval/dataset_configs/faroese.py +19 -60
- euroeval/dataset_configs/finnish.py +36 -69
- euroeval/dataset_configs/french.py +39 -75
- euroeval/dataset_configs/german.py +45 -82
- euroeval/dataset_configs/greek.py +64 -0
- euroeval/dataset_configs/icelandic.py +54 -91
- euroeval/dataset_configs/italian.py +42 -79
- euroeval/dataset_configs/latvian.py +28 -35
- euroeval/dataset_configs/lithuanian.py +28 -26
- euroeval/dataset_configs/norwegian.py +72 -115
- euroeval/dataset_configs/polish.py +33 -61
- euroeval/dataset_configs/portuguese.py +33 -66
- euroeval/dataset_configs/serbian.py +64 -0
- euroeval/dataset_configs/slovak.py +55 -0
- euroeval/dataset_configs/spanish.py +42 -77
- euroeval/dataset_configs/swedish.py +52 -90
- euroeval/dataset_configs/ukrainian.py +64 -0
- euroeval/exceptions.py +1 -1
- euroeval/finetuning.py +24 -17
- euroeval/generation.py +15 -14
- euroeval/generation_utils.py +8 -8
- euroeval/languages.py +395 -323
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +21 -6
- euroeval/metrics/llm_as_a_judge.py +6 -4
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +17 -19
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/__init__.py +2 -0
- euroeval/prompt_templates/classification.py +206 -0
- euroeval/prompt_templates/linguistic_acceptability.py +99 -42
- euroeval/prompt_templates/multiple_choice.py +102 -38
- euroeval/prompt_templates/named_entity_recognition.py +172 -51
- euroeval/prompt_templates/reading_comprehension.py +119 -42
- euroeval/prompt_templates/sentiment_classification.py +110 -40
- euroeval/prompt_templates/summarization.py +85 -40
- euroeval/prompt_templates/token_classification.py +279 -0
- euroeval/scores.py +11 -10
- euroeval/speed_benchmark.py +5 -6
- euroeval/task_group_utils/multiple_choice_classification.py +2 -4
- euroeval/task_group_utils/question_answering.py +24 -16
- euroeval/task_group_utils/sequence_classification.py +48 -35
- euroeval/task_group_utils/text_to_text.py +19 -9
- euroeval/task_group_utils/token_classification.py +21 -17
- euroeval/tasks.py +44 -1
- euroeval/tokenisation_utils.py +33 -22
- euroeval/types.py +10 -9
- euroeval/utils.py +35 -149
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
- euroeval-16.5.0.dist-info/RECORD +81 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
|
-
from functools import
|
|
9
|
+
from functools import cached_property, partial
|
|
10
10
|
from time import sleep
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
|
|
|
36
36
|
from pydantic import conlist, create_model
|
|
37
37
|
from requests.exceptions import RequestException
|
|
38
38
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
39
|
-
from tqdm.auto import tqdm
|
|
40
39
|
|
|
40
|
+
from ..caching_utils import cache_arguments
|
|
41
41
|
from ..constants import (
|
|
42
42
|
JSON_STRIP_CHARACTERS,
|
|
43
43
|
LITELLM_CLASSIFICATION_OUTPUT_KEY,
|
|
@@ -70,6 +70,7 @@ from ..generation_utils import (
|
|
|
70
70
|
extract_few_shot_examples,
|
|
71
71
|
raise_if_wrong_params,
|
|
72
72
|
)
|
|
73
|
+
from ..logging_utils import get_pbar, log, log_once
|
|
73
74
|
from ..task_group_utils import (
|
|
74
75
|
question_answering,
|
|
75
76
|
sequence_classification,
|
|
@@ -83,7 +84,6 @@ from ..utils import (
|
|
|
83
84
|
add_semaphore_and_catch_exception,
|
|
84
85
|
create_model_cache_dir,
|
|
85
86
|
get_hf_token,
|
|
86
|
-
log_once,
|
|
87
87
|
safe_run,
|
|
88
88
|
split_model_id,
|
|
89
89
|
)
|
|
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
|
|
|
95
95
|
from litellm.types.utils import ModelResponse
|
|
96
96
|
from transformers.trainer import Trainer
|
|
97
97
|
|
|
98
|
-
logger = logging.getLogger("euroeval")
|
|
99
|
-
|
|
100
98
|
|
|
101
99
|
VOCAB_SIZE_MAPPING = {
|
|
102
100
|
# OpenAI models
|
|
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
133
131
|
r"gpt-4.1.*": 1_047_576,
|
|
134
132
|
# Anthropic models
|
|
135
133
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
134
|
+
r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
|
|
136
135
|
# Gemini models
|
|
137
136
|
r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
|
|
138
137
|
r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
|
|
@@ -311,7 +310,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
311
310
|
InvalidBenchmark:
|
|
312
311
|
If the inputs do not contain either 'messages' or 'text' keys.
|
|
313
312
|
"""
|
|
314
|
-
model_inputs:
|
|
313
|
+
model_inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str]
|
|
315
314
|
if "messages" in inputs:
|
|
316
315
|
model_inputs = inputs["messages"]
|
|
317
316
|
elif "text" in inputs:
|
|
@@ -332,9 +331,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
332
331
|
)
|
|
333
332
|
|
|
334
333
|
all_responses: dict[int, "ModelResponse"] = {}
|
|
335
|
-
inputs_to_run:
|
|
336
|
-
|
|
337
|
-
)
|
|
334
|
+
inputs_to_run: c.Sequence[
|
|
335
|
+
tuple[int, c.Sequence[litellm.AllMessageValues] | str]
|
|
336
|
+
] = list(enumerate(model_inputs))
|
|
338
337
|
for attempt in range(num_attempts := 10):
|
|
339
338
|
if not inputs_to_run:
|
|
340
339
|
break
|
|
@@ -367,10 +366,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
367
366
|
(batch_indices[idx], model_inputs[batch_indices[idx]])
|
|
368
367
|
for idx, _ in failures
|
|
369
368
|
]
|
|
370
|
-
|
|
369
|
+
log(
|
|
371
370
|
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
372
371
|
f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
|
|
373
|
-
f"{failures[0][1]}."
|
|
372
|
+
f"{failures[0][1]}.",
|
|
373
|
+
level=logging.DEBUG,
|
|
374
374
|
)
|
|
375
375
|
|
|
376
376
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
@@ -422,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
422
422
|
"'stop' is not supported with this model",
|
|
423
423
|
"'$.stop' is invalid",
|
|
424
424
|
]
|
|
425
|
+
stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
|
|
425
426
|
logprobs_messages = [
|
|
426
427
|
"you are not allowed to request logprobs",
|
|
427
428
|
"you've reached the maximum number of requests with logprobs",
|
|
428
429
|
"logprobs is not supported",
|
|
429
430
|
"logprobs is not enabled",
|
|
431
|
+
"Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
|
|
430
432
|
]
|
|
431
|
-
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
432
433
|
logprobs_pattern = re.compile(
|
|
434
|
+
r"does not support parameters: \[.*'logprobs'.*\]"
|
|
435
|
+
)
|
|
436
|
+
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
437
|
+
top_logprobs_pattern = re.compile(
|
|
433
438
|
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
434
439
|
)
|
|
435
440
|
max_completion_tokens_pattern = re.compile(
|
|
@@ -438,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
438
443
|
temperature_messages = [
|
|
439
444
|
"'temperature' is not supported with this model.",
|
|
440
445
|
"temperature is not supported with this model",
|
|
446
|
+
r"does not support parameters: \[.*'temperature'.*\]",
|
|
441
447
|
]
|
|
442
448
|
temperature_must_be_one_messages = [
|
|
443
449
|
"`temperature` may only be set to 1",
|
|
@@ -454,10 +460,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
454
460
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
455
461
|
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
456
462
|
response_format_messages = [
|
|
457
|
-
"got an unexpected keyword argument 'response_format'"
|
|
463
|
+
"got an unexpected keyword argument 'response_format'",
|
|
464
|
+
"the model returned empty outputs",
|
|
458
465
|
]
|
|
459
466
|
|
|
460
|
-
if
|
|
467
|
+
if (
|
|
468
|
+
any(msg.lower() in error_msg for msg in stop_messages)
|
|
469
|
+
or stop_pattern.search(string=error_msg) is not None
|
|
470
|
+
):
|
|
461
471
|
log_once(
|
|
462
472
|
f"The model {model_id!r} does not support "
|
|
463
473
|
"stop sequences, so disabling them.",
|
|
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
467
477
|
return generation_kwargs
|
|
468
478
|
elif (
|
|
469
479
|
any(msg.lower() in error_msg for msg in logprobs_messages)
|
|
470
|
-
or logprobs_pattern.search(string=error_msg)
|
|
480
|
+
or logprobs_pattern.search(string=error_msg) is not None
|
|
471
481
|
# Special case for Vertex AI models, since they have strict rate
|
|
472
482
|
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
473
483
|
# we ignore this since the rate limiting makes it unusable anyway.
|
|
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
477
487
|
f"The model {model_id!r} does not support logprobs, so disabling it.",
|
|
478
488
|
level=logging.DEBUG,
|
|
479
489
|
)
|
|
490
|
+
self.buffer["first_label_token_mapping"] = False
|
|
480
491
|
generation_kwargs.pop("logprobs", None)
|
|
481
492
|
generation_kwargs.pop("top_logprobs", None)
|
|
493
|
+
generation_kwargs.pop("response_format", None)
|
|
482
494
|
return generation_kwargs
|
|
483
|
-
elif
|
|
495
|
+
elif (
|
|
496
|
+
any(msg.lower() in error_msg for msg in top_logprobs_messages)
|
|
497
|
+
or top_logprobs_pattern.search(string=error_msg) is not None
|
|
498
|
+
):
|
|
484
499
|
log_once(
|
|
485
500
|
f"The model {model_id!r} does not support the `top_logprobs` argument, "
|
|
486
501
|
"so moving the value to `logprobs`.",
|
|
@@ -525,7 +540,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
525
540
|
)
|
|
526
541
|
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
527
542
|
keys_and_their_types = {
|
|
528
|
-
tag_name: (
|
|
543
|
+
tag_name: (c.Sequence[str], ...) for tag_name in ner_tag_names
|
|
529
544
|
}
|
|
530
545
|
pydantic_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
531
546
|
generation_kwargs["response_format"] = pydantic_class
|
|
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
597
612
|
elif isinstance(
|
|
598
613
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
599
614
|
):
|
|
600
|
-
|
|
615
|
+
log(
|
|
601
616
|
f"Service temporarily unavailable. The error message was: {error}. "
|
|
602
|
-
"Retrying in 10 seconds..."
|
|
617
|
+
"Retrying in 10 seconds...",
|
|
618
|
+
level=logging.DEBUG,
|
|
603
619
|
)
|
|
604
620
|
sleep(10)
|
|
605
621
|
return generation_kwargs
|
|
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
629
645
|
) from error
|
|
630
646
|
|
|
631
647
|
if isinstance(error, RateLimitError):
|
|
632
|
-
|
|
648
|
+
log(
|
|
633
649
|
f"You have encountered your rate limit for model {model_id!r}. "
|
|
634
|
-
"
|
|
635
|
-
|
|
650
|
+
"Retrying in 10 seconds...",
|
|
651
|
+
level=logging.DEBUG,
|
|
652
|
+
)
|
|
653
|
+
sleep(10)
|
|
654
|
+
return generation_kwargs
|
|
655
|
+
|
|
656
|
+
if (
|
|
657
|
+
isinstance(error, BadRequestError)
|
|
658
|
+
and (
|
|
659
|
+
retry_match := re.search(
|
|
660
|
+
pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
|
|
661
|
+
string=error_msg,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
664
|
+
is not None
|
|
665
|
+
):
|
|
666
|
+
retry_seconds = float(retry_match.group(1))
|
|
667
|
+
log(
|
|
668
|
+
f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
|
|
669
|
+
"seconds...",
|
|
670
|
+
level=logging.DEBUG,
|
|
671
|
+
)
|
|
672
|
+
sleep(retry_seconds)
|
|
673
|
+
return generation_kwargs
|
|
636
674
|
|
|
637
675
|
if isinstance(error, AuthenticationError):
|
|
638
676
|
raise NeedsAdditionalArgument(
|
|
@@ -648,9 +686,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
648
686
|
async def _generate_async(
|
|
649
687
|
self,
|
|
650
688
|
model_id: str,
|
|
651
|
-
inputs:
|
|
689
|
+
inputs: c.Sequence[c.Sequence[litellm.AllMessageValues] | str],
|
|
652
690
|
**generation_kwargs,
|
|
653
|
-
) -> tuple[
|
|
691
|
+
) -> tuple[
|
|
692
|
+
c.Sequence[tuple[int, "ModelResponse"]], c.Sequence[tuple[int, Exception]]
|
|
693
|
+
]:
|
|
654
694
|
"""Generate outputs from the model asynchronously.
|
|
655
695
|
|
|
656
696
|
Args:
|
|
@@ -711,7 +751,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
711
751
|
for input_ in inputs
|
|
712
752
|
if isinstance(input_, list)
|
|
713
753
|
]
|
|
714
|
-
responses = await tqdm_async.gather(
|
|
754
|
+
responses = await tqdm_async.gather(
|
|
755
|
+
*requests, colour="yellow", ascii="—▰", leave=False
|
|
756
|
+
)
|
|
757
|
+
|
|
758
|
+
# If the outputs are empty, convert them to exceptions
|
|
759
|
+
if all(
|
|
760
|
+
not isinstance(response, Exception)
|
|
761
|
+
and response.choices[0].message.content == "{}"
|
|
762
|
+
for response in responses
|
|
763
|
+
):
|
|
764
|
+
responses = [ValueError("The model returned empty outputs.")] * len(
|
|
765
|
+
responses
|
|
766
|
+
)
|
|
715
767
|
|
|
716
768
|
# Separate the successful responses from the failed ones
|
|
717
769
|
successes = [
|
|
@@ -731,13 +783,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
731
783
|
try:
|
|
732
784
|
request.close()
|
|
733
785
|
except RuntimeError as e:
|
|
734
|
-
|
|
786
|
+
log(
|
|
787
|
+
f"RuntimeError during request.close(): {e}", level=logging.DEBUG
|
|
788
|
+
)
|
|
735
789
|
|
|
736
790
|
return successes, failures
|
|
737
791
|
|
|
738
792
|
@staticmethod
|
|
739
793
|
def _create_model_output(
|
|
740
|
-
model_responses:
|
|
794
|
+
model_responses: c.Sequence["ModelResponse"], model_id: str
|
|
741
795
|
) -> GenerativeModelOutput:
|
|
742
796
|
"""Create a GenerativeModelOutput object from a list of ModelResponse objects.
|
|
743
797
|
|
|
@@ -756,10 +810,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
756
810
|
for model_response in model_responses:
|
|
757
811
|
if not model_response.choices:
|
|
758
812
|
sequences.append("")
|
|
759
|
-
|
|
813
|
+
log(
|
|
760
814
|
f"The model {model_id!r} did not end up "
|
|
761
815
|
"generating any text. This is likely because the model ran "
|
|
762
|
-
"out of tokens while reasoning. Returning an empty string."
|
|
816
|
+
"out of tokens while reasoning. Returning an empty string.",
|
|
817
|
+
level=logging.WARNING,
|
|
763
818
|
)
|
|
764
819
|
continue
|
|
765
820
|
|
|
@@ -810,7 +865,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
810
865
|
)
|
|
811
866
|
continue
|
|
812
867
|
|
|
813
|
-
logprobs_list:
|
|
868
|
+
logprobs_list: c.Sequence[c.Sequence[tuple[str, float]]]
|
|
814
869
|
if isinstance(logprobs_obj, ChoiceLogprobs):
|
|
815
870
|
logprobs_list = [
|
|
816
871
|
[
|
|
@@ -847,11 +902,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
847
902
|
scores.append(logprobs_list)
|
|
848
903
|
|
|
849
904
|
if not sequences:
|
|
850
|
-
|
|
905
|
+
log(
|
|
851
906
|
"No sequences were generated by the model "
|
|
852
907
|
f"{model_id!r}. This may be due to the "
|
|
853
908
|
"model running out of tokens or an issue with the input data. "
|
|
854
|
-
"Returning an empty GenerativeModelOutput."
|
|
909
|
+
"Returning an empty GenerativeModelOutput.",
|
|
910
|
+
level=logging.WARNING,
|
|
855
911
|
)
|
|
856
912
|
return GenerativeModelOutput(sequences=[], scores=None)
|
|
857
913
|
|
|
@@ -1105,7 +1161,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1105
1161
|
return -1
|
|
1106
1162
|
|
|
1107
1163
|
@property
|
|
1108
|
-
def data_collator(self) -> c.Callable[[
|
|
1164
|
+
def data_collator(self) -> c.Callable[[c.Sequence[t.Any]], dict[str, t.Any]]:
|
|
1109
1165
|
"""The data collator used to prepare samples during finetuning.
|
|
1110
1166
|
|
|
1111
1167
|
Returns:
|
|
@@ -1130,6 +1186,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1130
1186
|
return partial(
|
|
1131
1187
|
sequence_classification.extract_labels_from_generation,
|
|
1132
1188
|
dataset_config=self.dataset_config,
|
|
1189
|
+
model_config=self.model_config,
|
|
1133
1190
|
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
1134
1191
|
)
|
|
1135
1192
|
case TaskGroup.TEXT_TO_TEXT:
|
|
@@ -1205,17 +1262,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1205
1262
|
ServiceUnavailableError,
|
|
1206
1263
|
InternalServerError,
|
|
1207
1264
|
) as e:
|
|
1208
|
-
|
|
1265
|
+
log(
|
|
1209
1266
|
f"Service temporarily unavailable. The error message was: {e}. "
|
|
1210
|
-
"Retrying in 10 seconds..."
|
|
1267
|
+
"Retrying in 10 seconds...",
|
|
1268
|
+
level=logging.DEBUG,
|
|
1211
1269
|
)
|
|
1212
1270
|
sleep(10)
|
|
1213
1271
|
except APIError as e:
|
|
1214
1272
|
if "'503 Service Unavailable" not in str(e):
|
|
1215
1273
|
raise e
|
|
1216
|
-
|
|
1274
|
+
log(
|
|
1217
1275
|
f"Failed to check if model {model_id!r} exists. Retrying in 10 "
|
|
1218
|
-
"seconds..."
|
|
1276
|
+
"seconds...",
|
|
1277
|
+
level=logging.WARNING,
|
|
1219
1278
|
)
|
|
1220
1279
|
sleep(10)
|
|
1221
1280
|
except (BadRequestError, NotFoundError):
|
|
@@ -1228,21 +1287,25 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1228
1287
|
case 0:
|
|
1229
1288
|
pass
|
|
1230
1289
|
case 1:
|
|
1231
|
-
|
|
1290
|
+
log(
|
|
1232
1291
|
f"Could not find the model ID {model_id!r}. Did you mean "
|
|
1233
|
-
f"{candidate_models[0]!r}?"
|
|
1292
|
+
f"{candidate_models[0]!r}?",
|
|
1293
|
+
level=logging.WARNING,
|
|
1234
1294
|
)
|
|
1235
1295
|
case _:
|
|
1236
1296
|
candidate_models_str = "', '".join(candidate_models)
|
|
1237
|
-
|
|
1297
|
+
log(
|
|
1238
1298
|
f"Could not find the model ID {model_id!r}. Did you mean "
|
|
1239
|
-
|
|
1299
|
+
"any of the following model IDs: "
|
|
1300
|
+
f"'{candidate_models_str}'?",
|
|
1301
|
+
level=logging.WARNING,
|
|
1240
1302
|
)
|
|
1241
1303
|
return False
|
|
1242
1304
|
else:
|
|
1243
|
-
|
|
1305
|
+
log(
|
|
1244
1306
|
f"Failed to check if model {model_id!r} exists after {num_attempts} "
|
|
1245
|
-
"attempts. Assuming it does not exist."
|
|
1307
|
+
"attempts. Assuming it does not exist.",
|
|
1308
|
+
level=logging.ERROR,
|
|
1246
1309
|
)
|
|
1247
1310
|
return False
|
|
1248
1311
|
|
|
@@ -1275,7 +1338,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1275
1338
|
"that the revision is actually the parameter and set the revision "
|
|
1276
1339
|
"to 'main'. In the future, use the new '#' syntax to specify the "
|
|
1277
1340
|
f"parameter (in this case, this would be {proper_model_id!r}), as this "
|
|
1278
|
-
"will be an error in future versions of EuroEval."
|
|
1341
|
+
"will be an error in future versions of EuroEval.",
|
|
1342
|
+
level=logging.WARNING,
|
|
1279
1343
|
)
|
|
1280
1344
|
model_id_components.param = model_id_components.revision
|
|
1281
1345
|
model_id_components.revision = "main"
|
|
@@ -1363,7 +1427,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1363
1427
|
|
|
1364
1428
|
return dataset
|
|
1365
1429
|
|
|
1366
|
-
@
|
|
1430
|
+
@cache_arguments()
|
|
1367
1431
|
def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
|
|
1368
1432
|
"""Get the generation arguments for the model.
|
|
1369
1433
|
|
|
@@ -1483,7 +1547,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1483
1547
|
# First attempt is a test run with a single conversation to handle errors
|
|
1484
1548
|
# quickly. We repeat this multiple times to deal with different types of
|
|
1485
1549
|
# errors, and stop if we get a successful response.
|
|
1486
|
-
test_input:
|
|
1550
|
+
test_input: c.Sequence[litellm.AllMessageValues] | str
|
|
1487
1551
|
if self.generative_type == GenerativeType.BASE:
|
|
1488
1552
|
test_input = "Test message"
|
|
1489
1553
|
else:
|
|
@@ -1542,7 +1606,7 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1542
1606
|
)
|
|
1543
1607
|
|
|
1544
1608
|
try:
|
|
1545
|
-
downloaded_ollama_models:
|
|
1609
|
+
downloaded_ollama_models: c.Sequence[str] = [
|
|
1546
1610
|
model_obj.model
|
|
1547
1611
|
for model_obj in ollama.list().models
|
|
1548
1612
|
if model_obj.model is not None
|
|
@@ -1571,7 +1635,8 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1571
1635
|
f"The model {model_id!r} cannot be found on Ollama, but the "
|
|
1572
1636
|
f"model {model_id_with_prefix} *was* found, so we would "
|
|
1573
1637
|
"recommend you cancelling this run and trying the evaluation "
|
|
1574
|
-
"with that model ID instead."
|
|
1638
|
+
"with that model ID instead.",
|
|
1639
|
+
level=logging.WARNING,
|
|
1575
1640
|
)
|
|
1576
1641
|
return False
|
|
1577
1642
|
except ollama.ResponseError as inner_e:
|
|
@@ -1589,11 +1654,8 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1589
1654
|
) from e
|
|
1590
1655
|
|
|
1591
1656
|
# Download the model
|
|
1592
|
-
with
|
|
1593
|
-
desc=f"Downloading {ollama_model_id}",
|
|
1594
|
-
unit_scale=True,
|
|
1595
|
-
unit="B",
|
|
1596
|
-
leave=False,
|
|
1657
|
+
with get_pbar(
|
|
1658
|
+
desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
|
|
1597
1659
|
) as pbar:
|
|
1598
1660
|
for status in response:
|
|
1599
1661
|
if status.total is not None:
|