EuroEval 16.3.0__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -2
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +2 -1
- euroeval/benchmark_modules/hf.py +99 -62
- euroeval/benchmark_modules/litellm.py +101 -41
- euroeval/benchmark_modules/vllm.py +91 -83
- euroeval/benchmarker.py +84 -78
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/constants.py +6 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -11
- euroeval/dataset_configs/dutch.py +0 -1
- euroeval/dataset_configs/english.py +0 -1
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -1
- euroeval/dataset_configs/french.py +0 -1
- euroeval/dataset_configs/german.py +0 -1
- euroeval/dataset_configs/italian.py +0 -1
- euroeval/dataset_configs/latvian.py +0 -1
- euroeval/dataset_configs/lithuanian.py +9 -3
- euroeval/dataset_configs/norwegian.py +0 -1
- euroeval/dataset_configs/polish.py +0 -1
- euroeval/dataset_configs/portuguese.py +0 -1
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -1
- euroeval/dataset_configs/swedish.py +10 -12
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +9 -5
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +17 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +21 -3
- euroeval/prompt_templates/multiple_choice.py +25 -1
- euroeval/prompt_templates/named_entity_recognition.py +51 -11
- euroeval/prompt_templates/reading_comprehension.py +31 -3
- euroeval/prompt_templates/sentiment_classification.py +23 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +14 -12
- euroeval/utils.py +29 -146
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/METADATA +4 -4
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.3.0.dist-info/RECORD +0 -71
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.3.0.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -6,7 +6,7 @@ import json
|
|
|
6
6
|
import logging
|
|
7
7
|
import re
|
|
8
8
|
import typing as t
|
|
9
|
-
from functools import
|
|
9
|
+
from functools import cached_property, partial
|
|
10
10
|
from time import sleep
|
|
11
11
|
|
|
12
12
|
import litellm
|
|
@@ -36,8 +36,8 @@ from litellm.utils import supports_reasoning, supports_response_schema
|
|
|
36
36
|
from pydantic import conlist, create_model
|
|
37
37
|
from requests.exceptions import RequestException
|
|
38
38
|
from tqdm.asyncio import tqdm as tqdm_async
|
|
39
|
-
from tqdm.auto import tqdm
|
|
40
39
|
|
|
40
|
+
from ..caching_utils import cache_arguments
|
|
41
41
|
from ..constants import (
|
|
42
42
|
JSON_STRIP_CHARACTERS,
|
|
43
43
|
LITELLM_CLASSIFICATION_OUTPUT_KEY,
|
|
@@ -70,6 +70,7 @@ from ..generation_utils import (
|
|
|
70
70
|
extract_few_shot_examples,
|
|
71
71
|
raise_if_wrong_params,
|
|
72
72
|
)
|
|
73
|
+
from ..logging_utils import get_pbar, log, log_once
|
|
73
74
|
from ..task_group_utils import (
|
|
74
75
|
question_answering,
|
|
75
76
|
sequence_classification,
|
|
@@ -83,7 +84,6 @@ from ..utils import (
|
|
|
83
84
|
add_semaphore_and_catch_exception,
|
|
84
85
|
create_model_cache_dir,
|
|
85
86
|
get_hf_token,
|
|
86
|
-
log_once,
|
|
87
87
|
safe_run,
|
|
88
88
|
split_model_id,
|
|
89
89
|
)
|
|
@@ -95,8 +95,6 @@ if t.TYPE_CHECKING:
|
|
|
95
95
|
from litellm.types.utils import ModelResponse
|
|
96
96
|
from transformers.trainer import Trainer
|
|
97
97
|
|
|
98
|
-
logger = logging.getLogger("euroeval")
|
|
99
|
-
|
|
100
98
|
|
|
101
99
|
VOCAB_SIZE_MAPPING = {
|
|
102
100
|
# OpenAI models
|
|
@@ -133,6 +131,7 @@ MODEL_MAX_LENGTH_MAPPING = {
|
|
|
133
131
|
r"gpt-4.1.*": 1_047_576,
|
|
134
132
|
# Anthropic models
|
|
135
133
|
r"(anthropic/)?claude-[1-9](-[1-9])?-(opus|sonnet|haiku)-[0-9]{8}": 200_000,
|
|
134
|
+
r"(anthropic/)?claude-(opus|sonnet|haiku)-[1-9](-[1-9])?-[0-9]{8}": 200_000,
|
|
136
135
|
# Gemini models
|
|
137
136
|
r"(gemini/)?gemini-1\.5-flash.*": 1_048_576,
|
|
138
137
|
r"(gemini/)?gemini-1\.5-pro.*": 2_097_152,
|
|
@@ -367,10 +366,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
367
366
|
(batch_indices[idx], model_inputs[batch_indices[idx]])
|
|
368
367
|
for idx, _ in failures
|
|
369
368
|
]
|
|
370
|
-
|
|
369
|
+
log(
|
|
371
370
|
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
372
371
|
f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
|
|
373
|
-
f"{failures[0][1]}."
|
|
372
|
+
f"{failures[0][1]}.",
|
|
373
|
+
level=logging.DEBUG,
|
|
374
374
|
)
|
|
375
375
|
|
|
376
376
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
@@ -422,14 +422,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
422
422
|
"'stop' is not supported with this model",
|
|
423
423
|
"'$.stop' is invalid",
|
|
424
424
|
]
|
|
425
|
+
stop_pattern = re.compile(r"does not support parameters: \[.*'stop'.*\]")
|
|
425
426
|
logprobs_messages = [
|
|
426
427
|
"you are not allowed to request logprobs",
|
|
427
428
|
"you've reached the maximum number of requests with logprobs",
|
|
428
429
|
"logprobs is not supported",
|
|
429
430
|
"logprobs is not enabled",
|
|
431
|
+
"Invalid value at 'generation_config.response_logprobs' (TYPE_BOOL)",
|
|
430
432
|
]
|
|
431
|
-
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
432
433
|
logprobs_pattern = re.compile(
|
|
434
|
+
r"does not support parameters: \[.*'logprobs'.*\]"
|
|
435
|
+
)
|
|
436
|
+
top_logprobs_messages = ["got an unexpected keyword argument 'top_logprobs'"]
|
|
437
|
+
top_logprobs_pattern = re.compile(
|
|
433
438
|
r"does not support parameters: \[.*'top_logprobs'.*\]"
|
|
434
439
|
)
|
|
435
440
|
max_completion_tokens_pattern = re.compile(
|
|
@@ -438,6 +443,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
438
443
|
temperature_messages = [
|
|
439
444
|
"'temperature' is not supported with this model.",
|
|
440
445
|
"temperature is not supported with this model",
|
|
446
|
+
r"does not support parameters: \[.*'temperature'.*\]",
|
|
441
447
|
]
|
|
442
448
|
temperature_must_be_one_messages = [
|
|
443
449
|
"`temperature` may only be set to 1",
|
|
@@ -454,10 +460,14 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
454
460
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
455
461
|
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
456
462
|
response_format_messages = [
|
|
457
|
-
"got an unexpected keyword argument 'response_format'"
|
|
463
|
+
"got an unexpected keyword argument 'response_format'",
|
|
464
|
+
"the model returned empty outputs",
|
|
458
465
|
]
|
|
459
466
|
|
|
460
|
-
if
|
|
467
|
+
if (
|
|
468
|
+
any(msg.lower() in error_msg for msg in stop_messages)
|
|
469
|
+
or stop_pattern.search(string=error_msg) is not None
|
|
470
|
+
):
|
|
461
471
|
log_once(
|
|
462
472
|
f"The model {model_id!r} does not support "
|
|
463
473
|
"stop sequences, so disabling them.",
|
|
@@ -467,7 +477,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
467
477
|
return generation_kwargs
|
|
468
478
|
elif (
|
|
469
479
|
any(msg.lower() in error_msg for msg in logprobs_messages)
|
|
470
|
-
or logprobs_pattern.search(string=error_msg)
|
|
480
|
+
or logprobs_pattern.search(string=error_msg) is not None
|
|
471
481
|
# Special case for Vertex AI models, since they have strict rate
|
|
472
482
|
# limits on using logprobs. They also have a cap of 5 logprobs, but
|
|
473
483
|
# we ignore this since the rate limiting makes it unusable anyway.
|
|
@@ -477,10 +487,15 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
477
487
|
f"The model {model_id!r} does not support logprobs, so disabling it.",
|
|
478
488
|
level=logging.DEBUG,
|
|
479
489
|
)
|
|
490
|
+
self.buffer["first_label_token_mapping"] = False
|
|
480
491
|
generation_kwargs.pop("logprobs", None)
|
|
481
492
|
generation_kwargs.pop("top_logprobs", None)
|
|
493
|
+
generation_kwargs.pop("response_format", None)
|
|
482
494
|
return generation_kwargs
|
|
483
|
-
elif
|
|
495
|
+
elif (
|
|
496
|
+
any(msg.lower() in error_msg for msg in top_logprobs_messages)
|
|
497
|
+
or top_logprobs_pattern.search(string=error_msg) is not None
|
|
498
|
+
):
|
|
484
499
|
log_once(
|
|
485
500
|
f"The model {model_id!r} does not support the `top_logprobs` argument, "
|
|
486
501
|
"so moving the value to `logprobs`.",
|
|
@@ -597,9 +612,10 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
597
612
|
elif isinstance(
|
|
598
613
|
error, (Timeout, ServiceUnavailableError, InternalServerError, SystemError)
|
|
599
614
|
):
|
|
600
|
-
|
|
615
|
+
log(
|
|
601
616
|
f"Service temporarily unavailable. The error message was: {error}. "
|
|
602
|
-
"Retrying in 10 seconds..."
|
|
617
|
+
"Retrying in 10 seconds...",
|
|
618
|
+
level=logging.DEBUG,
|
|
603
619
|
)
|
|
604
620
|
sleep(10)
|
|
605
621
|
return generation_kwargs
|
|
@@ -629,10 +645,32 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
629
645
|
) from error
|
|
630
646
|
|
|
631
647
|
if isinstance(error, RateLimitError):
|
|
632
|
-
|
|
648
|
+
log(
|
|
633
649
|
f"You have encountered your rate limit for model {model_id!r}. "
|
|
634
|
-
"
|
|
635
|
-
|
|
650
|
+
"Retrying in 10 seconds...",
|
|
651
|
+
level=logging.DEBUG,
|
|
652
|
+
)
|
|
653
|
+
sleep(10)
|
|
654
|
+
return generation_kwargs
|
|
655
|
+
|
|
656
|
+
if (
|
|
657
|
+
isinstance(error, BadRequestError)
|
|
658
|
+
and (
|
|
659
|
+
retry_match := re.search(
|
|
660
|
+
pattern=r"\bretry in ([0-9]+(.[0-9]+)?) ?(s|seconds)\b",
|
|
661
|
+
string=error_msg,
|
|
662
|
+
)
|
|
663
|
+
)
|
|
664
|
+
is not None
|
|
665
|
+
):
|
|
666
|
+
retry_seconds = float(retry_match.group(1))
|
|
667
|
+
log(
|
|
668
|
+
f"Bad request error encountered. Retrying in {retry_seconds:.1f} "
|
|
669
|
+
"seconds...",
|
|
670
|
+
level=logging.DEBUG,
|
|
671
|
+
)
|
|
672
|
+
sleep(retry_seconds)
|
|
673
|
+
return generation_kwargs
|
|
636
674
|
|
|
637
675
|
if isinstance(error, AuthenticationError):
|
|
638
676
|
raise NeedsAdditionalArgument(
|
|
@@ -711,7 +749,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
711
749
|
for input_ in inputs
|
|
712
750
|
if isinstance(input_, list)
|
|
713
751
|
]
|
|
714
|
-
responses = await tqdm_async.gather(
|
|
752
|
+
responses = await tqdm_async.gather(
|
|
753
|
+
*requests, colour="yellow", ascii="—▰", leave=False
|
|
754
|
+
)
|
|
755
|
+
|
|
756
|
+
# If the outputs are empty, convert them to exceptions
|
|
757
|
+
if all(
|
|
758
|
+
not isinstance(response, Exception)
|
|
759
|
+
and response.choices[0].message.content == "{}"
|
|
760
|
+
for response in responses
|
|
761
|
+
):
|
|
762
|
+
responses = [ValueError("The model returned empty outputs.")] * len(
|
|
763
|
+
responses
|
|
764
|
+
)
|
|
715
765
|
|
|
716
766
|
# Separate the successful responses from the failed ones
|
|
717
767
|
successes = [
|
|
@@ -731,7 +781,9 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
731
781
|
try:
|
|
732
782
|
request.close()
|
|
733
783
|
except RuntimeError as e:
|
|
734
|
-
|
|
784
|
+
log(
|
|
785
|
+
f"RuntimeError during request.close(): {e}", level=logging.DEBUG
|
|
786
|
+
)
|
|
735
787
|
|
|
736
788
|
return successes, failures
|
|
737
789
|
|
|
@@ -756,10 +808,11 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
756
808
|
for model_response in model_responses:
|
|
757
809
|
if not model_response.choices:
|
|
758
810
|
sequences.append("")
|
|
759
|
-
|
|
811
|
+
log(
|
|
760
812
|
f"The model {model_id!r} did not end up "
|
|
761
813
|
"generating any text. This is likely because the model ran "
|
|
762
|
-
"out of tokens while reasoning. Returning an empty string."
|
|
814
|
+
"out of tokens while reasoning. Returning an empty string.",
|
|
815
|
+
level=logging.WARNING,
|
|
763
816
|
)
|
|
764
817
|
continue
|
|
765
818
|
|
|
@@ -847,11 +900,12 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
847
900
|
scores.append(logprobs_list)
|
|
848
901
|
|
|
849
902
|
if not sequences:
|
|
850
|
-
|
|
903
|
+
log(
|
|
851
904
|
"No sequences were generated by the model "
|
|
852
905
|
f"{model_id!r}. This may be due to the "
|
|
853
906
|
"model running out of tokens or an issue with the input data. "
|
|
854
|
-
"Returning an empty GenerativeModelOutput."
|
|
907
|
+
"Returning an empty GenerativeModelOutput.",
|
|
908
|
+
level=logging.WARNING,
|
|
855
909
|
)
|
|
856
910
|
return GenerativeModelOutput(sequences=[], scores=None)
|
|
857
911
|
|
|
@@ -1130,6 +1184,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1130
1184
|
return partial(
|
|
1131
1185
|
sequence_classification.extract_labels_from_generation,
|
|
1132
1186
|
dataset_config=self.dataset_config,
|
|
1187
|
+
model_config=self.model_config,
|
|
1133
1188
|
first_label_token_mapping=self.buffer["first_label_token_mapping"],
|
|
1134
1189
|
)
|
|
1135
1190
|
case TaskGroup.TEXT_TO_TEXT:
|
|
@@ -1205,17 +1260,19 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1205
1260
|
ServiceUnavailableError,
|
|
1206
1261
|
InternalServerError,
|
|
1207
1262
|
) as e:
|
|
1208
|
-
|
|
1263
|
+
log(
|
|
1209
1264
|
f"Service temporarily unavailable. The error message was: {e}. "
|
|
1210
|
-
"Retrying in 10 seconds..."
|
|
1265
|
+
"Retrying in 10 seconds...",
|
|
1266
|
+
level=logging.DEBUG,
|
|
1211
1267
|
)
|
|
1212
1268
|
sleep(10)
|
|
1213
1269
|
except APIError as e:
|
|
1214
1270
|
if "'503 Service Unavailable" not in str(e):
|
|
1215
1271
|
raise e
|
|
1216
|
-
|
|
1272
|
+
log(
|
|
1217
1273
|
f"Failed to check if model {model_id!r} exists. Retrying in 10 "
|
|
1218
|
-
"seconds..."
|
|
1274
|
+
"seconds...",
|
|
1275
|
+
level=logging.WARNING,
|
|
1219
1276
|
)
|
|
1220
1277
|
sleep(10)
|
|
1221
1278
|
except (BadRequestError, NotFoundError):
|
|
@@ -1228,21 +1285,25 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1228
1285
|
case 0:
|
|
1229
1286
|
pass
|
|
1230
1287
|
case 1:
|
|
1231
|
-
|
|
1288
|
+
log(
|
|
1232
1289
|
f"Could not find the model ID {model_id!r}. Did you mean "
|
|
1233
|
-
f"{candidate_models[0]!r}?"
|
|
1290
|
+
f"{candidate_models[0]!r}?",
|
|
1291
|
+
level=logging.WARNING,
|
|
1234
1292
|
)
|
|
1235
1293
|
case _:
|
|
1236
1294
|
candidate_models_str = "', '".join(candidate_models)
|
|
1237
|
-
|
|
1295
|
+
log(
|
|
1238
1296
|
f"Could not find the model ID {model_id!r}. Did you mean "
|
|
1239
|
-
|
|
1297
|
+
"any of the following model IDs: "
|
|
1298
|
+
f"'{candidate_models_str}'?",
|
|
1299
|
+
level=logging.WARNING,
|
|
1240
1300
|
)
|
|
1241
1301
|
return False
|
|
1242
1302
|
else:
|
|
1243
|
-
|
|
1303
|
+
log(
|
|
1244
1304
|
f"Failed to check if model {model_id!r} exists after {num_attempts} "
|
|
1245
|
-
"attempts. Assuming it does not exist."
|
|
1305
|
+
"attempts. Assuming it does not exist.",
|
|
1306
|
+
level=logging.ERROR,
|
|
1246
1307
|
)
|
|
1247
1308
|
return False
|
|
1248
1309
|
|
|
@@ -1275,7 +1336,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1275
1336
|
"that the revision is actually the parameter and set the revision "
|
|
1276
1337
|
"to 'main'. In the future, use the new '#' syntax to specify the "
|
|
1277
1338
|
f"parameter (in this case, this would be {proper_model_id!r}), as this "
|
|
1278
|
-
"will be an error in future versions of EuroEval."
|
|
1339
|
+
"will be an error in future versions of EuroEval.",
|
|
1340
|
+
level=logging.WARNING,
|
|
1279
1341
|
)
|
|
1280
1342
|
model_id_components.param = model_id_components.revision
|
|
1281
1343
|
model_id_components.revision = "main"
|
|
@@ -1363,7 +1425,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1363
1425
|
|
|
1364
1426
|
return dataset
|
|
1365
1427
|
|
|
1366
|
-
@
|
|
1428
|
+
@cache_arguments()
|
|
1367
1429
|
def get_generation_kwargs(self, dataset_config: DatasetConfig) -> dict[str, t.Any]:
|
|
1368
1430
|
"""Get the generation arguments for the model.
|
|
1369
1431
|
|
|
@@ -1571,7 +1633,8 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1571
1633
|
f"The model {model_id!r} cannot be found on Ollama, but the "
|
|
1572
1634
|
f"model {model_id_with_prefix} *was* found, so we would "
|
|
1573
1635
|
"recommend you cancelling this run and trying the evaluation "
|
|
1574
|
-
"with that model ID instead."
|
|
1636
|
+
"with that model ID instead.",
|
|
1637
|
+
level=logging.WARNING,
|
|
1575
1638
|
)
|
|
1576
1639
|
return False
|
|
1577
1640
|
except ollama.ResponseError as inner_e:
|
|
@@ -1589,11 +1652,8 @@ def try_download_ollama_model(model_id: str) -> bool:
|
|
|
1589
1652
|
) from e
|
|
1590
1653
|
|
|
1591
1654
|
# Download the model
|
|
1592
|
-
with
|
|
1593
|
-
desc=f"Downloading {ollama_model_id}",
|
|
1594
|
-
unit_scale=True,
|
|
1595
|
-
unit="B",
|
|
1596
|
-
leave=False,
|
|
1655
|
+
with get_pbar(
|
|
1656
|
+
desc=f"Downloading {ollama_model_id}", unit_scale=True, unit="B"
|
|
1597
1657
|
) as pbar:
|
|
1598
1658
|
for status in response:
|
|
1599
1659
|
if status.total is not None:
|