EuroEval 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +4 -2
- euroeval/benchmark_modules/fresh.py +3 -1
- euroeval/benchmark_modules/hf.py +8 -4
- euroeval/benchmark_modules/litellm.py +5 -17
- euroeval/benchmark_modules/vllm.py +88 -23
- euroeval/benchmarker.py +110 -61
- euroeval/cli.py +1 -1
- euroeval/constants.py +3 -0
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +0 -2
- euroeval/dataset_configs/dutch.py +0 -2
- euroeval/dataset_configs/english.py +0 -2
- euroeval/dataset_configs/finnish.py +0 -2
- euroeval/dataset_configs/french.py +0 -2
- euroeval/dataset_configs/german.py +0 -2
- euroeval/dataset_configs/italian.py +0 -2
- euroeval/dataset_configs/latvian.py +2 -3
- euroeval/dataset_configs/lithuanian.py +62 -0
- euroeval/dataset_configs/norwegian.py +0 -2
- euroeval/dataset_configs/polish.py +0 -2
- euroeval/dataset_configs/portuguese.py +0 -2
- euroeval/dataset_configs/spanish.py +0 -2
- euroeval/dataset_configs/swedish.py +0 -3
- euroeval/metrics/huggingface.py +1 -1
- euroeval/metrics/pipeline.py +5 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +9 -0
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +10 -0
- euroeval/prompt_templates/sentiment_classification.py +11 -0
- euroeval/tokenisation_utils.py +8 -8
- euroeval/utils.py +1 -1
- {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
- euroeval-16.3.0.dist-info/RECORD +71 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
|
103
103
|
os.environ["VLLM_USE_V1"] = "1"
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
# Use the FlashInfer flash-attention backend for vLLM
|
|
107
|
-
|
|
106
|
+
# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
|
|
107
|
+
# specified a different backend.
|
|
108
|
+
if os.getenv("VLLM_ATTENTION_BACKEND") is None:
|
|
109
|
+
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
|
|
108
110
|
|
|
109
111
|
|
|
110
112
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Freshly initialised encoder models."""
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
import typing as t
|
|
4
5
|
from functools import cached_property
|
|
5
6
|
from json import JSONDecodeError
|
|
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
45
46
|
"""A freshly initialised encoder model."""
|
|
46
47
|
|
|
47
48
|
fresh_model = True
|
|
49
|
+
allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
|
|
48
50
|
|
|
49
51
|
def __init__(
|
|
50
52
|
self,
|
|
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
|
|
|
294
296
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
295
297
|
add_prefix_space=prefix,
|
|
296
298
|
cache_dir=model_config.model_cache_dir,
|
|
297
|
-
use_fast=True,
|
|
299
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
298
300
|
verbose=False,
|
|
299
301
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
300
302
|
)
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
|
+
import re
|
|
5
6
|
import typing as t
|
|
6
7
|
from functools import cached_property, partial
|
|
7
8
|
from json import JSONDecodeError
|
|
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
93
94
|
fresh_model = False
|
|
94
95
|
batching_preference = BatchingPreference.NO_PREFERENCE
|
|
95
96
|
high_priority = True
|
|
97
|
+
allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
|
|
96
98
|
|
|
97
99
|
def __init__(
|
|
98
100
|
self,
|
|
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
|
|
|
690
692
|
model=model,
|
|
691
693
|
model_id=model_id,
|
|
692
694
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
693
|
-
|
|
695
|
+
model_config=model_config,
|
|
694
696
|
)
|
|
695
697
|
|
|
696
698
|
return model, tokeniser
|
|
@@ -880,7 +882,7 @@ def load_tokeniser(
|
|
|
880
882
|
model: "PreTrainedModel | None",
|
|
881
883
|
model_id: str,
|
|
882
884
|
trust_remote_code: bool,
|
|
883
|
-
|
|
885
|
+
model_config: "ModelConfig",
|
|
884
886
|
) -> "PreTrainedTokenizer":
|
|
885
887
|
"""Load the tokeniser.
|
|
886
888
|
|
|
@@ -892,17 +894,19 @@ def load_tokeniser(
|
|
|
892
894
|
The model identifier. Used for logging.
|
|
893
895
|
trust_remote_code:
|
|
894
896
|
Whether to trust remote code.
|
|
897
|
+
model_config:
|
|
898
|
+
The model configuration.
|
|
895
899
|
|
|
896
900
|
Returns:
|
|
897
901
|
The loaded tokeniser.
|
|
898
902
|
"""
|
|
899
903
|
loading_kwargs: dict[str, bool | str] = dict(
|
|
900
|
-
use_fast=True,
|
|
904
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
901
905
|
verbose=False,
|
|
902
906
|
trust_remote_code=trust_remote_code,
|
|
903
907
|
padding_side="right",
|
|
904
908
|
truncation_side="right",
|
|
905
|
-
cache_dir=model_cache_dir,
|
|
909
|
+
cache_dir=model_config.model_cache_dir,
|
|
906
910
|
)
|
|
907
911
|
|
|
908
912
|
# If the model is a subclass of a certain model types then we have to add a prefix
|
|
@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
369
369
|
]
|
|
370
370
|
logger.debug(
|
|
371
371
|
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
372
|
-
f"{len(inputs_to_run):,} failed message(s)"
|
|
372
|
+
f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
|
|
373
|
+
f"{failures[0][1]}."
|
|
373
374
|
)
|
|
374
375
|
|
|
375
376
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
453
454
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
454
455
|
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
455
456
|
response_format_messages = [
|
|
456
|
-
"got an unexpected keyword argument 'response_format'"
|
|
457
|
-
"The model outputs empty dictionaries.",
|
|
457
|
+
"got an unexpected keyword argument 'response_format'"
|
|
458
458
|
]
|
|
459
459
|
|
|
460
460
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
713
713
|
]
|
|
714
714
|
responses = await tqdm_async.gather(*requests, leave=False)
|
|
715
715
|
|
|
716
|
-
# If we are performing structured generation and the model just outputs an empty
|
|
717
|
-
# dictionary, then we convert those to exceptions, to disable structured
|
|
718
|
-
# generation
|
|
719
|
-
if "response_format" in generation_kwargs:
|
|
720
|
-
responses = [
|
|
721
|
-
RuntimeError("The model outputs empty dictionaries.")
|
|
722
|
-
if not isinstance(response, Exception)
|
|
723
|
-
and any(choice.message.content == "{}" for choice in response.choices)
|
|
724
|
-
else response
|
|
725
|
-
for response in responses
|
|
726
|
-
]
|
|
727
|
-
|
|
728
716
|
# Separate the successful responses from the failed ones
|
|
729
717
|
successes = [
|
|
730
718
|
(idx, response)
|
|
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
984
972
|
model=None,
|
|
985
973
|
model_id=model_id,
|
|
986
974
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
987
|
-
|
|
975
|
+
model_config=self.model_config,
|
|
988
976
|
)
|
|
989
977
|
|
|
990
978
|
if (
|
|
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1067
1055
|
model=None,
|
|
1068
1056
|
model_id=model_id,
|
|
1069
1057
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
1070
|
-
|
|
1058
|
+
model_config=self.model_config,
|
|
1071
1059
|
)
|
|
1072
1060
|
|
|
1073
1061
|
all_max_lengths: list[int] = list()
|
|
@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
104
104
|
fresh_model = False
|
|
105
105
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
106
106
|
high_priority = True
|
|
107
|
-
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
|
|
107
|
+
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
|
|
108
108
|
|
|
109
109
|
def __init__(
|
|
110
110
|
self,
|
|
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
559
559
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
560
560
|
]
|
|
561
561
|
)
|
|
562
|
-
if
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
562
|
+
if (
|
|
563
|
+
self.end_of_reasoning_token is not None
|
|
564
|
+
and self.generative_type == GenerativeType.REASONING
|
|
565
|
+
):
|
|
566
|
+
for idx in range(len(completions)):
|
|
567
|
+
if self.end_of_reasoning_token in completions[idx]:
|
|
568
|
+
completions[idx] = completions[idx].split(
|
|
569
|
+
self.end_of_reasoning_token
|
|
570
|
+
)[-1]
|
|
571
|
+
elif self.benchmark_config.verbose:
|
|
572
|
+
logger.warning(
|
|
573
|
+
f"The model {self.model_config.model_id!r} is a reasoning "
|
|
574
|
+
"model, but the generated output does not contain the end of "
|
|
575
|
+
f"reasoning token ({self.end_of_reasoning_token!r}). Using "
|
|
576
|
+
"an empty string as the prediction instead."
|
|
577
|
+
)
|
|
578
|
+
completions[idx] = ""
|
|
579
|
+
else:
|
|
580
|
+
log_once(
|
|
581
|
+
f"The model {self.model_config.model_id!r} is a reasoning "
|
|
582
|
+
"model, but the generated output does not contain the end of "
|
|
583
|
+
f"reasoning token ({self.end_of_reasoning_token!r}). Using "
|
|
584
|
+
"an empty string as the prediction instead. Only showing "
|
|
585
|
+
"this warning once - see all occurrences if you run with the "
|
|
586
|
+
"`verbose` flag.",
|
|
587
|
+
level=logging.WARNING,
|
|
588
|
+
)
|
|
589
|
+
completions[idx] = ""
|
|
567
590
|
stop_token_pattern = re.compile(
|
|
568
591
|
"|".join(re.escape(stop_token) for stop_token in stop_tokens)
|
|
569
592
|
)
|
|
@@ -830,9 +853,12 @@ def load_model_and_tokeniser(
|
|
|
830
853
|
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
831
854
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
832
855
|
model_max_length=true_max_model_len,
|
|
833
|
-
|
|
856
|
+
model_config=model_config,
|
|
834
857
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
835
858
|
)
|
|
859
|
+
vllm_tokenisation_params = get_vllm_tokenisation_params(
|
|
860
|
+
tokeniser=tokeniser, model_config=model_config
|
|
861
|
+
)
|
|
836
862
|
|
|
837
863
|
clear_vllm()
|
|
838
864
|
|
|
@@ -865,16 +891,7 @@ def load_model_and_tokeniser(
|
|
|
865
891
|
enable_prefix_caching=False,
|
|
866
892
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
867
893
|
max_lora_rank=256,
|
|
868
|
-
|
|
869
|
-
tokenizer_mode="mistral"
|
|
870
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
871
|
-
else "auto",
|
|
872
|
-
config_format="mistral"
|
|
873
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
874
|
-
else "auto",
|
|
875
|
-
load_format="mistral"
|
|
876
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
877
|
-
else "auto",
|
|
894
|
+
**vllm_tokenisation_params,
|
|
878
895
|
)
|
|
879
896
|
except (RuntimeError, ValueError, OSError) as e:
|
|
880
897
|
if "awaiting a review from the repo authors" in str(e):
|
|
@@ -903,7 +920,7 @@ def load_tokeniser(
|
|
|
903
920
|
adapter_base_model_id: str | None,
|
|
904
921
|
trust_remote_code: bool,
|
|
905
922
|
model_max_length: int,
|
|
906
|
-
|
|
923
|
+
model_config: "ModelConfig",
|
|
907
924
|
token: str | bool,
|
|
908
925
|
) -> "PreTrainedTokenizer":
|
|
909
926
|
"""Load the tokeniser.
|
|
@@ -920,8 +937,8 @@ def load_tokeniser(
|
|
|
920
937
|
Whether to trust remote code.
|
|
921
938
|
model_max_length:
|
|
922
939
|
The maximum length of the model.
|
|
923
|
-
|
|
924
|
-
The
|
|
940
|
+
model_config:
|
|
941
|
+
The model configuration.
|
|
925
942
|
token:
|
|
926
943
|
The Hugging Face API token.
|
|
927
944
|
|
|
@@ -932,7 +949,7 @@ def load_tokeniser(
|
|
|
932
949
|
config = AutoConfig.from_pretrained(
|
|
933
950
|
adapter_base_model_id or model_id,
|
|
934
951
|
revision=revision,
|
|
935
|
-
cache_dir=model_cache_dir,
|
|
952
|
+
cache_dir=model_config.model_cache_dir,
|
|
936
953
|
token=token,
|
|
937
954
|
trust_remote_code=trust_remote_code,
|
|
938
955
|
local_files_only=not internet_connection_available(),
|
|
@@ -940,15 +957,25 @@ def load_tokeniser(
|
|
|
940
957
|
num_retries = 5
|
|
941
958
|
for _ in range(num_retries):
|
|
942
959
|
try:
|
|
960
|
+
# Mistral instruction-tuned models need a custom tokeniser
|
|
961
|
+
if model_id.startswith("mistralai/") and "base" not in model_id.lower():
|
|
962
|
+
tokeniser = MistralCommonTokenizer.from_pretrained(
|
|
963
|
+
model_id,
|
|
964
|
+
padding_side="left",
|
|
965
|
+
truncation_side="left",
|
|
966
|
+
model_max_length=model_max_length,
|
|
967
|
+
token=token,
|
|
968
|
+
)
|
|
969
|
+
break
|
|
943
970
|
tokeniser = AutoTokenizer.from_pretrained(
|
|
944
971
|
model_id,
|
|
945
|
-
use_fast=True,
|
|
972
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
946
973
|
verbose=False,
|
|
947
974
|
trust_remote_code=trust_remote_code,
|
|
948
975
|
padding_side="left",
|
|
949
976
|
truncation_side="left",
|
|
950
977
|
model_max_length=model_max_length,
|
|
951
|
-
cache_dir=model_cache_dir,
|
|
978
|
+
cache_dir=model_config.model_cache_dir,
|
|
952
979
|
config=config,
|
|
953
980
|
token=token,
|
|
954
981
|
local_files_only=not internet_connection_available(),
|
|
@@ -1189,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
|
|
|
1189
1216
|
"""
|
|
1190
1217
|
tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
|
|
1191
1218
|
return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
def get_vllm_tokenisation_params(
|
|
1222
|
+
tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
|
|
1223
|
+
) -> dict[str, t.Any]:
|
|
1224
|
+
"""Get the tokenisation parameters for vLLM.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
tokeniser:
|
|
1228
|
+
The tokeniser.
|
|
1229
|
+
model_config:
|
|
1230
|
+
The model configuration.
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
A dictionary of tokenisation parameters to pass to vLLM.
|
|
1234
|
+
"""
|
|
1235
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1236
|
+
tokeniser_mode = "mistral"
|
|
1237
|
+
elif model_config.param == "slow-tokenizer":
|
|
1238
|
+
tokeniser_mode = "slow"
|
|
1239
|
+
else:
|
|
1240
|
+
tokeniser_mode = "auto"
|
|
1241
|
+
|
|
1242
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1243
|
+
config_format = "mistral"
|
|
1244
|
+
else:
|
|
1245
|
+
config_format = "auto"
|
|
1246
|
+
|
|
1247
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1248
|
+
load_format = "mistral"
|
|
1249
|
+
else:
|
|
1250
|
+
load_format = "auto"
|
|
1251
|
+
|
|
1252
|
+
return dict(
|
|
1253
|
+
tokenizer_mode=tokeniser_mode,
|
|
1254
|
+
config_format=config_format,
|
|
1255
|
+
load_format=load_format,
|
|
1256
|
+
)
|
euroeval/benchmarker.py
CHANGED
|
@@ -12,6 +12,7 @@ from time import sleep
|
|
|
12
12
|
|
|
13
13
|
from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
|
|
14
14
|
from torch.distributed import destroy_process_group
|
|
15
|
+
from tqdm.auto import tqdm
|
|
15
16
|
|
|
16
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
17
18
|
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
@@ -32,6 +33,7 @@ from .utils import (
|
|
|
32
33
|
get_package_version,
|
|
33
34
|
internet_connection_available,
|
|
34
35
|
log_once,
|
|
36
|
+
split_model_id,
|
|
35
37
|
)
|
|
36
38
|
|
|
37
39
|
if t.TYPE_CHECKING:
|
|
@@ -82,7 +84,7 @@ class Benchmarker:
|
|
|
82
84
|
num_iterations: int = 10,
|
|
83
85
|
api_base: str | None = None,
|
|
84
86
|
api_version: str | None = None,
|
|
85
|
-
gpu_memory_utilization: float = 0.
|
|
87
|
+
gpu_memory_utilization: float = 0.8,
|
|
86
88
|
generative_type: GenerativeType | None = None,
|
|
87
89
|
debug: bool = False,
|
|
88
90
|
run_with_cli: bool = False,
|
|
@@ -607,46 +609,90 @@ class Benchmarker:
|
|
|
607
609
|
dataset_names=benchmark_config.datasets
|
|
608
610
|
)
|
|
609
611
|
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
612
|
+
# Get all the model configs
|
|
613
|
+
model_configs: list[ModelConfig] = list()
|
|
614
|
+
for model_id in tqdm(
|
|
615
|
+
iterable=model_ids,
|
|
616
|
+
desc="Fetching model configurations",
|
|
617
|
+
disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
|
|
618
|
+
):
|
|
616
619
|
try:
|
|
617
620
|
model_config = get_model_config(
|
|
618
621
|
model_id=model_id, benchmark_config=benchmark_config
|
|
619
622
|
)
|
|
623
|
+
model_configs.append(model_config)
|
|
620
624
|
except InvalidModel as e:
|
|
621
625
|
logger.info(e.message)
|
|
622
|
-
|
|
626
|
+
|
|
627
|
+
# Create a dictionary that takes each model config to the dataset configs that
|
|
628
|
+
# we need to benchmark the model on. Here we remove the datasets that the model
|
|
629
|
+
# has already been benchmarked on, or datasets that the model cannot be
|
|
630
|
+
# benchmarked on.
|
|
631
|
+
model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
|
|
632
|
+
model_config: [
|
|
633
|
+
dataset_config
|
|
634
|
+
for dataset_config in dataset_configs
|
|
635
|
+
if (
|
|
636
|
+
benchmark_config.force
|
|
637
|
+
or not model_has_been_benchmarked(
|
|
638
|
+
model_config=model_config,
|
|
639
|
+
dataset_config=dataset_config,
|
|
640
|
+
benchmark_config=benchmark_config,
|
|
641
|
+
benchmark_results=self.benchmark_results,
|
|
642
|
+
)
|
|
643
|
+
)
|
|
644
|
+
and model_config.model_type in dataset_config.allowed_model_types
|
|
645
|
+
]
|
|
646
|
+
for model_config in model_configs
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
total_benchmarks = sum(
|
|
650
|
+
len(dataset_configs)
|
|
651
|
+
for dataset_configs in model_config_to_dataset_configs.values()
|
|
652
|
+
)
|
|
653
|
+
if total_benchmarks == 0:
|
|
654
|
+
logger.info(
|
|
655
|
+
"No benchmarks to run, as all the selected models have already been "
|
|
656
|
+
"benchmarked on all the selected datasets."
|
|
657
|
+
)
|
|
658
|
+
return list()
|
|
659
|
+
|
|
660
|
+
logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
|
|
661
|
+
|
|
662
|
+
num_finished_benchmarks = 0
|
|
663
|
+
current_benchmark_results: list[BenchmarkResult] = list()
|
|
664
|
+
for model_config in model_configs:
|
|
665
|
+
if not model_config_to_dataset_configs[model_config]:
|
|
666
|
+
logger.debug(
|
|
667
|
+
f"Skipping model {model_config.model_id!r} because it has "
|
|
668
|
+
"already been benchmarked on all valid datasets."
|
|
669
|
+
)
|
|
623
670
|
continue
|
|
624
671
|
|
|
625
672
|
if model_config.adapter_base_model_id:
|
|
626
673
|
open_issue_msg = (
|
|
627
|
-
"If offline support is important to you, please "
|
|
628
|
-
"
|
|
674
|
+
"If offline support is important to you, please consider opening "
|
|
675
|
+
"an issue at https://github.com/EuroEval/EuroEval/issues."
|
|
629
676
|
)
|
|
630
677
|
if not internet_connection_available():
|
|
631
678
|
raise InvalidModel(
|
|
632
679
|
"Offline benchmarking of models with adapters is not currently "
|
|
633
|
-
"supported. "
|
|
634
|
-
|
|
680
|
+
"supported. An active internet connection is required. "
|
|
681
|
+
"{open_issue_msg}"
|
|
635
682
|
)
|
|
636
683
|
elif benchmark_config.download_only:
|
|
637
684
|
log_once(
|
|
638
685
|
"You are using download only mode with a model that includes "
|
|
639
|
-
"an adapter. "
|
|
640
|
-
"
|
|
641
|
-
"
|
|
642
|
-
"An internet connection will be required during evaluation. "
|
|
686
|
+
"an adapter. Please note that offline benchmarking of "
|
|
687
|
+
"adapter models is not currently supported - an internet "
|
|
688
|
+
"connection will be required during evaluation in this case. "
|
|
643
689
|
f"{open_issue_msg}",
|
|
644
690
|
level=logging.WARNING,
|
|
645
691
|
)
|
|
646
692
|
|
|
647
693
|
loaded_model: BenchmarkModule | None = None
|
|
648
694
|
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
649
|
-
for dataset_config in
|
|
695
|
+
for dataset_config in model_config_to_dataset_configs[model_config]:
|
|
650
696
|
# Revert any changes to the benchmark configuration made for the
|
|
651
697
|
# previous dataset
|
|
652
698
|
for param, value in benchmark_params_to_revert.items():
|
|
@@ -674,34 +720,6 @@ class Benchmarker:
|
|
|
674
720
|
benchmark_params_to_revert["few_shot"] = True
|
|
675
721
|
benchmark_config.few_shot = False
|
|
676
722
|
|
|
677
|
-
# Skip if we have already benchmarked this model on this dataset and
|
|
678
|
-
# we are not forcing the benchmark
|
|
679
|
-
if not benchmark_config.force and model_has_been_benchmarked(
|
|
680
|
-
model_id=model_id,
|
|
681
|
-
dataset=dataset_config.name,
|
|
682
|
-
few_shot=benchmark_config.few_shot,
|
|
683
|
-
validation_split=not benchmark_config.evaluate_test_split,
|
|
684
|
-
benchmark_results=self.benchmark_results,
|
|
685
|
-
):
|
|
686
|
-
logger.debug(
|
|
687
|
-
f"Skipping benchmarking {model_id} on "
|
|
688
|
-
f"{dataset_config.pretty_name}, as it has already been "
|
|
689
|
-
"benchmarked."
|
|
690
|
-
)
|
|
691
|
-
num_finished_benchmarks += 1
|
|
692
|
-
continue
|
|
693
|
-
|
|
694
|
-
# Skip if the model type should not be benchmarked on this dataset
|
|
695
|
-
model_type = model_config.model_type
|
|
696
|
-
allowed_model_types = dataset_config.allowed_model_types
|
|
697
|
-
if model_type not in allowed_model_types:
|
|
698
|
-
logger.debug(
|
|
699
|
-
f"Skipping benchmarking {model_id} on "
|
|
700
|
-
f"{dataset_config.pretty_name}, as it is of type {model_type}, "
|
|
701
|
-
f"and the only allowed model types are {allowed_model_types}."
|
|
702
|
-
)
|
|
703
|
-
continue
|
|
704
|
-
|
|
705
723
|
# We do not re-initialise generative models as their architecture is not
|
|
706
724
|
# customised to specific datasets
|
|
707
725
|
if model_config.model_type == ModelType.GENERATIVE:
|
|
@@ -735,6 +753,22 @@ class Benchmarker:
|
|
|
735
753
|
else:
|
|
736
754
|
loaded_model.dataset_config = dataset_config
|
|
737
755
|
|
|
756
|
+
# Skip the benchmark if the model is not of the correct
|
|
757
|
+
# generative type
|
|
758
|
+
if (
|
|
759
|
+
loaded_model.generative_type
|
|
760
|
+
not in dataset_config.allowed_generative_types
|
|
761
|
+
):
|
|
762
|
+
logger.debug(
|
|
763
|
+
f"Skipping the benchmark of model "
|
|
764
|
+
f"{model_config.model_id!r}on dataset "
|
|
765
|
+
f"{dataset_config.name!r} because the model has generative "
|
|
766
|
+
f"type {loaded_model.generative_type} and the dataset "
|
|
767
|
+
f"only allows {dataset_config.allowed_generative_types}."
|
|
768
|
+
)
|
|
769
|
+
num_finished_benchmarks += 1
|
|
770
|
+
continue
|
|
771
|
+
|
|
738
772
|
# Benchmark a single model on a single dataset
|
|
739
773
|
benchmark_output_or_err = self._benchmark_single(
|
|
740
774
|
model=loaded_model,
|
|
@@ -969,23 +1003,20 @@ class Benchmarker:
|
|
|
969
1003
|
|
|
970
1004
|
|
|
971
1005
|
def model_has_been_benchmarked(
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
validation_split: bool,
|
|
1006
|
+
model_config: "ModelConfig",
|
|
1007
|
+
dataset_config: "DatasetConfig",
|
|
1008
|
+
benchmark_config: "BenchmarkConfig",
|
|
976
1009
|
benchmark_results: list[BenchmarkResult],
|
|
977
1010
|
) -> bool:
|
|
978
1011
|
"""Checks whether a model has already been benchmarked on a dataset.
|
|
979
1012
|
|
|
980
1013
|
Args:
|
|
981
|
-
|
|
982
|
-
The model
|
|
983
|
-
|
|
984
|
-
The dataset.
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
validation_split:
|
|
988
|
-
Whether the model was evaluated on the validation split.
|
|
1014
|
+
model_config:
|
|
1015
|
+
The configuration of the model we are evaluating.
|
|
1016
|
+
dataset_config:
|
|
1017
|
+
The configuration of the dataset we are evaluating on.
|
|
1018
|
+
benchmark_config:
|
|
1019
|
+
The general benchmark configuration.
|
|
989
1020
|
benchmark_results:
|
|
990
1021
|
The benchmark results.
|
|
991
1022
|
|
|
@@ -993,10 +1024,28 @@ def model_has_been_benchmarked(
|
|
|
993
1024
|
Whether the model has already been evaluated on the dataset.
|
|
994
1025
|
"""
|
|
995
1026
|
for record in benchmark_results:
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1027
|
+
model_id_components = split_model_id(model_id=record.model)
|
|
1028
|
+
same_model_id = model_id_components.model_id == model_config.model_id
|
|
1029
|
+
same_revision = model_id_components.revision == model_config.revision
|
|
1030
|
+
same_param = model_id_components.param == model_config.param
|
|
1031
|
+
same_dataset = record.dataset == dataset_config.name
|
|
1032
|
+
same_split = (
|
|
1033
|
+
record.validation_split != benchmark_config.evaluate_test_split
|
|
1034
|
+
or "val" not in dataset_config.splits
|
|
1035
|
+
)
|
|
1036
|
+
same_num_shots = (
|
|
1037
|
+
record.few_shot == benchmark_config.few_shot
|
|
1038
|
+
or not record.generative
|
|
1039
|
+
or dataset_config.task.requires_zero_shot
|
|
1040
|
+
)
|
|
1041
|
+
if (
|
|
1042
|
+
same_model_id
|
|
1043
|
+
and same_revision
|
|
1044
|
+
and same_param
|
|
1045
|
+
and same_dataset
|
|
1046
|
+
and same_split
|
|
1047
|
+
and same_num_shots
|
|
1048
|
+
):
|
|
1000
1049
|
return True
|
|
1001
1050
|
return False
|
|
1002
1051
|
|
euroeval/cli.py
CHANGED
|
@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
|
|
|
188
188
|
)
|
|
189
189
|
@click.option(
|
|
190
190
|
"--gpu-memory-utilization",
|
|
191
|
-
default=0.
|
|
191
|
+
default=0.8,
|
|
192
192
|
show_default=True,
|
|
193
193
|
help="The GPU memory utilization to use for vLLM. A larger value will result in "
|
|
194
194
|
"faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
|
euroeval/constants.py
CHANGED
|
@@ -50,9 +50,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
|
|
|
50
50
|
# Hugging Face Hub tags used to classify models as merge models
|
|
51
51
|
MERGE_TAGS = ["merge", "mergekit"]
|
|
52
52
|
|
|
53
|
+
|
|
53
54
|
# The minimum required CUDA compute capability for using bfloat16 in vLLM
|
|
54
55
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
|
|
55
56
|
|
|
57
|
+
|
|
56
58
|
# Used to detect whether a model is a reasoning model
|
|
57
59
|
REASONING_TOKENS = [
|
|
58
60
|
("<think>", "</think>"),
|
|
@@ -60,6 +62,7 @@ REASONING_TOKENS = [
|
|
|
60
62
|
("<reasoning>", "</reasoning>"),
|
|
61
63
|
]
|
|
62
64
|
|
|
65
|
+
|
|
63
66
|
# These tokens are sometimes used by models to indicate the end of a generated
|
|
64
67
|
# response, but they do not use them as a proper EOS token, so we have to deal with them
|
|
65
68
|
# manually. We only use them as stop tokens if they actually appear in the model's
|
|
@@ -14,6 +14,7 @@ from .german import * # noqa: F403
|
|
|
14
14
|
from .icelandic import * # noqa: F403
|
|
15
15
|
from .italian import * # noqa: F403
|
|
16
16
|
from .latvian import * # noqa: F403
|
|
17
|
+
from .lithuanian import * # noqa: F403
|
|
17
18
|
from .norwegian import * # noqa: F403
|
|
18
19
|
from .polish import * # noqa: F403
|
|
19
20
|
from .portuguese import * # noqa: F403
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Danish dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import DA
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -159,7 +158,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
|
|
|
159
158
|
languages=[DA],
|
|
160
159
|
splits=["train", "test"],
|
|
161
160
|
_labels=["a", "b"],
|
|
162
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
163
161
|
unofficial=True,
|
|
164
162
|
)
|
|
165
163
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All Dutch dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import NL
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -152,7 +151,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
|
|
|
152
151
|
languages=[NL],
|
|
153
152
|
splits=["train", "test"],
|
|
154
153
|
_labels=["a", "b"],
|
|
155
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
156
154
|
unofficial=True,
|
|
157
155
|
)
|
|
158
156
|
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
"""All English dataset configurations used in EuroEval."""
|
|
2
2
|
|
|
3
3
|
from ..data_models import DatasetConfig
|
|
4
|
-
from ..enums import ModelType
|
|
5
4
|
from ..languages import EN
|
|
6
5
|
from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
|
|
7
6
|
|
|
@@ -135,7 +134,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
|
|
|
135
134
|
languages=[EN],
|
|
136
135
|
splits=["train", "test"],
|
|
137
136
|
_labels=["a", "b"],
|
|
138
|
-
_allowed_model_types=[ModelType.GENERATIVE],
|
|
139
137
|
unofficial=True,
|
|
140
138
|
)
|
|
141
139
|
|