EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +4 -2
- euroeval/benchmark_modules/fresh.py +3 -1
- euroeval/benchmark_modules/hf.py +8 -4
- euroeval/benchmark_modules/litellm.py +5 -17
- euroeval/benchmark_modules/vllm.py +98 -30
- euroeval/benchmarker.py +291 -405
- euroeval/cli.py +1 -1
- euroeval/constants.py +3 -0
- euroeval/data_models.py +35 -35
- euroeval/dataset_configs/__init__.py +1 -0
- euroeval/dataset_configs/danish.py +0 -2
- euroeval/dataset_configs/dutch.py +0 -2
- euroeval/dataset_configs/english.py +0 -2
- euroeval/dataset_configs/finnish.py +0 -2
- euroeval/dataset_configs/french.py +0 -2
- euroeval/dataset_configs/german.py +0 -2
- euroeval/dataset_configs/italian.py +0 -2
- euroeval/dataset_configs/latvian.py +2 -3
- euroeval/dataset_configs/lithuanian.py +62 -0
- euroeval/dataset_configs/norwegian.py +0 -2
- euroeval/dataset_configs/polish.py +0 -2
- euroeval/dataset_configs/portuguese.py +0 -2
- euroeval/dataset_configs/spanish.py +0 -2
- euroeval/dataset_configs/swedish.py +0 -3
- euroeval/metrics/huggingface.py +1 -1
- euroeval/metrics/pipeline.py +5 -0
- euroeval/prompt_templates/linguistic_acceptability.py +9 -0
- euroeval/prompt_templates/multiple_choice.py +9 -0
- euroeval/prompt_templates/named_entity_recognition.py +20 -0
- euroeval/prompt_templates/reading_comprehension.py +10 -0
- euroeval/prompt_templates/sentiment_classification.py +11 -0
- euroeval/tokenisation_utils.py +8 -8
- euroeval/utils.py +10 -5
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
- euroeval-16.3.0.dist-info/RECORD +71 -0
- euroeval-16.2.1.dist-info/RECORD +0 -70
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
|
|
|
103
103
|
os.environ["VLLM_USE_V1"] = "1"
|
|
104
104
|
|
|
105
105
|
|
|
106
|
-
# Use the FlashInfer flash-attention backend for vLLM
|
|
107
|
-
|
|
106
|
+
# Use the FlashInfer flash-attention backend for vLLM, unless the user has already
|
|
107
|
+
# specified a different backend.
|
|
108
|
+
if os.getenv("VLLM_ATTENTION_BACKEND") is None:
|
|
109
|
+
os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
|
|
108
110
|
|
|
109
111
|
|
|
110
112
|
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Freshly initialised encoder models."""
|
|
2
2
|
|
|
3
|
+
import re
|
|
3
4
|
import typing as t
|
|
4
5
|
from functools import cached_property
|
|
5
6
|
from json import JSONDecodeError
|
|
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
|
|
|
45
46
|
"""A freshly initialised encoder model."""
|
|
46
47
|
|
|
47
48
|
fresh_model = True
|
|
49
|
+
allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
|
|
48
50
|
|
|
49
51
|
def __init__(
|
|
50
52
|
self,
|
|
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
|
|
|
294
296
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
295
297
|
add_prefix_space=prefix,
|
|
296
298
|
cache_dir=model_config.model_cache_dir,
|
|
297
|
-
use_fast=True,
|
|
299
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
298
300
|
verbose=False,
|
|
299
301
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
300
302
|
)
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
|
+
import re
|
|
5
6
|
import typing as t
|
|
6
7
|
from functools import cached_property, partial
|
|
7
8
|
from json import JSONDecodeError
|
|
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
93
94
|
fresh_model = False
|
|
94
95
|
batching_preference = BatchingPreference.NO_PREFERENCE
|
|
95
96
|
high_priority = True
|
|
97
|
+
allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
|
|
96
98
|
|
|
97
99
|
def __init__(
|
|
98
100
|
self,
|
|
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
|
|
|
690
692
|
model=model,
|
|
691
693
|
model_id=model_id,
|
|
692
694
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
693
|
-
|
|
695
|
+
model_config=model_config,
|
|
694
696
|
)
|
|
695
697
|
|
|
696
698
|
return model, tokeniser
|
|
@@ -880,7 +882,7 @@ def load_tokeniser(
|
|
|
880
882
|
model: "PreTrainedModel | None",
|
|
881
883
|
model_id: str,
|
|
882
884
|
trust_remote_code: bool,
|
|
883
|
-
|
|
885
|
+
model_config: "ModelConfig",
|
|
884
886
|
) -> "PreTrainedTokenizer":
|
|
885
887
|
"""Load the tokeniser.
|
|
886
888
|
|
|
@@ -892,17 +894,19 @@ def load_tokeniser(
|
|
|
892
894
|
The model identifier. Used for logging.
|
|
893
895
|
trust_remote_code:
|
|
894
896
|
Whether to trust remote code.
|
|
897
|
+
model_config:
|
|
898
|
+
The model configuration.
|
|
895
899
|
|
|
896
900
|
Returns:
|
|
897
901
|
The loaded tokeniser.
|
|
898
902
|
"""
|
|
899
903
|
loading_kwargs: dict[str, bool | str] = dict(
|
|
900
|
-
use_fast=True,
|
|
904
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
901
905
|
verbose=False,
|
|
902
906
|
trust_remote_code=trust_remote_code,
|
|
903
907
|
padding_side="right",
|
|
904
908
|
truncation_side="right",
|
|
905
|
-
cache_dir=model_cache_dir,
|
|
909
|
+
cache_dir=model_config.model_cache_dir,
|
|
906
910
|
)
|
|
907
911
|
|
|
908
912
|
# If the model is a subclass of a certain model types then we have to add a prefix
|
|
@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
369
369
|
]
|
|
370
370
|
logger.debug(
|
|
371
371
|
f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
|
|
372
|
-
f"{len(inputs_to_run):,} failed message(s)"
|
|
372
|
+
f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
|
|
373
|
+
f"{failures[0][1]}."
|
|
373
374
|
)
|
|
374
375
|
|
|
375
376
|
# Attempt to handle the exceptions, to improve the chance of getting
|
|
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
453
454
|
requires_thinking_disabled_messages = ["thinking.type: Field required"]
|
|
454
455
|
seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
|
|
455
456
|
response_format_messages = [
|
|
456
|
-
"got an unexpected keyword argument 'response_format'"
|
|
457
|
-
"The model outputs empty dictionaries.",
|
|
457
|
+
"got an unexpected keyword argument 'response_format'"
|
|
458
458
|
]
|
|
459
459
|
|
|
460
460
|
if any(msg.lower() in error_msg for msg in stop_messages):
|
|
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
713
713
|
]
|
|
714
714
|
responses = await tqdm_async.gather(*requests, leave=False)
|
|
715
715
|
|
|
716
|
-
# If we are performing structured generation and the model just outputs an empty
|
|
717
|
-
# dictionary, then we convert those to exceptions, to disable structured
|
|
718
|
-
# generation
|
|
719
|
-
if "response_format" in generation_kwargs:
|
|
720
|
-
responses = [
|
|
721
|
-
RuntimeError("The model outputs empty dictionaries.")
|
|
722
|
-
if not isinstance(response, Exception)
|
|
723
|
-
and any(choice.message.content == "{}" for choice in response.choices)
|
|
724
|
-
else response
|
|
725
|
-
for response in responses
|
|
726
|
-
]
|
|
727
|
-
|
|
728
716
|
# Separate the successful responses from the failed ones
|
|
729
717
|
successes = [
|
|
730
718
|
(idx, response)
|
|
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
984
972
|
model=None,
|
|
985
973
|
model_id=model_id,
|
|
986
974
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
987
|
-
|
|
975
|
+
model_config=self.model_config,
|
|
988
976
|
)
|
|
989
977
|
|
|
990
978
|
if (
|
|
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1067
1055
|
model=None,
|
|
1068
1056
|
model_id=model_id,
|
|
1069
1057
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
1070
|
-
|
|
1058
|
+
model_config=self.model_config,
|
|
1071
1059
|
)
|
|
1072
1060
|
|
|
1073
1061
|
all_max_lengths: list[int] = list()
|
|
@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
104
104
|
fresh_model = False
|
|
105
105
|
batching_preference = BatchingPreference.ALL_AT_ONCE
|
|
106
106
|
high_priority = True
|
|
107
|
-
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
|
|
107
|
+
allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
|
|
108
108
|
|
|
109
109
|
def __init__(
|
|
110
110
|
self,
|
|
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
559
559
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
560
560
|
]
|
|
561
561
|
)
|
|
562
|
-
if
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
562
|
+
if (
|
|
563
|
+
self.end_of_reasoning_token is not None
|
|
564
|
+
and self.generative_type == GenerativeType.REASONING
|
|
565
|
+
):
|
|
566
|
+
for idx in range(len(completions)):
|
|
567
|
+
if self.end_of_reasoning_token in completions[idx]:
|
|
568
|
+
completions[idx] = completions[idx].split(
|
|
569
|
+
self.end_of_reasoning_token
|
|
570
|
+
)[-1]
|
|
571
|
+
elif self.benchmark_config.verbose:
|
|
572
|
+
logger.warning(
|
|
573
|
+
f"The model {self.model_config.model_id!r} is a reasoning "
|
|
574
|
+
"model, but the generated output does not contain the end of "
|
|
575
|
+
f"reasoning token ({self.end_of_reasoning_token!r}). Using "
|
|
576
|
+
"an empty string as the prediction instead."
|
|
577
|
+
)
|
|
578
|
+
completions[idx] = ""
|
|
579
|
+
else:
|
|
580
|
+
log_once(
|
|
581
|
+
f"The model {self.model_config.model_id!r} is a reasoning "
|
|
582
|
+
"model, but the generated output does not contain the end of "
|
|
583
|
+
f"reasoning token ({self.end_of_reasoning_token!r}). Using "
|
|
584
|
+
"an empty string as the prediction instead. Only showing "
|
|
585
|
+
"this warning once - see all occurrences if you run with the "
|
|
586
|
+
"`verbose` flag.",
|
|
587
|
+
level=logging.WARNING,
|
|
588
|
+
)
|
|
589
|
+
completions[idx] = ""
|
|
567
590
|
stop_token_pattern = re.compile(
|
|
568
591
|
"|".join(re.escape(stop_token) for stop_token in stop_tokens)
|
|
569
592
|
)
|
|
@@ -830,21 +853,27 @@ def load_model_and_tokeniser(
|
|
|
830
853
|
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
831
854
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
832
855
|
model_max_length=true_max_model_len,
|
|
833
|
-
|
|
856
|
+
model_config=model_config,
|
|
834
857
|
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
835
858
|
)
|
|
859
|
+
vllm_tokenisation_params = get_vllm_tokenisation_params(
|
|
860
|
+
tokeniser=tokeniser, model_config=model_config
|
|
861
|
+
)
|
|
836
862
|
|
|
837
863
|
clear_vllm()
|
|
838
864
|
|
|
839
|
-
# if we do not have an internet connection we need to give the path to the folder
|
|
840
|
-
# that contains the model weights and config files, otherwise vLLM will try to
|
|
841
|
-
# download them regardless if they are already present in the download_dir
|
|
842
|
-
model_path = resolve_model_path(download_dir)
|
|
843
|
-
|
|
844
865
|
try:
|
|
845
866
|
model = LLM(
|
|
846
|
-
model=
|
|
847
|
-
|
|
867
|
+
model=(
|
|
868
|
+
model_id
|
|
869
|
+
if internet_connection_available()
|
|
870
|
+
else resolve_model_path(download_dir=download_dir)
|
|
871
|
+
),
|
|
872
|
+
tokenizer=(
|
|
873
|
+
model_id
|
|
874
|
+
if internet_connection_available()
|
|
875
|
+
else resolve_model_path(download_dir=download_dir)
|
|
876
|
+
),
|
|
848
877
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
849
878
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
850
879
|
download_dir=download_dir,
|
|
@@ -862,16 +891,7 @@ def load_model_and_tokeniser(
|
|
|
862
891
|
enable_prefix_caching=False,
|
|
863
892
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
864
893
|
max_lora_rank=256,
|
|
865
|
-
|
|
866
|
-
tokenizer_mode="mistral"
|
|
867
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
868
|
-
else "auto",
|
|
869
|
-
config_format="mistral"
|
|
870
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
871
|
-
else "auto",
|
|
872
|
-
load_format="mistral"
|
|
873
|
-
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
874
|
-
else "auto",
|
|
894
|
+
**vllm_tokenisation_params,
|
|
875
895
|
)
|
|
876
896
|
except (RuntimeError, ValueError, OSError) as e:
|
|
877
897
|
if "awaiting a review from the repo authors" in str(e):
|
|
@@ -900,7 +920,7 @@ def load_tokeniser(
|
|
|
900
920
|
adapter_base_model_id: str | None,
|
|
901
921
|
trust_remote_code: bool,
|
|
902
922
|
model_max_length: int,
|
|
903
|
-
|
|
923
|
+
model_config: "ModelConfig",
|
|
904
924
|
token: str | bool,
|
|
905
925
|
) -> "PreTrainedTokenizer":
|
|
906
926
|
"""Load the tokeniser.
|
|
@@ -917,8 +937,8 @@ def load_tokeniser(
|
|
|
917
937
|
Whether to trust remote code.
|
|
918
938
|
model_max_length:
|
|
919
939
|
The maximum length of the model.
|
|
920
|
-
|
|
921
|
-
The
|
|
940
|
+
model_config:
|
|
941
|
+
The model configuration.
|
|
922
942
|
token:
|
|
923
943
|
The Hugging Face API token.
|
|
924
944
|
|
|
@@ -929,7 +949,7 @@ def load_tokeniser(
|
|
|
929
949
|
config = AutoConfig.from_pretrained(
|
|
930
950
|
adapter_base_model_id or model_id,
|
|
931
951
|
revision=revision,
|
|
932
|
-
cache_dir=model_cache_dir,
|
|
952
|
+
cache_dir=model_config.model_cache_dir,
|
|
933
953
|
token=token,
|
|
934
954
|
trust_remote_code=trust_remote_code,
|
|
935
955
|
local_files_only=not internet_connection_available(),
|
|
@@ -937,15 +957,25 @@ def load_tokeniser(
|
|
|
937
957
|
num_retries = 5
|
|
938
958
|
for _ in range(num_retries):
|
|
939
959
|
try:
|
|
960
|
+
# Mistral instruction-tuned models need a custom tokeniser
|
|
961
|
+
if model_id.startswith("mistralai/") and "base" not in model_id.lower():
|
|
962
|
+
tokeniser = MistralCommonTokenizer.from_pretrained(
|
|
963
|
+
model_id,
|
|
964
|
+
padding_side="left",
|
|
965
|
+
truncation_side="left",
|
|
966
|
+
model_max_length=model_max_length,
|
|
967
|
+
token=token,
|
|
968
|
+
)
|
|
969
|
+
break
|
|
940
970
|
tokeniser = AutoTokenizer.from_pretrained(
|
|
941
971
|
model_id,
|
|
942
|
-
use_fast=True,
|
|
972
|
+
use_fast=False if model_config.param == "slow-tokenizer" else True,
|
|
943
973
|
verbose=False,
|
|
944
974
|
trust_remote_code=trust_remote_code,
|
|
945
975
|
padding_side="left",
|
|
946
976
|
truncation_side="left",
|
|
947
977
|
model_max_length=model_max_length,
|
|
948
|
-
cache_dir=model_cache_dir,
|
|
978
|
+
cache_dir=model_config.model_cache_dir,
|
|
949
979
|
config=config,
|
|
950
980
|
token=token,
|
|
951
981
|
local_files_only=not internet_connection_available(),
|
|
@@ -1186,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
|
|
|
1186
1216
|
"""
|
|
1187
1217
|
tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
|
|
1188
1218
|
return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
def get_vllm_tokenisation_params(
|
|
1222
|
+
tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
|
|
1223
|
+
) -> dict[str, t.Any]:
|
|
1224
|
+
"""Get the tokenisation parameters for vLLM.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
tokeniser:
|
|
1228
|
+
The tokeniser.
|
|
1229
|
+
model_config:
|
|
1230
|
+
The model configuration.
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
A dictionary of tokenisation parameters to pass to vLLM.
|
|
1234
|
+
"""
|
|
1235
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1236
|
+
tokeniser_mode = "mistral"
|
|
1237
|
+
elif model_config.param == "slow-tokenizer":
|
|
1238
|
+
tokeniser_mode = "slow"
|
|
1239
|
+
else:
|
|
1240
|
+
tokeniser_mode = "auto"
|
|
1241
|
+
|
|
1242
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1243
|
+
config_format = "mistral"
|
|
1244
|
+
else:
|
|
1245
|
+
config_format = "auto"
|
|
1246
|
+
|
|
1247
|
+
if isinstance(tokeniser, MistralCommonTokenizer):
|
|
1248
|
+
load_format = "mistral"
|
|
1249
|
+
else:
|
|
1250
|
+
load_format = "auto"
|
|
1251
|
+
|
|
1252
|
+
return dict(
|
|
1253
|
+
tokenizer_mode=tokeniser_mode,
|
|
1254
|
+
config_format=config_format,
|
|
1255
|
+
load_format=load_format,
|
|
1256
|
+
)
|