EuroEval 16.1.0__py3-none-any.whl → 16.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -6
- euroeval/benchmark_config_factory.py +4 -0
- euroeval/benchmark_modules/hf.py +31 -16
- euroeval/benchmark_modules/litellm.py +2 -0
- euroeval/benchmark_modules/vllm.py +24 -9
- euroeval/benchmarker.py +127 -14
- euroeval/cli.py +8 -0
- euroeval/data_models.py +4 -0
- euroeval/generation.py +3 -1
- euroeval/generation_utils.py +10 -4
- euroeval/metrics/base.py +12 -0
- euroeval/metrics/huggingface.py +23 -2
- euroeval/prompt_templates/linguistic_acceptability.py +6 -5
- euroeval/prompt_templates/named_entity_recognition.py +3 -3
- euroeval/prompt_templates/sentiment_classification.py +5 -5
- euroeval/task_group_utils/sequence_classification.py +1 -1
- euroeval/tasks.py +3 -0
- euroeval/tokenisation_utils.py +12 -13
- euroeval/types.py +2 -2
- euroeval/utils.py +77 -5
- {euroeval-16.1.0.dist-info → euroeval-16.2.0.dist-info}/METADATA +31 -7
- {euroeval-16.1.0.dist-info → euroeval-16.2.0.dist-info}/RECORD +25 -25
- {euroeval-16.1.0.dist-info → euroeval-16.2.0.dist-info}/WHEEL +0 -0
- {euroeval-16.1.0.dist-info → euroeval-16.2.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.1.0.dist-info → euroeval-16.2.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py
CHANGED
|
@@ -12,12 +12,13 @@ import warnings
|
|
|
12
12
|
from termcolor import colored
|
|
13
13
|
|
|
14
14
|
# Block specific warnings before importing anything else, as they can be noisy
|
|
15
|
-
|
|
16
|
-
warnings.filterwarnings("ignore", category=
|
|
17
|
-
|
|
18
|
-
logging.getLogger("
|
|
19
|
-
logging.getLogger("
|
|
20
|
-
|
|
15
|
+
if os.getenv("FULL_LOG") != "1":
|
|
16
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
17
|
+
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
18
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
19
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
20
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
21
|
+
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
21
22
|
|
|
22
23
|
# Set up logging
|
|
23
24
|
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
@@ -47,6 +47,7 @@ def build_benchmark_config(
|
|
|
47
47
|
debug: bool,
|
|
48
48
|
run_with_cli: bool,
|
|
49
49
|
requires_safetensors: bool,
|
|
50
|
+
download_only: bool,
|
|
50
51
|
) -> BenchmarkConfig:
|
|
51
52
|
"""Create a benchmark configuration.
|
|
52
53
|
|
|
@@ -117,6 +118,8 @@ def build_benchmark_config(
|
|
|
117
118
|
Whether the benchmark is being run with the CLI.
|
|
118
119
|
requires_safetensors:
|
|
119
120
|
Whether to only allow evaluations of models stored as safetensors.
|
|
121
|
+
download_only:
|
|
122
|
+
Whether to only download the requested model weights and datasets.
|
|
120
123
|
|
|
121
124
|
Returns:
|
|
122
125
|
The benchmark configuration.
|
|
@@ -165,6 +168,7 @@ def build_benchmark_config(
|
|
|
165
168
|
debug=debug,
|
|
166
169
|
run_with_cli=run_with_cli,
|
|
167
170
|
requires_safetensors=requires_safetensors,
|
|
171
|
+
download_only=download_only,
|
|
168
172
|
)
|
|
169
173
|
|
|
170
174
|
|
euroeval/benchmark_modules/hf.py
CHANGED
|
@@ -146,21 +146,25 @@ class HuggingFaceEncoderModel(BenchmarkModule):
|
|
|
146
146
|
Returns:
|
|
147
147
|
The number of parameters in the model.
|
|
148
148
|
"""
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
try:
|
|
152
|
-
repo_info = hf_api.model_info(
|
|
153
|
-
repo_id=self.model_config.adapter_base_model_id
|
|
154
|
-
or self.model_config.model_id,
|
|
155
|
-
revision=self.model_config.revision,
|
|
156
|
-
)
|
|
157
|
-
except (
|
|
158
|
-
RepositoryNotFoundError,
|
|
159
|
-
RevisionNotFoundError,
|
|
160
|
-
RequestException,
|
|
161
|
-
HFValidationError,
|
|
162
|
-
):
|
|
149
|
+
# No need to try to use the API if we have no internet.
|
|
150
|
+
if not internet_connection_available():
|
|
163
151
|
repo_info = None
|
|
152
|
+
else:
|
|
153
|
+
token = get_hf_token(api_key=self.benchmark_config.api_key)
|
|
154
|
+
hf_api = HfApi(token=token)
|
|
155
|
+
try:
|
|
156
|
+
repo_info = hf_api.model_info(
|
|
157
|
+
repo_id=self.model_config.adapter_base_model_id
|
|
158
|
+
or self.model_config.model_id,
|
|
159
|
+
revision=self.model_config.revision,
|
|
160
|
+
)
|
|
161
|
+
except (
|
|
162
|
+
RepositoryNotFoundError,
|
|
163
|
+
RevisionNotFoundError,
|
|
164
|
+
RequestException,
|
|
165
|
+
HFValidationError,
|
|
166
|
+
):
|
|
167
|
+
repo_info = None
|
|
164
168
|
|
|
165
169
|
if (
|
|
166
170
|
repo_info is not None
|
|
@@ -558,7 +562,7 @@ def load_model_and_tokeniser(
|
|
|
558
562
|
The benchmark configuration
|
|
559
563
|
|
|
560
564
|
Returns:
|
|
561
|
-
|
|
565
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
562
566
|
"""
|
|
563
567
|
config: "PretrainedConfig"
|
|
564
568
|
block_terminal_output()
|
|
@@ -686,6 +690,7 @@ def load_model_and_tokeniser(
|
|
|
686
690
|
model=model,
|
|
687
691
|
model_id=model_id,
|
|
688
692
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
693
|
+
model_cache_dir=model_config.model_cache_dir,
|
|
689
694
|
)
|
|
690
695
|
|
|
691
696
|
return model, tokeniser
|
|
@@ -722,6 +727,11 @@ def get_model_repo_info(
|
|
|
722
727
|
):
|
|
723
728
|
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
724
729
|
|
|
730
|
+
# If we have not internet, and the model_id is not a directory for a local model
|
|
731
|
+
# we also just create a dummy model info object.
|
|
732
|
+
elif not internet_connection_available():
|
|
733
|
+
model_info = HfApiModelInfo(id=model_id, tags=None, pipeline_tag=None)
|
|
734
|
+
|
|
725
735
|
# If the model does not exist locally, then we get the model info from the Hugging
|
|
726
736
|
# Face Hub, if possible
|
|
727
737
|
if model_info is None:
|
|
@@ -867,7 +877,10 @@ def get_model_repo_info(
|
|
|
867
877
|
|
|
868
878
|
|
|
869
879
|
def load_tokeniser(
|
|
870
|
-
model: "PreTrainedModel | None",
|
|
880
|
+
model: "PreTrainedModel | None",
|
|
881
|
+
model_id: str,
|
|
882
|
+
trust_remote_code: bool,
|
|
883
|
+
model_cache_dir: str,
|
|
871
884
|
) -> "PreTrainedTokenizer":
|
|
872
885
|
"""Load the tokeniser.
|
|
873
886
|
|
|
@@ -889,6 +902,7 @@ def load_tokeniser(
|
|
|
889
902
|
trust_remote_code=trust_remote_code,
|
|
890
903
|
padding_side="right",
|
|
891
904
|
truncation_side="right",
|
|
905
|
+
cache_dir=model_cache_dir,
|
|
892
906
|
)
|
|
893
907
|
|
|
894
908
|
# If the model is a subclass of a certain model types then we have to add a prefix
|
|
@@ -999,6 +1013,7 @@ def load_hf_model_config(
|
|
|
999
1013
|
token=get_hf_token(api_key=api_key),
|
|
1000
1014
|
trust_remote_code=trust_remote_code,
|
|
1001
1015
|
cache_dir=model_cache_dir,
|
|
1016
|
+
local_files_only=not internet_connection_available(),
|
|
1002
1017
|
)
|
|
1003
1018
|
if config.eos_token_id is not None and config.pad_token_id is None:
|
|
1004
1019
|
if isinstance(config.eos_token_id, list):
|
|
@@ -984,6 +984,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
984
984
|
model=None,
|
|
985
985
|
model_id=model_id,
|
|
986
986
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
987
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
987
988
|
)
|
|
988
989
|
|
|
989
990
|
if (
|
|
@@ -1066,6 +1067,7 @@ class LiteLLMModel(BenchmarkModule):
|
|
|
1066
1067
|
model=None,
|
|
1067
1068
|
model_id=model_id,
|
|
1068
1069
|
trust_remote_code=self.benchmark_config.trust_remote_code,
|
|
1070
|
+
model_cache_dir=self.model_config.model_cache_dir,
|
|
1069
1071
|
)
|
|
1070
1072
|
|
|
1071
1073
|
all_max_lengths: list[int] = list()
|
|
@@ -72,7 +72,9 @@ from ..utils import (
|
|
|
72
72
|
create_model_cache_dir,
|
|
73
73
|
get_hf_token,
|
|
74
74
|
get_min_cuda_compute_capability,
|
|
75
|
+
internet_connection_available,
|
|
75
76
|
log_once,
|
|
77
|
+
resolve_model_path,
|
|
76
78
|
split_model_id,
|
|
77
79
|
)
|
|
78
80
|
from .hf import HuggingFaceEncoderModel, get_model_repo_info, load_hf_model_config
|
|
@@ -146,7 +148,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
146
148
|
)
|
|
147
149
|
|
|
148
150
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
149
|
-
model=self._model, tokeniser=self._tokeniser,
|
|
151
|
+
model=self._model, tokeniser=self._tokeniser, model_config=model_config
|
|
150
152
|
)
|
|
151
153
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
152
154
|
tokeniser=self._tokeniser, generative_type=self.generative_type
|
|
@@ -834,10 +836,15 @@ def load_model_and_tokeniser(
|
|
|
834
836
|
|
|
835
837
|
clear_vllm()
|
|
836
838
|
|
|
839
|
+
# if we do not have an internet connection we need to give the path to the folder
|
|
840
|
+
# that contains the model weights and config files, otherwise vLLM will try to
|
|
841
|
+
# download them regardless if they are already present in the download_dir
|
|
842
|
+
model_path = resolve_model_path(download_dir)
|
|
843
|
+
|
|
837
844
|
try:
|
|
838
845
|
model = LLM(
|
|
839
|
-
model=model_id,
|
|
840
|
-
tokenizer=model_id,
|
|
846
|
+
model=model_id if internet_connection_available() else model_path,
|
|
847
|
+
tokenizer=model_id if internet_connection_available() else model_path,
|
|
841
848
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
842
849
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
843
850
|
download_dir=download_dir,
|
|
@@ -925,6 +932,7 @@ def load_tokeniser(
|
|
|
925
932
|
cache_dir=model_cache_dir,
|
|
926
933
|
token=token,
|
|
927
934
|
trust_remote_code=trust_remote_code,
|
|
935
|
+
local_files_only=not internet_connection_available(),
|
|
928
936
|
)
|
|
929
937
|
num_retries = 5
|
|
930
938
|
for _ in range(num_retries):
|
|
@@ -937,8 +945,10 @@ def load_tokeniser(
|
|
|
937
945
|
padding_side="left",
|
|
938
946
|
truncation_side="left",
|
|
939
947
|
model_max_length=model_max_length,
|
|
948
|
+
cache_dir=model_cache_dir,
|
|
940
949
|
config=config,
|
|
941
950
|
token=token,
|
|
951
|
+
local_files_only=not internet_connection_available(),
|
|
942
952
|
)
|
|
943
953
|
break
|
|
944
954
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
@@ -996,7 +1006,7 @@ def clear_vllm() -> None:
|
|
|
996
1006
|
|
|
997
1007
|
|
|
998
1008
|
def get_end_of_reasoning_token(
|
|
999
|
-
model: "LLM", tokeniser: "PreTrainedTokenizer",
|
|
1009
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
|
|
1000
1010
|
) -> str | None:
|
|
1001
1011
|
"""Get the end-of-reasoning token for a generative model.
|
|
1002
1012
|
|
|
@@ -1005,21 +1015,26 @@ def get_end_of_reasoning_token(
|
|
|
1005
1015
|
The vLLM model.
|
|
1006
1016
|
tokeniser:
|
|
1007
1017
|
The tokeniser.
|
|
1008
|
-
|
|
1009
|
-
The model
|
|
1018
|
+
model_config:
|
|
1019
|
+
The model configuration.
|
|
1010
1020
|
|
|
1011
1021
|
Returns:
|
|
1012
1022
|
The end of reasoning token, or None if it could not be found.
|
|
1013
1023
|
"""
|
|
1024
|
+
model_id = model_config.model_id
|
|
1025
|
+
|
|
1014
1026
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
1015
1027
|
prompt = "What is your name?"
|
|
1016
1028
|
if has_chat_template(tokeniser=tokeniser):
|
|
1029
|
+
extra_kwargs = dict()
|
|
1030
|
+
if model_config.param in {"thinking", "no-thinking"}:
|
|
1031
|
+
extra_kwargs["enable_thinking"] = model_config.param == "thinking"
|
|
1017
1032
|
templated_prompt = apply_chat_template(
|
|
1018
1033
|
conversation=[dict(role="user", content=prompt)],
|
|
1019
1034
|
tokeniser=tokeniser,
|
|
1020
1035
|
tokenise=False,
|
|
1021
1036
|
add_generation_prompt=True,
|
|
1022
|
-
|
|
1037
|
+
**extra_kwargs,
|
|
1023
1038
|
)
|
|
1024
1039
|
assert isinstance(templated_prompt, str)
|
|
1025
1040
|
prompt = templated_prompt
|
|
@@ -1042,8 +1057,8 @@ def get_end_of_reasoning_token(
|
|
|
1042
1057
|
if not bor_reasoning_matches:
|
|
1043
1058
|
log_once(
|
|
1044
1059
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
1045
|
-
"tokens in the prompt or the completion. Assuming the model is not "
|
|
1046
|
-
"
|
|
1060
|
+
"tokens in the prompt or the completion. Assuming the model is not a "
|
|
1061
|
+
"reasoning model.",
|
|
1047
1062
|
level=logging.DEBUG,
|
|
1048
1063
|
)
|
|
1049
1064
|
return None
|
euroeval/benchmarker.py
CHANGED
|
@@ -16,7 +16,7 @@ from torch.distributed import destroy_process_group
|
|
|
16
16
|
|
|
17
17
|
from .benchmark_config_factory import build_benchmark_config
|
|
18
18
|
from .constants import GENERATIVE_PIPELINE_TAGS
|
|
19
|
-
from .data_loading import load_data
|
|
19
|
+
from .data_loading import load_data, load_raw_data
|
|
20
20
|
from .data_models import BenchmarkConfigParams, BenchmarkResult
|
|
21
21
|
from .dataset_configs import get_all_dataset_configs
|
|
22
22
|
from .enums import Device, GenerativeType, ModelType
|
|
@@ -28,7 +28,12 @@ from .model_loading import load_model
|
|
|
28
28
|
from .scores import log_scores
|
|
29
29
|
from .speed_benchmark import benchmark_speed
|
|
30
30
|
from .tasks import SPEED
|
|
31
|
-
from .utils import
|
|
31
|
+
from .utils import (
|
|
32
|
+
enforce_reproducibility,
|
|
33
|
+
get_package_version,
|
|
34
|
+
internet_connection_available,
|
|
35
|
+
log_once,
|
|
36
|
+
)
|
|
32
37
|
|
|
33
38
|
if t.TYPE_CHECKING:
|
|
34
39
|
from .benchmark_modules import BenchmarkModule
|
|
@@ -83,6 +88,7 @@ class Benchmarker:
|
|
|
83
88
|
debug: bool = False,
|
|
84
89
|
run_with_cli: bool = False,
|
|
85
90
|
requires_safetensors: bool = False,
|
|
91
|
+
download_only: bool = False,
|
|
86
92
|
) -> None:
|
|
87
93
|
"""Initialise the benchmarker.
|
|
88
94
|
|
|
@@ -164,14 +170,26 @@ class Benchmarker:
|
|
|
164
170
|
requires_safetensors:
|
|
165
171
|
Whether to only allow models that use the safetensors format. Defaults
|
|
166
172
|
to False.
|
|
173
|
+
download_only:
|
|
174
|
+
Whether to only download models and datasets without performing any
|
|
175
|
+
benchmarking. Defaults to False.
|
|
167
176
|
|
|
168
177
|
Raises:
|
|
169
178
|
ValueError:
|
|
170
|
-
If both `task` and `dataset` are specified
|
|
179
|
+
If both `task` and `dataset` are specified, or if `download_only`
|
|
180
|
+
is True and we have no internet connection.
|
|
171
181
|
"""
|
|
172
182
|
if task is not None and dataset is not None:
|
|
173
183
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
174
184
|
|
|
185
|
+
if not internet_connection_available() and download_only:
|
|
186
|
+
msg = "It appears you do not have an internet connection, but "
|
|
187
|
+
if run_with_cli:
|
|
188
|
+
msg += "the --download-only flag was set."
|
|
189
|
+
else:
|
|
190
|
+
msg += "the argument `download_only` was set to True."
|
|
191
|
+
raise ValueError(msg)
|
|
192
|
+
|
|
175
193
|
# Bail early if hf_transfer is enabled but not installed.
|
|
176
194
|
if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
|
|
177
195
|
raise ImportError(
|
|
@@ -222,17 +240,82 @@ class Benchmarker:
|
|
|
222
240
|
|
|
223
241
|
@property
|
|
224
242
|
def benchmark_results(self) -> list[BenchmarkResult]:
|
|
225
|
-
"""The benchmark results.
|
|
243
|
+
"""The benchmark results.
|
|
244
|
+
|
|
245
|
+
Returns:
|
|
246
|
+
A list of benchmark results.
|
|
247
|
+
|
|
248
|
+
Raises:
|
|
249
|
+
ValueError:
|
|
250
|
+
If there is an error decoding a line in the results file.
|
|
251
|
+
"""
|
|
226
252
|
if self.results_path.exists():
|
|
253
|
+
benchmark_results: list[BenchmarkResult] = list()
|
|
227
254
|
with self.results_path.open() as f:
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
255
|
+
for line in f:
|
|
256
|
+
if line.strip():
|
|
257
|
+
try:
|
|
258
|
+
result_dict = json.loads(line.strip())
|
|
259
|
+
except json.JSONDecodeError as e:
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"Error decoding JSON line: {line.strip()}"
|
|
262
|
+
) from e
|
|
263
|
+
|
|
264
|
+
# Fix for older records
|
|
265
|
+
has_old_raw_results = (
|
|
266
|
+
"results" in result_dict
|
|
267
|
+
and isinstance(result_dict["results"], dict)
|
|
268
|
+
and "raw" in result_dict["results"]
|
|
269
|
+
and isinstance(result_dict["results"]["raw"], dict)
|
|
270
|
+
and "test" in result_dict["results"]["raw"]
|
|
271
|
+
)
|
|
272
|
+
if has_old_raw_results:
|
|
273
|
+
result_dict["results"]["raw"] = result_dict["results"][
|
|
274
|
+
"raw"
|
|
275
|
+
]["test"]
|
|
276
|
+
|
|
277
|
+
result = BenchmarkResult.from_dict(result_dict)
|
|
278
|
+
benchmark_results.append(result)
|
|
279
|
+
return benchmark_results
|
|
233
280
|
else:
|
|
234
281
|
return list()
|
|
235
282
|
|
|
283
|
+
def _download(
|
|
284
|
+
self,
|
|
285
|
+
dataset_config: "DatasetConfig",
|
|
286
|
+
model_config: "ModelConfig",
|
|
287
|
+
benchmark_config: "BenchmarkConfig",
|
|
288
|
+
) -> None:
|
|
289
|
+
"""Download data, metrics, and model for the given dataset, and model.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
dataset_config: The configuration for the dataset.
|
|
293
|
+
model_config: The configuration for the model.
|
|
294
|
+
benchmark_config: The configuration for the benchmark.
|
|
295
|
+
"""
|
|
296
|
+
log_once(f"Loading data for {dataset_config.pretty_name}", level=logging.INFO)
|
|
297
|
+
dataset = load_raw_data(
|
|
298
|
+
dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
|
|
299
|
+
)
|
|
300
|
+
del dataset
|
|
301
|
+
|
|
302
|
+
log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
|
|
303
|
+
model = load_model(
|
|
304
|
+
model_config=model_config,
|
|
305
|
+
dataset_config=dataset_config,
|
|
306
|
+
benchmark_config=benchmark_config,
|
|
307
|
+
)
|
|
308
|
+
del model
|
|
309
|
+
|
|
310
|
+
log_once(
|
|
311
|
+
f"Loading metrics for the '{dataset_config.task.name}' task",
|
|
312
|
+
level=logging.INFO,
|
|
313
|
+
)
|
|
314
|
+
for metric_name in dataset_config.task.metrics:
|
|
315
|
+
log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
|
|
316
|
+
metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
|
|
317
|
+
del metric
|
|
318
|
+
|
|
236
319
|
def benchmark(
|
|
237
320
|
self,
|
|
238
321
|
model: list[str] | str,
|
|
@@ -336,6 +419,9 @@ class Benchmarker:
|
|
|
336
419
|
requires_safetensors:
|
|
337
420
|
Whether to only allow models that use the safetensors format. Defaults
|
|
338
421
|
to the value specified when initialising the benchmarker.
|
|
422
|
+
download_only:
|
|
423
|
+
Whether to only download the models without evaluating them. Defaults
|
|
424
|
+
to the value specified when initialising the benchmarker.
|
|
339
425
|
|
|
340
426
|
Returns:
|
|
341
427
|
A list of benchmark results.
|
|
@@ -395,6 +481,28 @@ class Benchmarker:
|
|
|
395
481
|
num_finished_benchmarks += len(dataset_configs)
|
|
396
482
|
continue
|
|
397
483
|
|
|
484
|
+
if model_config.adapter_base_model_id:
|
|
485
|
+
open_issue_msg = (
|
|
486
|
+
"If offline support is important to you, please "
|
|
487
|
+
"consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
|
|
488
|
+
)
|
|
489
|
+
if not internet_connection_available():
|
|
490
|
+
raise InvalidModel(
|
|
491
|
+
"Offline benchmarking of models with adapters is not currently "
|
|
492
|
+
"supported. "
|
|
493
|
+
f"An active internet connection is required. {open_issue_msg}"
|
|
494
|
+
)
|
|
495
|
+
elif benchmark_config.download_only:
|
|
496
|
+
log_once(
|
|
497
|
+
"You are using download only mode with a model that includes "
|
|
498
|
+
"an adapter. "
|
|
499
|
+
"Please note: Offline benchmarking of adapter models is not "
|
|
500
|
+
"currently supported. "
|
|
501
|
+
"An internet connection will be required during evaluation. "
|
|
502
|
+
f"{open_issue_msg}",
|
|
503
|
+
level=logging.WARNING,
|
|
504
|
+
)
|
|
505
|
+
|
|
398
506
|
loaded_model: BenchmarkModule | None = None
|
|
399
507
|
benchmark_params_to_revert: dict[str, t.Any] = dict()
|
|
400
508
|
for dataset_config in dataset_configs:
|
|
@@ -645,6 +753,9 @@ class Benchmarker:
|
|
|
645
753
|
requires_safetensors:
|
|
646
754
|
Whether to only allow models that use the safetensors format. If None,
|
|
647
755
|
then this value will not be updated.
|
|
756
|
+
download_only:
|
|
757
|
+
Whether to only download the models without evaluating them. If None,
|
|
758
|
+
then this value will not be updated.
|
|
648
759
|
|
|
649
760
|
Returns:
|
|
650
761
|
The updated benchmark configuration.
|
|
@@ -813,17 +924,19 @@ class Benchmarker:
|
|
|
813
924
|
model_param=model_config.param,
|
|
814
925
|
)
|
|
815
926
|
|
|
927
|
+
model_id_to_be_stored = model_config.model_id
|
|
928
|
+
if model_config.revision != "main":
|
|
929
|
+
model_id_to_be_stored += f"@{model_config.revision}"
|
|
930
|
+
if model_config.param is not None:
|
|
931
|
+
model_id_to_be_stored += f"#{model_config.param}"
|
|
932
|
+
|
|
816
933
|
record = BenchmarkResult(
|
|
817
934
|
dataset=dataset_config.name,
|
|
818
935
|
task=dataset_config.task.name,
|
|
819
936
|
dataset_languages=[
|
|
820
937
|
language.code for language in dataset_config.languages
|
|
821
938
|
],
|
|
822
|
-
model=
|
|
823
|
-
f"{model_config.model_id}@{model_config.revision}"
|
|
824
|
-
if model_config.revision and model_config.revision != "main"
|
|
825
|
-
else model_config.model_id
|
|
826
|
-
),
|
|
939
|
+
model=model_id_to_be_stored,
|
|
827
940
|
results=results,
|
|
828
941
|
num_model_parameters=model.num_params,
|
|
829
942
|
max_sequence_length=model.model_max_length,
|
euroeval/cli.py
CHANGED
|
@@ -216,6 +216,12 @@ from .tasks import get_all_tasks
|
|
|
216
216
|
help="The type of generative model. Only relevant if the model is generative. If "
|
|
217
217
|
"not specified, the type will be inferred automatically.",
|
|
218
218
|
)
|
|
219
|
+
@click.option(
|
|
220
|
+
"--download-only",
|
|
221
|
+
is_flag=True,
|
|
222
|
+
help="Only download the requested model weights and datasets, and exit.",
|
|
223
|
+
default=False,
|
|
224
|
+
)
|
|
219
225
|
def benchmark(
|
|
220
226
|
model: tuple[str],
|
|
221
227
|
dataset: tuple[str],
|
|
@@ -243,6 +249,7 @@ def benchmark(
|
|
|
243
249
|
debug: bool,
|
|
244
250
|
requires_safetensors: bool,
|
|
245
251
|
generative_type: str | None,
|
|
252
|
+
download_only: bool,
|
|
246
253
|
) -> None:
|
|
247
254
|
"""Benchmark pretrained language models on language tasks."""
|
|
248
255
|
models = list(model)
|
|
@@ -284,6 +291,7 @@ def benchmark(
|
|
|
284
291
|
debug=debug,
|
|
285
292
|
run_with_cli=True,
|
|
286
293
|
requires_safetensors=requires_safetensors,
|
|
294
|
+
download_only=download_only,
|
|
287
295
|
)
|
|
288
296
|
|
|
289
297
|
# Perform the benchmark evaluation
|
euroeval/data_models.py
CHANGED
|
@@ -228,6 +228,9 @@ class BenchmarkConfig:
|
|
|
228
228
|
generative_type:
|
|
229
229
|
The type of generative model to benchmark. Only relevant if the model is
|
|
230
230
|
generative.
|
|
231
|
+
download_only:
|
|
232
|
+
Whether to only download the models, metrics and datasets without
|
|
233
|
+
evaluating.
|
|
231
234
|
"""
|
|
232
235
|
|
|
233
236
|
model_languages: list[Language]
|
|
@@ -255,6 +258,7 @@ class BenchmarkConfig:
|
|
|
255
258
|
run_with_cli: bool
|
|
256
259
|
requires_safetensors: bool
|
|
257
260
|
generative_type: GenerativeType | None
|
|
261
|
+
download_only: bool
|
|
258
262
|
|
|
259
263
|
|
|
260
264
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
euroeval/generation.py
CHANGED
|
@@ -243,7 +243,9 @@ def generate_single_iteration(
|
|
|
243
243
|
ground_truth = []
|
|
244
244
|
|
|
245
245
|
itr_scores: dict[str, float] = model.compute_metrics(
|
|
246
|
-
model_outputs_and_labels=(all_preds, ground_truth),
|
|
246
|
+
model_outputs_and_labels=(all_preds, ground_truth),
|
|
247
|
+
dataset=dataset,
|
|
248
|
+
benchmark_config=benchmark_config,
|
|
247
249
|
)
|
|
248
250
|
|
|
249
251
|
return itr_scores
|
euroeval/generation_utils.py
CHANGED
|
@@ -202,7 +202,7 @@ def apply_prompt(
|
|
|
202
202
|
"""
|
|
203
203
|
# Sanity check
|
|
204
204
|
if (
|
|
205
|
-
generative_type
|
|
205
|
+
generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}
|
|
206
206
|
and always_populate_text_field
|
|
207
207
|
and tokeniser is None
|
|
208
208
|
):
|
|
@@ -229,7 +229,10 @@ def apply_prompt(
|
|
|
229
229
|
)
|
|
230
230
|
label_mapping = dataset_config.prompt_label_mapping
|
|
231
231
|
label = label_mapping.get(label, label)
|
|
232
|
-
if generative_type
|
|
232
|
+
if generative_type in {
|
|
233
|
+
GenerativeType.INSTRUCTION_TUNED,
|
|
234
|
+
GenerativeType.REASONING,
|
|
235
|
+
}:
|
|
233
236
|
prompt = dataset_config.instruction_prompt.format(**kwargs)
|
|
234
237
|
return prompt, label
|
|
235
238
|
else:
|
|
@@ -355,7 +358,7 @@ def apply_prompt(
|
|
|
355
358
|
f"Unsupported task group: {dataset_config.task.task_group}."
|
|
356
359
|
)
|
|
357
360
|
|
|
358
|
-
if generative_type
|
|
361
|
+
if generative_type in {GenerativeType.INSTRUCTION_TUNED, GenerativeType.REASONING}:
|
|
359
362
|
few_shot_messages = [
|
|
360
363
|
dict(role=role, content=content)
|
|
361
364
|
for prompt, label in few_shot_sections
|
|
@@ -408,7 +411,10 @@ def apply_prompt(
|
|
|
408
411
|
else:
|
|
409
412
|
prompt_prefix = ""
|
|
410
413
|
if dataset_config.prompt_prefix:
|
|
411
|
-
|
|
414
|
+
labels_str = dataset_config.get_labels_str()
|
|
415
|
+
prompt_prefix = (
|
|
416
|
+
dataset_config.prompt_prefix.format(labels_str=labels_str) + "\n\n"
|
|
417
|
+
)
|
|
412
418
|
|
|
413
419
|
few_shot_prompt = "\n\n".join([prompt for prompt, _ in few_shot_sections])
|
|
414
420
|
if few_shot_prompt:
|
euroeval/metrics/base.py
CHANGED
|
@@ -42,6 +42,18 @@ class Metric(abc.ABC):
|
|
|
42
42
|
else lambda x: (100 * x, f"{x:.2%}")
|
|
43
43
|
)
|
|
44
44
|
|
|
45
|
+
def download(self, cache_dir: str) -> "Metric":
|
|
46
|
+
"""Initiates the download of the metric if needed.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
cache_dir:
|
|
50
|
+
The directory where the metric will be downloaded to.
|
|
51
|
+
|
|
52
|
+
Returns:
|
|
53
|
+
The metric object itself.
|
|
54
|
+
"""
|
|
55
|
+
return self
|
|
56
|
+
|
|
45
57
|
@abc.abstractmethod
|
|
46
58
|
def __call__(
|
|
47
59
|
self,
|
euroeval/metrics/huggingface.py
CHANGED
|
@@ -3,9 +3,11 @@
|
|
|
3
3
|
import collections.abc as c
|
|
4
4
|
import logging
|
|
5
5
|
import typing as t
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
|
|
7
8
|
import evaluate
|
|
8
9
|
import numpy as np
|
|
10
|
+
from datasets import DownloadConfig
|
|
9
11
|
|
|
10
12
|
from ..utils import HiddenPrints
|
|
11
13
|
from .base import Metric
|
|
@@ -76,6 +78,23 @@ class HuggingFaceMetric(Metric):
|
|
|
76
78
|
)
|
|
77
79
|
self.metric: "EvaluationModule | None" = None
|
|
78
80
|
|
|
81
|
+
def download(self, cache_dir: str) -> "HuggingFaceMetric":
|
|
82
|
+
"""Initiates the download of the metric if needed.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
cache_dir:
|
|
86
|
+
The directory where the metric will be downloaded to.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The metric object itself.
|
|
90
|
+
"""
|
|
91
|
+
# Annoying but needed to make the metric download to a different cache dir
|
|
92
|
+
download_config = DownloadConfig(cache_dir=Path(cache_dir, "evaluate"))
|
|
93
|
+
self.metric = evaluate.load(
|
|
94
|
+
path=self.huggingface_id, download_config=download_config
|
|
95
|
+
)
|
|
96
|
+
return self
|
|
97
|
+
|
|
79
98
|
def __call__(
|
|
80
99
|
self,
|
|
81
100
|
predictions: c.Sequence,
|
|
@@ -103,7 +122,9 @@ class HuggingFaceMetric(Metric):
|
|
|
103
122
|
The calculated metric score, or None if the score should be ignored.
|
|
104
123
|
"""
|
|
105
124
|
if self.metric is None:
|
|
106
|
-
self.
|
|
125
|
+
self.download(cache_dir=benchmark_config.cache_dir)
|
|
126
|
+
|
|
127
|
+
assert self.metric is not None
|
|
107
128
|
|
|
108
129
|
with HiddenPrints():
|
|
109
130
|
results = self.metric.compute(
|
|
@@ -176,7 +197,7 @@ bert_score_metric = HuggingFaceMetric(
|
|
|
176
197
|
huggingface_id="bertscore",
|
|
177
198
|
results_key="f1",
|
|
178
199
|
compute_kwargs=dict(
|
|
179
|
-
model_type="microsoft/mdeberta-v3-base", device="
|
|
200
|
+
model_type="microsoft/mdeberta-v3-base", device="cpu", batch_size=16
|
|
180
201
|
),
|
|
181
202
|
)
|
|
182
203
|
|
|
@@ -97,7 +97,7 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
97
97
|
default_prompt_prefix="Hetta eru nakrir setningar og um teir eru mállæruliga "
|
|
98
98
|
"rættir.",
|
|
99
99
|
default_prompt_template="Setningur: {text}\nMállæruliga rættur: {label}",
|
|
100
|
-
default_instruction_prompt="Setningur: {text}\n\
|
|
100
|
+
default_instruction_prompt="Setningur: {text}\n\nGreindu hvort setningurin er "
|
|
101
101
|
"mállæruliga rættur ella ikki. Svara við {labels_str}, og einki annað.",
|
|
102
102
|
),
|
|
103
103
|
FR: PromptConfig(
|
|
@@ -111,11 +111,12 @@ LA_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
111
111
|
),
|
|
112
112
|
IS: PromptConfig(
|
|
113
113
|
default_prompt_label_mapping=dict(correct="já", incorrect="nei"),
|
|
114
|
-
default_prompt_prefix="
|
|
115
|
-
"málfræðilega réttar.",
|
|
114
|
+
default_prompt_prefix="Hér fyrir neðan eru setningar ásamt mati á því hvort "
|
|
115
|
+
"þær eru málfræðilega réttar.",
|
|
116
116
|
default_prompt_template="Setning: {text}\nMálfræðilega rétt: {label}",
|
|
117
|
-
default_instruction_prompt="Setning: {text}\n\
|
|
118
|
-
"málfræðilega rétt
|
|
117
|
+
default_instruction_prompt="Setning: {text}\n\nGreindu hvort setningin er "
|
|
118
|
+
"málfræðilega rétt. Svaraðu með 'já' ef setningin er rétt og 'nei' ef hún "
|
|
119
|
+
"er það ekki.",
|
|
119
120
|
),
|
|
120
121
|
IT: PromptConfig(
|
|
121
122
|
default_prompt_label_mapping=dict(correct="si", incorrect="no"),
|
|
@@ -176,7 +176,7 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
176
176
|
default_prompt_prefix="Her eru nakrir setningar og nakrar JSON orðabøkur við "
|
|
177
177
|
"nevndar eindir, sum eru í setningunum.",
|
|
178
178
|
default_prompt_template="Setningur: {text}\nNevndar eindir: {label}",
|
|
179
|
-
default_instruction_prompt="Setningur: {text}\n\
|
|
179
|
+
default_instruction_prompt="Setningur: {text}\n\nGreindu nevndu einingarnar í "
|
|
180
180
|
"setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
|
|
181
181
|
"{labels_str}. Gildin ættu að vera listi yfir nevndu einingarnar af "
|
|
182
182
|
"þeirri gerð, nákvæmlega eins og þær koma fram í setningunni.",
|
|
@@ -215,8 +215,8 @@ NER_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
215
215
|
},
|
|
216
216
|
default_prompt_prefix="Eftirfarandi eru setningar ásamt JSON lyklum með "
|
|
217
217
|
"nefndum einingum sem koma fyrir í setningunum.",
|
|
218
|
-
default_prompt_template="Setning: {text}\
|
|
219
|
-
default_instruction_prompt="Setning: {text}\n\
|
|
218
|
+
default_prompt_template="Setning: {text}\nNafneiningar: {label}",
|
|
219
|
+
default_instruction_prompt="Setning: {text}\n\nGreindu nefndu einingarnar í "
|
|
220
220
|
"setningunni. Þú ættir að skila þessu sem JSON orðabók með lyklunum "
|
|
221
221
|
"{labels_str}. Gildin ættu að vera listi yfir nefndu "
|
|
222
222
|
"einingarnar af þeirri gerð, nákvæmlega eins og þær koma fram í "
|
|
@@ -137,11 +137,11 @@ SENT_TEMPLATES: dict["Language", PromptConfig] = {
|
|
|
137
137
|
default_prompt_label_mapping=dict(
|
|
138
138
|
positive="jákvætt", neutral="hlutlaust", negative="neikvætt"
|
|
139
139
|
),
|
|
140
|
-
default_prompt_prefix="
|
|
141
|
-
"verið
|
|
142
|
-
default_prompt_template="
|
|
143
|
-
default_instruction_prompt="
|
|
144
|
-
"Svaraðu með {labels_str}, og ekkert annað.",
|
|
140
|
+
default_prompt_prefix="Hér fyrir neðan eru textabrot ásamt lyndisgildi þeirra "
|
|
141
|
+
"sem getur verið 'jákvætt', 'hlutlaust' eða 'neikvætt'.",
|
|
142
|
+
default_prompt_template="Textabrot: {text}\nViðhorf: {label}",
|
|
143
|
+
default_instruction_prompt="Textabrot: {text}\n\nGreindu lyndið í "
|
|
144
|
+
"textabrotinu. Svaraðu með {labels_str}, og ekkert annað.",
|
|
145
145
|
),
|
|
146
146
|
IT: PromptConfig(
|
|
147
147
|
default_prompt_label_mapping=dict(
|
|
@@ -198,7 +198,7 @@ def extract_labels_from_generation(
|
|
|
198
198
|
# If no candidate labels were found, we either pick the label with the smallest
|
|
199
199
|
# word edit distance to the predicted label (if invalid model outputs are
|
|
200
200
|
# allowed), or we raise an error
|
|
201
|
-
if min(edit_distances)
|
|
201
|
+
if min(edit_distances) >= 1000:
|
|
202
202
|
if dataset_config.allow_invalid_model_outputs:
|
|
203
203
|
logger.warning(
|
|
204
204
|
"No candidate labels found for the predicted label "
|
euroeval/tasks.py
CHANGED
|
@@ -100,6 +100,7 @@ KNOW = Task(
|
|
|
100
100
|
default_num_few_shot_examples=5,
|
|
101
101
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
102
102
|
default_labels=["a", "b", "c", "d"],
|
|
103
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
103
104
|
uses_logprobs=True,
|
|
104
105
|
)
|
|
105
106
|
|
|
@@ -112,6 +113,7 @@ MCRC = Task(
|
|
|
112
113
|
default_num_few_shot_examples=5,
|
|
113
114
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
114
115
|
default_labels=["a", "b", "c", "d"],
|
|
116
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
115
117
|
uses_logprobs=True,
|
|
116
118
|
)
|
|
117
119
|
|
|
@@ -124,6 +126,7 @@ COMMON_SENSE = Task(
|
|
|
124
126
|
default_num_few_shot_examples=5,
|
|
125
127
|
default_max_generated_tokens=NUM_GENERATION_TOKENS_FOR_CLASSIFICATION,
|
|
126
128
|
default_labels=["a", "b", "c", "d"],
|
|
129
|
+
default_allowed_model_types=[ModelType.GENERATIVE],
|
|
127
130
|
uses_logprobs=True,
|
|
128
131
|
)
|
|
129
132
|
|
euroeval/tokenisation_utils.py
CHANGED
|
@@ -339,13 +339,18 @@ def get_end_of_chat_token_ids(
|
|
|
339
339
|
return None
|
|
340
340
|
|
|
341
341
|
user_message: dict[str, str] = dict(role="user", content="X")
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
342
|
+
try:
|
|
343
|
+
token_ids = apply_chat_template(
|
|
344
|
+
conversation=[user_message],
|
|
345
|
+
tokeniser=tokeniser,
|
|
346
|
+
tokenise=True,
|
|
347
|
+
add_generation_prompt=False,
|
|
348
|
+
enable_thinking=generative_type == GenerativeType.REASONING,
|
|
349
|
+
)
|
|
350
|
+
except InvalidModel as e:
|
|
351
|
+
if "does not have a chat template" in str(e):
|
|
352
|
+
return None
|
|
353
|
+
raise e
|
|
349
354
|
assert isinstance(token_ids, list)
|
|
350
355
|
|
|
351
356
|
for idx, token in enumerate(tokeniser.convert_ids_to_tokens(token_ids)):
|
|
@@ -546,7 +551,6 @@ def apply_chat_template(
|
|
|
546
551
|
tokeniser: "PreTrainedTokenizer",
|
|
547
552
|
tokenise: bool,
|
|
548
553
|
add_generation_prompt: bool,
|
|
549
|
-
enable_thinking: bool,
|
|
550
554
|
**extra_kwargs,
|
|
551
555
|
) -> str | list[int]:
|
|
552
556
|
"""Apply the chat template to a prompt.
|
|
@@ -563,10 +567,6 @@ def apply_chat_template(
|
|
|
563
567
|
Whether to add a generation prompt at the end of the conversation. This is
|
|
564
568
|
only relevant for regular Hugging Face tokenisers, as Mistral tokenisers
|
|
565
569
|
always add a generation prompt.
|
|
566
|
-
enable_thinking:
|
|
567
|
-
Whether to enable special handling for reasoning models, such as adding
|
|
568
|
-
special tokens for thinking. This is only relevant for regular Hugging
|
|
569
|
-
Face tokenisers, as Mistral tokenisers always handle reasoning models.
|
|
570
570
|
**extra_kwargs:
|
|
571
571
|
Extra keyword arguments to pass to the tokeniser's `apply_chat_template`
|
|
572
572
|
method. Only relevant for regular Hugging Face tokenisers.
|
|
@@ -596,7 +596,6 @@ def apply_chat_template(
|
|
|
596
596
|
conversation=conversation,
|
|
597
597
|
add_generation_prompt=add_generation_prompt,
|
|
598
598
|
tokenize=tokenise,
|
|
599
|
-
enable_thinking=enable_thinking,
|
|
600
599
|
**extra_kwargs,
|
|
601
600
|
)
|
|
602
601
|
return templated_prompt
|
euroeval/types.py
CHANGED
|
@@ -8,8 +8,7 @@ if t.TYPE_CHECKING:
|
|
|
8
8
|
from datasets.arrow_dataset import Dataset
|
|
9
9
|
from numpy.typing import NDArray
|
|
10
10
|
|
|
11
|
-
from .data_models import GenerativeModelOutput
|
|
12
|
-
|
|
11
|
+
from .data_models import BenchmarkConfig, GenerativeModelOutput
|
|
13
12
|
|
|
14
13
|
ScoreDict: t.TypeAlias = dict[str, dict[str, float] | list[dict[str, float]]]
|
|
15
14
|
Predictions: t.TypeAlias = "NDArray | list[str] | list[list[str]]"
|
|
@@ -27,6 +26,7 @@ class ComputeMetricsFunction(t.Protocol):
|
|
|
27
26
|
"NDArray | list[str] | list[list[str]]",
|
|
28
27
|
],
|
|
29
28
|
dataset: "Dataset",
|
|
29
|
+
benchmark_config: "BenchmarkConfig",
|
|
30
30
|
) -> dict[str, float]:
|
|
31
31
|
"""Compute the metrics.
|
|
32
32
|
|
euroeval/utils.py
CHANGED
|
@@ -8,6 +8,7 @@ import logging
|
|
|
8
8
|
import os
|
|
9
9
|
import random
|
|
10
10
|
import re
|
|
11
|
+
import socket
|
|
11
12
|
import sys
|
|
12
13
|
import typing as t
|
|
13
14
|
import warnings
|
|
@@ -18,10 +19,8 @@ import demjson3
|
|
|
18
19
|
import huggingface_hub as hf_hub
|
|
19
20
|
import litellm
|
|
20
21
|
import numpy as np
|
|
21
|
-
import requests
|
|
22
22
|
import torch
|
|
23
23
|
from datasets.utils import disable_progress_bar
|
|
24
|
-
from requests.exceptions import RequestException
|
|
25
24
|
from transformers import logging as tf_logging
|
|
26
25
|
|
|
27
26
|
from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
|
|
@@ -54,6 +53,68 @@ def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
|
|
|
54
53
|
return str(cache_dir_path)
|
|
55
54
|
|
|
56
55
|
|
|
56
|
+
def resolve_model_path(download_dir: str) -> str:
|
|
57
|
+
"""Resolve the path to the directory containing the model config files and weights.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
download_dir:
|
|
61
|
+
The download directory
|
|
62
|
+
|
|
63
|
+
Returns:
|
|
64
|
+
The path to the model.
|
|
65
|
+
"""
|
|
66
|
+
model_path = Path(download_dir)
|
|
67
|
+
# Get the 'path safe' version of the model id, which is the last dir in the path
|
|
68
|
+
model_id_path = model_path.name
|
|
69
|
+
# Hf hub `cache_dir` puts the files in models--`model_id_path`/snapshots
|
|
70
|
+
model_path = model_path / f"models--{model_id_path}" / "snapshots"
|
|
71
|
+
if not model_path.exists():
|
|
72
|
+
raise InvalidModel(
|
|
73
|
+
f"Attempted to load models from the {model_path} directory, "
|
|
74
|
+
"but it does not exist."
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Get all files in the model path
|
|
78
|
+
found_files = [
|
|
79
|
+
found_file for found_file in model_path.rglob("*") if found_file.is_file()
|
|
80
|
+
]
|
|
81
|
+
if not found_files:
|
|
82
|
+
raise InvalidModel(f"No model files found at {model_path}")
|
|
83
|
+
|
|
84
|
+
# Make sure that there arent multiples of the files found
|
|
85
|
+
if len(found_files) == len(set(found_files)):
|
|
86
|
+
raise InvalidModel(
|
|
87
|
+
f"Found multiple model config files for {model_id_path.strip('models--')}"
|
|
88
|
+
f"at {model_path}"
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Check that found_files contains at least a 'config.json'
|
|
92
|
+
config_file = next(
|
|
93
|
+
(file for file in found_files if file.name == "config.json"), None
|
|
94
|
+
)
|
|
95
|
+
if config_file is None:
|
|
96
|
+
raise InvalidModel(
|
|
97
|
+
f"Missing required file 'config.json' for {model_id_path.strip('models--')}"
|
|
98
|
+
f"at {model_path}"
|
|
99
|
+
)
|
|
100
|
+
model_path = config_file.parent
|
|
101
|
+
|
|
102
|
+
# As a precaution we also check that all of the files are in the same directory
|
|
103
|
+
# if not we create a new dir with symlinks to all of the files from all snapshots
|
|
104
|
+
# this is especially useful for vllm where we can only specify one folder and e.g.,
|
|
105
|
+
# the safetensors version of the weights was added in an unmerged PR
|
|
106
|
+
if not all(
|
|
107
|
+
[found_file.parent == found_files[0].parent for found_file in found_files]
|
|
108
|
+
):
|
|
109
|
+
new_model_path = model_path.parent / "model_files"
|
|
110
|
+
new_model_path.mkdir(exist_ok=True)
|
|
111
|
+
for found_file in found_files:
|
|
112
|
+
Path(new_model_path / found_file.name).symlink_to(found_file)
|
|
113
|
+
model_path = new_model_path
|
|
114
|
+
|
|
115
|
+
return str(model_path)
|
|
116
|
+
|
|
117
|
+
|
|
57
118
|
def clear_memory() -> None:
|
|
58
119
|
"""Clears the memory of unused items."""
|
|
59
120
|
for gc_generation in range(3):
|
|
@@ -91,6 +152,9 @@ def block_terminal_output() -> None:
|
|
|
91
152
|
libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
|
|
92
153
|
disables most of the logging from the `transformers` library.
|
|
93
154
|
"""
|
|
155
|
+
if os.getenv("FULL_LOG") == "1":
|
|
156
|
+
return
|
|
157
|
+
|
|
94
158
|
# Ignore miscellaneous warnings
|
|
95
159
|
warnings.filterwarnings("ignore", category=UserWarning)
|
|
96
160
|
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
@@ -196,6 +260,7 @@ def get_min_cuda_compute_capability() -> float | None:
|
|
|
196
260
|
return float(f"{major}.{minor}")
|
|
197
261
|
|
|
198
262
|
|
|
263
|
+
@cache
|
|
199
264
|
def internet_connection_available() -> bool:
|
|
200
265
|
"""Checks if internet connection is available by pinging google.com.
|
|
201
266
|
|
|
@@ -203,10 +268,17 @@ def internet_connection_available() -> bool:
|
|
|
203
268
|
Whether or not internet connection is available.
|
|
204
269
|
"""
|
|
205
270
|
try:
|
|
206
|
-
|
|
271
|
+
s = socket.create_connection(("1.1.1.1", 80))
|
|
272
|
+
s.close()
|
|
207
273
|
return True
|
|
208
|
-
|
|
209
|
-
|
|
274
|
+
# a bit ugly but we dont want to actually import the pytest-socket exceptions
|
|
275
|
+
# we catch all exceptions and check if the name matches any known errors
|
|
276
|
+
except Exception as e:
|
|
277
|
+
pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
|
|
278
|
+
if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
|
|
279
|
+
return False
|
|
280
|
+
else:
|
|
281
|
+
raise e
|
|
210
282
|
|
|
211
283
|
|
|
212
284
|
class HiddenPrints:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.2.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -61,13 +61,13 @@ Requires-Dist: transformers[mistral-common]>=4.56.0
|
|
|
61
61
|
Provides-Extra: all
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
-
Requires-Dist:
|
|
65
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
64
|
+
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
-
Requires-Dist:
|
|
70
|
-
Requires-Dist: vllm>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
69
|
+
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
+
Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Description-Content-Type: text/markdown
|
|
72
72
|
|
|
73
73
|
<div align='center'>
|
|
@@ -152,13 +152,13 @@ model:
|
|
|
152
152
|
```
|
|
153
153
|
>>> from euroeval import Benchmarker
|
|
154
154
|
>>> benchmark = Benchmarker()
|
|
155
|
-
>>> benchmark(model="<model>")
|
|
155
|
+
>>> benchmark(model="<model-id>")
|
|
156
156
|
```
|
|
157
157
|
|
|
158
158
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
159
159
|
`language` arguments, shown here with same example as above:
|
|
160
160
|
```
|
|
161
|
-
>>> benchmark(model="<model>", task="sentiment-classification", language="da")
|
|
161
|
+
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
162
162
|
```
|
|
163
163
|
|
|
164
164
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
@@ -168,6 +168,30 @@ models on the Danish sentiment classification task:
|
|
|
168
168
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
169
169
|
```
|
|
170
170
|
|
|
171
|
+
### Benchmarking in an Offline Environment
|
|
172
|
+
If you need to benchmark in an offline environment, you need to download the models,
|
|
173
|
+
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
174
|
+
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
175
|
+
script. For example to download the model you want and all of the Danish sentiment
|
|
176
|
+
classification datasets:
|
|
177
|
+
```
|
|
178
|
+
$ euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
Or from a script:
|
|
182
|
+
```
|
|
183
|
+
>>> benchmark(
|
|
184
|
+
... model="<model-id>",
|
|
185
|
+
... task="sentiment-classification",
|
|
186
|
+
... language="da",
|
|
187
|
+
... download_only=True,
|
|
188
|
+
... )
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
Please note: Offline benchmarking of adapter models is not currently supported. An
|
|
192
|
+
internet connection will be required during evaluation. If offline support is important
|
|
193
|
+
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
194
|
+
|
|
171
195
|
### Benchmarking from Docker
|
|
172
196
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
173
197
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
@@ -1,32 +1,32 @@
|
|
|
1
|
-
euroeval/__init__.py,sha256=
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
1
|
+
euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=NcdxQkGrstsprdz1QW3XrgS8B65uEP5SqxFJoL8zEEk,11831
|
|
3
|
+
euroeval/benchmarker.py,sha256=I82iVGwlRJ9BQ02u_bt5ngN-ZzWEJT2ReCrqXgh6lx4,55285
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
|
-
euroeval/cli.py,sha256=
|
|
5
|
+
euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
|
|
6
6
|
euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
|
|
7
7
|
euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=LNioJFW231RSSKZx7WIs46Xxs0KWgb7ElRyyULHSEzQ,27742
|
|
9
9
|
euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
|
|
12
|
-
euroeval/generation.py,sha256=
|
|
13
|
-
euroeval/generation_utils.py,sha256=
|
|
12
|
+
euroeval/generation.py,sha256=Va3EOmFzOMBNfI4fh3nW5qhhrM3CBT8_4MaLwVtsF_E,12528
|
|
13
|
+
euroeval/generation_utils.py,sha256=d2_vylWXIeH4xIXgbsI5rN6dMt0zKp0zXExD6aOKWaA,18299
|
|
14
14
|
euroeval/languages.py,sha256=G2cJI8lDT7eOFHxNR9opJ6zWjdxFDwm8P8HY_4WKFI4,33815
|
|
15
15
|
euroeval/model_cache.py,sha256=h61cL_fy2Sd1sqYZis5lAWqvQIfQXXt_v8QZeftKNkg,9226
|
|
16
16
|
euroeval/model_config.py,sha256=64KKHPTrpsFhFAANtBnAKkOs7PWZ50GXkXeDl4jICgs,2748
|
|
17
17
|
euroeval/model_loading.py,sha256=B6dyjYO0Dg7NOcUXls8Sjwe6W0c2UqJ1OGw-RkzoSSQ,2239
|
|
18
18
|
euroeval/scores.py,sha256=HQQqyjdgm853FZ_ifIdnSltKfBhsY7pOITov6F3Et5o,3165
|
|
19
19
|
euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4044
|
|
20
|
-
euroeval/tasks.py,sha256=
|
|
21
|
-
euroeval/tokenisation_utils.py,sha256=
|
|
22
|
-
euroeval/types.py,sha256=
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
20
|
+
euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
|
|
21
|
+
euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
|
|
22
|
+
euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
|
|
23
|
+
euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
|
|
27
|
-
euroeval/benchmark_modules/hf.py,sha256=
|
|
28
|
-
euroeval/benchmark_modules/litellm.py,sha256=
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
27
|
+
euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
|
|
28
|
+
euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
|
|
30
30
|
euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
|
|
31
31
|
euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
|
|
32
32
|
euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
|
|
@@ -45,26 +45,26 @@ euroeval/dataset_configs/portuguese.py,sha256=gQ054SdLQ5fkm4IAP6Mdh5RcPDJPDITcuy
|
|
|
45
45
|
euroeval/dataset_configs/spanish.py,sha256=DvJlMK6OQg4qmxKzQA2IficlBMB7BafvxqIVuTKiZyw,4902
|
|
46
46
|
euroeval/dataset_configs/swedish.py,sha256=YWHp7hbJ25o36csSg9uXaQCEJK1BPb7u2RQZiCe0lNs,5445
|
|
47
47
|
euroeval/metrics/__init__.py,sha256=qkELjrnBkuO9WzeQJZQRyXpZg_WclUByHswAc6Il7Ns,199
|
|
48
|
-
euroeval/metrics/base.py,sha256=
|
|
49
|
-
euroeval/metrics/huggingface.py,sha256=
|
|
48
|
+
euroeval/metrics/base.py,sha256=HST2XeZrUQZV_vTiieePiaznEov3CIGzuVNIITtLsQc,2596
|
|
49
|
+
euroeval/metrics/huggingface.py,sha256=iHKJnvOXRc_e8sxB2ff3WkfK64jXyn5KEnIxPyfD2fM,6522
|
|
50
50
|
euroeval/metrics/llm_as_a_judge.py,sha256=YCUHWK3_bkMEYvL7Q79ZAK3V0M1m5rq5zJYdtMxa4fs,9686
|
|
51
51
|
euroeval/metrics/pipeline.py,sha256=Wcan3eDWV7t4WRXMPWCCe_JsA-fZnIfZU2ESinbbL2I,10284
|
|
52
52
|
euroeval/metrics/speed.py,sha256=tLna031y0SVzAv6lvXBxf8IOSiw9dvLlonky2zM3MnE,1369
|
|
53
53
|
euroeval/prompt_templates/__init__.py,sha256=HWMZpybxs2xHPnVeJ43893conARahIVLWNXeRhXEGZw,357
|
|
54
|
-
euroeval/prompt_templates/linguistic_acceptability.py,sha256=
|
|
54
|
+
euroeval/prompt_templates/linguistic_acceptability.py,sha256=m23LrckohdnToQDsexdsW_5YyBfGTf5DTjiMI643F9A,8717
|
|
55
55
|
euroeval/prompt_templates/multiple_choice.py,sha256=Q-8-ETqG-RZeLzR8v8WUBIN7djiNSfNpmYnZRUWcd84,6905
|
|
56
|
-
euroeval/prompt_templates/named_entity_recognition.py,sha256=
|
|
56
|
+
euroeval/prompt_templates/named_entity_recognition.py,sha256=HIX9EBkSIBl5JXceFtiZTdvzWr9YHM9-55D6bcjIyQ4,16436
|
|
57
57
|
euroeval/prompt_templates/reading_comprehension.py,sha256=ogzmhiSZO6egrdxxQiWz6a0XMdC0vws-lg5yRKQoYV0,8730
|
|
58
|
-
euroeval/prompt_templates/sentiment_classification.py,sha256=
|
|
58
|
+
euroeval/prompt_templates/sentiment_classification.py,sha256=b3TvH26M77vwFfn577NlGVW881qfV7YSm-Xba_w98Fc,9504
|
|
59
59
|
euroeval/prompt_templates/summarization.py,sha256=4Sqwj6C7yNfqj4FFFCseJMLDoSZ13aIOgY0SjIzzsNo,6593
|
|
60
60
|
euroeval/task_group_utils/__init__.py,sha256=CorGVkixkoEDOQuDsrOGlTmF1zmM0wnGHs8psWTfD28,72
|
|
61
61
|
euroeval/task_group_utils/multiple_choice_classification.py,sha256=i5sidJGAXnENRoB6pOelyaUeGP1qoxwPSzD-F9RLwWk,7106
|
|
62
62
|
euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5Ei12cdRnrfq4pE-T7Y,27750
|
|
63
|
-
euroeval/task_group_utils/sequence_classification.py,sha256=
|
|
63
|
+
euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
|
|
64
64
|
euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
|
|
65
65
|
euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
|
|
66
|
-
euroeval-16.
|
|
67
|
-
euroeval-16.
|
|
68
|
-
euroeval-16.
|
|
69
|
-
euroeval-16.
|
|
70
|
-
euroeval-16.
|
|
66
|
+
euroeval-16.2.0.dist-info/METADATA,sha256=GQ1C9avsX8wl0Hcj3wmXvziveGDFWUT2aUrhhjIDzwc,14590
|
|
67
|
+
euroeval-16.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
68
|
+
euroeval-16.2.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
69
|
+
euroeval-16.2.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
70
|
+
euroeval-16.2.0.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|