EuroEval 16.2.0__py3-none-any.whl → 16.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/benchmark_config_factory.py +41 -129
- euroeval/benchmark_modules/vllm.py +10 -7
- euroeval/benchmarker.py +183 -337
- euroeval/data_models.py +35 -34
- euroeval/utils.py +9 -4
- {euroeval-16.2.0.dist-info → euroeval-16.2.2.dist-info}/METADATA +1 -1
- {euroeval-16.2.0.dist-info → euroeval-16.2.2.dist-info}/RECORD +10 -10
- {euroeval-16.2.0.dist-info → euroeval-16.2.2.dist-info}/WHEEL +0 -0
- {euroeval-16.2.0.dist-info → euroeval-16.2.2.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.0.dist-info → euroeval-16.2.2.dist-info}/licenses/LICENSE +0 -0
|
@@ -6,9 +6,9 @@ import typing as t
|
|
|
6
6
|
|
|
7
7
|
import torch
|
|
8
8
|
|
|
9
|
-
from .data_models import BenchmarkConfig
|
|
9
|
+
from .data_models import BenchmarkConfig, BenchmarkConfigParams
|
|
10
10
|
from .dataset_configs import get_all_dataset_configs
|
|
11
|
-
from .enums import Device
|
|
11
|
+
from .enums import Device
|
|
12
12
|
from .exceptions import InvalidBenchmark
|
|
13
13
|
from .languages import get_all_languages
|
|
14
14
|
from .tasks import SPEED, get_all_tasks
|
|
@@ -21,154 +21,66 @@ logger = logging.getLogger("euroeval")
|
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
def build_benchmark_config(
|
|
24
|
-
|
|
25
|
-
save_results: bool,
|
|
26
|
-
task: str | list[str] | None,
|
|
27
|
-
dataset: str | list[str] | None,
|
|
28
|
-
language: str | list[str],
|
|
29
|
-
model_language: str | list[str] | None,
|
|
30
|
-
dataset_language: str | list[str] | None,
|
|
31
|
-
device: Device | None,
|
|
32
|
-
batch_size: int,
|
|
33
|
-
raise_errors: bool,
|
|
34
|
-
cache_dir: str,
|
|
35
|
-
api_key: str | None,
|
|
36
|
-
force: bool,
|
|
37
|
-
verbose: bool,
|
|
38
|
-
trust_remote_code: bool,
|
|
39
|
-
clear_model_cache: bool,
|
|
40
|
-
evaluate_test_split: bool,
|
|
41
|
-
few_shot: bool,
|
|
42
|
-
num_iterations: int,
|
|
43
|
-
api_base: str | None,
|
|
44
|
-
api_version: str | None,
|
|
45
|
-
gpu_memory_utilization: float,
|
|
46
|
-
generative_type: GenerativeType | None,
|
|
47
|
-
debug: bool,
|
|
48
|
-
run_with_cli: bool,
|
|
49
|
-
requires_safetensors: bool,
|
|
50
|
-
download_only: bool,
|
|
24
|
+
benchmark_config_params: BenchmarkConfigParams,
|
|
51
25
|
) -> BenchmarkConfig:
|
|
52
26
|
"""Create a benchmark configuration.
|
|
53
27
|
|
|
54
28
|
Args:
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
save_results:
|
|
58
|
-
Whether to save the benchmark results to a file.
|
|
59
|
-
task:
|
|
60
|
-
The tasks to include for dataset. If None then datasets will not be
|
|
61
|
-
filtered based on their task.
|
|
62
|
-
dataset:
|
|
63
|
-
The datasets to include for task. If None then all datasets will be
|
|
64
|
-
included, limited by the `task` parameter.
|
|
65
|
-
language:
|
|
66
|
-
The language codes of the languages to include, both for models and
|
|
67
|
-
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
|
|
68
|
-
to 'all' if all languages should be considered.
|
|
69
|
-
model_language:
|
|
70
|
-
The language codes of the languages to include for models. If None then
|
|
71
|
-
the `language` parameter will be used.
|
|
72
|
-
dataset_language:
|
|
73
|
-
The language codes of the languages to include for datasets. If None then
|
|
74
|
-
the `language` parameter will be used.
|
|
75
|
-
device:
|
|
76
|
-
The device to use for running the models. If None then the device will be
|
|
77
|
-
set automatically.
|
|
78
|
-
batch_size:
|
|
79
|
-
The batch size to use for running the models.
|
|
80
|
-
raise_errors:
|
|
81
|
-
Whether to raise errors when running the benchmark.
|
|
82
|
-
cache_dir:
|
|
83
|
-
The directory to use for caching the models.
|
|
84
|
-
api_key:
|
|
85
|
-
The API key to use for a given inference server.
|
|
86
|
-
force:
|
|
87
|
-
Whether to force the benchmark to run even if the results are already
|
|
88
|
-
cached.
|
|
89
|
-
verbose:
|
|
90
|
-
Whether to print verbose output when running the benchmark. This is
|
|
91
|
-
automatically set if `debug` is True.
|
|
92
|
-
trust_remote_code:
|
|
93
|
-
Whether to trust remote code when running the benchmark.
|
|
94
|
-
clear_model_cache:
|
|
95
|
-
Whether to clear the model cache before running the benchmark.
|
|
96
|
-
evaluate_test_split:
|
|
97
|
-
Whether to use the test split for the datasets.
|
|
98
|
-
few_shot:
|
|
99
|
-
Whether to use few-shot learning for the models.
|
|
100
|
-
num_iterations:
|
|
101
|
-
The number of iterations each model should be evaluated for.
|
|
102
|
-
api_base:
|
|
103
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
104
|
-
model on an inference API.
|
|
105
|
-
api_version:
|
|
106
|
-
The version of the API to use for a given inference API.
|
|
107
|
-
gpu_memory_utilization:
|
|
108
|
-
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
109
|
-
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
110
|
-
this if you are running out of GPU memory. Only relevant if the model is
|
|
111
|
-
generative.
|
|
112
|
-
generative_type:
|
|
113
|
-
The type of generative model. Only relevant if the model is generative. If
|
|
114
|
-
not specified, the type will be inferred automatically.
|
|
115
|
-
debug:
|
|
116
|
-
Whether to run the benchmark in debug mode.
|
|
117
|
-
run_with_cli:
|
|
118
|
-
Whether the benchmark is being run with the CLI.
|
|
119
|
-
requires_safetensors:
|
|
120
|
-
Whether to only allow evaluations of models stored as safetensors.
|
|
121
|
-
download_only:
|
|
122
|
-
Whether to only download the requested model weights and datasets.
|
|
29
|
+
benchmark_config_params:
|
|
30
|
+
The parameters for creating the benchmark configuration.
|
|
123
31
|
|
|
124
32
|
Returns:
|
|
125
33
|
The benchmark configuration.
|
|
126
34
|
"""
|
|
127
|
-
language_codes = get_correct_language_codes(
|
|
35
|
+
language_codes = get_correct_language_codes(
|
|
36
|
+
language_codes=benchmark_config_params.language
|
|
37
|
+
)
|
|
128
38
|
model_languages = prepare_languages(
|
|
129
|
-
language_codes=model_language,
|
|
39
|
+
language_codes=benchmark_config_params.model_language,
|
|
40
|
+
default_language_codes=language_codes,
|
|
130
41
|
)
|
|
131
42
|
dataset_languages = prepare_languages(
|
|
132
|
-
language_codes=dataset_language,
|
|
43
|
+
language_codes=benchmark_config_params.dataset_language,
|
|
44
|
+
default_language_codes=language_codes,
|
|
133
45
|
)
|
|
134
46
|
|
|
135
47
|
tasks, datasets = prepare_tasks_and_datasets(
|
|
136
|
-
task=task,
|
|
48
|
+
task=benchmark_config_params.task,
|
|
49
|
+
dataset=benchmark_config_params.dataset,
|
|
50
|
+
dataset_languages=dataset_languages,
|
|
137
51
|
)
|
|
138
52
|
|
|
139
|
-
torch_device = prepare_device(device=device)
|
|
140
|
-
|
|
141
|
-
# Set variable with number of iterations
|
|
142
|
-
if hasattr(sys, "_called_from_test"):
|
|
143
|
-
num_iterations = 1
|
|
144
|
-
|
|
145
53
|
return BenchmarkConfig(
|
|
146
54
|
model_languages=model_languages,
|
|
147
55
|
dataset_languages=dataset_languages,
|
|
148
56
|
tasks=tasks,
|
|
149
57
|
datasets=datasets,
|
|
150
|
-
batch_size=batch_size,
|
|
151
|
-
raise_errors=raise_errors,
|
|
152
|
-
cache_dir=cache_dir,
|
|
153
|
-
api_key=api_key,
|
|
154
|
-
force=force,
|
|
155
|
-
progress_bar=progress_bar,
|
|
156
|
-
save_results=save_results,
|
|
157
|
-
verbose=verbose or debug,
|
|
158
|
-
device=
|
|
159
|
-
trust_remote_code=trust_remote_code,
|
|
160
|
-
clear_model_cache=clear_model_cache,
|
|
161
|
-
evaluate_test_split=evaluate_test_split,
|
|
162
|
-
few_shot=few_shot,
|
|
163
|
-
num_iterations=
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
58
|
+
batch_size=benchmark_config_params.batch_size,
|
|
59
|
+
raise_errors=benchmark_config_params.raise_errors,
|
|
60
|
+
cache_dir=benchmark_config_params.cache_dir,
|
|
61
|
+
api_key=benchmark_config_params.api_key,
|
|
62
|
+
force=benchmark_config_params.force,
|
|
63
|
+
progress_bar=benchmark_config_params.progress_bar,
|
|
64
|
+
save_results=benchmark_config_params.save_results,
|
|
65
|
+
verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
|
|
66
|
+
device=prepare_device(device=benchmark_config_params.device),
|
|
67
|
+
trust_remote_code=benchmark_config_params.trust_remote_code,
|
|
68
|
+
clear_model_cache=benchmark_config_params.clear_model_cache,
|
|
69
|
+
evaluate_test_split=benchmark_config_params.evaluate_test_split,
|
|
70
|
+
few_shot=benchmark_config_params.few_shot,
|
|
71
|
+
num_iterations=(
|
|
72
|
+
1
|
|
73
|
+
if hasattr(sys, "_called_from_test")
|
|
74
|
+
else benchmark_config_params.num_iterations
|
|
75
|
+
),
|
|
76
|
+
api_base=benchmark_config_params.api_base,
|
|
77
|
+
api_version=benchmark_config_params.api_version,
|
|
78
|
+
gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
|
|
79
|
+
generative_type=benchmark_config_params.generative_type,
|
|
80
|
+
debug=benchmark_config_params.debug,
|
|
81
|
+
run_with_cli=benchmark_config_params.run_with_cli,
|
|
82
|
+
requires_safetensors=benchmark_config_params.requires_safetensors,
|
|
83
|
+
download_only=benchmark_config_params.download_only,
|
|
172
84
|
)
|
|
173
85
|
|
|
174
86
|
|
|
@@ -836,15 +836,18 @@ def load_model_and_tokeniser(
|
|
|
836
836
|
|
|
837
837
|
clear_vllm()
|
|
838
838
|
|
|
839
|
-
# if we do not have an internet connection we need to give the path to the folder
|
|
840
|
-
# that contains the model weights and config files, otherwise vLLM will try to
|
|
841
|
-
# download them regardless if they are already present in the download_dir
|
|
842
|
-
model_path = resolve_model_path(download_dir)
|
|
843
|
-
|
|
844
839
|
try:
|
|
845
840
|
model = LLM(
|
|
846
|
-
model=
|
|
847
|
-
|
|
841
|
+
model=(
|
|
842
|
+
model_id
|
|
843
|
+
if internet_connection_available()
|
|
844
|
+
else resolve_model_path(download_dir=download_dir)
|
|
845
|
+
),
|
|
846
|
+
tokenizer=(
|
|
847
|
+
model_id
|
|
848
|
+
if internet_connection_available()
|
|
849
|
+
else resolve_model_path(download_dir=download_dir)
|
|
850
|
+
),
|
|
848
851
|
gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
|
|
849
852
|
max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
|
|
850
853
|
download_dir=download_dir,
|
euroeval/benchmarker.py
CHANGED
|
@@ -6,7 +6,6 @@ import logging
|
|
|
6
6
|
import re
|
|
7
7
|
import sys
|
|
8
8
|
import typing as t
|
|
9
|
-
from copy import deepcopy
|
|
10
9
|
from pathlib import Path
|
|
11
10
|
from shutil import rmtree
|
|
12
11
|
from time import sleep
|
|
@@ -200,10 +199,10 @@ class Benchmarker:
|
|
|
200
199
|
)
|
|
201
200
|
|
|
202
201
|
self.benchmark_config_default_params = BenchmarkConfigParams(
|
|
203
|
-
progress_bar=progress_bar,
|
|
204
|
-
save_results=save_results,
|
|
205
202
|
task=task,
|
|
206
203
|
dataset=dataset,
|
|
204
|
+
progress_bar=progress_bar,
|
|
205
|
+
save_results=save_results,
|
|
207
206
|
language=language,
|
|
208
207
|
model_language=model_language,
|
|
209
208
|
dataset_language=dataset_language,
|
|
@@ -212,24 +211,25 @@ class Benchmarker:
|
|
|
212
211
|
raise_errors=raise_errors,
|
|
213
212
|
cache_dir=cache_dir,
|
|
214
213
|
api_key=api_key,
|
|
215
|
-
|
|
216
|
-
|
|
214
|
+
api_base=api_base,
|
|
215
|
+
api_version=api_version,
|
|
217
216
|
trust_remote_code=trust_remote_code,
|
|
218
217
|
clear_model_cache=clear_model_cache,
|
|
219
218
|
evaluate_test_split=evaluate_test_split,
|
|
220
219
|
few_shot=few_shot,
|
|
221
220
|
num_iterations=num_iterations,
|
|
222
|
-
|
|
223
|
-
|
|
221
|
+
requires_safetensors=requires_safetensors,
|
|
222
|
+
download_only=download_only,
|
|
224
223
|
gpu_memory_utilization=gpu_memory_utilization,
|
|
225
224
|
generative_type=generative_type,
|
|
225
|
+
verbose=verbose,
|
|
226
|
+
force=force,
|
|
226
227
|
debug=debug,
|
|
227
228
|
run_with_cli=run_with_cli,
|
|
228
|
-
requires_safetensors=requires_safetensors,
|
|
229
229
|
)
|
|
230
230
|
|
|
231
231
|
self.benchmark_config = build_benchmark_config(
|
|
232
|
-
|
|
232
|
+
benchmark_config_params=self.benchmark_config_default_params
|
|
233
233
|
)
|
|
234
234
|
|
|
235
235
|
# Initialise variable storing model lists, so we only have to fetch it once
|
|
@@ -331,14 +331,20 @@ class Benchmarker:
|
|
|
331
331
|
raise_errors: bool | None = None,
|
|
332
332
|
cache_dir: str | None = None,
|
|
333
333
|
api_key: str | None = None,
|
|
334
|
-
|
|
335
|
-
|
|
334
|
+
api_base: str | None = None,
|
|
335
|
+
api_version: str | None = None,
|
|
336
336
|
trust_remote_code: bool | None = None,
|
|
337
337
|
clear_model_cache: bool | None = None,
|
|
338
338
|
evaluate_test_split: bool | None = None,
|
|
339
339
|
few_shot: bool | None = None,
|
|
340
340
|
num_iterations: int | None = None,
|
|
341
341
|
requires_safetensors: bool | None = None,
|
|
342
|
+
download_only: bool | None = None,
|
|
343
|
+
gpu_memory_utilization: float | None = None,
|
|
344
|
+
generative_type: GenerativeType | None = None,
|
|
345
|
+
force: bool | None = None,
|
|
346
|
+
verbose: bool | None = None,
|
|
347
|
+
debug: bool | None = None,
|
|
342
348
|
) -> list[BenchmarkResult]:
|
|
343
349
|
"""Benchmarks models on datasets.
|
|
344
350
|
|
|
@@ -391,13 +397,13 @@ class Benchmarker:
|
|
|
391
397
|
api_key:
|
|
392
398
|
The API key to use for a given inference server. Defaults to the value
|
|
393
399
|
specified when initialising the benchmarker.
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
the benchmarker.
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
400
|
+
api_base:
|
|
401
|
+
The base URL for a given inference API. Only relevant if `model` refers
|
|
402
|
+
to a model on an inference API. Defaults to the value specified when
|
|
403
|
+
initialising the benchmarker.
|
|
404
|
+
api_version:
|
|
405
|
+
The version of the API to use. Defaults to the value specified when
|
|
406
|
+
initialising the benchmarker.
|
|
401
407
|
trust_remote_code:
|
|
402
408
|
Whether to trust remote code when loading models. Defaults to the value
|
|
403
409
|
specified when initialising the benchmarker.
|
|
@@ -422,6 +428,27 @@ class Benchmarker:
|
|
|
422
428
|
download_only:
|
|
423
429
|
Whether to only download the models without evaluating them. Defaults
|
|
424
430
|
to the value specified when initialising the benchmarker.
|
|
431
|
+
gpu_memory_utilization:
|
|
432
|
+
The GPU memory utilization to use for vLLM. Only relevant if the model
|
|
433
|
+
is generative. A larger value will result in faster evaluation, but at
|
|
434
|
+
the risk of running out of GPU memory. Only reduce this if you are
|
|
435
|
+
running out of GPU memory. Defaults to the value specified when
|
|
436
|
+
initialising the benchmarker.
|
|
437
|
+
generative_type:
|
|
438
|
+
The type of generative model to benchmark. Only relevant if the model is
|
|
439
|
+
generative. If not specified, then the type will be inferred based on
|
|
440
|
+
the tags of the model. Defaults to the value specified when initialising
|
|
441
|
+
the benchmarker.
|
|
442
|
+
force:
|
|
443
|
+
Whether to force evaluations of models, even if they have been
|
|
444
|
+
benchmarked already. Defaults to the value specified when initialising
|
|
445
|
+
the benchmarker.
|
|
446
|
+
verbose:
|
|
447
|
+
Whether to output additional output. Defaults to the value specified
|
|
448
|
+
when initialising the benchmarker.
|
|
449
|
+
debug:
|
|
450
|
+
Whether to output debug information. Defaults to the value specified
|
|
451
|
+
when initialising the benchmarker.
|
|
425
452
|
|
|
426
453
|
Returns:
|
|
427
454
|
A list of benchmark results.
|
|
@@ -433,27 +460,141 @@ class Benchmarker:
|
|
|
433
460
|
if task is not None and dataset is not None:
|
|
434
461
|
raise ValueError("Only one of `task` and `dataset` can be specified.")
|
|
435
462
|
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
463
|
+
# Get a new updated benchmark configuration, based on any changes to the
|
|
464
|
+
# parameters
|
|
465
|
+
benchmark_config_params = BenchmarkConfigParams(
|
|
466
|
+
task=(
|
|
467
|
+
task if task is not None else self.benchmark_config_default_params.task
|
|
468
|
+
),
|
|
469
|
+
dataset=(
|
|
470
|
+
dataset
|
|
471
|
+
if dataset is not None
|
|
472
|
+
else self.benchmark_config_default_params.dataset
|
|
473
|
+
),
|
|
474
|
+
progress_bar=(
|
|
475
|
+
progress_bar
|
|
476
|
+
if progress_bar is not None
|
|
477
|
+
else self.benchmark_config_default_params.progress_bar
|
|
478
|
+
),
|
|
479
|
+
save_results=(
|
|
480
|
+
save_results
|
|
481
|
+
if save_results is not None
|
|
482
|
+
else self.benchmark_config_default_params.save_results
|
|
483
|
+
),
|
|
484
|
+
language=(
|
|
485
|
+
language
|
|
486
|
+
if language is not None
|
|
487
|
+
else self.benchmark_config_default_params.language
|
|
488
|
+
),
|
|
489
|
+
model_language=(
|
|
490
|
+
model_language
|
|
491
|
+
if model_language is not None
|
|
492
|
+
else self.benchmark_config_default_params.model_language
|
|
493
|
+
),
|
|
494
|
+
dataset_language=(
|
|
495
|
+
dataset_language
|
|
496
|
+
if dataset_language is not None
|
|
497
|
+
else self.benchmark_config_default_params.dataset_language
|
|
498
|
+
),
|
|
499
|
+
device=(
|
|
500
|
+
device
|
|
501
|
+
if device is not None
|
|
502
|
+
else self.benchmark_config_default_params.device
|
|
503
|
+
),
|
|
504
|
+
batch_size=(
|
|
505
|
+
batch_size
|
|
506
|
+
if batch_size is not None
|
|
507
|
+
else self.benchmark_config_default_params.batch_size
|
|
508
|
+
),
|
|
509
|
+
raise_errors=(
|
|
510
|
+
raise_errors
|
|
511
|
+
if raise_errors is not None
|
|
512
|
+
else self.benchmark_config_default_params.raise_errors
|
|
513
|
+
),
|
|
514
|
+
cache_dir=(
|
|
515
|
+
cache_dir
|
|
516
|
+
if cache_dir is not None
|
|
517
|
+
else self.benchmark_config_default_params.cache_dir
|
|
518
|
+
),
|
|
519
|
+
api_key=(
|
|
520
|
+
api_key
|
|
521
|
+
if api_key is not None
|
|
522
|
+
else self.benchmark_config_default_params.api_key
|
|
523
|
+
),
|
|
524
|
+
api_base=(
|
|
525
|
+
api_base
|
|
526
|
+
if api_base is not None
|
|
527
|
+
else self.benchmark_config_default_params.api_base
|
|
528
|
+
),
|
|
529
|
+
api_version=(
|
|
530
|
+
api_version
|
|
531
|
+
if api_version is not None
|
|
532
|
+
else self.benchmark_config_default_params.api_version
|
|
533
|
+
),
|
|
534
|
+
trust_remote_code=(
|
|
535
|
+
trust_remote_code
|
|
536
|
+
if trust_remote_code is not None
|
|
537
|
+
else self.benchmark_config_default_params.trust_remote_code
|
|
538
|
+
),
|
|
539
|
+
clear_model_cache=(
|
|
540
|
+
clear_model_cache
|
|
541
|
+
if clear_model_cache is not None
|
|
542
|
+
else self.benchmark_config_default_params.clear_model_cache
|
|
543
|
+
),
|
|
544
|
+
evaluate_test_split=(
|
|
545
|
+
evaluate_test_split
|
|
546
|
+
if evaluate_test_split is not None
|
|
547
|
+
else self.benchmark_config_default_params.evaluate_test_split
|
|
548
|
+
),
|
|
549
|
+
few_shot=(
|
|
550
|
+
few_shot
|
|
551
|
+
if few_shot is not None
|
|
552
|
+
else self.benchmark_config_default_params.few_shot
|
|
553
|
+
),
|
|
554
|
+
num_iterations=(
|
|
555
|
+
num_iterations
|
|
556
|
+
if num_iterations is not None
|
|
557
|
+
else self.benchmark_config_default_params.num_iterations
|
|
558
|
+
),
|
|
559
|
+
requires_safetensors=(
|
|
560
|
+
requires_safetensors
|
|
561
|
+
if requires_safetensors is not None
|
|
562
|
+
else self.benchmark_config_default_params.requires_safetensors
|
|
563
|
+
),
|
|
564
|
+
download_only=(
|
|
565
|
+
download_only
|
|
566
|
+
if download_only is not None
|
|
567
|
+
else self.benchmark_config_default_params.download_only
|
|
568
|
+
),
|
|
569
|
+
gpu_memory_utilization=(
|
|
570
|
+
gpu_memory_utilization
|
|
571
|
+
if gpu_memory_utilization is not None
|
|
572
|
+
else self.benchmark_config_default_params.gpu_memory_utilization
|
|
573
|
+
),
|
|
574
|
+
generative_type=(
|
|
575
|
+
generative_type
|
|
576
|
+
if generative_type is not None
|
|
577
|
+
else self.benchmark_config_default_params.generative_type
|
|
578
|
+
),
|
|
579
|
+
force=(
|
|
580
|
+
force
|
|
581
|
+
if force is not None
|
|
582
|
+
else self.benchmark_config_default_params.force
|
|
583
|
+
),
|
|
584
|
+
verbose=(
|
|
585
|
+
verbose
|
|
586
|
+
if verbose is not None
|
|
587
|
+
else self.benchmark_config_default_params.verbose
|
|
588
|
+
),
|
|
589
|
+
debug=(
|
|
590
|
+
debug
|
|
591
|
+
if debug is not None
|
|
592
|
+
else self.benchmark_config_default_params.debug
|
|
593
|
+
),
|
|
594
|
+
run_with_cli=self.benchmark_config_default_params.run_with_cli,
|
|
595
|
+
)
|
|
596
|
+
benchmark_config = build_benchmark_config(
|
|
597
|
+
benchmark_config_params=benchmark_config_params
|
|
457
598
|
)
|
|
458
599
|
|
|
459
600
|
adjust_logging_level(verbose=benchmark_config.verbose)
|
|
@@ -651,170 +792,6 @@ class Benchmarker:
|
|
|
651
792
|
destroy_process_group()
|
|
652
793
|
return current_benchmark_results
|
|
653
794
|
|
|
654
|
-
def _get_updated_benchmark_config(
|
|
655
|
-
self,
|
|
656
|
-
progress_bar: bool | None = None,
|
|
657
|
-
save_results: bool | None = None,
|
|
658
|
-
task: str | list[str] | None | None = None,
|
|
659
|
-
dataset: str | list[str] | None | None = None,
|
|
660
|
-
language: str | list[str] | None = None,
|
|
661
|
-
model_language: str | list[str] | None | None = None,
|
|
662
|
-
dataset_language: str | list[str] | None | None = None,
|
|
663
|
-
device: Device | None | None = None,
|
|
664
|
-
batch_size: int | None = None,
|
|
665
|
-
raise_errors: bool | None = None,
|
|
666
|
-
cache_dir: str | None = None,
|
|
667
|
-
api_key: str | None | None = None,
|
|
668
|
-
force: bool | None = None,
|
|
669
|
-
verbose: bool | None = None,
|
|
670
|
-
trust_remote_code: bool | None = None,
|
|
671
|
-
clear_model_cache: bool | None = None,
|
|
672
|
-
evaluate_test_split: bool | None = None,
|
|
673
|
-
few_shot: bool | None = None,
|
|
674
|
-
num_iterations: int | None = None,
|
|
675
|
-
api_base: str | None | None = None,
|
|
676
|
-
api_version: str | None | None = None,
|
|
677
|
-
debug: bool | None = None,
|
|
678
|
-
run_with_cli: bool | None = None,
|
|
679
|
-
requires_safetensors: bool | None = None,
|
|
680
|
-
) -> "BenchmarkConfig":
|
|
681
|
-
"""Get an updated benchmark configuration.
|
|
682
|
-
|
|
683
|
-
Args:
|
|
684
|
-
progress_bar:
|
|
685
|
-
Whether progress bars should be shown. If None, then this value will not
|
|
686
|
-
be updated.
|
|
687
|
-
save_results:
|
|
688
|
-
Whether to save the benchmark results to
|
|
689
|
-
'euroeval_benchmark_results.jsonl'. If None, then this value will not
|
|
690
|
-
be updated.
|
|
691
|
-
task:
|
|
692
|
-
The tasks benchmark the model(s) on. If None, then this value will not
|
|
693
|
-
be updated.
|
|
694
|
-
dataset:
|
|
695
|
-
The datasets to benchmark on. If None, then this value will not be
|
|
696
|
-
updated.
|
|
697
|
-
language:
|
|
698
|
-
The language codes of the languages to include, both for models and
|
|
699
|
-
datasets. If None, then this value will not be updated.
|
|
700
|
-
model_language:
|
|
701
|
-
The language codes of the languages to include for models. If None, then
|
|
702
|
-
this value will not be updated.
|
|
703
|
-
dataset_language:
|
|
704
|
-
The language codes of the languages to include for datasets. If None,
|
|
705
|
-
then this value will not be updated.
|
|
706
|
-
device:
|
|
707
|
-
The device to use for benchmarking. If None, then this value will not be
|
|
708
|
-
updated.
|
|
709
|
-
batch_size:
|
|
710
|
-
The batch size to use. If None, then this value will not be updated.
|
|
711
|
-
raise_errors:
|
|
712
|
-
Whether to raise errors instead of skipping the model evaluation. If
|
|
713
|
-
None, then this value will not be updated.
|
|
714
|
-
cache_dir:
|
|
715
|
-
Directory to store cached models. If None, then this value will not be
|
|
716
|
-
updated.
|
|
717
|
-
api_key:
|
|
718
|
-
The API key to use for a given inference server. If None, then this
|
|
719
|
-
value will not be updated.
|
|
720
|
-
force:
|
|
721
|
-
Whether to force evaluations of models, even if they have been
|
|
722
|
-
benchmarked already. If None, then this value will not be updated.
|
|
723
|
-
verbose:
|
|
724
|
-
Whether to output additional output. If None, then this value will not
|
|
725
|
-
be updated.
|
|
726
|
-
trust_remote_code:
|
|
727
|
-
Whether to trust remote code when loading models. If None, then this
|
|
728
|
-
value will not be updated.
|
|
729
|
-
clear_model_cache:
|
|
730
|
-
Whether to clear the model cache after benchmarking each model. If None,
|
|
731
|
-
then this value will not be updated.
|
|
732
|
-
evaluate_test_split:
|
|
733
|
-
Whether to evaluate the test split of the datasets. If None, then this
|
|
734
|
-
value will not be updated.
|
|
735
|
-
few_shot:
|
|
736
|
-
Whether to only evaluate the model using few-shot evaluation. If None,
|
|
737
|
-
then this value will not be updated.
|
|
738
|
-
num_iterations:
|
|
739
|
-
The number of times each model should be evaluated. If None, then this
|
|
740
|
-
value will not be updated.
|
|
741
|
-
api_base:
|
|
742
|
-
The base URL for a given inference API. If None, then this value will
|
|
743
|
-
not be updated.
|
|
744
|
-
api_version:
|
|
745
|
-
The version of the API to use. If None, then this value will not be
|
|
746
|
-
updated.
|
|
747
|
-
debug:
|
|
748
|
-
Whether to output debug information. If None, then this value will not
|
|
749
|
-
be updated.
|
|
750
|
-
run_with_cli:
|
|
751
|
-
Whether the benchmarker is being run from the command-line interface.
|
|
752
|
-
If None, then this value will not be updated.
|
|
753
|
-
requires_safetensors:
|
|
754
|
-
Whether to only allow models that use the safetensors format. If None,
|
|
755
|
-
then this value will not be updated.
|
|
756
|
-
download_only:
|
|
757
|
-
Whether to only download the models without evaluating them. If None,
|
|
758
|
-
then this value will not be updated.
|
|
759
|
-
|
|
760
|
-
Returns:
|
|
761
|
-
The updated benchmark configuration.
|
|
762
|
-
"""
|
|
763
|
-
benchmark_config_params = deepcopy(self.benchmark_config_default_params)
|
|
764
|
-
|
|
765
|
-
if progress_bar is not None:
|
|
766
|
-
benchmark_config_params.progress_bar = progress_bar
|
|
767
|
-
if save_results is not None:
|
|
768
|
-
benchmark_config_params.save_results = save_results
|
|
769
|
-
if task is not None:
|
|
770
|
-
benchmark_config_params.task = task
|
|
771
|
-
benchmark_config_params.dataset = None
|
|
772
|
-
if dataset is not None:
|
|
773
|
-
benchmark_config_params.dataset = dataset
|
|
774
|
-
benchmark_config_params.task = None
|
|
775
|
-
if language is not None:
|
|
776
|
-
benchmark_config_params.language = language
|
|
777
|
-
if model_language is not None:
|
|
778
|
-
benchmark_config_params.model_language = model_language
|
|
779
|
-
if dataset_language is not None:
|
|
780
|
-
benchmark_config_params.dataset_language = dataset_language
|
|
781
|
-
if device is not None:
|
|
782
|
-
benchmark_config_params.device = device
|
|
783
|
-
if batch_size is not None:
|
|
784
|
-
benchmark_config_params.batch_size = batch_size
|
|
785
|
-
if raise_errors is not None:
|
|
786
|
-
benchmark_config_params.raise_errors = raise_errors
|
|
787
|
-
if cache_dir is not None:
|
|
788
|
-
benchmark_config_params.cache_dir = cache_dir
|
|
789
|
-
if api_key is not None:
|
|
790
|
-
benchmark_config_params.api_key = api_key
|
|
791
|
-
if force is not None:
|
|
792
|
-
benchmark_config_params.force = force
|
|
793
|
-
if verbose is not None:
|
|
794
|
-
benchmark_config_params.verbose = verbose
|
|
795
|
-
if trust_remote_code is not None:
|
|
796
|
-
benchmark_config_params.trust_remote_code = trust_remote_code
|
|
797
|
-
if clear_model_cache is not None:
|
|
798
|
-
benchmark_config_params.clear_model_cache = clear_model_cache
|
|
799
|
-
if evaluate_test_split is not None:
|
|
800
|
-
benchmark_config_params.evaluate_test_split = evaluate_test_split
|
|
801
|
-
if few_shot is not None:
|
|
802
|
-
benchmark_config_params.few_shot = few_shot
|
|
803
|
-
if num_iterations is not None:
|
|
804
|
-
benchmark_config_params.num_iterations = num_iterations
|
|
805
|
-
if api_base is not None:
|
|
806
|
-
benchmark_config_params.api_base = api_base
|
|
807
|
-
if api_version is not None:
|
|
808
|
-
benchmark_config_params.api_version = api_version
|
|
809
|
-
if debug is not None:
|
|
810
|
-
benchmark_config_params.debug = debug
|
|
811
|
-
if run_with_cli is not None:
|
|
812
|
-
benchmark_config_params.run_with_cli = run_with_cli
|
|
813
|
-
if requires_safetensors is not None:
|
|
814
|
-
benchmark_config_params.requires_safetensors = requires_safetensors
|
|
815
|
-
|
|
816
|
-
return build_benchmark_config(**benchmark_config_params.model_dump())
|
|
817
|
-
|
|
818
795
|
def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
|
|
819
796
|
"""Prepare the model ID(s) to be benchmarked.
|
|
820
797
|
|
|
@@ -982,144 +959,13 @@ class Benchmarker:
|
|
|
982
959
|
raise e
|
|
983
960
|
return e
|
|
984
961
|
|
|
985
|
-
def __call__(
|
|
986
|
-
self
|
|
987
|
-
model: list[str] | str,
|
|
988
|
-
task: str | list[str] | None = None,
|
|
989
|
-
dataset: list[str] | str | None = None,
|
|
990
|
-
progress_bar: bool | None = None,
|
|
991
|
-
save_results: bool | None = None,
|
|
992
|
-
language: str | list[str] | None = None,
|
|
993
|
-
model_language: str | list[str] | None = None,
|
|
994
|
-
dataset_language: str | list[str] | None = None,
|
|
995
|
-
device: Device | None = None,
|
|
996
|
-
batch_size: int | None = None,
|
|
997
|
-
raise_errors: bool | None = None,
|
|
998
|
-
cache_dir: str | None = None,
|
|
999
|
-
api_key: str | None = None,
|
|
1000
|
-
force: bool | None = None,
|
|
1001
|
-
verbose: bool | None = None,
|
|
1002
|
-
trust_remote_code: bool | None = None,
|
|
1003
|
-
clear_model_cache: bool | None = None,
|
|
1004
|
-
evaluate_test_split: bool | None = None,
|
|
1005
|
-
few_shot: bool | None = None,
|
|
1006
|
-
num_iterations: int | None = None,
|
|
1007
|
-
requires_safetensors: bool | None = None,
|
|
1008
|
-
) -> list[BenchmarkResult]:
|
|
1009
|
-
"""Benchmarks models on datasets.
|
|
1010
|
-
|
|
1011
|
-
Args:
|
|
1012
|
-
model:
|
|
1013
|
-
The full Hugging Face Hub path(s) to the pretrained transformer model.
|
|
1014
|
-
The specific model version to use can be added after the suffix '@':
|
|
1015
|
-
"model@v1.0.0". It can be a branch name, a tag name, or a commit id,
|
|
1016
|
-
and defaults to the latest version if not specified.
|
|
1017
|
-
task:
|
|
1018
|
-
The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
|
|
1019
|
-
If both `task` and `dataset` are None then all datasets will be
|
|
1020
|
-
benchmarked. Defaults to None.
|
|
1021
|
-
dataset:
|
|
1022
|
-
The datasets to benchmark on. Mutually exclusive with `task`. If both
|
|
1023
|
-
`task` and `dataset` are None then all datasets will be benchmarked.
|
|
1024
|
-
Defaults to None.
|
|
1025
|
-
progress_bar:
|
|
1026
|
-
Whether progress bars should be shown. Defaults to the value specified
|
|
1027
|
-
when initialising the benchmarker.
|
|
1028
|
-
save_results:
|
|
1029
|
-
Whether to save the benchmark results to
|
|
1030
|
-
'euroeval_benchmark_results.jsonl'. Defaults to the value specified
|
|
1031
|
-
when initialising the benchmarker.
|
|
1032
|
-
language:
|
|
1033
|
-
The language codes of the languages to include, both for models and
|
|
1034
|
-
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
|
|
1035
|
-
'all' if all languages should be considered. Defaults to the value
|
|
1036
|
-
specified when initialising the benchmarker.
|
|
1037
|
-
model_language:
|
|
1038
|
-
The language codes of the languages to include for models. If specified
|
|
1039
|
-
then this overrides the `language` parameter for model languages.
|
|
1040
|
-
Defaults to the value specified when initialising the benchmarker.
|
|
1041
|
-
dataset_language:
|
|
1042
|
-
The language codes of the languages to include for datasets. If
|
|
1043
|
-
specified then this overrides the `language` parameter for dataset
|
|
1044
|
-
languages. Defaults to the value specified when initialising the
|
|
1045
|
-
benchmarker.
|
|
1046
|
-
device:
|
|
1047
|
-
The device to use for benchmarking. Defaults to the value specified when
|
|
1048
|
-
initialising the benchmarker.
|
|
1049
|
-
batch_size:
|
|
1050
|
-
The batch size to use. Defaults to the value specified when initialising
|
|
1051
|
-
the benchmarker.
|
|
1052
|
-
raise_errors:
|
|
1053
|
-
Whether to raise errors instead of skipping the model evaluation.
|
|
1054
|
-
cache_dir:
|
|
1055
|
-
Directory to store cached models. Defaults to the value specified when
|
|
1056
|
-
initialising the benchmarker.
|
|
1057
|
-
api_key:
|
|
1058
|
-
The API key to use for a given inference server. Defaults to the value
|
|
1059
|
-
specified when initialising the benchmarker.
|
|
1060
|
-
force:
|
|
1061
|
-
Whether to force evaluations of models, even if they have been
|
|
1062
|
-
benchmarked already. Defaults to the value specified when initialising
|
|
1063
|
-
the benchmarker.
|
|
1064
|
-
verbose:
|
|
1065
|
-
Whether to output additional output. Defaults to the value specified
|
|
1066
|
-
when initialising the benchmarker.
|
|
1067
|
-
trust_remote_code:
|
|
1068
|
-
Whether to trust remote code when loading models. Defaults to the value
|
|
1069
|
-
specified when initialising the benchmarker.
|
|
1070
|
-
clear_model_cache:
|
|
1071
|
-
Whether to clear the model cache after benchmarking each model. Defaults
|
|
1072
|
-
to the value specified when initialising the benchmarker.
|
|
1073
|
-
evaluate_test_split:
|
|
1074
|
-
Whether to evaluate the test split of the datasets. Defaults to the
|
|
1075
|
-
value specified when initialising the benchmarker.
|
|
1076
|
-
few_shot:
|
|
1077
|
-
Whether to only evaluate the model using few-shot evaluation. Only
|
|
1078
|
-
relevant if the model is generative. Defaults to the value specified
|
|
1079
|
-
when initialising the benchmarker.
|
|
1080
|
-
num_iterations:
|
|
1081
|
-
The number of times each model should be evaluated. This is only meant
|
|
1082
|
-
to be used for power users, and scores will not be allowed on the
|
|
1083
|
-
leaderboards if this is changed. Defaults to the value specified when
|
|
1084
|
-
initialising the benchmarker.
|
|
1085
|
-
requires_safetensors:
|
|
1086
|
-
Whether to only allow models that use the safetensors format. Defaults
|
|
1087
|
-
to the value specified when initialising the benchmarker.
|
|
1088
|
-
|
|
1089
|
-
Returns:
|
|
1090
|
-
A list of benchmark results.
|
|
1091
|
-
|
|
1092
|
-
Raises:
|
|
1093
|
-
ValueError:
|
|
1094
|
-
If both `task` and `dataset` are specified.
|
|
1095
|
-
"""
|
|
962
|
+
def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
|
|
963
|
+
"""Alias for `self.benchmark()`."""
|
|
1096
964
|
logger.warning(
|
|
1097
965
|
"Calling the `Benchmarker` class directly is deprecated. Please use the "
|
|
1098
966
|
"`benchmark` function instead. This will be removed in a future version."
|
|
1099
967
|
)
|
|
1100
|
-
return self.benchmark(
|
|
1101
|
-
model=model,
|
|
1102
|
-
task=task,
|
|
1103
|
-
dataset=dataset,
|
|
1104
|
-
progress_bar=progress_bar,
|
|
1105
|
-
save_results=save_results,
|
|
1106
|
-
language=language,
|
|
1107
|
-
model_language=model_language,
|
|
1108
|
-
dataset_language=dataset_language,
|
|
1109
|
-
device=device,
|
|
1110
|
-
batch_size=batch_size,
|
|
1111
|
-
raise_errors=raise_errors,
|
|
1112
|
-
cache_dir=cache_dir,
|
|
1113
|
-
api_key=api_key,
|
|
1114
|
-
force=force,
|
|
1115
|
-
verbose=verbose,
|
|
1116
|
-
trust_remote_code=trust_remote_code,
|
|
1117
|
-
clear_model_cache=clear_model_cache,
|
|
1118
|
-
evaluate_test_split=evaluate_test_split,
|
|
1119
|
-
few_shot=few_shot,
|
|
1120
|
-
num_iterations=num_iterations,
|
|
1121
|
-
requires_safetensors=requires_safetensors,
|
|
1122
|
-
)
|
|
968
|
+
return self.benchmark(*args, **kwds)
|
|
1123
969
|
|
|
1124
970
|
|
|
1125
971
|
def model_has_been_benchmarked(
|
euroeval/data_models.py
CHANGED
|
@@ -170,14 +170,16 @@ class BenchmarkConfig:
|
|
|
170
170
|
"""General benchmarking configuration, across datasets and models.
|
|
171
171
|
|
|
172
172
|
Attributes:
|
|
173
|
-
model_languages:
|
|
174
|
-
The languages of the models to benchmark.
|
|
175
|
-
dataset_languages:
|
|
176
|
-
The languages of the datasets in the benchmark.
|
|
177
173
|
tasks:
|
|
178
174
|
The tasks benchmark the model(s) on.
|
|
179
175
|
datasets:
|
|
180
176
|
The datasets to benchmark on.
|
|
177
|
+
model_languages:
|
|
178
|
+
The languages of the models to benchmark.
|
|
179
|
+
dataset_languages:
|
|
180
|
+
The languages of the datasets in the benchmark.
|
|
181
|
+
device:
|
|
182
|
+
The device to use for benchmarking.
|
|
181
183
|
batch_size:
|
|
182
184
|
The batch size to use.
|
|
183
185
|
raise_errors:
|
|
@@ -186,17 +188,16 @@ class BenchmarkConfig:
|
|
|
186
188
|
Directory to store cached models and datasets.
|
|
187
189
|
api_key:
|
|
188
190
|
The API key to use for a given inference API.
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
191
|
+
api_base:
|
|
192
|
+
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
193
|
+
model on an inference API.
|
|
194
|
+
api_version:
|
|
195
|
+
The version of the API to use. Only relevant if `model` refers to a model on
|
|
196
|
+
an inference API.
|
|
192
197
|
progress_bar:
|
|
193
198
|
Whether to show a progress bar.
|
|
194
199
|
save_results:
|
|
195
200
|
Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
|
|
196
|
-
device:
|
|
197
|
-
The device to use for benchmarking.
|
|
198
|
-
verbose:
|
|
199
|
-
Whether to print verbose output.
|
|
200
201
|
trust_remote_code:
|
|
201
202
|
Whether to trust remote code when loading models from the Hugging Face Hub.
|
|
202
203
|
clear_model_cache:
|
|
@@ -208,21 +209,11 @@ class BenchmarkConfig:
|
|
|
208
209
|
if the model is generative.
|
|
209
210
|
num_iterations:
|
|
210
211
|
The number of iterations each model should be evaluated for.
|
|
211
|
-
api_base:
|
|
212
|
-
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
213
|
-
model on an inference API.
|
|
214
|
-
api_version:
|
|
215
|
-
The version of the API to use. Only relevant if `model` refers to a model on
|
|
216
|
-
an inference API.
|
|
217
212
|
gpu_memory_utilization:
|
|
218
213
|
The GPU memory utilization to use for vLLM. A larger value will result in
|
|
219
214
|
faster evaluation, but at the risk of running out of GPU memory. Only reduce
|
|
220
215
|
this if you are running out of GPU memory. Only relevant if the model is
|
|
221
216
|
generative.
|
|
222
|
-
debug:
|
|
223
|
-
Whether to run the benchmark in debug mode.
|
|
224
|
-
run_with_cli:
|
|
225
|
-
Whether the benchmark is being run with the CLI.
|
|
226
217
|
requires_safetensors:
|
|
227
218
|
Whether to only allow models that use the safetensors format.
|
|
228
219
|
generative_type:
|
|
@@ -231,6 +222,15 @@ class BenchmarkConfig:
|
|
|
231
222
|
download_only:
|
|
232
223
|
Whether to only download the models, metrics and datasets without
|
|
233
224
|
evaluating.
|
|
225
|
+
force:
|
|
226
|
+
Whether to force the benchmark to run even if the results are already
|
|
227
|
+
cached.
|
|
228
|
+
verbose:
|
|
229
|
+
Whether to print verbose output.
|
|
230
|
+
debug:
|
|
231
|
+
Whether to run the benchmark in debug mode.
|
|
232
|
+
run_with_cli:
|
|
233
|
+
Whether the benchmark is being run with the CLI.
|
|
234
234
|
"""
|
|
235
235
|
|
|
236
236
|
model_languages: list[Language]
|
|
@@ -241,24 +241,24 @@ class BenchmarkConfig:
|
|
|
241
241
|
raise_errors: bool
|
|
242
242
|
cache_dir: str
|
|
243
243
|
api_key: str | None
|
|
244
|
-
|
|
244
|
+
api_base: str | None
|
|
245
|
+
api_version: str | None
|
|
245
246
|
progress_bar: bool
|
|
246
247
|
save_results: bool
|
|
247
248
|
device: torch.device
|
|
248
|
-
verbose: bool
|
|
249
249
|
trust_remote_code: bool
|
|
250
250
|
clear_model_cache: bool
|
|
251
251
|
evaluate_test_split: bool
|
|
252
252
|
few_shot: bool
|
|
253
253
|
num_iterations: int
|
|
254
|
-
api_base: str | None
|
|
255
|
-
api_version: str | None
|
|
256
254
|
gpu_memory_utilization: float
|
|
257
|
-
debug: bool
|
|
258
|
-
run_with_cli: bool
|
|
259
255
|
requires_safetensors: bool
|
|
260
256
|
generative_type: GenerativeType | None
|
|
261
257
|
download_only: bool
|
|
258
|
+
force: bool
|
|
259
|
+
verbose: bool
|
|
260
|
+
debug: bool
|
|
261
|
+
run_with_cli: bool
|
|
262
262
|
|
|
263
263
|
|
|
264
264
|
class BenchmarkConfigParams(pydantic.BaseModel):
|
|
@@ -266,10 +266,10 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
266
266
|
|
|
267
267
|
model_config = pydantic.ConfigDict(protected_namespaces=())
|
|
268
268
|
|
|
269
|
-
progress_bar: bool
|
|
270
|
-
save_results: bool
|
|
271
269
|
task: str | list[str] | None
|
|
272
270
|
dataset: str | list[str] | None
|
|
271
|
+
progress_bar: bool
|
|
272
|
+
save_results: bool
|
|
273
273
|
language: str | list[str]
|
|
274
274
|
model_language: str | list[str] | None
|
|
275
275
|
dataset_language: str | list[str] | None
|
|
@@ -278,20 +278,21 @@ class BenchmarkConfigParams(pydantic.BaseModel):
|
|
|
278
278
|
raise_errors: bool
|
|
279
279
|
cache_dir: str
|
|
280
280
|
api_key: str | None
|
|
281
|
-
|
|
282
|
-
|
|
281
|
+
api_base: str | None
|
|
282
|
+
api_version: str | None
|
|
283
283
|
trust_remote_code: bool
|
|
284
284
|
clear_model_cache: bool
|
|
285
285
|
evaluate_test_split: bool
|
|
286
286
|
few_shot: bool
|
|
287
287
|
num_iterations: int
|
|
288
|
-
|
|
289
|
-
|
|
288
|
+
requires_safetensors: bool
|
|
289
|
+
download_only: bool
|
|
290
290
|
gpu_memory_utilization: float
|
|
291
291
|
generative_type: GenerativeType | None
|
|
292
|
+
force: bool
|
|
293
|
+
verbose: bool
|
|
292
294
|
debug: bool
|
|
293
295
|
run_with_cli: bool
|
|
294
|
-
requires_safetensors: bool
|
|
295
296
|
|
|
296
297
|
|
|
297
298
|
class BenchmarkResult(pydantic.BaseModel):
|
euroeval/utils.py
CHANGED
|
@@ -62,6 +62,10 @@ def resolve_model_path(download_dir: str) -> str:
|
|
|
62
62
|
|
|
63
63
|
Returns:
|
|
64
64
|
The path to the model.
|
|
65
|
+
|
|
66
|
+
Raises:
|
|
67
|
+
InvalidModel:
|
|
68
|
+
If the model path is not valid, or if required files are missing.
|
|
65
69
|
"""
|
|
66
70
|
model_path = Path(download_dir)
|
|
67
71
|
# Get the 'path safe' version of the model id, which is the last dir in the path
|
|
@@ -271,14 +275,15 @@ def internet_connection_available() -> bool:
|
|
|
271
275
|
s = socket.create_connection(("1.1.1.1", 80))
|
|
272
276
|
s.close()
|
|
273
277
|
return True
|
|
274
|
-
|
|
275
|
-
#
|
|
278
|
+
|
|
279
|
+
# We want to only catch exceptions related to socket connections, but as we cannot
|
|
280
|
+
# import these here as they're developer dependencies, we check the exception name
|
|
281
|
+
# instead. If the exception is not related to socket connections, we reraise it.
|
|
276
282
|
except Exception as e:
|
|
277
283
|
pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
|
|
278
284
|
if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
|
|
279
285
|
return False
|
|
280
|
-
|
|
281
|
-
raise e
|
|
286
|
+
raise e
|
|
282
287
|
|
|
283
288
|
|
|
284
289
|
class HiddenPrints:
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
|
|
2
|
-
euroeval/benchmark_config_factory.py,sha256=
|
|
3
|
-
euroeval/benchmarker.py,sha256=
|
|
2
|
+
euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
|
|
3
|
+
euroeval/benchmarker.py,sha256=fjEqAkUC92fYUarWleigxfSrw0siYWW4YI_KlwyDUF4,46992
|
|
4
4
|
euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
|
|
5
5
|
euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
|
|
6
6
|
euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
|
|
7
7
|
euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
|
|
8
|
-
euroeval/data_models.py,sha256=
|
|
8
|
+
euroeval/data_models.py,sha256=X4zAdR1K2MPb4f4Vc7gPYfolzFxxsz5WplnsmsiMYY8,27766
|
|
9
9
|
euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
|
|
10
10
|
euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
|
|
11
11
|
euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
|
|
@@ -20,13 +20,13 @@ euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4
|
|
|
20
20
|
euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
|
|
21
21
|
euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
|
|
22
22
|
euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
|
|
23
|
-
euroeval/utils.py,sha256=
|
|
23
|
+
euroeval/utils.py,sha256=AyUWGh-G5j14jXZ6ccS1LyTXml2JgbOzOt_e-rr5mag,19451
|
|
24
24
|
euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
|
|
25
25
|
euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
|
|
26
26
|
euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
|
|
27
27
|
euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
|
|
28
28
|
euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
|
|
29
|
-
euroeval/benchmark_modules/vllm.py,sha256=
|
|
29
|
+
euroeval/benchmark_modules/vllm.py,sha256=bo5XaKlHEKhdEFPNJxsnJFq4RWOC9VoOH4Hqw_6dbMQ,43893
|
|
30
30
|
euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
|
|
31
31
|
euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
|
|
32
32
|
euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
|
|
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
|
|
|
63
63
|
euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
|
|
64
64
|
euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
|
|
65
65
|
euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
|
|
66
|
-
euroeval-16.2.
|
|
67
|
-
euroeval-16.2.
|
|
68
|
-
euroeval-16.2.
|
|
69
|
-
euroeval-16.2.
|
|
70
|
-
euroeval-16.2.
|
|
66
|
+
euroeval-16.2.2.dist-info/METADATA,sha256=jGGv76AqT4vGKREN8jD3bBHi19vVyIKUNvlk6FNhRN8,14590
|
|
67
|
+
euroeval-16.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
68
|
+
euroeval-16.2.2.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
|
|
69
|
+
euroeval-16.2.2.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
|
|
70
|
+
euroeval-16.2.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|