EuroEval 15.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +72 -0
- euroeval/benchmark_config_factory.py +358 -0
- euroeval/benchmark_modules/__init__.py +7 -0
- euroeval/benchmark_modules/base.py +354 -0
- euroeval/benchmark_modules/fresh.py +286 -0
- euroeval/benchmark_modules/hf.py +1185 -0
- euroeval/benchmark_modules/litellm.py +905 -0
- euroeval/benchmark_modules/vllm.py +1171 -0
- euroeval/benchmarker.py +1074 -0
- euroeval/callbacks.py +72 -0
- euroeval/cli.py +281 -0
- euroeval/constants.py +50 -0
- euroeval/data_loading.py +96 -0
- euroeval/data_models.py +474 -0
- euroeval/dataset_configs.py +2001 -0
- euroeval/enums.py +144 -0
- euroeval/exceptions.py +191 -0
- euroeval/finetuning.py +324 -0
- euroeval/generation.py +296 -0
- euroeval/human_evaluation.py +737 -0
- euroeval/languages.py +200 -0
- euroeval/model_cache.py +253 -0
- euroeval/model_config.py +77 -0
- euroeval/model_loading.py +78 -0
- euroeval/scores.py +90 -0
- euroeval/speed_benchmark.py +124 -0
- euroeval/task_utils/__init__.py +1 -0
- euroeval/task_utils/multiple_choice_classification.py +176 -0
- euroeval/task_utils/question_answering.py +698 -0
- euroeval/task_utils/sequence_classification.py +237 -0
- euroeval/task_utils/text_to_text.py +150 -0
- euroeval/task_utils/token_classification.py +464 -0
- euroeval/tasks.py +202 -0
- euroeval/types.py +97 -0
- euroeval/utils.py +574 -0
- euroeval-15.2.0.dist-info/METADATA +234 -0
- euroeval-15.2.0.dist-info/RECORD +40 -0
- euroeval-15.2.0.dist-info/WHEEL +4 -0
- euroeval-15.2.0.dist-info/entry_points.txt +4 -0
- euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
euroeval/__init__.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""EuroEval - A benchmarking framework for language models."""
|
|
2
|
+
|
|
3
|
+
### STAGE 1 ###
|
|
4
|
+
### Block unwanted terminal output that happens on importing external modules ###
|
|
5
|
+
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
import warnings
|
|
9
|
+
|
|
10
|
+
from termcolor import colored
|
|
11
|
+
|
|
12
|
+
# Block specific warnings before importing anything else, as they can be noisy
|
|
13
|
+
warnings.filterwarnings("ignore", category=UserWarning)
|
|
14
|
+
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
15
|
+
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
16
|
+
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
17
|
+
|
|
18
|
+
# Set up logging
|
|
19
|
+
fmt = colored("%(asctime)s", "light_blue") + " ⋅ " + colored("%(message)s", "green")
|
|
20
|
+
logging.basicConfig(
|
|
21
|
+
level=logging.CRITICAL if hasattr(sys, "_called_from_test") else logging.INFO,
|
|
22
|
+
format=fmt,
|
|
23
|
+
datefmt="%Y-%m-%d %H:%M:%S",
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
### STAGE 2 ###
|
|
28
|
+
### Set the rest up ###
|
|
29
|
+
|
|
30
|
+
import importlib.metadata # noqa: E402
|
|
31
|
+
import os # noqa: E402
|
|
32
|
+
|
|
33
|
+
from dotenv import load_dotenv # noqa: E402
|
|
34
|
+
|
|
35
|
+
from .benchmarker import Benchmarker # noqa: E402
|
|
36
|
+
from .utils import block_terminal_output # noqa: E402
|
|
37
|
+
|
|
38
|
+
# Block unwanted terminal outputs. This blocks way more than the above, but since it
|
|
39
|
+
# relies on importing from the `utils` module, external modules are already imported
|
|
40
|
+
# before this is run, necessitating the above block as well
|
|
41
|
+
block_terminal_output()
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# Fetches the version of the package as defined in pyproject.toml
|
|
45
|
+
__version__ = importlib.metadata.version("euroeval")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
# Loads environment variables
|
|
49
|
+
load_dotenv()
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
# Disable parallelisation when tokenizing, as that can lead to errors
|
|
53
|
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# Enable MPS fallback
|
|
57
|
+
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# Set amount of threads per GPU - this is the default and is only set to prevent a
|
|
61
|
+
# warning from showing
|
|
62
|
+
os.environ["OMP_NUM_THREADS"] = "1"
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# Disable a warning from Ray regarding the detection of the number of CPUs
|
|
66
|
+
os.environ["RAY_DISABLE_DOCKER_CPU_WARNING"] = "1"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
# Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
|
|
70
|
+
# former and LiteLLM uses the latter
|
|
71
|
+
if os.getenv("HUGGINGFACE_API_KEY"):
|
|
72
|
+
os.environ["HF_TOKEN"] = os.environ["HUGGINGFACE_API_KEY"]
|
|
@@ -0,0 +1,358 @@
|
|
|
1
|
+
"""Factory class for creating dataset configurations."""
|
|
2
|
+
|
|
3
|
+
import importlib.util
|
|
4
|
+
import logging
|
|
5
|
+
import sys
|
|
6
|
+
import typing as t
|
|
7
|
+
|
|
8
|
+
import torch
|
|
9
|
+
|
|
10
|
+
from .data_models import BenchmarkConfig
|
|
11
|
+
from .dataset_configs import get_all_dataset_configs
|
|
12
|
+
from .enums import Device
|
|
13
|
+
from .exceptions import InvalidBenchmark
|
|
14
|
+
from .languages import get_all_languages
|
|
15
|
+
from .tasks import get_all_tasks
|
|
16
|
+
from .utils import log_once
|
|
17
|
+
|
|
18
|
+
if t.TYPE_CHECKING:
|
|
19
|
+
from .data_models import Language, Task
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger("euroeval")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def build_benchmark_config(
|
|
26
|
+
progress_bar: bool,
|
|
27
|
+
save_results: bool,
|
|
28
|
+
task: str | list[str] | None,
|
|
29
|
+
dataset: str | list[str] | None,
|
|
30
|
+
language: str | list[str],
|
|
31
|
+
model_language: str | list[str] | None,
|
|
32
|
+
dataset_language: str | list[str] | None,
|
|
33
|
+
device: Device | None,
|
|
34
|
+
batch_size: int,
|
|
35
|
+
raise_errors: bool,
|
|
36
|
+
cache_dir: str,
|
|
37
|
+
api_key: str | None,
|
|
38
|
+
force: bool,
|
|
39
|
+
verbose: bool,
|
|
40
|
+
trust_remote_code: bool,
|
|
41
|
+
use_flash_attention: bool | None,
|
|
42
|
+
clear_model_cache: bool,
|
|
43
|
+
evaluate_test_split: bool,
|
|
44
|
+
few_shot: bool,
|
|
45
|
+
num_iterations: int,
|
|
46
|
+
api_base: str | None,
|
|
47
|
+
api_version: str | None,
|
|
48
|
+
debug: bool,
|
|
49
|
+
run_with_cli: bool,
|
|
50
|
+
only_allow_safetensors: bool,
|
|
51
|
+
first_time: bool = False,
|
|
52
|
+
) -> BenchmarkConfig:
|
|
53
|
+
"""Create a benchmark configuration.
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
progress_bar:
|
|
57
|
+
Whether to show a progress bar when running the benchmark.
|
|
58
|
+
save_results:
|
|
59
|
+
Whether to save the benchmark results to a file.
|
|
60
|
+
task:
|
|
61
|
+
The tasks to include for dataset. If None then datasets will not be
|
|
62
|
+
filtered based on their task.
|
|
63
|
+
dataset:
|
|
64
|
+
The datasets to include for task. If None then all datasets will be
|
|
65
|
+
included, limited by the `task` parameter.
|
|
66
|
+
language:
|
|
67
|
+
The language codes of the languages to include, both for models and
|
|
68
|
+
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
|
|
69
|
+
to 'all' if all languages should be considered.
|
|
70
|
+
model_language:
|
|
71
|
+
The language codes of the languages to include for models. If None then
|
|
72
|
+
the `language` parameter will be used.
|
|
73
|
+
dataset_language:
|
|
74
|
+
The language codes of the languages to include for datasets. If None then
|
|
75
|
+
the `language` parameter will be used.
|
|
76
|
+
device:
|
|
77
|
+
The device to use for running the models. If None then the device will be
|
|
78
|
+
set automatically.
|
|
79
|
+
batch_size:
|
|
80
|
+
The batch size to use for running the models.
|
|
81
|
+
raise_errors:
|
|
82
|
+
Whether to raise errors when running the benchmark.
|
|
83
|
+
cache_dir:
|
|
84
|
+
The directory to use for caching the models.
|
|
85
|
+
api_key:
|
|
86
|
+
The API key to use for a given inference server.
|
|
87
|
+
force:
|
|
88
|
+
Whether to force the benchmark to run even if the results are already
|
|
89
|
+
cached.
|
|
90
|
+
verbose:
|
|
91
|
+
Whether to print verbose output when running the benchmark. This is
|
|
92
|
+
automatically set if `debug` is True.
|
|
93
|
+
trust_remote_code:
|
|
94
|
+
Whether to trust remote code when running the benchmark.
|
|
95
|
+
use_flash_attention:
|
|
96
|
+
Whether to use Flash Attention for the models. If None then it will be used
|
|
97
|
+
if it is available.
|
|
98
|
+
clear_model_cache:
|
|
99
|
+
Whether to clear the model cache before running the benchmark.
|
|
100
|
+
evaluate_test_split:
|
|
101
|
+
Whether to use the test split for the datasets.
|
|
102
|
+
few_shot:
|
|
103
|
+
Whether to use few-shot learning for the models.
|
|
104
|
+
num_iterations:
|
|
105
|
+
The number of iterations each model should be evaluated for.
|
|
106
|
+
api_base:
|
|
107
|
+
The base URL for a given inference API. Only relevant if `model` refers to a
|
|
108
|
+
model on an inference API.
|
|
109
|
+
api_version:
|
|
110
|
+
The version of the API to use for a given inference API.
|
|
111
|
+
debug:
|
|
112
|
+
Whether to run the benchmark in debug mode.
|
|
113
|
+
run_with_cli:
|
|
114
|
+
Whether the benchmark is being run with the CLI.
|
|
115
|
+
only_allow_safetensors:
|
|
116
|
+
Whether to only allow evaluations of models stored as safetensors.
|
|
117
|
+
first_time:
|
|
118
|
+
Whether this is the first time the benchmark configuration is being created.
|
|
119
|
+
Defaults to False.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
The benchmark configuration.
|
|
123
|
+
"""
|
|
124
|
+
language_codes = get_correct_language_codes(language_codes=language)
|
|
125
|
+
model_languages = prepare_languages(
|
|
126
|
+
language_codes=model_language, default_language_codes=language_codes
|
|
127
|
+
)
|
|
128
|
+
dataset_languages = prepare_languages(
|
|
129
|
+
language_codes=dataset_language, default_language_codes=language_codes
|
|
130
|
+
)
|
|
131
|
+
|
|
132
|
+
tasks, datasets = prepare_tasks_and_datasets(
|
|
133
|
+
task=task, dataset=dataset, dataset_languages=dataset_languages
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
torch_device = prepare_device(device=device)
|
|
137
|
+
|
|
138
|
+
if use_flash_attention is None:
|
|
139
|
+
if torch_device.type != "cuda":
|
|
140
|
+
use_flash_attention = False
|
|
141
|
+
elif (
|
|
142
|
+
importlib.util.find_spec("flash_attn") is None
|
|
143
|
+
and importlib.util.find_spec("vllm_flash_attn") is None
|
|
144
|
+
):
|
|
145
|
+
use_flash_attention = False
|
|
146
|
+
if first_time and torch_device.type == "cuda":
|
|
147
|
+
message = (
|
|
148
|
+
"Flash attention has not been installed, so this will not be used. "
|
|
149
|
+
"To install it, run `pip install -U wheel && "
|
|
150
|
+
"FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn "
|
|
151
|
+
"--no-build-isolation`. Alternatively, you can disable this "
|
|
152
|
+
"message by setting "
|
|
153
|
+
)
|
|
154
|
+
if run_with_cli:
|
|
155
|
+
message += "the flag `--no-use-flash-attention`."
|
|
156
|
+
else:
|
|
157
|
+
message += (
|
|
158
|
+
"the argument `use_flash_attention=False` in the `Benchmarker`."
|
|
159
|
+
)
|
|
160
|
+
log_once(message=message, level=logging.INFO)
|
|
161
|
+
|
|
162
|
+
# Set variable with number of iterations
|
|
163
|
+
if hasattr(sys, "_called_from_test"):
|
|
164
|
+
num_iterations = 1
|
|
165
|
+
|
|
166
|
+
return BenchmarkConfig(
|
|
167
|
+
model_languages=model_languages,
|
|
168
|
+
dataset_languages=dataset_languages,
|
|
169
|
+
tasks=tasks,
|
|
170
|
+
datasets=datasets,
|
|
171
|
+
batch_size=batch_size,
|
|
172
|
+
raise_errors=raise_errors,
|
|
173
|
+
cache_dir=cache_dir,
|
|
174
|
+
api_key=api_key,
|
|
175
|
+
force=force,
|
|
176
|
+
progress_bar=progress_bar,
|
|
177
|
+
save_results=save_results,
|
|
178
|
+
verbose=verbose or debug,
|
|
179
|
+
device=torch_device,
|
|
180
|
+
trust_remote_code=trust_remote_code,
|
|
181
|
+
use_flash_attention=use_flash_attention,
|
|
182
|
+
clear_model_cache=clear_model_cache,
|
|
183
|
+
evaluate_test_split=evaluate_test_split,
|
|
184
|
+
few_shot=few_shot,
|
|
185
|
+
num_iterations=num_iterations,
|
|
186
|
+
api_base=api_base,
|
|
187
|
+
api_version=api_version,
|
|
188
|
+
debug=debug,
|
|
189
|
+
run_with_cli=run_with_cli,
|
|
190
|
+
only_allow_safetensors=only_allow_safetensors,
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def get_correct_language_codes(language_codes: str | list[str]) -> list[str]:
|
|
195
|
+
"""Get correct language code(s).
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
language_codes:
|
|
199
|
+
The language codes of the languages to include, both for models and
|
|
200
|
+
datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
|
|
201
|
+
to 'all' if all languages should be considered.
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
The correct language codes.
|
|
205
|
+
"""
|
|
206
|
+
# Create a dictionary that maps languages to their associated language objects
|
|
207
|
+
language_mapping = get_all_languages()
|
|
208
|
+
|
|
209
|
+
# Create the list `languages`
|
|
210
|
+
if "all" in language_codes:
|
|
211
|
+
languages = list(language_mapping.keys())
|
|
212
|
+
elif isinstance(language_codes, str):
|
|
213
|
+
languages = [language_codes]
|
|
214
|
+
else:
|
|
215
|
+
languages = language_codes
|
|
216
|
+
|
|
217
|
+
# If `languages` contains 'no' then also include 'nb' and 'nn'. Conversely, if
|
|
218
|
+
# either 'nb' or 'nn' are specified then also include 'no'.
|
|
219
|
+
if "no" in languages:
|
|
220
|
+
languages = list(set(languages) | {"nb", "nn"})
|
|
221
|
+
elif "nb" in languages or "nn" in languages:
|
|
222
|
+
languages = list(set(languages) | {"no"})
|
|
223
|
+
|
|
224
|
+
return languages
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
def prepare_languages(
|
|
228
|
+
language_codes: str | list[str] | None, default_language_codes: list[str]
|
|
229
|
+
) -> list["Language"]:
|
|
230
|
+
"""Prepare language(s) for benchmarking.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
language_codes:
|
|
234
|
+
The language codes of the languages to include for models or datasets.
|
|
235
|
+
If specified then this overrides the `language` parameter for model or
|
|
236
|
+
dataset languages.
|
|
237
|
+
default_language_codes:
|
|
238
|
+
The default language codes of the languages to include.
|
|
239
|
+
|
|
240
|
+
Returns:
|
|
241
|
+
The prepared model or dataset languages.
|
|
242
|
+
"""
|
|
243
|
+
# Create a dictionary that maps languages to their associated language objects
|
|
244
|
+
language_mapping = get_all_languages()
|
|
245
|
+
|
|
246
|
+
# Create the list `languages_str` of language codes to use for models or datasets
|
|
247
|
+
languages_str: list[str]
|
|
248
|
+
if language_codes is None:
|
|
249
|
+
languages_str = default_language_codes
|
|
250
|
+
elif isinstance(language_codes, str):
|
|
251
|
+
languages_str = [language_codes]
|
|
252
|
+
else:
|
|
253
|
+
languages_str = language_codes
|
|
254
|
+
|
|
255
|
+
# Convert the model languages to language objects
|
|
256
|
+
if "all" in languages_str:
|
|
257
|
+
prepared_languages = list(language_mapping.values())
|
|
258
|
+
else:
|
|
259
|
+
prepared_languages = [language_mapping[language] for language in languages_str]
|
|
260
|
+
|
|
261
|
+
return prepared_languages
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def prepare_tasks_and_datasets(
|
|
265
|
+
task: str | list[str] | None,
|
|
266
|
+
dataset_languages: list["Language"],
|
|
267
|
+
dataset: str | list[str] | None,
|
|
268
|
+
) -> tuple[list["Task"], list[str]]:
|
|
269
|
+
"""Prepare task(s) and dataset(s) for benchmarking.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
task:
|
|
273
|
+
The tasks to include for dataset. If None then datasets will not be
|
|
274
|
+
filtered based on their task.
|
|
275
|
+
dataset_languages:
|
|
276
|
+
The languages of the datasets in the benchmark.
|
|
277
|
+
dataset:
|
|
278
|
+
The datasets to include for task. If None then all datasets will be
|
|
279
|
+
included, limited by the `task` and `dataset_languages` parameters.
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
The prepared tasks and datasets.
|
|
283
|
+
|
|
284
|
+
Raises:
|
|
285
|
+
InvalidBenchmark:
|
|
286
|
+
If the task or dataset is not found in the benchmark tasks or datasets.
|
|
287
|
+
"""
|
|
288
|
+
# Create a dictionary that maps benchmark tasks to their associated benchmark
|
|
289
|
+
# task objects, and a dictionary that maps dataset names to their associated
|
|
290
|
+
# dataset configuration objects
|
|
291
|
+
task_mapping = get_all_tasks()
|
|
292
|
+
all_dataset_configs = get_all_dataset_configs()
|
|
293
|
+
|
|
294
|
+
# Create the list of dataset tasks
|
|
295
|
+
try:
|
|
296
|
+
if task is None:
|
|
297
|
+
tasks = list(task_mapping.values())
|
|
298
|
+
elif isinstance(task, str):
|
|
299
|
+
tasks = [task_mapping[task]]
|
|
300
|
+
else:
|
|
301
|
+
tasks = [task_mapping[t] for t in task]
|
|
302
|
+
except KeyError as e:
|
|
303
|
+
raise InvalidBenchmark(f"Task {e} not found in the benchmark tasks.") from e
|
|
304
|
+
|
|
305
|
+
all_official_datasets = [
|
|
306
|
+
dataset_name
|
|
307
|
+
for dataset_name, dataset_config in all_dataset_configs.items()
|
|
308
|
+
if not dataset_config.unofficial
|
|
309
|
+
]
|
|
310
|
+
if dataset is None:
|
|
311
|
+
dataset = all_official_datasets
|
|
312
|
+
elif isinstance(dataset, str):
|
|
313
|
+
dataset = [dataset]
|
|
314
|
+
|
|
315
|
+
all_datasets = list(all_dataset_configs.keys())
|
|
316
|
+
invalid_datasets = set(dataset) - set(all_datasets)
|
|
317
|
+
if invalid_datasets:
|
|
318
|
+
raise InvalidBenchmark(
|
|
319
|
+
f"Dataset(s) {', '.join(invalid_datasets)} not found in the benchmark "
|
|
320
|
+
"datasets."
|
|
321
|
+
)
|
|
322
|
+
|
|
323
|
+
datasets = [
|
|
324
|
+
dataset_name
|
|
325
|
+
for dataset_name, dataset_config in all_dataset_configs.items()
|
|
326
|
+
if dataset_name in dataset
|
|
327
|
+
and dataset_config.task in tasks
|
|
328
|
+
and set(dataset_config.languages).intersection(dataset_languages)
|
|
329
|
+
]
|
|
330
|
+
|
|
331
|
+
return tasks, datasets
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def prepare_device(device: Device | None) -> torch.device:
|
|
335
|
+
"""Prepare device for benchmarking.
|
|
336
|
+
|
|
337
|
+
Args:
|
|
338
|
+
device:
|
|
339
|
+
The device to use for running the models. If None then the device will be
|
|
340
|
+
set automatically.
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
The prepared device.
|
|
344
|
+
"""
|
|
345
|
+
device_mapping = {
|
|
346
|
+
Device.CPU: torch.device("cpu"),
|
|
347
|
+
Device.CUDA: torch.device("cuda"),
|
|
348
|
+
Device.MPS: torch.device("mps"),
|
|
349
|
+
}
|
|
350
|
+
if isinstance(device, Device):
|
|
351
|
+
return device_mapping[device]
|
|
352
|
+
|
|
353
|
+
if torch.cuda.is_available():
|
|
354
|
+
return torch.device("cuda")
|
|
355
|
+
elif torch.backends.mps.is_available():
|
|
356
|
+
return torch.device("mps")
|
|
357
|
+
else:
|
|
358
|
+
return torch.device("cpu")
|