EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +7 -4
- euroeval/benchmark_config_factory.py +0 -4
- euroeval/benchmark_modules/base.py +3 -16
- euroeval/benchmark_modules/fresh.py +5 -2
- euroeval/benchmark_modules/hf.py +107 -66
- euroeval/benchmark_modules/litellm.py +103 -55
- euroeval/benchmark_modules/vllm.py +155 -82
- euroeval/benchmarker.py +184 -129
- euroeval/caching_utils.py +79 -0
- euroeval/callbacks.py +5 -7
- euroeval/cli.py +1 -1
- euroeval/constants.py +9 -0
- euroeval/data_loading.py +14 -11
- euroeval/data_models.py +12 -4
- euroeval/dataset_configs/__init__.py +3 -0
- euroeval/dataset_configs/czech.py +79 -0
- euroeval/dataset_configs/danish.py +10 -13
- euroeval/dataset_configs/dutch.py +0 -3
- euroeval/dataset_configs/english.py +0 -3
- euroeval/dataset_configs/estonian.py +11 -1
- euroeval/dataset_configs/finnish.py +0 -3
- euroeval/dataset_configs/french.py +0 -3
- euroeval/dataset_configs/german.py +0 -3
- euroeval/dataset_configs/italian.py +0 -3
- euroeval/dataset_configs/latvian.py +2 -4
- euroeval/dataset_configs/lithuanian.py +68 -0
- euroeval/dataset_configs/norwegian.py +0 -3
- euroeval/dataset_configs/polish.py +0 -3
- euroeval/dataset_configs/portuguese.py +0 -3
- euroeval/dataset_configs/slovak.py +60 -0
- euroeval/dataset_configs/spanish.py +0 -3
- euroeval/dataset_configs/swedish.py +10 -15
- euroeval/finetuning.py +21 -15
- euroeval/generation.py +10 -10
- euroeval/generation_utils.py +2 -3
- euroeval/logging_utils.py +250 -0
- euroeval/metrics/base.py +0 -3
- euroeval/metrics/huggingface.py +10 -6
- euroeval/metrics/llm_as_a_judge.py +5 -3
- euroeval/metrics/pipeline.py +22 -9
- euroeval/metrics/speed.py +0 -3
- euroeval/model_cache.py +11 -14
- euroeval/model_config.py +4 -5
- euroeval/model_loading.py +3 -0
- euroeval/prompt_templates/linguistic_acceptability.py +30 -3
- euroeval/prompt_templates/multiple_choice.py +34 -1
- euroeval/prompt_templates/named_entity_recognition.py +71 -11
- euroeval/prompt_templates/reading_comprehension.py +41 -3
- euroeval/prompt_templates/sentiment_classification.py +34 -1
- euroeval/prompt_templates/summarization.py +26 -6
- euroeval/scores.py +7 -7
- euroeval/speed_benchmark.py +3 -5
- euroeval/task_group_utils/multiple_choice_classification.py +0 -3
- euroeval/task_group_utils/question_answering.py +0 -3
- euroeval/task_group_utils/sequence_classification.py +43 -31
- euroeval/task_group_utils/text_to_text.py +17 -8
- euroeval/task_group_utils/token_classification.py +10 -9
- euroeval/tokenisation_utils.py +22 -20
- euroeval/utils.py +30 -147
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
- euroeval-16.4.0.dist-info/RECORD +75 -0
- euroeval-16.2.2.dist-info/RECORD +0 -70
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
- {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/utils.py
CHANGED
|
@@ -11,30 +11,23 @@ import re
|
|
|
11
11
|
import socket
|
|
12
12
|
import sys
|
|
13
13
|
import typing as t
|
|
14
|
-
import warnings
|
|
15
|
-
from functools import cache
|
|
16
14
|
from pathlib import Path
|
|
17
15
|
|
|
18
16
|
import demjson3
|
|
19
17
|
import huggingface_hub as hf_hub
|
|
20
|
-
import litellm
|
|
21
18
|
import numpy as np
|
|
22
19
|
import torch
|
|
23
|
-
from datasets.utils import disable_progress_bar
|
|
24
|
-
from transformers import logging as tf_logging
|
|
25
20
|
|
|
21
|
+
from .caching_utils import cache_arguments
|
|
22
|
+
from .constants import T
|
|
26
23
|
from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
|
|
24
|
+
from .logging_utils import log, log_once
|
|
27
25
|
|
|
28
26
|
if t.TYPE_CHECKING:
|
|
29
|
-
from types import TracebackType
|
|
30
|
-
|
|
31
27
|
from .data_models import ModelIdComponents
|
|
32
28
|
from .types import Predictions
|
|
33
29
|
|
|
34
30
|
|
|
35
|
-
logger = logging.getLogger("euroeval")
|
|
36
|
-
|
|
37
|
-
|
|
38
31
|
def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
|
|
39
32
|
"""Create cache directory for a model.
|
|
40
33
|
|
|
@@ -149,68 +142,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
|
|
|
149
142
|
return rng
|
|
150
143
|
|
|
151
144
|
|
|
152
|
-
def block_terminal_output() -> None:
|
|
153
|
-
"""Blocks libraries from writing output to the terminal.
|
|
154
|
-
|
|
155
|
-
This filters warnings from some libraries, sets the logging level to ERROR for some
|
|
156
|
-
libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
|
|
157
|
-
disables most of the logging from the `transformers` library.
|
|
158
|
-
"""
|
|
159
|
-
if os.getenv("FULL_LOG") == "1":
|
|
160
|
-
return
|
|
161
|
-
|
|
162
|
-
# Ignore miscellaneous warnings
|
|
163
|
-
warnings.filterwarnings("ignore", category=UserWarning)
|
|
164
|
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
|
165
|
-
logging.getLogger("absl").setLevel(logging.CRITICAL)
|
|
166
|
-
|
|
167
|
-
# Disable matplotlib logging
|
|
168
|
-
logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
|
|
169
|
-
|
|
170
|
-
# Disable PyTorch logging
|
|
171
|
-
logging.getLogger("torch.utils.cpp_extension").setLevel(logging.CRITICAL)
|
|
172
|
-
warnings.filterwarnings(action="ignore", module="torch*")
|
|
173
|
-
os.environ["TORCH_LOGS"] = "-all"
|
|
174
|
-
|
|
175
|
-
# Disable huggingface_hub logging
|
|
176
|
-
logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
|
|
177
|
-
|
|
178
|
-
# Disable LiteLLM logging
|
|
179
|
-
logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
|
|
180
|
-
logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
|
|
181
|
-
logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
|
|
182
|
-
logging.getLogger("openai").setLevel(logging.CRITICAL)
|
|
183
|
-
logging.getLogger("httpx").setLevel(logging.CRITICAL)
|
|
184
|
-
litellm.suppress_debug_info = True
|
|
185
|
-
|
|
186
|
-
# Disable vLLM logging
|
|
187
|
-
logging.getLogger("vllm").setLevel(logging.CRITICAL)
|
|
188
|
-
logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
|
|
189
|
-
logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
|
|
190
|
-
logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
|
|
191
|
-
logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
|
|
192
|
-
logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
|
|
193
|
-
logging.getLogger("mistral_common.tokens.tokenizers.tekken").setLevel(
|
|
194
|
-
logging.CRITICAL
|
|
195
|
-
)
|
|
196
|
-
os.environ["LOG_LEVEL"] = "CRITICAL"
|
|
197
|
-
os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
|
|
198
|
-
|
|
199
|
-
# Disable datasets logging
|
|
200
|
-
logging.getLogger("datasets").setLevel(logging.CRITICAL)
|
|
201
|
-
logging.getLogger("filelock").setLevel(logging.CRITICAL)
|
|
202
|
-
disable_progress_bar()
|
|
203
|
-
|
|
204
|
-
# Disable evaluate logging
|
|
205
|
-
warnings.filterwarnings("ignore", module="seqeval*")
|
|
206
|
-
|
|
207
|
-
# Disable most of the `transformers` logging
|
|
208
|
-
tf_logging._default_log_level = logging.CRITICAL
|
|
209
|
-
tf_logging.set_verbosity(logging.CRITICAL)
|
|
210
|
-
logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
|
|
211
|
-
logging.getLogger("accelerate").setLevel(logging.CRITICAL)
|
|
212
|
-
|
|
213
|
-
|
|
214
145
|
def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type | None:
|
|
215
146
|
"""Get a class by its name.
|
|
216
147
|
|
|
@@ -240,9 +171,10 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
|
|
|
240
171
|
|
|
241
172
|
if error_messages:
|
|
242
173
|
errors = "\n- " + "\n- ".join(error_messages)
|
|
243
|
-
|
|
174
|
+
log(
|
|
244
175
|
f"Could not find the class with the name(s) {', '.join(class_name)}. The "
|
|
245
|
-
f"following error messages were raised: {errors}"
|
|
176
|
+
f"following error messages were raised: {errors}",
|
|
177
|
+
level=logging.DEBUG,
|
|
246
178
|
)
|
|
247
179
|
|
|
248
180
|
# If the class could not be found, return None
|
|
@@ -264,49 +196,27 @@ def get_min_cuda_compute_capability() -> float | None:
|
|
|
264
196
|
return float(f"{major}.{minor}")
|
|
265
197
|
|
|
266
198
|
|
|
267
|
-
@
|
|
199
|
+
@cache_arguments(disable_condition=lambda: hasattr(sys, "_called_from_test"))
|
|
268
200
|
def internet_connection_available() -> bool:
|
|
269
201
|
"""Checks if internet connection is available by pinging google.com.
|
|
270
202
|
|
|
271
203
|
Returns:
|
|
272
204
|
Whether or not internet connection is available.
|
|
273
205
|
"""
|
|
206
|
+
internet_available: bool = False
|
|
207
|
+
|
|
274
208
|
try:
|
|
275
209
|
s = socket.create_connection(("1.1.1.1", 80))
|
|
276
210
|
s.close()
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
# import these here as they're developer dependencies, we check the exception name
|
|
281
|
-
# instead. If the exception is not related to socket connections, we reraise it.
|
|
211
|
+
internet_available = True
|
|
212
|
+
except OSError:
|
|
213
|
+
pass
|
|
282
214
|
except Exception as e:
|
|
283
215
|
pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
|
|
284
|
-
if type(e).__name__ in pytest_socket_errors
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
class HiddenPrints:
|
|
290
|
-
"""Context manager which removes all terminal output."""
|
|
291
|
-
|
|
292
|
-
def __enter__(self) -> None:
|
|
293
|
-
"""Enter the context manager."""
|
|
294
|
-
self._original_stdout = sys.stdout
|
|
295
|
-
self._original_stderr = sys.stderr
|
|
296
|
-
sys.stdout = open(os.devnull, "w")
|
|
297
|
-
sys.stderr = open(os.devnull, "w")
|
|
298
|
-
|
|
299
|
-
def __exit__(
|
|
300
|
-
self,
|
|
301
|
-
exc_type: t.Type[BaseException],
|
|
302
|
-
exc_val: BaseException,
|
|
303
|
-
exc_tb: "TracebackType",
|
|
304
|
-
) -> None:
|
|
305
|
-
"""Exit the context manager."""
|
|
306
|
-
sys.stdout.close()
|
|
307
|
-
sys.stderr.close()
|
|
308
|
-
sys.stdout = self._original_stdout
|
|
309
|
-
sys.stderr = self._original_stderr
|
|
216
|
+
if type(e).__name__ not in pytest_socket_errors:
|
|
217
|
+
raise e
|
|
218
|
+
|
|
219
|
+
return internet_available
|
|
310
220
|
|
|
311
221
|
|
|
312
222
|
def raise_if_model_output_contains_nan_values(model_output: "Predictions") -> None:
|
|
@@ -364,34 +274,6 @@ def unscramble(scrambled_text: str) -> str:
|
|
|
364
274
|
return unscrambled
|
|
365
275
|
|
|
366
276
|
|
|
367
|
-
@cache
|
|
368
|
-
def log_once(message: str, level: int = logging.INFO) -> None:
|
|
369
|
-
"""Log a message once.
|
|
370
|
-
|
|
371
|
-
This is ensured by caching the input/output pairs of this function, using the
|
|
372
|
-
`functools.cache` decorator.
|
|
373
|
-
|
|
374
|
-
Args:
|
|
375
|
-
message:
|
|
376
|
-
The message to log.
|
|
377
|
-
level:
|
|
378
|
-
The logging level. Defaults to logging.INFO.
|
|
379
|
-
"""
|
|
380
|
-
match level:
|
|
381
|
-
case logging.DEBUG:
|
|
382
|
-
logger.debug(message)
|
|
383
|
-
case logging.INFO:
|
|
384
|
-
logger.info(message)
|
|
385
|
-
case logging.WARNING:
|
|
386
|
-
logger.warning(message)
|
|
387
|
-
case logging.ERROR:
|
|
388
|
-
logger.error(message)
|
|
389
|
-
case logging.CRITICAL:
|
|
390
|
-
logger.critical(message)
|
|
391
|
-
case _:
|
|
392
|
-
raise ValueError(f"Invalid logging level: {level}")
|
|
393
|
-
|
|
394
|
-
|
|
395
277
|
def get_package_version(package_name: str) -> str | None:
|
|
396
278
|
"""Get the version of a package.
|
|
397
279
|
|
|
@@ -408,9 +290,6 @@ def get_package_version(package_name: str) -> str | None:
|
|
|
408
290
|
return None
|
|
409
291
|
|
|
410
292
|
|
|
411
|
-
T = t.TypeVar("T", bound=object)
|
|
412
|
-
|
|
413
|
-
|
|
414
293
|
def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
|
|
415
294
|
"""Run a coroutine, ensuring that the event loop is always closed when we're done.
|
|
416
295
|
|
|
@@ -462,39 +341,43 @@ def extract_json_dict_from_string(s: str) -> dict | None:
|
|
|
462
341
|
Returns:
|
|
463
342
|
The extracted JSON dictionary, or None if no JSON dictionary could be found.
|
|
464
343
|
"""
|
|
465
|
-
json_regex = r"\{[^{}]
|
|
344
|
+
json_regex = r"\{[^{}]*?\}"
|
|
466
345
|
if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
|
|
467
|
-
|
|
346
|
+
log(
|
|
468
347
|
"The model output does not contain any JSON dictionary, so cannot parse "
|
|
469
|
-
f"it. Skipping. Here is the output: {s!r}"
|
|
348
|
+
f"it. Skipping. Here is the output: {s!r}",
|
|
349
|
+
level=logging.DEBUG,
|
|
470
350
|
)
|
|
471
351
|
return None
|
|
472
352
|
json_string = json_match.group()
|
|
473
353
|
try:
|
|
474
354
|
json_output = demjson3.decode(txt=json_string)
|
|
475
355
|
except demjson3.JSONDecodeError:
|
|
476
|
-
|
|
356
|
+
log(
|
|
477
357
|
"The model output is not valid JSON, so cannot parse it. Skipping. "
|
|
478
|
-
f"Here is the output: {json_string!r}"
|
|
358
|
+
f"Here is the output: {json_string!r}",
|
|
359
|
+
level=logging.DEBUG,
|
|
479
360
|
)
|
|
480
361
|
return None
|
|
481
362
|
if not isinstance(json_output, dict):
|
|
482
|
-
|
|
363
|
+
log(
|
|
483
364
|
"The model output is not a JSON dictionary, so cannot parse "
|
|
484
|
-
f"it. Skipping. Here is the output: {json_string!r}"
|
|
365
|
+
f"it. Skipping. Here is the output: {json_string!r}",
|
|
366
|
+
level=logging.DEBUG,
|
|
485
367
|
)
|
|
486
368
|
return None
|
|
487
369
|
elif not all(isinstance(key, str) for key in json_output.keys()):
|
|
488
|
-
|
|
370
|
+
log(
|
|
489
371
|
"The model output is not a JSON dictionary with string keys, "
|
|
490
372
|
"so cannot parse it. Skipping. Here is the output: "
|
|
491
|
-
f"{json_string!r}"
|
|
373
|
+
f"{json_string!r}",
|
|
374
|
+
level=logging.DEBUG,
|
|
492
375
|
)
|
|
493
376
|
return None
|
|
494
377
|
return json_output
|
|
495
378
|
|
|
496
379
|
|
|
497
|
-
@
|
|
380
|
+
@cache_arguments()
|
|
498
381
|
def get_hf_token(api_key: str | None) -> str | bool:
|
|
499
382
|
"""Get the Hugging Face token.
|
|
500
383
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: EuroEval
|
|
3
|
-
Version: 16.
|
|
3
|
+
Version: 16.4.0
|
|
4
4
|
Summary: The robust European language model benchmark.
|
|
5
5
|
Project-URL: Repository, https://github.com/EuroEval/EuroEval
|
|
6
6
|
Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
|
|
@@ -62,21 +62,28 @@ Provides-Extra: all
|
|
|
62
62
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
|
|
63
63
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
|
|
64
64
|
Requires-Dist: timm>=1.0.19; extra == 'all'
|
|
65
|
-
Requires-Dist: vllm[flashinfer]>=0.
|
|
65
|
+
Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'all'
|
|
66
66
|
Provides-Extra: generative
|
|
67
67
|
Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
|
|
68
68
|
Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
|
|
69
69
|
Requires-Dist: timm>=1.0.19; extra == 'generative'
|
|
70
|
-
Requires-Dist: vllm[flashinfer]>=0.
|
|
70
|
+
Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'generative'
|
|
71
71
|
Description-Content-Type: text/markdown
|
|
72
72
|
|
|
73
|
+
<!-- This disables the requirement that the first line is a top-level heading -->
|
|
74
|
+
<!-- markdownlint-configure-file { "MD041": false } -->
|
|
75
|
+
|
|
73
76
|
<div align='center'>
|
|
74
|
-
<img
|
|
77
|
+
<img
|
|
78
|
+
src="https://raw.githubusercontent.com/EuroEval/EuroEval/main/gfx/euroeval.png"
|
|
79
|
+
height="500"
|
|
80
|
+
width="372"
|
|
81
|
+
>
|
|
75
82
|
</div>
|
|
76
83
|
|
|
77
|
-
### The robust European language model benchmark
|
|
84
|
+
### The robust European language model benchmark
|
|
78
85
|
|
|
79
|
-
|
|
86
|
+
(formerly known as ScandEval)
|
|
80
87
|
|
|
81
88
|
______________________________________________________________________
|
|
82
89
|
[](https://euroeval.com)
|
|
@@ -85,19 +92,19 @@ ______________________________________________________________________
|
|
|
85
92
|
[](https://arxiv.org/abs/2406.13469)
|
|
86
93
|
[](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
|
|
87
94
|
[](https://github.com/EuroEval/EuroEval/commits/main)
|
|
88
|
-
[](https://github.com/EuroEval/EuroEval/tree/main/tests)
|
|
89
96
|
[](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
|
|
90
97
|
|
|
91
|
-
|
|
92
98
|
## Maintainer
|
|
93
99
|
|
|
94
|
-
- Dan Saattrup Smart ([@saattrupdan](https://github.com/saattrupdan), dan.smart@alexandra.dk)
|
|
95
|
-
|
|
100
|
+
- Dan Saattrup Smart ([@saattrupdan](https://github.com/saattrupdan), <dan.smart@alexandra.dk>)
|
|
96
101
|
|
|
97
102
|
## Installation
|
|
103
|
+
|
|
98
104
|
To install the package simply write the following command in your favorite terminal:
|
|
99
|
-
|
|
100
|
-
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
pip install euroeval[all]
|
|
101
108
|
```
|
|
102
109
|
|
|
103
110
|
This will install the EuroEval package with all extras. You can also install the
|
|
@@ -105,51 +112,61 @@ minimal version by leaving out the `[all]`, in which case the package will let y
|
|
|
105
112
|
when an evaluation requires a certain extra dependency, and how you install it.
|
|
106
113
|
|
|
107
114
|
## Quickstart
|
|
115
|
+
|
|
108
116
|
### Benchmarking from the Command Line
|
|
117
|
+
|
|
109
118
|
The easiest way to benchmark pretrained models is via the command line interface. After
|
|
110
119
|
having installed the package, you can benchmark your favorite model like so:
|
|
111
|
-
|
|
112
|
-
|
|
120
|
+
|
|
121
|
+
```bash
|
|
122
|
+
euroeval --model <model-id>
|
|
113
123
|
```
|
|
114
124
|
|
|
115
125
|
Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
|
|
116
126
|
Hub](https://huggingface.co/models). By default this will benchmark the model on all
|
|
117
127
|
the tasks available. If you want to benchmark on a particular task, then use the
|
|
118
128
|
`--task` argument:
|
|
119
|
-
|
|
120
|
-
|
|
129
|
+
|
|
130
|
+
```bash
|
|
131
|
+
euroeval --model <model-id> --task sentiment-classification
|
|
121
132
|
```
|
|
122
133
|
|
|
123
134
|
We can also narrow down which languages we would like to benchmark on. This can be done
|
|
124
135
|
by setting the `--language` argument. Here we thus benchmark the model on the Danish
|
|
125
136
|
sentiment classification task:
|
|
126
|
-
|
|
127
|
-
|
|
137
|
+
|
|
138
|
+
```bash
|
|
139
|
+
euroeval --model <model-id> --task sentiment-classification --language da
|
|
128
140
|
```
|
|
129
141
|
|
|
130
142
|
Multiple models, datasets and/or languages can be specified by just attaching multiple
|
|
131
143
|
arguments. Here is an example with two models:
|
|
132
|
-
|
|
133
|
-
|
|
144
|
+
|
|
145
|
+
```bash
|
|
146
|
+
euroeval --model <model-id1> --model <model-id2>
|
|
134
147
|
```
|
|
135
148
|
|
|
136
149
|
The specific model version/revision to use can also be added after the suffix '@':
|
|
137
|
-
|
|
138
|
-
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
euroeval --model <model-id>@<commit>
|
|
139
153
|
```
|
|
140
154
|
|
|
141
155
|
This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
|
|
142
156
|
|
|
143
157
|
See all the arguments and options available for the `euroeval` command by typing
|
|
144
|
-
|
|
145
|
-
|
|
158
|
+
|
|
159
|
+
```bash
|
|
160
|
+
euroeval --help
|
|
146
161
|
```
|
|
147
162
|
|
|
148
163
|
### Benchmarking from a Script
|
|
164
|
+
|
|
149
165
|
In a script, the syntax is similar to the command line interface. You simply initialise
|
|
150
166
|
an object of the `Benchmarker` class, and call this benchmark object with your favorite
|
|
151
167
|
model:
|
|
152
|
-
|
|
168
|
+
|
|
169
|
+
```python
|
|
153
170
|
>>> from euroeval import Benchmarker
|
|
154
171
|
>>> benchmark = Benchmarker()
|
|
155
172
|
>>> benchmark(model="<model-id>")
|
|
@@ -157,29 +174,34 @@ model:
|
|
|
157
174
|
|
|
158
175
|
To benchmark on a specific task and/or language, you simply specify the `task` or
|
|
159
176
|
`language` arguments, shown here with same example as above:
|
|
160
|
-
|
|
177
|
+
|
|
178
|
+
```python
|
|
161
179
|
>>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
|
|
162
180
|
```
|
|
163
181
|
|
|
164
182
|
If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
|
|
165
183
|
simply leave out the `model` argument. In this example, we're benchmarking all Danish
|
|
166
184
|
models on the Danish sentiment classification task:
|
|
167
|
-
|
|
185
|
+
|
|
186
|
+
```python
|
|
168
187
|
>>> benchmark(task="sentiment-classification", language="da")
|
|
169
188
|
```
|
|
170
189
|
|
|
171
190
|
### Benchmarking in an Offline Environment
|
|
191
|
+
|
|
172
192
|
If you need to benchmark in an offline environment, you need to download the models,
|
|
173
193
|
datasets and metrics beforehand. This can be done by adding the `--download-only`
|
|
174
194
|
argument, from the command line, or the `download_only` argument, if benchmarking from a
|
|
175
195
|
script. For example to download the model you want and all of the Danish sentiment
|
|
176
196
|
classification datasets:
|
|
177
|
-
|
|
178
|
-
|
|
197
|
+
|
|
198
|
+
```bash
|
|
199
|
+
euroeval --model <model-id> --task sentiment-classification --language da --download-only
|
|
179
200
|
```
|
|
180
201
|
|
|
181
202
|
Or from a script:
|
|
182
|
-
|
|
203
|
+
|
|
204
|
+
```python
|
|
183
205
|
>>> benchmark(
|
|
184
206
|
... model="<model-id>",
|
|
185
207
|
... task="sentiment-classification",
|
|
@@ -193,11 +215,13 @@ internet connection will be required during evaluation. If offline support is im
|
|
|
193
215
|
to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
|
|
194
216
|
|
|
195
217
|
### Benchmarking from Docker
|
|
218
|
+
|
|
196
219
|
A Dockerfile is provided in the repo, which can be downloaded and run, without needing
|
|
197
220
|
to clone the repo and installing from source. This can be fetched programmatically by
|
|
198
221
|
running the following:
|
|
199
|
-
|
|
200
|
-
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
wget https://raw.githubusercontent.com/EuroEval/EuroEval/main/Dockerfile.cuda
|
|
201
225
|
```
|
|
202
226
|
|
|
203
227
|
Next, to be able to build the Docker image, first ensure that the NVIDIA Container
|
|
@@ -208,56 +232,153 @@ and
|
|
|
208
232
|
Ensure that the the CUDA version stated at the top of the Dockerfile matches the CUDA
|
|
209
233
|
version installed (which you can check using `nvidia-smi`). After that, we build the
|
|
210
234
|
image as follows:
|
|
211
|
-
|
|
212
|
-
|
|
235
|
+
|
|
236
|
+
```bash
|
|
237
|
+
docker build --pull -t euroeval -f Dockerfile.cuda .
|
|
213
238
|
```
|
|
214
239
|
|
|
215
240
|
With the Docker image built, we can now evaluate any model as follows:
|
|
216
|
-
|
|
217
|
-
|
|
241
|
+
|
|
242
|
+
```bash
|
|
243
|
+
docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
|
|
218
244
|
```
|
|
219
245
|
|
|
220
246
|
Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
|
|
221
247
|
argument. This could for instance be `--model <model-id> --task
|
|
222
248
|
sentiment-classification`.
|
|
223
249
|
|
|
224
|
-
|
|
225
250
|
### Reproducing the datasets
|
|
251
|
+
|
|
226
252
|
All datasets used in this project are generated using the scripts located in the
|
|
227
253
|
[src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
|
|
228
254
|
with the following command
|
|
229
255
|
|
|
230
|
-
```
|
|
231
|
-
|
|
256
|
+
```bash
|
|
257
|
+
uv run src/scripts/<name-of-script>.py
|
|
232
258
|
```
|
|
233
259
|
|
|
234
260
|
Replace <name-of-script> with the specific script you wish to execute, e.g.,
|
|
235
261
|
|
|
236
|
-
```
|
|
237
|
-
|
|
262
|
+
```bash
|
|
263
|
+
uv run src/scripts/create_allocine.py
|
|
238
264
|
```
|
|
239
265
|
|
|
240
266
|
## Contributors :pray:
|
|
241
267
|
|
|
242
268
|
A huge thank you to all the contributors who have helped make this project a success!
|
|
243
269
|
|
|
244
|
-
<a href="https://github.com/peter-sk"
|
|
245
|
-
<
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
<a href="https://github.com/
|
|
252
|
-
<
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
<a href="https://github.com/
|
|
259
|
-
<
|
|
260
|
-
|
|
270
|
+
<a href="https://github.com/peter-sk">
|
|
271
|
+
<img
|
|
272
|
+
src="https://avatars.githubusercontent.com/u/6168908"
|
|
273
|
+
width=50
|
|
274
|
+
alt="Contributor avatar for peter-sk"
|
|
275
|
+
/>
|
|
276
|
+
</a>
|
|
277
|
+
<a href="https://github.com/AJDERS">
|
|
278
|
+
<img
|
|
279
|
+
src="https://avatars.githubusercontent.com/u/38854604"
|
|
280
|
+
width=50
|
|
281
|
+
alt="Contributor avatar for AJDERS"
|
|
282
|
+
/>
|
|
283
|
+
</a>
|
|
284
|
+
<a href="https://github.com/oliverkinch">
|
|
285
|
+
<img
|
|
286
|
+
src="https://avatars.githubusercontent.com/u/71556498"
|
|
287
|
+
width=50
|
|
288
|
+
alt="Contributor avatar for oliverkinch"
|
|
289
|
+
/>
|
|
290
|
+
</a>
|
|
291
|
+
<a href="https://github.com/versae">
|
|
292
|
+
<img
|
|
293
|
+
src="https://avatars.githubusercontent.com/u/173537"
|
|
294
|
+
width=50
|
|
295
|
+
alt="Contributor avatar for versae"
|
|
296
|
+
/>
|
|
297
|
+
</a>
|
|
298
|
+
<a href="https://github.com/KennethEnevoldsen">
|
|
299
|
+
<img
|
|
300
|
+
src="https://avatars.githubusercontent.com/u/23721977"
|
|
301
|
+
width=50
|
|
302
|
+
alt="Contributor avatar for KennethEnevoldsen"
|
|
303
|
+
/>
|
|
304
|
+
</a>
|
|
305
|
+
<a href="https://github.com/viggo-gascou">
|
|
306
|
+
<img
|
|
307
|
+
src="https://avatars.githubusercontent.com/u/94069687"
|
|
308
|
+
width=50
|
|
309
|
+
alt="Contributor avatar for viggo-gascou"
|
|
310
|
+
/>
|
|
311
|
+
</a>
|
|
312
|
+
<a href="https://github.com/mathiasesn">
|
|
313
|
+
<img
|
|
314
|
+
src="https://avatars.githubusercontent.com/u/27091759"
|
|
315
|
+
width=50
|
|
316
|
+
alt="Contributor avatar for mathiasesn"
|
|
317
|
+
/>
|
|
318
|
+
</a>
|
|
319
|
+
<a href="https://github.com/Alkarex">
|
|
320
|
+
<img
|
|
321
|
+
src="https://avatars.githubusercontent.com/u/1008324"
|
|
322
|
+
width=50
|
|
323
|
+
alt="Contributor avatar for Alkarex"
|
|
324
|
+
/>
|
|
325
|
+
</a>
|
|
326
|
+
<a href="https://github.com/marksverdhei">
|
|
327
|
+
<img
|
|
328
|
+
src="https://avatars.githubusercontent.com/u/46672778"
|
|
329
|
+
width=50
|
|
330
|
+
alt="Contributor avatar for marksverdhei"
|
|
331
|
+
/>
|
|
332
|
+
</a>
|
|
333
|
+
<a href="https://github.com/Mikeriess">
|
|
334
|
+
<img
|
|
335
|
+
src="https://avatars.githubusercontent.com/u/19728563"
|
|
336
|
+
width=50
|
|
337
|
+
alt="Contributor avatar for Mikeriess"
|
|
338
|
+
/>
|
|
339
|
+
</a>
|
|
340
|
+
<a href="https://github.com/ThomasKluiters">
|
|
341
|
+
<img
|
|
342
|
+
src="https://avatars.githubusercontent.com/u/8137941"
|
|
343
|
+
width=50
|
|
344
|
+
alt="Contributor avatar for ThomasKluiters"
|
|
345
|
+
/>
|
|
346
|
+
</a>
|
|
347
|
+
<a href="https://github.com/BramVanroy">
|
|
348
|
+
<img
|
|
349
|
+
src="https://avatars.githubusercontent.com/u/2779410"
|
|
350
|
+
width=50
|
|
351
|
+
alt="Contributor avatar for BramVanroy"
|
|
352
|
+
/>
|
|
353
|
+
</a>
|
|
354
|
+
<a href="https://github.com/peregilk">
|
|
355
|
+
<img
|
|
356
|
+
src="https://avatars.githubusercontent.com/u/9079808"
|
|
357
|
+
width=50
|
|
358
|
+
alt="Contributor avatar for peregilk"
|
|
359
|
+
/>
|
|
360
|
+
</a>
|
|
361
|
+
<a href="https://github.com/Rijgersberg">
|
|
362
|
+
<img
|
|
363
|
+
src="https://avatars.githubusercontent.com/u/8604946"
|
|
364
|
+
width=50
|
|
365
|
+
alt="Contributor avatar for Rijgersberg"
|
|
366
|
+
/>
|
|
367
|
+
</a>
|
|
368
|
+
<a href="https://github.com/duarteocarmo">
|
|
369
|
+
<img
|
|
370
|
+
src="https://avatars.githubusercontent.com/u/26342344"
|
|
371
|
+
width=50
|
|
372
|
+
alt="Contributor avatar for duarteocarmo"
|
|
373
|
+
/>
|
|
374
|
+
</a>
|
|
375
|
+
<a href="https://github.com/slowwavesleep">
|
|
376
|
+
<img
|
|
377
|
+
src="https://avatars.githubusercontent.com/u/44175589"
|
|
378
|
+
width=50
|
|
379
|
+
alt="Contributor avatar for slowwavesleep"
|
|
380
|
+
/>
|
|
381
|
+
</a>
|
|
261
382
|
|
|
262
383
|
### Contribute to EuroEval
|
|
263
384
|
|
|
@@ -269,8 +390,8 @@ contributing new datasets, your help makes this project better for everyone.
|
|
|
269
390
|
- **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
|
|
270
391
|
a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
|
|
271
392
|
|
|
272
|
-
|
|
273
393
|
### Special Thanks
|
|
394
|
+
|
|
274
395
|
- Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
|
|
275
396
|
[Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
|
|
276
397
|
- Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
|
|
@@ -285,11 +406,11 @@ contributing new datasets, your help makes this project better for everyone.
|
|
|
285
406
|
- Thanks to [CHC](https://chc.au.dk/) for sponsoring the OpenAI credits used to
|
|
286
407
|
evaluate GPT-4-turbo in German.
|
|
287
408
|
|
|
288
|
-
|
|
289
409
|
## Citing EuroEval
|
|
410
|
+
|
|
290
411
|
If you want to cite the framework then feel free to use this:
|
|
291
412
|
|
|
292
|
-
```
|
|
413
|
+
```bibtex
|
|
293
414
|
@article{smart2024encoder,
|
|
294
415
|
title={Encoder vs Decoder: Comparative Analysis of Encoder and Decoder Language Models on Multilingual NLU Tasks},
|
|
295
416
|
author={Smart, Dan Saattrup and Enevoldsen, Kenneth and Schneider-Kamp, Peter},
|