EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show
  1. euroeval/__init__.py +7 -4
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +5 -2
  5. euroeval/benchmark_modules/hf.py +107 -66
  6. euroeval/benchmark_modules/litellm.py +103 -55
  7. euroeval/benchmark_modules/vllm.py +155 -82
  8. euroeval/benchmarker.py +184 -129
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +1 -1
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +14 -11
  14. euroeval/data_models.py +12 -4
  15. euroeval/dataset_configs/__init__.py +3 -0
  16. euroeval/dataset_configs/czech.py +79 -0
  17. euroeval/dataset_configs/danish.py +10 -13
  18. euroeval/dataset_configs/dutch.py +0 -3
  19. euroeval/dataset_configs/english.py +0 -3
  20. euroeval/dataset_configs/estonian.py +11 -1
  21. euroeval/dataset_configs/finnish.py +0 -3
  22. euroeval/dataset_configs/french.py +0 -3
  23. euroeval/dataset_configs/german.py +0 -3
  24. euroeval/dataset_configs/italian.py +0 -3
  25. euroeval/dataset_configs/latvian.py +2 -4
  26. euroeval/dataset_configs/lithuanian.py +68 -0
  27. euroeval/dataset_configs/norwegian.py +0 -3
  28. euroeval/dataset_configs/polish.py +0 -3
  29. euroeval/dataset_configs/portuguese.py +0 -3
  30. euroeval/dataset_configs/slovak.py +60 -0
  31. euroeval/dataset_configs/spanish.py +0 -3
  32. euroeval/dataset_configs/swedish.py +10 -15
  33. euroeval/finetuning.py +21 -15
  34. euroeval/generation.py +10 -10
  35. euroeval/generation_utils.py +2 -3
  36. euroeval/logging_utils.py +250 -0
  37. euroeval/metrics/base.py +0 -3
  38. euroeval/metrics/huggingface.py +10 -6
  39. euroeval/metrics/llm_as_a_judge.py +5 -3
  40. euroeval/metrics/pipeline.py +22 -9
  41. euroeval/metrics/speed.py +0 -3
  42. euroeval/model_cache.py +11 -14
  43. euroeval/model_config.py +4 -5
  44. euroeval/model_loading.py +3 -0
  45. euroeval/prompt_templates/linguistic_acceptability.py +30 -3
  46. euroeval/prompt_templates/multiple_choice.py +34 -1
  47. euroeval/prompt_templates/named_entity_recognition.py +71 -11
  48. euroeval/prompt_templates/reading_comprehension.py +41 -3
  49. euroeval/prompt_templates/sentiment_classification.py +34 -1
  50. euroeval/prompt_templates/summarization.py +26 -6
  51. euroeval/scores.py +7 -7
  52. euroeval/speed_benchmark.py +3 -5
  53. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  54. euroeval/task_group_utils/question_answering.py +0 -3
  55. euroeval/task_group_utils/sequence_classification.py +43 -31
  56. euroeval/task_group_utils/text_to_text.py +17 -8
  57. euroeval/task_group_utils/token_classification.py +10 -9
  58. euroeval/tokenisation_utils.py +22 -20
  59. euroeval/utils.py +30 -147
  60. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
  61. euroeval-16.4.0.dist-info/RECORD +75 -0
  62. euroeval-16.2.2.dist-info/RECORD +0 -70
  63. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  64. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  65. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/utils.py CHANGED
@@ -11,30 +11,23 @@ import re
11
11
  import socket
12
12
  import sys
13
13
  import typing as t
14
- import warnings
15
- from functools import cache
16
14
  from pathlib import Path
17
15
 
18
16
  import demjson3
19
17
  import huggingface_hub as hf_hub
20
- import litellm
21
18
  import numpy as np
22
19
  import torch
23
- from datasets.utils import disable_progress_bar
24
- from transformers import logging as tf_logging
25
20
 
21
+ from .caching_utils import cache_arguments
22
+ from .constants import T
26
23
  from .exceptions import InvalidBenchmark, InvalidModel, NaNValueInModelOutput
24
+ from .logging_utils import log, log_once
27
25
 
28
26
  if t.TYPE_CHECKING:
29
- from types import TracebackType
30
-
31
27
  from .data_models import ModelIdComponents
32
28
  from .types import Predictions
33
29
 
34
30
 
35
- logger = logging.getLogger("euroeval")
36
-
37
-
38
31
  def create_model_cache_dir(cache_dir: str, model_id: str) -> str:
39
32
  """Create cache directory for a model.
40
33
 
@@ -149,68 +142,6 @@ def enforce_reproducibility(seed: int = 4242) -> np.random.Generator:
149
142
  return rng
150
143
 
151
144
 
152
- def block_terminal_output() -> None:
153
- """Blocks libraries from writing output to the terminal.
154
-
155
- This filters warnings from some libraries, sets the logging level to ERROR for some
156
- libraries, disabled tokeniser progress bars when using Hugging Face tokenisers, and
157
- disables most of the logging from the `transformers` library.
158
- """
159
- if os.getenv("FULL_LOG") == "1":
160
- return
161
-
162
- # Ignore miscellaneous warnings
163
- warnings.filterwarnings("ignore", category=UserWarning)
164
- warnings.filterwarnings("ignore", category=FutureWarning)
165
- logging.getLogger("absl").setLevel(logging.CRITICAL)
166
-
167
- # Disable matplotlib logging
168
- logging.getLogger("matplotlib.font_manager").setLevel(logging.CRITICAL)
169
-
170
- # Disable PyTorch logging
171
- logging.getLogger("torch.utils.cpp_extension").setLevel(logging.CRITICAL)
172
- warnings.filterwarnings(action="ignore", module="torch*")
173
- os.environ["TORCH_LOGS"] = "-all"
174
-
175
- # Disable huggingface_hub logging
176
- logging.getLogger("huggingface_hub").setLevel(logging.CRITICAL)
177
-
178
- # Disable LiteLLM logging
179
- logging.getLogger("LiteLLM").setLevel(logging.CRITICAL)
180
- logging.getLogger("LiteLLM Router").setLevel(logging.CRITICAL)
181
- logging.getLogger("LiteLLM Proxy").setLevel(logging.CRITICAL)
182
- logging.getLogger("openai").setLevel(logging.CRITICAL)
183
- logging.getLogger("httpx").setLevel(logging.CRITICAL)
184
- litellm.suppress_debug_info = True
185
-
186
- # Disable vLLM logging
187
- logging.getLogger("vllm").setLevel(logging.CRITICAL)
188
- logging.getLogger("vllm.engine.llm_engine").setLevel(logging.CRITICAL)
189
- logging.getLogger("vllm.transformers_utils.tokenizer").setLevel(logging.CRITICAL)
190
- logging.getLogger("vllm.core.scheduler").setLevel(logging.CRITICAL)
191
- logging.getLogger("vllm.model_executor.weight_utils").setLevel(logging.CRITICAL)
192
- logging.getLogger("vllm.platforms").setLevel(logging.CRITICAL)
193
- logging.getLogger("mistral_common.tokens.tokenizers.tekken").setLevel(
194
- logging.CRITICAL
195
- )
196
- os.environ["LOG_LEVEL"] = "CRITICAL"
197
- os.environ["VLLM_CONFIGURE_LOGGING"] = "0"
198
-
199
- # Disable datasets logging
200
- logging.getLogger("datasets").setLevel(logging.CRITICAL)
201
- logging.getLogger("filelock").setLevel(logging.CRITICAL)
202
- disable_progress_bar()
203
-
204
- # Disable evaluate logging
205
- warnings.filterwarnings("ignore", module="seqeval*")
206
-
207
- # Disable most of the `transformers` logging
208
- tf_logging._default_log_level = logging.CRITICAL
209
- tf_logging.set_verbosity(logging.CRITICAL)
210
- logging.getLogger("transformers.trainer").setLevel(logging.CRITICAL)
211
- logging.getLogger("accelerate").setLevel(logging.CRITICAL)
212
-
213
-
214
145
  def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type | None:
215
146
  """Get a class by its name.
216
147
 
@@ -240,9 +171,10 @@ def get_class_by_name(class_name: str | list[str], module_name: str) -> t.Type |
240
171
 
241
172
  if error_messages:
242
173
  errors = "\n- " + "\n- ".join(error_messages)
243
- logger.debug(
174
+ log(
244
175
  f"Could not find the class with the name(s) {', '.join(class_name)}. The "
245
- f"following error messages were raised: {errors}"
176
+ f"following error messages were raised: {errors}",
177
+ level=logging.DEBUG,
246
178
  )
247
179
 
248
180
  # If the class could not be found, return None
@@ -264,49 +196,27 @@ def get_min_cuda_compute_capability() -> float | None:
264
196
  return float(f"{major}.{minor}")
265
197
 
266
198
 
267
- @cache
199
+ @cache_arguments(disable_condition=lambda: hasattr(sys, "_called_from_test"))
268
200
  def internet_connection_available() -> bool:
269
201
  """Checks if internet connection is available by pinging google.com.
270
202
 
271
203
  Returns:
272
204
  Whether or not internet connection is available.
273
205
  """
206
+ internet_available: bool = False
207
+
274
208
  try:
275
209
  s = socket.create_connection(("1.1.1.1", 80))
276
210
  s.close()
277
- return True
278
-
279
- # We want to only catch exceptions related to socket connections, but as we cannot
280
- # import these here as they're developer dependencies, we check the exception name
281
- # instead. If the exception is not related to socket connections, we reraise it.
211
+ internet_available = True
212
+ except OSError:
213
+ pass
282
214
  except Exception as e:
283
215
  pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
284
- if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
285
- return False
286
- raise e
287
-
288
-
289
- class HiddenPrints:
290
- """Context manager which removes all terminal output."""
291
-
292
- def __enter__(self) -> None:
293
- """Enter the context manager."""
294
- self._original_stdout = sys.stdout
295
- self._original_stderr = sys.stderr
296
- sys.stdout = open(os.devnull, "w")
297
- sys.stderr = open(os.devnull, "w")
298
-
299
- def __exit__(
300
- self,
301
- exc_type: t.Type[BaseException],
302
- exc_val: BaseException,
303
- exc_tb: "TracebackType",
304
- ) -> None:
305
- """Exit the context manager."""
306
- sys.stdout.close()
307
- sys.stderr.close()
308
- sys.stdout = self._original_stdout
309
- sys.stderr = self._original_stderr
216
+ if type(e).__name__ not in pytest_socket_errors:
217
+ raise e
218
+
219
+ return internet_available
310
220
 
311
221
 
312
222
  def raise_if_model_output_contains_nan_values(model_output: "Predictions") -> None:
@@ -364,34 +274,6 @@ def unscramble(scrambled_text: str) -> str:
364
274
  return unscrambled
365
275
 
366
276
 
367
- @cache
368
- def log_once(message: str, level: int = logging.INFO) -> None:
369
- """Log a message once.
370
-
371
- This is ensured by caching the input/output pairs of this function, using the
372
- `functools.cache` decorator.
373
-
374
- Args:
375
- message:
376
- The message to log.
377
- level:
378
- The logging level. Defaults to logging.INFO.
379
- """
380
- match level:
381
- case logging.DEBUG:
382
- logger.debug(message)
383
- case logging.INFO:
384
- logger.info(message)
385
- case logging.WARNING:
386
- logger.warning(message)
387
- case logging.ERROR:
388
- logger.error(message)
389
- case logging.CRITICAL:
390
- logger.critical(message)
391
- case _:
392
- raise ValueError(f"Invalid logging level: {level}")
393
-
394
-
395
277
  def get_package_version(package_name: str) -> str | None:
396
278
  """Get the version of a package.
397
279
 
@@ -408,9 +290,6 @@ def get_package_version(package_name: str) -> str | None:
408
290
  return None
409
291
 
410
292
 
411
- T = t.TypeVar("T", bound=object)
412
-
413
-
414
293
  def safe_run(coroutine: t.Coroutine[t.Any, t.Any, T]) -> T:
415
294
  """Run a coroutine, ensuring that the event loop is always closed when we're done.
416
295
 
@@ -462,39 +341,43 @@ def extract_json_dict_from_string(s: str) -> dict | None:
462
341
  Returns:
463
342
  The extracted JSON dictionary, or None if no JSON dictionary could be found.
464
343
  """
465
- json_regex = r"\{[^{}]+?\}"
344
+ json_regex = r"\{[^{}]*?\}"
466
345
  if (json_match := re.search(pattern=json_regex, string=s, flags=re.DOTALL)) is None:
467
- logger.debug(
346
+ log(
468
347
  "The model output does not contain any JSON dictionary, so cannot parse "
469
- f"it. Skipping. Here is the output: {s!r}"
348
+ f"it. Skipping. Here is the output: {s!r}",
349
+ level=logging.DEBUG,
470
350
  )
471
351
  return None
472
352
  json_string = json_match.group()
473
353
  try:
474
354
  json_output = demjson3.decode(txt=json_string)
475
355
  except demjson3.JSONDecodeError:
476
- logger.debug(
356
+ log(
477
357
  "The model output is not valid JSON, so cannot parse it. Skipping. "
478
- f"Here is the output: {json_string!r}"
358
+ f"Here is the output: {json_string!r}",
359
+ level=logging.DEBUG,
479
360
  )
480
361
  return None
481
362
  if not isinstance(json_output, dict):
482
- logger.debug(
363
+ log(
483
364
  "The model output is not a JSON dictionary, so cannot parse "
484
- f"it. Skipping. Here is the output: {json_string!r}"
365
+ f"it. Skipping. Here is the output: {json_string!r}",
366
+ level=logging.DEBUG,
485
367
  )
486
368
  return None
487
369
  elif not all(isinstance(key, str) for key in json_output.keys()):
488
- logger.debug(
370
+ log(
489
371
  "The model output is not a JSON dictionary with string keys, "
490
372
  "so cannot parse it. Skipping. Here is the output: "
491
- f"{json_string!r}"
373
+ f"{json_string!r}",
374
+ level=logging.DEBUG,
492
375
  )
493
376
  return None
494
377
  return json_output
495
378
 
496
379
 
497
- @cache
380
+ @cache_arguments()
498
381
  def get_hf_token(api_key: str | None) -> str | bool:
499
382
  """Get the Hugging Face token.
500
383
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.2.2
3
+ Version: 16.4.0
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -62,21 +62,28 @@ Provides-Extra: all
62
62
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'all'
63
63
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'all'
64
64
  Requires-Dist: timm>=1.0.19; extra == 'all'
65
- Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'all'
65
+ Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'all'
66
66
  Provides-Extra: generative
67
67
  Requires-Dist: bitsandbytes>=0.43.1; (platform_system == 'Linux') and extra == 'generative'
68
68
  Requires-Dist: fbgemm-gpu>=1.0.0; (platform_system == 'Linux') and extra == 'generative'
69
69
  Requires-Dist: timm>=1.0.19; extra == 'generative'
70
- Requires-Dist: vllm[flashinfer]>=0.10.1; (platform_system == 'Linux') and extra == 'generative'
70
+ Requires-Dist: vllm[flashinfer]>=0.11.0; (platform_system == 'Linux') and extra == 'generative'
71
71
  Description-Content-Type: text/markdown
72
72
 
73
+ <!-- This disables the requirement that the first line is a top-level heading -->
74
+ <!-- markdownlint-configure-file { "MD041": false } -->
75
+
73
76
  <div align='center'>
74
- <img src="https://raw.githubusercontent.com/EuroEval/EuroEval/main/gfx/euroeval.png" height="500" width="372">
77
+ <img
78
+ src="https://raw.githubusercontent.com/EuroEval/EuroEval/main/gfx/euroeval.png"
79
+ height="500"
80
+ width="372"
81
+ >
75
82
  </div>
76
83
 
77
- ### The robust European language model benchmark.
84
+ ### The robust European language model benchmark
78
85
 
79
- _(formerly known as ScandEval)_
86
+ (formerly known as ScandEval)
80
87
 
81
88
  ______________________________________________________________________
82
89
  [![Documentation](https://img.shields.io/badge/docs-passing-green)](https://euroeval.com)
@@ -85,19 +92,19 @@ ______________________________________________________________________
85
92
  [![Second paper](https://img.shields.io/badge/arXiv-2406.13469-b31b1b.svg)](https://arxiv.org/abs/2406.13469)
86
93
  [![License](https://img.shields.io/github/license/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/blob/main/LICENSE)
87
94
  [![LastCommit](https://img.shields.io/github/last-commit/EuroEval/EuroEval)](https://github.com/EuroEval/EuroEval/commits/main)
88
- [![Code Coverage](https://img.shields.io/badge/Coverage-67%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
95
+ [![Code Coverage](https://img.shields.io/badge/Coverage-70%25-yellow.svg)](https://github.com/EuroEval/EuroEval/tree/main/tests)
89
96
  [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-2.0-4baaaa.svg)](https://github.com/EuroEval/EuroEval/blob/main/CODE_OF_CONDUCT.md)
90
97
 
91
-
92
98
  ## Maintainer
93
99
 
94
- - Dan Saattrup Smart ([@saattrupdan](https://github.com/saattrupdan), dan.smart@alexandra.dk)
95
-
100
+ - Dan Saattrup Smart ([@saattrupdan](https://github.com/saattrupdan), <dan.smart@alexandra.dk>)
96
101
 
97
102
  ## Installation
103
+
98
104
  To install the package simply write the following command in your favorite terminal:
99
- ```
100
- $ pip install euroeval[all]
105
+
106
+ ```bash
107
+ pip install euroeval[all]
101
108
  ```
102
109
 
103
110
  This will install the EuroEval package with all extras. You can also install the
@@ -105,51 +112,61 @@ minimal version by leaving out the `[all]`, in which case the package will let y
105
112
  when an evaluation requires a certain extra dependency, and how you install it.
106
113
 
107
114
  ## Quickstart
115
+
108
116
  ### Benchmarking from the Command Line
117
+
109
118
  The easiest way to benchmark pretrained models is via the command line interface. After
110
119
  having installed the package, you can benchmark your favorite model like so:
111
- ```
112
- $ euroeval --model <model-id>
120
+
121
+ ```bash
122
+ euroeval --model <model-id>
113
123
  ```
114
124
 
115
125
  Here `model` is the HuggingFace model ID, which can be found on the [HuggingFace
116
126
  Hub](https://huggingface.co/models). By default this will benchmark the model on all
117
127
  the tasks available. If you want to benchmark on a particular task, then use the
118
128
  `--task` argument:
119
- ```
120
- $ euroeval --model <model-id> --task sentiment-classification
129
+
130
+ ```bash
131
+ euroeval --model <model-id> --task sentiment-classification
121
132
  ```
122
133
 
123
134
  We can also narrow down which languages we would like to benchmark on. This can be done
124
135
  by setting the `--language` argument. Here we thus benchmark the model on the Danish
125
136
  sentiment classification task:
126
- ```
127
- $ euroeval --model <model-id> --task sentiment-classification --language da
137
+
138
+ ```bash
139
+ euroeval --model <model-id> --task sentiment-classification --language da
128
140
  ```
129
141
 
130
142
  Multiple models, datasets and/or languages can be specified by just attaching multiple
131
143
  arguments. Here is an example with two models:
132
- ```
133
- $ euroeval --model <model-id1> --model <model-id2>
144
+
145
+ ```bash
146
+ euroeval --model <model-id1> --model <model-id2>
134
147
  ```
135
148
 
136
149
  The specific model version/revision to use can also be added after the suffix '@':
137
- ```
138
- $ euroeval --model <model-id>@<commit>
150
+
151
+ ```bash
152
+ euroeval --model <model-id>@<commit>
139
153
  ```
140
154
 
141
155
  This can be a branch name, a tag name, or a commit id. It defaults to 'main' for latest.
142
156
 
143
157
  See all the arguments and options available for the `euroeval` command by typing
144
- ```
145
- $ euroeval --help
158
+
159
+ ```bash
160
+ euroeval --help
146
161
  ```
147
162
 
148
163
  ### Benchmarking from a Script
164
+
149
165
  In a script, the syntax is similar to the command line interface. You simply initialise
150
166
  an object of the `Benchmarker` class, and call this benchmark object with your favorite
151
167
  model:
152
- ```
168
+
169
+ ```python
153
170
  >>> from euroeval import Benchmarker
154
171
  >>> benchmark = Benchmarker()
155
172
  >>> benchmark(model="<model-id>")
@@ -157,29 +174,34 @@ model:
157
174
 
158
175
  To benchmark on a specific task and/or language, you simply specify the `task` or
159
176
  `language` arguments, shown here with same example as above:
160
- ```
177
+
178
+ ```python
161
179
  >>> benchmark(model="<model-id>", task="sentiment-classification", language="da")
162
180
  ```
163
181
 
164
182
  If you want to benchmark a subset of all the models on the Hugging Face Hub, you can
165
183
  simply leave out the `model` argument. In this example, we're benchmarking all Danish
166
184
  models on the Danish sentiment classification task:
167
- ```
185
+
186
+ ```python
168
187
  >>> benchmark(task="sentiment-classification", language="da")
169
188
  ```
170
189
 
171
190
  ### Benchmarking in an Offline Environment
191
+
172
192
  If you need to benchmark in an offline environment, you need to download the models,
173
193
  datasets and metrics beforehand. This can be done by adding the `--download-only`
174
194
  argument, from the command line, or the `download_only` argument, if benchmarking from a
175
195
  script. For example to download the model you want and all of the Danish sentiment
176
196
  classification datasets:
177
- ```
178
- $ euroeval --model <model-id> --task sentiment-classification --language da --download-only
197
+
198
+ ```bash
199
+ euroeval --model <model-id> --task sentiment-classification --language da --download-only
179
200
  ```
180
201
 
181
202
  Or from a script:
182
- ```
203
+
204
+ ```python
183
205
  >>> benchmark(
184
206
  ... model="<model-id>",
185
207
  ... task="sentiment-classification",
@@ -193,11 +215,13 @@ internet connection will be required during evaluation. If offline support is im
193
215
  to you, please consider [opening an issue](https://github.com/EuroEval/EuroEval/issues).
194
216
 
195
217
  ### Benchmarking from Docker
218
+
196
219
  A Dockerfile is provided in the repo, which can be downloaded and run, without needing
197
220
  to clone the repo and installing from source. This can be fetched programmatically by
198
221
  running the following:
199
- ```
200
- $ wget https://raw.githubusercontent.com/EuroEval/EuroEval/main/Dockerfile.cuda
222
+
223
+ ```bash
224
+ wget https://raw.githubusercontent.com/EuroEval/EuroEval/main/Dockerfile.cuda
201
225
  ```
202
226
 
203
227
  Next, to be able to build the Docker image, first ensure that the NVIDIA Container
@@ -208,56 +232,153 @@ and
208
232
  Ensure that the the CUDA version stated at the top of the Dockerfile matches the CUDA
209
233
  version installed (which you can check using `nvidia-smi`). After that, we build the
210
234
  image as follows:
211
- ```
212
- $ docker build --pull -t euroeval -f Dockerfile.cuda .
235
+
236
+ ```bash
237
+ docker build --pull -t euroeval -f Dockerfile.cuda .
213
238
  ```
214
239
 
215
240
  With the Docker image built, we can now evaluate any model as follows:
216
- ```
217
- $ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
241
+
242
+ ```bash
243
+ docker run -e args="<euroeval-arguments>" --gpus 1 --name euroeval --rm euroeval
218
244
  ```
219
245
 
220
246
  Here `<euroeval-arguments>` consists of the arguments added to the `euroeval` CLI
221
247
  argument. This could for instance be `--model <model-id> --task
222
248
  sentiment-classification`.
223
249
 
224
-
225
250
  ### Reproducing the datasets
251
+
226
252
  All datasets used in this project are generated using the scripts located in the
227
253
  [src/scripts](src/scripts) folder. To reproduce a dataset, run the corresponding script
228
254
  with the following command
229
255
 
230
- ```shell
231
- $ uv run src/scripts/<name-of-script>.py
256
+ ```bash
257
+ uv run src/scripts/<name-of-script>.py
232
258
  ```
233
259
 
234
260
  Replace <name-of-script> with the specific script you wish to execute, e.g.,
235
261
 
236
- ```shell
237
- $ uv run src/scripts/create_allocine.py
262
+ ```bash
263
+ uv run src/scripts/create_allocine.py
238
264
  ```
239
265
 
240
266
  ## Contributors :pray:
241
267
 
242
268
  A huge thank you to all the contributors who have helped make this project a success!
243
269
 
244
- <a href="https://github.com/peter-sk"><img src="https://avatars.githubusercontent.com/u/6168908" width=50 alt="Contributor avatar for peter-sk"/></a>
245
- <a href="https://github.com/AJDERS"><img src="https://avatars.githubusercontent.com/u/38854604" width=50 alt="Contributor avatar for AJDERS"/></a>
246
- <a href="https://github.com/oliverkinch"><img src="https://avatars.githubusercontent.com/u/71556498" width=50 alt="Contributor avatar for oliverkinch"/></a>
247
- <a href="https://github.com/versae"><img src="https://avatars.githubusercontent.com/u/173537" width=50 alt="Contributor avatar for versae"/></a>
248
- <a href="https://github.com/KennethEnevoldsen"><img src="https://avatars.githubusercontent.com/u/23721977" width=50 alt="Contributor avatar for KennethEnevoldsen"/></a>
249
- <a href="https://github.com/viggo-gascou"><img src="https://avatars.githubusercontent.com/u/94069687" width=50 alt="Contributor avatar for viggo-gascou"/></a>
250
- <a href="https://github.com/mathiasesn"><img src="https://avatars.githubusercontent.com/u/27091759" width=50 alt="Contributor avatar for mathiasesn"/></a>
251
- <a href="https://github.com/Alkarex"><img src="https://avatars.githubusercontent.com/u/1008324" width=50 alt="Contributor avatar for Alkarex"/></a>
252
- <a href="https://github.com/marksverdhei"><img src="https://avatars.githubusercontent.com/u/46672778" width=50 alt="Contributor avatar for marksverdhei"/></a>
253
- <a href="https://github.com/Mikeriess"><img src="https://avatars.githubusercontent.com/u/19728563" width=50 alt="Contributor avatar for Mikeriess"/></a>
254
- <a href="https://github.com/ThomasKluiters"><img src="https://avatars.githubusercontent.com/u/8137941" width=50 alt="Contributor avatar for ThomasKluiters"/></a>
255
- <a href="https://github.com/BramVanroy"><img src="https://avatars.githubusercontent.com/u/2779410" width=50 alt="Contributor avatar for BramVanroy"/></a>
256
- <a href="https://github.com/peregilk"><img src="https://avatars.githubusercontent.com/u/9079808" width=50 alt="Contributor avatar for peregilk"/></a>
257
- <a href="https://github.com/Rijgersberg"><img src="https://avatars.githubusercontent.com/u/8604946" width=50 alt="Contributor avatar for Rijgersberg"/></a>
258
- <a href="https://github.com/duarteocarmo"><img src="https://avatars.githubusercontent.com/u/26342344" width=50 alt="Contributor avatar for duarteocarmo"/></a>
259
- <a href="https://github.com/slowwavesleep"><img src="https://avatars.githubusercontent.com/u/44175589" width=50 alt="Contributor avatar for slowwavesleep"/></a>
260
-
270
+ <a href="https://github.com/peter-sk">
271
+ <img
272
+ src="https://avatars.githubusercontent.com/u/6168908"
273
+ width=50
274
+ alt="Contributor avatar for peter-sk"
275
+ />
276
+ </a>
277
+ <a href="https://github.com/AJDERS">
278
+ <img
279
+ src="https://avatars.githubusercontent.com/u/38854604"
280
+ width=50
281
+ alt="Contributor avatar for AJDERS"
282
+ />
283
+ </a>
284
+ <a href="https://github.com/oliverkinch">
285
+ <img
286
+ src="https://avatars.githubusercontent.com/u/71556498"
287
+ width=50
288
+ alt="Contributor avatar for oliverkinch"
289
+ />
290
+ </a>
291
+ <a href="https://github.com/versae">
292
+ <img
293
+ src="https://avatars.githubusercontent.com/u/173537"
294
+ width=50
295
+ alt="Contributor avatar for versae"
296
+ />
297
+ </a>
298
+ <a href="https://github.com/KennethEnevoldsen">
299
+ <img
300
+ src="https://avatars.githubusercontent.com/u/23721977"
301
+ width=50
302
+ alt="Contributor avatar for KennethEnevoldsen"
303
+ />
304
+ </a>
305
+ <a href="https://github.com/viggo-gascou">
306
+ <img
307
+ src="https://avatars.githubusercontent.com/u/94069687"
308
+ width=50
309
+ alt="Contributor avatar for viggo-gascou"
310
+ />
311
+ </a>
312
+ <a href="https://github.com/mathiasesn">
313
+ <img
314
+ src="https://avatars.githubusercontent.com/u/27091759"
315
+ width=50
316
+ alt="Contributor avatar for mathiasesn"
317
+ />
318
+ </a>
319
+ <a href="https://github.com/Alkarex">
320
+ <img
321
+ src="https://avatars.githubusercontent.com/u/1008324"
322
+ width=50
323
+ alt="Contributor avatar for Alkarex"
324
+ />
325
+ </a>
326
+ <a href="https://github.com/marksverdhei">
327
+ <img
328
+ src="https://avatars.githubusercontent.com/u/46672778"
329
+ width=50
330
+ alt="Contributor avatar for marksverdhei"
331
+ />
332
+ </a>
333
+ <a href="https://github.com/Mikeriess">
334
+ <img
335
+ src="https://avatars.githubusercontent.com/u/19728563"
336
+ width=50
337
+ alt="Contributor avatar for Mikeriess"
338
+ />
339
+ </a>
340
+ <a href="https://github.com/ThomasKluiters">
341
+ <img
342
+ src="https://avatars.githubusercontent.com/u/8137941"
343
+ width=50
344
+ alt="Contributor avatar for ThomasKluiters"
345
+ />
346
+ </a>
347
+ <a href="https://github.com/BramVanroy">
348
+ <img
349
+ src="https://avatars.githubusercontent.com/u/2779410"
350
+ width=50
351
+ alt="Contributor avatar for BramVanroy"
352
+ />
353
+ </a>
354
+ <a href="https://github.com/peregilk">
355
+ <img
356
+ src="https://avatars.githubusercontent.com/u/9079808"
357
+ width=50
358
+ alt="Contributor avatar for peregilk"
359
+ />
360
+ </a>
361
+ <a href="https://github.com/Rijgersberg">
362
+ <img
363
+ src="https://avatars.githubusercontent.com/u/8604946"
364
+ width=50
365
+ alt="Contributor avatar for Rijgersberg"
366
+ />
367
+ </a>
368
+ <a href="https://github.com/duarteocarmo">
369
+ <img
370
+ src="https://avatars.githubusercontent.com/u/26342344"
371
+ width=50
372
+ alt="Contributor avatar for duarteocarmo"
373
+ />
374
+ </a>
375
+ <a href="https://github.com/slowwavesleep">
376
+ <img
377
+ src="https://avatars.githubusercontent.com/u/44175589"
378
+ width=50
379
+ alt="Contributor avatar for slowwavesleep"
380
+ />
381
+ </a>
261
382
 
262
383
  ### Contribute to EuroEval
263
384
 
@@ -269,8 +390,8 @@ contributing new datasets, your help makes this project better for everyone.
269
390
  - **Adding datasets**: If you're interested in adding a new dataset to EuroEval, we have
270
391
  a [dedicated guide](NEW_DATASET_GUIDE.md) with step-by-step instructions.
271
392
 
272
-
273
393
  ### Special Thanks
394
+
274
395
  - Thanks to [Google](https://google.com/) for sponsoring Gemini credits as part of their
275
396
  [Google Cloud for Researchers Program](https://cloud.google.com/edu/researchers).
276
397
  - Thanks [@Mikeriess](https://github.com/Mikeriess) for evaluating many of the larger
@@ -285,11 +406,11 @@ contributing new datasets, your help makes this project better for everyone.
285
406
  - Thanks to [CHC](https://chc.au.dk/) for sponsoring the OpenAI credits used to
286
407
  evaluate GPT-4-turbo in German.
287
408
 
288
-
289
409
  ## Citing EuroEval
410
+
290
411
  If you want to cite the framework then feel free to use this:
291
412
 
292
- ```
413
+ ```bibtex
293
414
  @article{smart2024encoder,
294
415
  title={Encoder vs Decoder: Comparative Analysis of Encoder and Decoder Language Models on Multilingual NLU Tasks},
295
416
  author={Smart, Dan Saattrup and Enevoldsen, Kenneth and Schneider-Kamp, Peter},