EuroEval 15.12.0__py3-none-any.whl → 16.7.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. euroeval/__init__.py +32 -14
  2. euroeval/benchmark_config_factory.py +92 -180
  3. euroeval/benchmark_modules/base.py +49 -39
  4. euroeval/benchmark_modules/fresh.py +35 -21
  5. euroeval/benchmark_modules/hf.py +280 -244
  6. euroeval/benchmark_modules/litellm.py +752 -312
  7. euroeval/benchmark_modules/vllm.py +570 -268
  8. euroeval/benchmarker.py +651 -528
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +49 -38
  12. euroeval/constants.py +44 -25
  13. euroeval/data_loading.py +111 -55
  14. euroeval/data_models.py +490 -323
  15. euroeval/dataset_configs/__init__.py +26 -4
  16. euroeval/dataset_configs/bosnian.py +39 -0
  17. euroeval/dataset_configs/bulgarian.py +56 -0
  18. euroeval/dataset_configs/croatian.py +56 -0
  19. euroeval/dataset_configs/czech.py +75 -0
  20. euroeval/dataset_configs/danish.py +78 -50
  21. euroeval/dataset_configs/dutch.py +74 -44
  22. euroeval/dataset_configs/english.py +71 -36
  23. euroeval/dataset_configs/estonian.py +111 -0
  24. euroeval/dataset_configs/faroese.py +25 -18
  25. euroeval/dataset_configs/finnish.py +63 -26
  26. euroeval/dataset_configs/french.py +65 -32
  27. euroeval/dataset_configs/german.py +77 -36
  28. euroeval/dataset_configs/greek.py +64 -0
  29. euroeval/dataset_configs/icelandic.py +68 -57
  30. euroeval/dataset_configs/italian.py +68 -36
  31. euroeval/dataset_configs/latvian.py +87 -0
  32. euroeval/dataset_configs/lithuanian.py +64 -0
  33. euroeval/dataset_configs/norwegian.py +98 -72
  34. euroeval/dataset_configs/polish.py +96 -0
  35. euroeval/dataset_configs/portuguese.py +63 -40
  36. euroeval/dataset_configs/serbian.py +64 -0
  37. euroeval/dataset_configs/slovak.py +55 -0
  38. euroeval/dataset_configs/slovene.py +56 -0
  39. euroeval/dataset_configs/spanish.py +68 -34
  40. euroeval/dataset_configs/swedish.py +82 -41
  41. euroeval/dataset_configs/ukrainian.py +64 -0
  42. euroeval/enums.py +12 -6
  43. euroeval/exceptions.py +21 -1
  44. euroeval/finetuning.py +34 -26
  45. euroeval/generation.py +76 -41
  46. euroeval/generation_utils.py +169 -34
  47. euroeval/languages.py +1020 -188
  48. euroeval/logging_utils.py +268 -0
  49. euroeval/metrics/__init__.py +6 -0
  50. euroeval/metrics/base.py +85 -0
  51. euroeval/metrics/huggingface.py +216 -0
  52. euroeval/metrics/llm_as_a_judge.py +260 -0
  53. euroeval/metrics/pipeline.py +289 -0
  54. euroeval/metrics/speed.py +48 -0
  55. euroeval/model_cache.py +40 -21
  56. euroeval/model_config.py +4 -5
  57. euroeval/model_loading.py +3 -0
  58. euroeval/prompt_templates/__init__.py +2 -0
  59. euroeval/prompt_templates/classification.py +206 -0
  60. euroeval/prompt_templates/linguistic_acceptability.py +157 -22
  61. euroeval/prompt_templates/multiple_choice.py +159 -17
  62. euroeval/prompt_templates/named_entity_recognition.py +318 -21
  63. euroeval/prompt_templates/reading_comprehension.py +207 -16
  64. euroeval/prompt_templates/sentiment_classification.py +205 -22
  65. euroeval/prompt_templates/summarization.py +122 -22
  66. euroeval/prompt_templates/token_classification.py +279 -0
  67. euroeval/scores.py +20 -9
  68. euroeval/speed_benchmark.py +11 -12
  69. euroeval/task_group_utils/multiple_choice_classification.py +21 -12
  70. euroeval/task_group_utils/question_answering.py +101 -73
  71. euroeval/task_group_utils/sequence_classification.py +144 -61
  72. euroeval/task_group_utils/text_to_text.py +33 -12
  73. euroeval/task_group_utils/token_classification.py +86 -89
  74. euroeval/tasks.py +75 -16
  75. euroeval/tokenisation_utils.py +603 -0
  76. euroeval/types.py +17 -11
  77. euroeval/utils.py +332 -137
  78. euroeval-16.7.1.dist-info/METADATA +623 -0
  79. euroeval-16.7.1.dist-info/RECORD +84 -0
  80. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/entry_points.txt +0 -1
  81. euroeval/human_evaluation.py +0 -737
  82. euroeval/metrics.py +0 -452
  83. euroeval/tokenization_utils.py +0 -498
  84. euroeval-15.12.0.dist-info/METADATA +0 -285
  85. euroeval-15.12.0.dist-info/RECORD +0 -63
  86. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/WHEEL +0 -0
  87. {euroeval-15.12.0.dist-info → euroeval-16.7.1.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -1,12 +1,13 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
+ import collections.abc as c
3
4
  import contextlib
5
+ import datetime as dt
4
6
  import json
5
7
  import logging
8
+ import os
6
9
  import re
7
- import sys
8
10
  import typing as t
9
- from copy import deepcopy
10
11
  from pathlib import Path
11
12
  from shutil import rmtree
12
13
  from time import sleep
@@ -15,27 +16,30 @@ from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
15
16
  from torch.distributed import destroy_process_group
16
17
 
17
18
  from .benchmark_config_factory import build_benchmark_config
18
- from .constants import GENERATIVE_DATASET_TASK_GROUPS, GENERATIVE_PIPELINE_TAGS
19
- from .data_loading import load_data
19
+ from .constants import GENERATIVE_PIPELINE_TAGS
20
+ from .data_loading import load_data, load_raw_data
20
21
  from .data_models import BenchmarkConfigParams, BenchmarkResult
21
22
  from .dataset_configs import get_all_dataset_configs
22
- from .enums import Device, ModelType
23
+ from .enums import Device, GenerativeType, ModelType
23
24
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
24
25
  from .finetuning import finetune
25
26
  from .generation import generate
27
+ from .logging_utils import adjust_logging_level, get_pbar, log, log_once
26
28
  from .model_config import get_model_config
27
29
  from .model_loading import load_model
28
30
  from .scores import log_scores
29
31
  from .speed_benchmark import benchmark_speed
30
32
  from .tasks import SPEED
31
- from .utils import enforce_reproducibility, get_package_version
33
+ from .utils import (
34
+ enforce_reproducibility,
35
+ get_package_version,
36
+ internet_connection_available,
37
+ split_model_id,
38
+ )
32
39
 
33
40
  if t.TYPE_CHECKING:
34
41
  from .benchmark_modules import BenchmarkModule
35
- from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
36
-
37
-
38
- logger = logging.getLogger("euroeval")
42
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
39
43
 
40
44
 
41
45
  class Benchmarker:
@@ -59,13 +63,11 @@ class Benchmarker:
59
63
  self,
60
64
  progress_bar: bool = True,
61
65
  save_results: bool = True,
62
- task: str | list[str] | None = None,
63
- dataset: list[str] | str | None = None,
64
- language: str | list[str] = "all",
65
- model_language: str | list[str] | None = None,
66
- dataset_language: str | list[str] | None = None,
66
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
67
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
68
+ language: str | c.Sequence[str] = "all",
67
69
  device: Device | None = None,
68
- batch_size: int = 32,
70
+ finetuning_batch_size: int = 32,
69
71
  raise_errors: bool = False,
70
72
  cache_dir: str = ".euroeval_cache",
71
73
  api_key: str | None = None,
@@ -78,10 +80,15 @@ class Benchmarker:
78
80
  num_iterations: int = 10,
79
81
  api_base: str | None = None,
80
82
  api_version: str | None = None,
81
- gpu_memory_utilization: float = 0.9,
83
+ gpu_memory_utilization: float = 0.8,
84
+ generative_type: GenerativeType | None = None,
82
85
  debug: bool = False,
83
86
  run_with_cli: bool = False,
84
- only_allow_safetensors: bool = False,
87
+ requires_safetensors: bool = False,
88
+ download_only: bool = False,
89
+ model_language: str | c.Sequence[str] | None = None,
90
+ dataset_language: str | c.Sequence[str] | None = None,
91
+ batch_size: int | None = None,
85
92
  ) -> None:
86
93
  """Initialise the benchmarker.
87
94
 
@@ -102,18 +109,10 @@ class Benchmarker:
102
109
  The language codes of the languages to include, both for models and
103
110
  datasets. Set this to 'all' if all languages should be considered.
104
111
  Defaults to "all".
105
- model_language:
106
- The language codes of the languages to include for models. If specified
107
- then this overrides the `language` parameter for model languages.
108
- Defaults to None.
109
- dataset_language:
110
- The language codes of the languages to include for datasets. If
111
- specified then this overrides the `language` parameter for dataset
112
- languages. Defaults to None.
113
112
  device:
114
113
  The device to use for benchmarking. Defaults to None.
115
- batch_size:
116
- The batch size to use. Defaults to 32.
114
+ finetuning_batch_size:
115
+ The batch size to use when finetuning. Defaults to 32.
117
116
  raise_errors:
118
117
  Whether to raise errors instead of skipping the model evaluation.
119
118
  Defaults to False.
@@ -151,22 +150,46 @@ class Benchmarker:
151
150
  is generative. A larger value will result in faster evaluation, but at
152
151
  the risk of running out of GPU memory. Only reduce this if you are
153
152
  running out of GPU memory. Defaults to 0.9.
153
+ generative_type:
154
+ The type of generative model to benchmark. Only relevant if the model is
155
+ generative. If not specified, then the type will be inferred based on
156
+ the tags of the model. Defaults to None.
154
157
  debug:
155
158
  Whether to output debug information. Defaults to False.
156
159
  run_with_cli:
157
160
  Whether the benchmarker is being run from the command-line interface.
158
161
  Defaults to False.
159
- only_allow_safetensors:
162
+ requires_safetensors:
160
163
  Whether to only allow models that use the safetensors format. Defaults
161
164
  to False.
165
+ download_only:
166
+ Whether to only download models and datasets without performing any
167
+ benchmarking. Defaults to False.
168
+ model_language:
169
+ Deprecated argument. Please use `language` instead.
170
+ dataset_language:
171
+ Deprecated argument. Please use `language` instead.
172
+ batch_size:
173
+ Deprecated argument. Please use `finetuning_batch_size` instead.
162
174
 
163
175
  Raises:
164
176
  ValueError:
165
- If both `task` and `dataset` are specified.
177
+ If both `task` and `dataset` are specified, or if `download_only`
178
+ is True and we have no internet connection.
179
+ ImportError:
180
+ If `hf_transfer` is enabled but not installed.
166
181
  """
167
182
  if task is not None and dataset is not None:
168
183
  raise ValueError("Only one of `task` and `dataset` can be specified.")
169
184
 
185
+ if not internet_connection_available() and download_only:
186
+ msg = "It appears you do not have an internet connection, but "
187
+ if run_with_cli:
188
+ msg += "the --download-only flag was set."
189
+ else:
190
+ msg += "the argument `download_only` was set to True."
191
+ raise ValueError(msg)
192
+
170
193
  # Bail early if hf_transfer is enabled but not installed.
171
194
  if HF_HUB_ENABLE_HF_TRANSFER and get_package_version("hf_transfer") is None:
172
195
  raise ImportError(
@@ -176,81 +199,205 @@ class Benchmarker:
176
199
  "Try installing it with `pip install hf_transfer`."
177
200
  )
178
201
 
202
+ # Deprecation warnings
203
+ if batch_size is not None:
204
+ if run_with_cli:
205
+ msg = (
206
+ "The --batch-size option is deprecated and will be removed in a "
207
+ "future version. Please use --finetuning-batch-size instead. "
208
+ "Overwriting --finetuning-batch-size with the value from "
209
+ "--batch-size."
210
+ )
211
+ else:
212
+ msg = (
213
+ "The `batch_size` argument is deprecated and will be removed in a "
214
+ "future version. Please use `finetuning_batch_size` instead. "
215
+ "Overwriting `finetuning_batch_size` with the value from "
216
+ "`batch_size`."
217
+ )
218
+ log(msg, level=logging.WARNING)
219
+ finetuning_batch_size = batch_size
220
+ if model_language is not None:
221
+ if run_with_cli:
222
+ msg = (
223
+ "The --model-language option is deprecated and will be removed in "
224
+ "a future version. Please use --language instead. Ignoring the "
225
+ "--model-language value."
226
+ )
227
+ else:
228
+ msg = (
229
+ "The `model_language` argument is deprecated and will be removed "
230
+ "in a future version. Please use `language` instead. Ignoring the "
231
+ "`model_language` value."
232
+ )
233
+ log(msg, level=logging.WARNING)
234
+ if dataset_language is not None:
235
+ if run_with_cli:
236
+ msg = (
237
+ "The --dataset-language option is deprecated and will be removed "
238
+ "in a future version. Please use --language instead. Ignoring the "
239
+ "--dataset-language value."
240
+ )
241
+ else:
242
+ msg = (
243
+ "The `dataset_language` argument is deprecated and will be removed "
244
+ "in a future version. Please use `language` instead. Ignoring the "
245
+ "`dataset_language` value."
246
+ )
247
+ log(msg, level=logging.WARNING)
248
+
249
+ # If FULL_LOG has been set, then force verbose mode
250
+ if os.getenv("FULL_LOG", "0") == "1":
251
+ verbose = True
252
+
179
253
  self.benchmark_config_default_params = BenchmarkConfigParams(
180
- progress_bar=progress_bar,
181
- save_results=save_results,
182
254
  task=task,
183
255
  dataset=dataset,
256
+ progress_bar=progress_bar,
257
+ save_results=save_results,
184
258
  language=language,
185
- model_language=model_language,
186
- dataset_language=dataset_language,
187
259
  device=device,
188
- batch_size=batch_size,
260
+ finetuning_batch_size=finetuning_batch_size,
189
261
  raise_errors=raise_errors,
190
262
  cache_dir=cache_dir,
191
263
  api_key=api_key,
192
- force=force,
193
- verbose=verbose,
264
+ api_base=api_base,
265
+ api_version=api_version,
194
266
  trust_remote_code=trust_remote_code,
195
267
  clear_model_cache=clear_model_cache,
196
268
  evaluate_test_split=evaluate_test_split,
197
269
  few_shot=few_shot,
198
270
  num_iterations=num_iterations,
199
- api_base=api_base,
200
- api_version=api_version,
271
+ requires_safetensors=requires_safetensors,
272
+ download_only=download_only,
201
273
  gpu_memory_utilization=gpu_memory_utilization,
274
+ generative_type=generative_type,
275
+ verbose=verbose,
276
+ force=force,
202
277
  debug=debug,
203
278
  run_with_cli=run_with_cli,
204
- only_allow_safetensors=only_allow_safetensors,
205
279
  )
206
280
 
207
281
  self.benchmark_config = build_benchmark_config(
208
- first_time=True, **self.benchmark_config_default_params.model_dump()
282
+ benchmark_config_params=self.benchmark_config_default_params
209
283
  )
210
284
 
211
285
  # Initialise variable storing model lists, so we only have to fetch it once
212
- self._model_lists: dict[str, list[str]] | None = None
286
+ self._model_lists: dict[str, c.Sequence[str]] | None = None
213
287
 
214
288
  self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
215
289
  adjust_logging_level(verbose=self.benchmark_config.verbose)
216
290
 
217
291
  @property
218
- def benchmark_results(self) -> list[BenchmarkResult]:
219
- """The benchmark results."""
292
+ def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
293
+ """The benchmark results.
294
+
295
+ Returns:
296
+ A list of benchmark results.
297
+
298
+ Raises:
299
+ ValueError:
300
+ If there is an error decoding a line in the results file.
301
+ """
220
302
  if self.results_path.exists():
303
+ benchmark_results: list[BenchmarkResult] = list()
221
304
  with self.results_path.open() as f:
222
- return [
223
- BenchmarkResult.from_dict(json.loads(line))
224
- for line in f
225
- if line.strip()
226
- ]
305
+ for line in f:
306
+ if line.strip():
307
+ try:
308
+ result_dict = json.loads(line.strip())
309
+ except json.JSONDecodeError as e:
310
+ raise ValueError(
311
+ f"Error decoding JSON line: {line.strip()}"
312
+ ) from e
313
+
314
+ # Fix for older records
315
+ has_old_raw_results = (
316
+ "results" in result_dict
317
+ and isinstance(result_dict["results"], dict)
318
+ and "raw" in result_dict["results"]
319
+ and isinstance(result_dict["results"]["raw"], dict)
320
+ and "test" in result_dict["results"]["raw"]
321
+ )
322
+ if has_old_raw_results:
323
+ result_dict["results"]["raw"] = result_dict["results"][
324
+ "raw"
325
+ ]["test"]
326
+
327
+ result = BenchmarkResult.from_dict(result_dict)
328
+ benchmark_results.append(result)
329
+ return benchmark_results
227
330
  else:
228
331
  return list()
229
332
 
333
+ def _download(
334
+ self,
335
+ dataset_config: "DatasetConfig",
336
+ model_config: "ModelConfig",
337
+ benchmark_config: "BenchmarkConfig",
338
+ ) -> None:
339
+ """Download data, metrics, and model for the given dataset, and model.
340
+
341
+ Args:
342
+ dataset_config: The configuration for the dataset.
343
+ model_config: The configuration for the model.
344
+ benchmark_config: The configuration for the benchmark.
345
+ """
346
+ log_once(
347
+ f"Loading data for {dataset_config.logging_string}", level=logging.INFO
348
+ )
349
+ dataset = load_raw_data(
350
+ dataset_config=dataset_config, cache_dir=benchmark_config.cache_dir
351
+ )
352
+ del dataset
353
+
354
+ model = load_model(
355
+ model_config=model_config,
356
+ dataset_config=dataset_config,
357
+ benchmark_config=benchmark_config,
358
+ )
359
+ del model
360
+
361
+ log_once(
362
+ f"Loading metrics for the '{dataset_config.task.name}' task",
363
+ level=logging.INFO,
364
+ )
365
+ for metric_name in dataset_config.task.metrics:
366
+ log_once(f"Loading metric {metric_name.name}", level=logging.DEBUG)
367
+ metric = metric_name.download(cache_dir=benchmark_config.cache_dir)
368
+ del metric
369
+
230
370
  def benchmark(
231
371
  self,
232
- model: list[str] | str,
233
- task: str | list[str] | None = None,
234
- dataset: list[str] | str | None = None,
372
+ model: c.Sequence[str] | str,
373
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
374
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
235
375
  progress_bar: bool | None = None,
236
376
  save_results: bool | None = None,
237
- language: str | list[str] | None = None,
238
- model_language: str | list[str] | None = None,
239
- dataset_language: str | list[str] | None = None,
377
+ language: str | c.Sequence[str] | None = None,
240
378
  device: Device | None = None,
241
- batch_size: int | None = None,
379
+ finetuning_batch_size: int | None = None,
242
380
  raise_errors: bool | None = None,
243
381
  cache_dir: str | None = None,
244
382
  api_key: str | None = None,
245
- force: bool | None = None,
246
- verbose: bool | None = None,
383
+ api_base: str | None = None,
384
+ api_version: str | None = None,
247
385
  trust_remote_code: bool | None = None,
248
386
  clear_model_cache: bool | None = None,
249
387
  evaluate_test_split: bool | None = None,
250
388
  few_shot: bool | None = None,
251
389
  num_iterations: int | None = None,
252
- only_allow_safetensors: bool | None = None,
253
- ) -> list[BenchmarkResult]:
390
+ requires_safetensors: bool | None = None,
391
+ download_only: bool | None = None,
392
+ gpu_memory_utilization: float | None = None,
393
+ generative_type: GenerativeType | None = None,
394
+ force: bool | None = None,
395
+ verbose: bool | None = None,
396
+ debug: bool | None = None,
397
+ model_language: str | c.Sequence[str] | None = None,
398
+ dataset_language: str | c.Sequence[str] | None = None,
399
+ batch_size: int | None = None,
400
+ ) -> c.Sequence[BenchmarkResult]:
254
401
  """Benchmarks models on datasets.
255
402
 
256
403
  Args:
@@ -279,21 +426,12 @@ class Benchmarker:
279
426
  datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
280
427
  'all' if all languages should be considered. Defaults to the value
281
428
  specified when initialising the benchmarker.
282
- model_language:
283
- The language codes of the languages to include for models. If specified
284
- then this overrides the `language` parameter for model languages.
285
- Defaults to the value specified when initialising the benchmarker.
286
- dataset_language:
287
- The language codes of the languages to include for datasets. If
288
- specified then this overrides the `language` parameter for dataset
289
- languages. Defaults to the value specified when initialising the
290
- benchmarker.
291
429
  device:
292
430
  The device to use for benchmarking. Defaults to the value specified when
293
431
  initialising the benchmarker.
294
- batch_size:
295
- The batch size to use. Defaults to the value specified when initialising
296
- the benchmarker.
432
+ finetuning_batch_size:
433
+ The batch size to use for finetuning. Defaults to the value specified
434
+ when initialising the benchmarker.
297
435
  raise_errors:
298
436
  Whether to raise errors instead of skipping the model evaluation.
299
437
  cache_dir:
@@ -302,13 +440,13 @@ class Benchmarker:
302
440
  api_key:
303
441
  The API key to use for a given inference server. Defaults to the value
304
442
  specified when initialising the benchmarker.
305
- force:
306
- Whether to force evaluations of models, even if they have been
307
- benchmarked already. Defaults to the value specified when initialising
308
- the benchmarker.
309
- verbose:
310
- Whether to output additional output. Defaults to the value specified
311
- when initialising the benchmarker.
443
+ api_base:
444
+ The base URL for a given inference API. Only relevant if `model` refers
445
+ to a model on an inference API. Defaults to the value specified when
446
+ initialising the benchmarker.
447
+ api_version:
448
+ The version of the API to use. Defaults to the value specified when
449
+ initialising the benchmarker.
312
450
  trust_remote_code:
313
451
  Whether to trust remote code when loading models. Defaults to the value
314
452
  specified when initialising the benchmarker.
@@ -327,9 +465,39 @@ class Benchmarker:
327
465
  to be used for power users, and scores will not be allowed on the
328
466
  leaderboards if this is changed. Defaults to the value specified when
329
467
  initialising the benchmarker.
330
- only_allow_safetensors:
468
+ requires_safetensors:
331
469
  Whether to only allow models that use the safetensors format. Defaults
332
470
  to the value specified when initialising the benchmarker.
471
+ download_only:
472
+ Whether to only download the models without evaluating them. Defaults
473
+ to the value specified when initialising the benchmarker.
474
+ gpu_memory_utilization:
475
+ The GPU memory utilization to use for vLLM. Only relevant if the model
476
+ is generative. A larger value will result in faster evaluation, but at
477
+ the risk of running out of GPU memory. Only reduce this if you are
478
+ running out of GPU memory. Defaults to the value specified when
479
+ initialising the benchmarker.
480
+ generative_type:
481
+ The type of generative model to benchmark. Only relevant if the model is
482
+ generative. If not specified, then the type will be inferred based on
483
+ the tags of the model. Defaults to the value specified when initialising
484
+ the benchmarker.
485
+ force:
486
+ Whether to force evaluations of models, even if they have been
487
+ benchmarked already. Defaults to the value specified when initialising
488
+ the benchmarker.
489
+ verbose:
490
+ Whether to output additional output. Defaults to the value specified
491
+ when initialising the benchmarker.
492
+ debug:
493
+ Whether to output debug information. Defaults to the value specified
494
+ when initialising the benchmarker.
495
+ model_language:
496
+ Deprecated argument. Please use `language` instead.
497
+ dataset_language:
498
+ Deprecated argument. Please use `language` instead.
499
+ batch_size:
500
+ Deprecated argument. Please use `finetuning_batch_size` instead.
333
501
 
334
502
  Returns:
335
503
  A list of benchmark results.
@@ -341,27 +509,156 @@ class Benchmarker:
341
509
  if task is not None and dataset is not None:
342
510
  raise ValueError("Only one of `task` and `dataset` can be specified.")
343
511
 
344
- benchmark_config = self._get_updated_benchmark_config(
345
- task=task,
346
- dataset=dataset,
347
- progress_bar=progress_bar,
348
- save_results=save_results,
349
- language=language,
350
- model_language=model_language,
351
- dataset_language=dataset_language,
352
- device=device,
353
- batch_size=batch_size,
354
- raise_errors=raise_errors,
355
- cache_dir=cache_dir,
356
- api_key=api_key,
357
- force=force,
358
- verbose=verbose,
359
- trust_remote_code=trust_remote_code,
360
- clear_model_cache=clear_model_cache,
361
- evaluate_test_split=evaluate_test_split,
362
- few_shot=few_shot,
363
- num_iterations=num_iterations,
364
- only_allow_safetensors=only_allow_safetensors,
512
+ # Deprecation warnings
513
+ if batch_size is not None:
514
+ log(
515
+ "The `batch_size` argument is deprecated and will be removed in a "
516
+ "future version. Please use `finetuning_batch_size` instead. "
517
+ "Overwriting `finetuning_batch_size` with the value from "
518
+ "`batch_size`.",
519
+ level=logging.WARNING,
520
+ )
521
+ finetuning_batch_size = batch_size
522
+ if model_language is not None:
523
+ log(
524
+ "The `model_language` argument is deprecated and will be removed "
525
+ "in a future version. Please use `language` instead. Ignoring the "
526
+ "`model_language` value.",
527
+ level=logging.WARNING,
528
+ )
529
+ if dataset_language is not None:
530
+ log(
531
+ "The `dataset_language` argument is deprecated and will be removed "
532
+ "in a future version. Please use `language` instead. Ignoring the "
533
+ "`dataset_language` value.",
534
+ level=logging.WARNING,
535
+ )
536
+
537
+ # Get a new updated benchmark configuration, based on any changes to the
538
+ # parameters
539
+ benchmark_config_params = BenchmarkConfigParams(
540
+ task=(
541
+ task if task is not None else self.benchmark_config_default_params.task
542
+ ),
543
+ dataset=(
544
+ dataset
545
+ if dataset is not None
546
+ else self.benchmark_config_default_params.dataset
547
+ ),
548
+ progress_bar=(
549
+ progress_bar
550
+ if progress_bar is not None
551
+ else self.benchmark_config_default_params.progress_bar
552
+ ),
553
+ save_results=(
554
+ save_results
555
+ if save_results is not None
556
+ else self.benchmark_config_default_params.save_results
557
+ ),
558
+ language=(
559
+ language
560
+ if language is not None
561
+ else self.benchmark_config_default_params.language
562
+ ),
563
+ device=(
564
+ device
565
+ if device is not None
566
+ else self.benchmark_config_default_params.device
567
+ ),
568
+ finetuning_batch_size=(
569
+ finetuning_batch_size
570
+ if finetuning_batch_size is not None
571
+ else self.benchmark_config_default_params.finetuning_batch_size
572
+ ),
573
+ raise_errors=(
574
+ raise_errors
575
+ if raise_errors is not None
576
+ else self.benchmark_config_default_params.raise_errors
577
+ ),
578
+ cache_dir=(
579
+ cache_dir
580
+ if cache_dir is not None
581
+ else self.benchmark_config_default_params.cache_dir
582
+ ),
583
+ api_key=(
584
+ api_key
585
+ if api_key is not None
586
+ else self.benchmark_config_default_params.api_key
587
+ ),
588
+ api_base=(
589
+ api_base
590
+ if api_base is not None
591
+ else self.benchmark_config_default_params.api_base
592
+ ),
593
+ api_version=(
594
+ api_version
595
+ if api_version is not None
596
+ else self.benchmark_config_default_params.api_version
597
+ ),
598
+ trust_remote_code=(
599
+ trust_remote_code
600
+ if trust_remote_code is not None
601
+ else self.benchmark_config_default_params.trust_remote_code
602
+ ),
603
+ clear_model_cache=(
604
+ clear_model_cache
605
+ if clear_model_cache is not None
606
+ else self.benchmark_config_default_params.clear_model_cache
607
+ ),
608
+ evaluate_test_split=(
609
+ evaluate_test_split
610
+ if evaluate_test_split is not None
611
+ else self.benchmark_config_default_params.evaluate_test_split
612
+ ),
613
+ few_shot=(
614
+ few_shot
615
+ if few_shot is not None
616
+ else self.benchmark_config_default_params.few_shot
617
+ ),
618
+ num_iterations=(
619
+ num_iterations
620
+ if num_iterations is not None
621
+ else self.benchmark_config_default_params.num_iterations
622
+ ),
623
+ requires_safetensors=(
624
+ requires_safetensors
625
+ if requires_safetensors is not None
626
+ else self.benchmark_config_default_params.requires_safetensors
627
+ ),
628
+ download_only=(
629
+ download_only
630
+ if download_only is not None
631
+ else self.benchmark_config_default_params.download_only
632
+ ),
633
+ gpu_memory_utilization=(
634
+ gpu_memory_utilization
635
+ if gpu_memory_utilization is not None
636
+ else self.benchmark_config_default_params.gpu_memory_utilization
637
+ ),
638
+ generative_type=(
639
+ generative_type
640
+ if generative_type is not None
641
+ else self.benchmark_config_default_params.generative_type
642
+ ),
643
+ force=(
644
+ force
645
+ if force is not None
646
+ else self.benchmark_config_default_params.force
647
+ ),
648
+ verbose=(
649
+ verbose
650
+ if verbose is not None
651
+ else self.benchmark_config_default_params.verbose
652
+ ),
653
+ debug=(
654
+ debug
655
+ if debug is not None
656
+ else self.benchmark_config_default_params.debug
657
+ ),
658
+ run_with_cli=self.benchmark_config_default_params.run_with_cli,
659
+ )
660
+ benchmark_config = build_benchmark_config(
661
+ benchmark_config_params=benchmark_config_params
365
662
  )
366
663
 
367
664
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -370,67 +667,137 @@ class Benchmarker:
370
667
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
371
668
 
372
669
  model_ids = self._prepare_model_ids(model_id=model)
373
- dataset_configs = prepare_dataset_configs(
374
- dataset_names=benchmark_config.datasets
670
+ dataset_configs = benchmark_config.datasets
671
+
672
+ # Get all the model configs
673
+ model_configs: list[ModelConfig] = list()
674
+ for model_id in get_pbar(
675
+ iterable=model_ids,
676
+ desc="Fetching model configurations",
677
+ disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
678
+ ):
679
+ try:
680
+ model_config = get_model_config(
681
+ model_id=model_id, benchmark_config=benchmark_config
682
+ )
683
+ model_configs.append(model_config)
684
+ except InvalidModel as e:
685
+ log(e.message, level=logging.ERROR)
686
+
687
+ # Create a dictionary that takes each model config to the dataset configs that
688
+ # we need to benchmark the model on. We initially include all the relevant
689
+ # datasets for each model.
690
+ model_config_to_dataset_configs: dict[
691
+ ModelConfig, c.Sequence[DatasetConfig]
692
+ ] = {
693
+ model_config: [
694
+ dataset_config
695
+ for dataset_config in dataset_configs
696
+ if model_config.model_type in dataset_config.allowed_model_types
697
+ ]
698
+ for model_config in model_configs
699
+ }
700
+
701
+ # Initialise the current benchmark results with all the ones that we have cached
702
+ # on disk already (can be none), and remove those datasets from the mapping
703
+ current_benchmark_results: list[BenchmarkResult] = list()
704
+ for (
705
+ model_config,
706
+ model_dataset_configs,
707
+ ) in model_config_to_dataset_configs.items():
708
+ new_model_dataset_configs: list[DatasetConfig] = list()
709
+ for dataset_config in model_dataset_configs:
710
+ benchmark_record = get_record(
711
+ model_config=model_config,
712
+ dataset_config=dataset_config,
713
+ benchmark_config=benchmark_config,
714
+ benchmark_results=self.benchmark_results,
715
+ )
716
+ if benchmark_record is not None and not benchmark_config.force:
717
+ current_benchmark_results.append(benchmark_record)
718
+ else:
719
+ new_model_dataset_configs.append(dataset_config)
720
+ model_config_to_dataset_configs[model_config] = new_model_dataset_configs
721
+
722
+ total_benchmarks = sum(
723
+ len(dataset_configs)
724
+ for dataset_configs in model_config_to_dataset_configs.values()
375
725
  )
726
+ if total_benchmarks == 0:
727
+ log(
728
+ "No benchmarks to run, as all the selected models have already been "
729
+ "benchmarked on all the selected datasets.",
730
+ level=logging.INFO,
731
+ )
732
+ return current_benchmark_results
376
733
 
377
- total_benchmarks = len(model_ids) * len(dataset_configs)
378
734
  num_finished_benchmarks = 0
735
+ benchmark_params_to_revert: dict[str, t.Any] = dict()
736
+ for model_config in model_configs:
737
+ if not model_config_to_dataset_configs[model_config]:
738
+ log(
739
+ f"Skipping model {model_config.model_id!r} because it has "
740
+ "already been benchmarked on all valid datasets.",
741
+ level=logging.DEBUG,
742
+ )
743
+ continue
744
+
745
+ if model_config.adapter_base_model_id:
746
+ open_issue_msg = (
747
+ "If offline support is important to you, please consider opening "
748
+ "an issue at https://github.com/EuroEval/EuroEval/issues."
749
+ )
750
+ if not internet_connection_available():
751
+ raise InvalidModel(
752
+ "Offline benchmarking of models with adapters is not currently "
753
+ "supported. An active internet connection is required. "
754
+ "{open_issue_msg}"
755
+ )
756
+ elif benchmark_config.download_only:
757
+ log_once(
758
+ "You are using download only mode with a model that includes "
759
+ "an adapter. Please note that offline benchmarking of "
760
+ "adapter models is not currently supported - an internet "
761
+ "connection will be required during evaluation in this case. "
762
+ f"{open_issue_msg}",
763
+ level=logging.WARNING,
764
+ )
379
765
 
380
- current_benchmark_results: list[BenchmarkResult] = list()
381
- for model_id in model_ids:
382
- model_config: ModelConfig | None = None
383
766
  loaded_model: BenchmarkModule | None = None
384
- for dataset_config in dataset_configs:
385
- # Skip if we have already benchmarked this model on this dataset and
386
- # we are not forcing the benchmark
387
- if not benchmark_config.force and model_has_been_benchmarked(
388
- model_id=model_id,
389
- dataset=dataset_config.name,
390
- few_shot=benchmark_config.few_shot,
391
- validation_split=not benchmark_config.evaluate_test_split,
392
- benchmark_results=self.benchmark_results,
767
+ for dataset_config in model_config_to_dataset_configs[model_config]:
768
+ # Revert any changes to the benchmark configuration made for the
769
+ # previous dataset
770
+ for param, value in benchmark_params_to_revert.items():
771
+ setattr(benchmark_config, param, value)
772
+ benchmark_params_to_revert = dict()
773
+
774
+ # Update the benchmark config if the dataset requires it
775
+ if (
776
+ "val" not in dataset_config.splits
777
+ and not benchmark_config.evaluate_test_split
393
778
  ):
394
- logger.debug(
395
- f"Skipping benchmarking {model_id} on "
396
- f"{dataset_config.pretty_name}, as it has already been "
397
- "benchmarked."
779
+ log(
780
+ "The dataset does not have a validation split, so even though "
781
+ "you requested evaluating the validation split (the default), "
782
+ "we will evaluate on the test split.",
783
+ level=logging.DEBUG,
398
784
  )
399
- num_finished_benchmarks += 1
400
- continue
401
-
402
- if model_config is None:
403
- try:
404
- model_config = get_model_config(
405
- model_id=model_id, benchmark_config=benchmark_config
406
- )
407
- except InvalidModel as e:
408
- logger.info(e.message)
409
- num_finished_benchmarks += len(dataset_configs)
410
- continue
411
-
412
- # Skip if the model is an encoder model and the task is generative
413
- task_is_generative = (
414
- dataset_config.task.task_group in GENERATIVE_DATASET_TASK_GROUPS
415
- )
416
- if model_config.model_type == ModelType.ENCODER and task_is_generative:
417
- logger.debug(
418
- f"Skipping benchmarking {model_id} on "
419
- f"{dataset_config.pretty_name}, as it is an encoder model and "
420
- "the task is generative."
785
+ benchmark_params_to_revert["evaluate_test_split"] = False
786
+ benchmark_config.evaluate_test_split = True
787
+ if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
788
+ log(
789
+ "The task requires zero-shot evaluation, so even though you "
790
+ "requested few-shot evaluation (the default), we will evaluate "
791
+ "zero-shot.",
792
+ level=logging.DEBUG,
421
793
  )
422
- continue
794
+ benchmark_params_to_revert["few_shot"] = True
795
+ benchmark_config.few_shot = False
423
796
 
424
797
  # We do not re-initialise generative models as their architecture is not
425
798
  # customised to specific datasets
426
799
  if model_config.model_type == ModelType.GENERATIVE:
427
- initial_logging(
428
- model_config=model_config,
429
- dataset_config=dataset_config,
430
- benchmark_config=benchmark_config,
431
- )
432
800
  if loaded_model is None:
433
- logger.info("Loading model...")
434
801
  try:
435
802
  loaded_model = load_model(
436
803
  model_config=model_config,
@@ -440,7 +807,7 @@ class Benchmarker:
440
807
  except InvalidModel as e:
441
808
  if benchmark_config.raise_errors:
442
809
  raise e
443
- logger.info(e.message)
810
+ log(e.message, level=logging.ERROR)
444
811
 
445
812
  # Add the remaining number of benchmarks for the model to
446
813
  # our benchmark counter, since we're skipping the rest of
@@ -454,12 +821,31 @@ class Benchmarker:
454
821
  else:
455
822
  loaded_model.dataset_config = dataset_config
456
823
 
824
+ # Skip the benchmark if the model is not of the correct
825
+ # generative type
826
+ if (
827
+ loaded_model.generative_type
828
+ not in dataset_config.allowed_generative_types
829
+ ):
830
+ log(
831
+ f"Skipping the benchmark of model "
832
+ f"{model_config.model_id!r}on dataset "
833
+ f"{dataset_config.name!r} because the model has generative "
834
+ f"type {loaded_model.generative_type} and the dataset "
835
+ f"only allows {dataset_config.allowed_generative_types}.",
836
+ level=logging.DEBUG,
837
+ )
838
+ num_finished_benchmarks += 1
839
+ continue
840
+
457
841
  # Benchmark a single model on a single dataset
458
842
  benchmark_output_or_err = self._benchmark_single(
459
843
  model=loaded_model,
460
844
  model_config=model_config,
461
845
  dataset_config=dataset_config,
462
846
  benchmark_config=benchmark_config,
847
+ num_finished_benchmarks=num_finished_benchmarks,
848
+ num_total_benchmarks=total_benchmarks,
463
849
  )
464
850
 
465
851
  if (
@@ -469,12 +855,12 @@ class Benchmarker:
469
855
  raise benchmark_output_or_err
470
856
 
471
857
  elif isinstance(benchmark_output_or_err, InvalidBenchmark):
472
- logger.info(benchmark_output_or_err.message)
858
+ log(benchmark_output_or_err.message, level=logging.WARNING)
473
859
  num_finished_benchmarks += 1
474
860
  continue
475
861
 
476
862
  elif isinstance(benchmark_output_or_err, InvalidModel):
477
- logger.info(benchmark_output_or_err.message)
863
+ log(benchmark_output_or_err.message, level=logging.WARNING)
478
864
 
479
865
  # Add the remaining number of benchmarks for the model to our
480
866
  # benchmark counter, since we're skipping the rest of them
@@ -490,15 +876,15 @@ class Benchmarker:
490
876
  record.append_to_results(results_path=self.results_path)
491
877
 
492
878
  num_finished_benchmarks += 1
493
- logger.info(
494
- f"Finished {num_finished_benchmarks} out of "
495
- f"{total_benchmarks} benchmarks."
496
- )
497
879
 
498
880
  del loaded_model
499
881
  if benchmark_config.clear_model_cache:
500
882
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
501
883
 
884
+ log(
885
+ f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
886
+ )
887
+
502
888
  # This avoids the following warning at the end of the benchmarking:
503
889
  # Warning: WARNING: process group has NOT been destroyed before we destruct
504
890
  # ProcessGroupNCCL. On normal program exit, the application should call
@@ -511,168 +897,7 @@ class Benchmarker:
511
897
  destroy_process_group()
512
898
  return current_benchmark_results
513
899
 
514
- def _get_updated_benchmark_config(
515
- self,
516
- progress_bar: bool | None = None,
517
- save_results: bool | None = None,
518
- task: str | list[str] | None | None = None,
519
- dataset: str | list[str] | None | None = None,
520
- language: str | list[str] | None = None,
521
- model_language: str | list[str] | None | None = None,
522
- dataset_language: str | list[str] | None | None = None,
523
- device: Device | None | None = None,
524
- batch_size: int | None = None,
525
- raise_errors: bool | None = None,
526
- cache_dir: str | None = None,
527
- api_key: str | None | None = None,
528
- force: bool | None = None,
529
- verbose: bool | None = None,
530
- trust_remote_code: bool | None = None,
531
- clear_model_cache: bool | None = None,
532
- evaluate_test_split: bool | None = None,
533
- few_shot: bool | None = None,
534
- num_iterations: int | None = None,
535
- api_base: str | None | None = None,
536
- api_version: str | None | None = None,
537
- debug: bool | None = None,
538
- run_with_cli: bool | None = None,
539
- only_allow_safetensors: bool | None = None,
540
- ) -> "BenchmarkConfig":
541
- """Get an updated benchmark configuration.
542
-
543
- Args:
544
- progress_bar:
545
- Whether progress bars should be shown. If None, then this value will not
546
- be updated.
547
- save_results:
548
- Whether to save the benchmark results to
549
- 'euroeval_benchmark_results.jsonl'. If None, then this value will not
550
- be updated.
551
- task:
552
- The tasks benchmark the model(s) on. If None, then this value will not
553
- be updated.
554
- dataset:
555
- The datasets to benchmark on. If None, then this value will not be
556
- updated.
557
- language:
558
- The language codes of the languages to include, both for models and
559
- datasets. If None, then this value will not be updated.
560
- model_language:
561
- The language codes of the languages to include for models. If None, then
562
- this value will not be updated.
563
- dataset_language:
564
- The language codes of the languages to include for datasets. If None,
565
- then this value will not be updated.
566
- device:
567
- The device to use for benchmarking. If None, then this value will not be
568
- updated.
569
- batch_size:
570
- The batch size to use. If None, then this value will not be updated.
571
- raise_errors:
572
- Whether to raise errors instead of skipping the model evaluation. If
573
- None, then this value will not be updated.
574
- cache_dir:
575
- Directory to store cached models. If None, then this value will not be
576
- updated.
577
- api_key:
578
- The API key to use for a given inference server. If None, then this
579
- value will not be updated.
580
- force:
581
- Whether to force evaluations of models, even if they have been
582
- benchmarked already. If None, then this value will not be updated.
583
- verbose:
584
- Whether to output additional output. If None, then this value will not
585
- be updated.
586
- trust_remote_code:
587
- Whether to trust remote code when loading models. If None, then this
588
- value will not be updated.
589
- clear_model_cache:
590
- Whether to clear the model cache after benchmarking each model. If None,
591
- then this value will not be updated.
592
- evaluate_test_split:
593
- Whether to evaluate the test split of the datasets. If None, then this
594
- value will not be updated.
595
- few_shot:
596
- Whether to only evaluate the model using few-shot evaluation. If None,
597
- then this value will not be updated.
598
- num_iterations:
599
- The number of times each model should be evaluated. If None, then this
600
- value will not be updated.
601
- api_base:
602
- The base URL for a given inference API. If None, then this value will
603
- not be updated.
604
- api_version:
605
- The version of the API to use. If None, then this value will not be
606
- updated.
607
- debug:
608
- Whether to output debug information. If None, then this value will not
609
- be updated.
610
- run_with_cli:
611
- Whether the benchmarker is being run from the command-line interface.
612
- If None, then this value will not be updated.
613
- only_allow_safetensors:
614
- Whether to only allow models that use the safetensors format. If None,
615
- then this value will not be updated.
616
-
617
- Returns:
618
- The updated benchmark configuration.
619
- """
620
- benchmark_config_params = deepcopy(self.benchmark_config_default_params)
621
-
622
- if progress_bar is not None:
623
- benchmark_config_params.progress_bar = progress_bar
624
- if save_results is not None:
625
- benchmark_config_params.save_results = save_results
626
- if task is not None:
627
- benchmark_config_params.task = task
628
- benchmark_config_params.dataset = None
629
- if dataset is not None:
630
- benchmark_config_params.dataset = dataset
631
- benchmark_config_params.task = None
632
- if language is not None:
633
- benchmark_config_params.language = language
634
- if model_language is not None:
635
- benchmark_config_params.model_language = model_language
636
- if dataset_language is not None:
637
- benchmark_config_params.dataset_language = dataset_language
638
- if device is not None:
639
- benchmark_config_params.device = device
640
- if batch_size is not None:
641
- benchmark_config_params.batch_size = batch_size
642
- if raise_errors is not None:
643
- benchmark_config_params.raise_errors = raise_errors
644
- if cache_dir is not None:
645
- benchmark_config_params.cache_dir = cache_dir
646
- if api_key is not None:
647
- benchmark_config_params.api_key = api_key
648
- if force is not None:
649
- benchmark_config_params.force = force
650
- if verbose is not None:
651
- benchmark_config_params.verbose = verbose
652
- if trust_remote_code is not None:
653
- benchmark_config_params.trust_remote_code = trust_remote_code
654
- if clear_model_cache is not None:
655
- benchmark_config_params.clear_model_cache = clear_model_cache
656
- if evaluate_test_split is not None:
657
- benchmark_config_params.evaluate_test_split = evaluate_test_split
658
- if few_shot is not None:
659
- benchmark_config_params.few_shot = few_shot
660
- if num_iterations is not None:
661
- benchmark_config_params.num_iterations = num_iterations
662
- if api_base is not None:
663
- benchmark_config_params.api_base = api_base
664
- if api_version is not None:
665
- benchmark_config_params.api_version = api_version
666
- if debug is not None:
667
- benchmark_config_params.debug = debug
668
- if run_with_cli is not None:
669
- benchmark_config_params.run_with_cli = run_with_cli
670
- if only_allow_safetensors is not None:
671
- benchmark_config_params.only_allow_safetensors = only_allow_safetensors
672
-
673
- return build_benchmark_config(**benchmark_config_params.model_dump())
674
-
675
- def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
900
+ def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
676
901
  """Prepare the model ID(s) to be benchmarked.
677
902
 
678
903
  Args:
@@ -703,6 +928,8 @@ class Benchmarker:
703
928
  model_config: "ModelConfig",
704
929
  dataset_config: "DatasetConfig",
705
930
  benchmark_config: "BenchmarkConfig",
931
+ num_finished_benchmarks: int,
932
+ num_total_benchmarks: int,
706
933
  ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
707
934
  """Benchmark a single model on a single dataset.
708
935
 
@@ -715,25 +942,29 @@ class Benchmarker:
715
942
  The configuration of the dataset we are evaluating on.
716
943
  benchmark_config:
717
944
  The general benchmark configuration.
945
+ num_finished_benchmarks:
946
+ The number of benchmarks that have already been completed.
947
+ num_total_benchmarks:
948
+ The total number of benchmarks to be completed.
718
949
 
719
950
  Returns:
720
951
  The benchmark result, or an error if the benchmark was unsuccessful.
721
- """
722
- if model is None:
723
- initial_logging(
724
- model_config=model_config,
725
- dataset_config=dataset_config,
726
- benchmark_config=benchmark_config,
727
- )
728
952
 
729
- while True:
953
+ Raises:
954
+ RuntimeError:
955
+ If the MPS fallback is not enabled when required.
956
+ InvalidBenchmark:
957
+ If the benchmark was unsuccessful.
958
+ InvalidModel:
959
+ If the model is invalid.
960
+ """
961
+ for _ in range(num_attempts := 5):
730
962
  try:
731
963
  # Set random seeds to enforce reproducibility of the randomly
732
964
  # initialised weights
733
965
  rng = enforce_reproducibility()
734
966
 
735
967
  if model is None or model_config.model_type != ModelType.GENERATIVE:
736
- logger.info("Loading model...")
737
968
  model = load_model(
738
969
  model_config=model_config,
739
970
  dataset_config=dataset_config,
@@ -741,6 +972,14 @@ class Benchmarker:
741
972
  )
742
973
  assert model is not None
743
974
 
975
+ initial_logging(
976
+ model_config=model_config,
977
+ dataset_config=dataset_config,
978
+ benchmark_config=benchmark_config,
979
+ num_finished_benchmarks=num_finished_benchmarks,
980
+ num_total_benchmarks=num_total_benchmarks,
981
+ )
982
+
744
983
  if dataset_config.task == SPEED:
745
984
  scores = benchmark_speed(
746
985
  model=model, benchmark_config=benchmark_config
@@ -773,24 +1012,25 @@ class Benchmarker:
773
1012
  )
774
1013
 
775
1014
  results = log_scores(
776
- dataset_name=dataset_config.pretty_name,
1015
+ dataset_name=dataset_config.logging_string,
777
1016
  metrics=dataset_config.task.metrics,
778
1017
  scores=scores,
779
1018
  model_id=model_config.model_id,
780
1019
  model_revision=model_config.revision,
1020
+ model_param=model_config.param,
781
1021
  )
782
1022
 
1023
+ model_id_to_be_stored = model_config.model_id
1024
+ if model_config.revision != "main":
1025
+ model_id_to_be_stored += f"@{model_config.revision}"
1026
+ if model_config.param is not None:
1027
+ model_id_to_be_stored += f"#{model_config.param}"
1028
+
783
1029
  record = BenchmarkResult(
784
1030
  dataset=dataset_config.name,
785
1031
  task=dataset_config.task.name,
786
- dataset_languages=[
787
- language.code for language in dataset_config.languages
788
- ],
789
- model=(
790
- f"{model_config.model_id}@{model_config.revision}"
791
- if model_config.revision and model_config.revision != "main"
792
- else model_config.model_id
793
- ),
1032
+ languages=[language.code for language in dataset_config.languages],
1033
+ model=model_id_to_be_stored,
794
1034
  results=results,
795
1035
  num_model_parameters=model.num_params,
796
1036
  max_sequence_length=model.model_max_length,
@@ -805,14 +1045,15 @@ class Benchmarker:
805
1045
  few_shot=benchmark_config.few_shot,
806
1046
  validation_split=not benchmark_config.evaluate_test_split,
807
1047
  )
808
- logger.debug(f"Results:\n{results}")
1048
+ log(f"Results:\n{results}", level=logging.DEBUG)
809
1049
  return record
810
1050
 
811
1051
  except HuggingFaceHubDown:
812
1052
  wait_time = 30
813
- logger.debug(
1053
+ log(
814
1054
  f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
815
- "seconds."
1055
+ "seconds.",
1056
+ level=logging.DEBUG,
816
1057
  )
817
1058
  sleep(wait_time)
818
1059
  continue
@@ -835,200 +1076,68 @@ class Benchmarker:
835
1076
  elif benchmark_config.raise_errors:
836
1077
  raise e
837
1078
  return e
1079
+ else:
1080
+ return InvalidBenchmark(
1081
+ f"Failed to benchmark model {model_config.model_id!r} on dataset "
1082
+ f"{dataset_config.name!r} after {num_attempts} attempts."
1083
+ )
838
1084
 
839
- def __call__(
840
- self,
841
- model: list[str] | str,
842
- task: str | list[str] | None = None,
843
- dataset: list[str] | str | None = None,
844
- progress_bar: bool | None = None,
845
- save_results: bool | None = None,
846
- language: str | list[str] | None = None,
847
- model_language: str | list[str] | None = None,
848
- dataset_language: str | list[str] | None = None,
849
- device: Device | None = None,
850
- batch_size: int | None = None,
851
- raise_errors: bool | None = None,
852
- cache_dir: str | None = None,
853
- api_key: str | None = None,
854
- force: bool | None = None,
855
- verbose: bool | None = None,
856
- trust_remote_code: bool | None = None,
857
- clear_model_cache: bool | None = None,
858
- evaluate_test_split: bool | None = None,
859
- few_shot: bool | None = None,
860
- num_iterations: int | None = None,
861
- only_allow_safetensors: bool | None = None,
862
- ) -> list[BenchmarkResult]:
863
- """Benchmarks models on datasets.
864
-
865
- Args:
866
- model:
867
- The full Hugging Face Hub path(s) to the pretrained transformer model.
868
- The specific model version to use can be added after the suffix '@':
869
- "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
870
- and defaults to the latest version if not specified.
871
- task:
872
- The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
873
- If both `task` and `dataset` are None then all datasets will be
874
- benchmarked. Defaults to None.
875
- dataset:
876
- The datasets to benchmark on. Mutually exclusive with `task`. If both
877
- `task` and `dataset` are None then all datasets will be benchmarked.
878
- Defaults to None.
879
- progress_bar:
880
- Whether progress bars should be shown. Defaults to the value specified
881
- when initialising the benchmarker.
882
- save_results:
883
- Whether to save the benchmark results to
884
- 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
885
- when initialising the benchmarker.
886
- language:
887
- The language codes of the languages to include, both for models and
888
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
889
- 'all' if all languages should be considered. Defaults to the value
890
- specified when initialising the benchmarker.
891
- model_language:
892
- The language codes of the languages to include for models. If specified
893
- then this overrides the `language` parameter for model languages.
894
- Defaults to the value specified when initialising the benchmarker.
895
- dataset_language:
896
- The language codes of the languages to include for datasets. If
897
- specified then this overrides the `language` parameter for dataset
898
- languages. Defaults to the value specified when initialising the
899
- benchmarker.
900
- device:
901
- The device to use for benchmarking. Defaults to the value specified when
902
- initialising the benchmarker.
903
- batch_size:
904
- The batch size to use. Defaults to the value specified when initialising
905
- the benchmarker.
906
- raise_errors:
907
- Whether to raise errors instead of skipping the model evaluation.
908
- cache_dir:
909
- Directory to store cached models. Defaults to the value specified when
910
- initialising the benchmarker.
911
- api_key:
912
- The API key to use for a given inference server. Defaults to the value
913
- specified when initialising the benchmarker.
914
- force:
915
- Whether to force evaluations of models, even if they have been
916
- benchmarked already. Defaults to the value specified when initialising
917
- the benchmarker.
918
- verbose:
919
- Whether to output additional output. Defaults to the value specified
920
- when initialising the benchmarker.
921
- trust_remote_code:
922
- Whether to trust remote code when loading models. Defaults to the value
923
- specified when initialising the benchmarker.
924
- clear_model_cache:
925
- Whether to clear the model cache after benchmarking each model. Defaults
926
- to the value specified when initialising the benchmarker.
927
- evaluate_test_split:
928
- Whether to evaluate the test split of the datasets. Defaults to the
929
- value specified when initialising the benchmarker.
930
- few_shot:
931
- Whether to only evaluate the model using few-shot evaluation. Only
932
- relevant if the model is generative. Defaults to the value specified
933
- when initialising the benchmarker.
934
- num_iterations:
935
- The number of times each model should be evaluated. This is only meant
936
- to be used for power users, and scores will not be allowed on the
937
- leaderboards if this is changed. Defaults to the value specified when
938
- initialising the benchmarker.
939
- only_allow_safetensors:
940
- Whether to only allow models that use the safetensors format. Defaults
941
- to the value specified when initialising the benchmarker.
942
-
943
- Returns:
944
- A list of benchmark results.
945
-
946
- Raises:
947
- ValueError:
948
- If both `task` and `dataset` are specified.
949
- """
950
- logger.warning(
1085
+ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
1086
+ """Alias for `self.benchmark()`."""
1087
+ log(
951
1088
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
952
- "`benchmark` function instead. This will be removed in a future version."
953
- )
954
- return self.benchmark(
955
- model=model,
956
- task=task,
957
- dataset=dataset,
958
- progress_bar=progress_bar,
959
- save_results=save_results,
960
- language=language,
961
- model_language=model_language,
962
- dataset_language=dataset_language,
963
- device=device,
964
- batch_size=batch_size,
965
- raise_errors=raise_errors,
966
- cache_dir=cache_dir,
967
- api_key=api_key,
968
- force=force,
969
- verbose=verbose,
970
- trust_remote_code=trust_remote_code,
971
- clear_model_cache=clear_model_cache,
972
- evaluate_test_split=evaluate_test_split,
973
- few_shot=few_shot,
974
- num_iterations=num_iterations,
975
- only_allow_safetensors=only_allow_safetensors,
1089
+ "`benchmark` function instead. This will be removed in a future version.",
1090
+ level=logging.WARNING,
976
1091
  )
1092
+ return self.benchmark(*args, **kwds)
977
1093
 
978
1094
 
979
- def model_has_been_benchmarked(
980
- model_id: str,
981
- dataset: str,
982
- few_shot: bool,
983
- validation_split: bool,
984
- benchmark_results: list[BenchmarkResult],
985
- ) -> bool:
986
- """Checks whether a model has already been benchmarked on a dataset.
1095
+ def get_record(
1096
+ model_config: "ModelConfig",
1097
+ dataset_config: "DatasetConfig",
1098
+ benchmark_config: "BenchmarkConfig",
1099
+ benchmark_results: c.Sequence[BenchmarkResult],
1100
+ ) -> BenchmarkResult | None:
1101
+ """Get the benchmark record for a given model and dataset.
987
1102
 
988
1103
  Args:
989
- model_id:
990
- The model ID.
991
- dataset:
992
- The dataset.
993
- few_shot:
994
- Whether the model was evaluated using few-shot evaluation.
995
- validation_split:
996
- Whether the model was evaluated on the validation split.
1104
+ model_config:
1105
+ The configuration of the model we are evaluating.
1106
+ dataset_config:
1107
+ The configuration of the dataset we are evaluating on.
1108
+ benchmark_config:
1109
+ The general benchmark configuration.
997
1110
  benchmark_results:
998
1111
  The benchmark results.
999
1112
 
1000
1113
  Returns:
1001
- Whether the model has already been evaluated on the dataset.
1114
+ The benchmark record, or None if no such record exists.
1002
1115
  """
1003
1116
  for record in benchmark_results:
1004
- same_evaluation = record.model == model_id and record.dataset == dataset
1005
- same_validation_split_setting = record.validation_split == validation_split
1006
- same_few_shot_setting = record.few_shot == few_shot or not record.generative
1007
- if same_evaluation and same_validation_split_setting and same_few_shot_setting:
1008
- return True
1009
- return False
1010
-
1011
-
1012
- def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
1013
- """Adjust the logging level based on verbosity.
1014
-
1015
- Args:
1016
- verbose:
1017
- Whether to output additional output.
1018
- ignore_testing:
1019
- Whether to ignore the testing flag.
1020
-
1021
- Returns:
1022
- The logging level that was set.
1023
- """
1024
- if hasattr(sys, "_called_from_test") and not ignore_testing:
1025
- logging_level = logging.CRITICAL
1026
- elif verbose:
1027
- logging_level = logging.DEBUG
1028
- else:
1029
- logging_level = logging.INFO
1030
- logger.setLevel(logging_level)
1031
- return logging_level
1117
+ model_id_components = split_model_id(model_id=record.model)
1118
+ same_model_id = model_id_components.model_id == model_config.model_id
1119
+ same_revision = model_id_components.revision == model_config.revision
1120
+ same_param = model_id_components.param == model_config.param
1121
+ same_dataset = record.dataset == dataset_config.name
1122
+ same_split = (
1123
+ record.validation_split != benchmark_config.evaluate_test_split
1124
+ or "val" not in dataset_config.splits
1125
+ )
1126
+ same_num_shots = (
1127
+ record.few_shot == benchmark_config.few_shot
1128
+ or not record.generative
1129
+ or dataset_config.task.requires_zero_shot
1130
+ )
1131
+ if (
1132
+ same_model_id
1133
+ and same_revision
1134
+ and same_param
1135
+ and same_dataset
1136
+ and same_split
1137
+ and same_num_shots
1138
+ ):
1139
+ return record
1140
+ return None
1032
1141
 
1033
1142
 
1034
1143
  def clear_model_cache_fn(cache_dir: str) -> None:
@@ -1049,7 +1158,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
1049
1158
  rmtree(sub_model_dir)
1050
1159
 
1051
1160
 
1052
- def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
1161
+ def prepare_dataset_configs(
1162
+ dataset_names: c.Sequence[str],
1163
+ ) -> c.Sequence["DatasetConfig"]:
1053
1164
  """Prepare the dataset configuration(s) to be benchmarked.
1054
1165
 
1055
1166
  Args:
@@ -1068,6 +1179,8 @@ def initial_logging(
1068
1179
  model_config: "ModelConfig",
1069
1180
  dataset_config: "DatasetConfig",
1070
1181
  benchmark_config: "BenchmarkConfig",
1182
+ num_finished_benchmarks: int,
1183
+ num_total_benchmarks: int,
1071
1184
  ) -> None:
1072
1185
  """Initial logging at the start of the benchmarking process.
1073
1186
 
@@ -1078,10 +1191,16 @@ def initial_logging(
1078
1191
  The configuration of the dataset we are evaluating on.
1079
1192
  benchmark_config:
1080
1193
  The general benchmark configuration.
1194
+ num_finished_benchmarks:
1195
+ The number of benchmarks that have already been finished.
1196
+ num_total_benchmarks:
1197
+ The total number of benchmarks to be run.
1081
1198
  """
1082
1199
  model_id = model_config.model_id
1083
1200
  if model_config.revision and model_config.revision != "main":
1084
1201
  model_id += f"@{model_config.revision}"
1202
+ if model_config.param is not None:
1203
+ model_id += f"#{model_config.param}"
1085
1204
 
1086
1205
  split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1087
1206
  if model_config.task in GENERATIVE_PIPELINE_TAGS:
@@ -1092,21 +1211,25 @@ def initial_logging(
1092
1211
  else:
1093
1212
  eval_type = "Benchmarking"
1094
1213
 
1095
- logger.info(
1096
- f"{eval_type} {model_id} on the {split_type} split of "
1097
- f"{dataset_config.pretty_name}"
1214
+ log_once(
1215
+ f"\n{eval_type} {model_id} on the {split_type} split of "
1216
+ f"{dataset_config.logging_string} ({num_finished_benchmarks + 1}/"
1217
+ f"{num_total_benchmarks} benchmarks)...",
1218
+ prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
1098
1219
  )
1099
1220
 
1100
1221
  if dataset_config.unofficial:
1101
- logger.info(
1222
+ log_once(
1102
1223
  f"Note that the {dataset_config.name!r} dataset is unofficial, "
1103
1224
  "meaning that the resulting evaluation will not be included in the "
1104
- "official leaderboard."
1225
+ "official leaderboard.",
1226
+ level=logging.WARNING,
1105
1227
  )
1106
1228
 
1107
1229
  if benchmark_config.debug:
1108
- logger.info(
1230
+ log_once(
1109
1231
  "Running in debug mode. This will output additional information, as "
1110
1232
  "well as store the model outputs in the current directory after each "
1111
- "batch. For this reason, evaluation will be slower."
1233
+ "batch. For this reason, evaluation will be slower.",
1234
+ level=logging.WARNING,
1112
1235
  )