EuroEval 15.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (40) hide show
  1. euroeval/__init__.py +72 -0
  2. euroeval/benchmark_config_factory.py +358 -0
  3. euroeval/benchmark_modules/__init__.py +7 -0
  4. euroeval/benchmark_modules/base.py +354 -0
  5. euroeval/benchmark_modules/fresh.py +286 -0
  6. euroeval/benchmark_modules/hf.py +1185 -0
  7. euroeval/benchmark_modules/litellm.py +905 -0
  8. euroeval/benchmark_modules/vllm.py +1171 -0
  9. euroeval/benchmarker.py +1074 -0
  10. euroeval/callbacks.py +72 -0
  11. euroeval/cli.py +281 -0
  12. euroeval/constants.py +50 -0
  13. euroeval/data_loading.py +96 -0
  14. euroeval/data_models.py +474 -0
  15. euroeval/dataset_configs.py +2001 -0
  16. euroeval/enums.py +144 -0
  17. euroeval/exceptions.py +191 -0
  18. euroeval/finetuning.py +324 -0
  19. euroeval/generation.py +296 -0
  20. euroeval/human_evaluation.py +737 -0
  21. euroeval/languages.py +200 -0
  22. euroeval/model_cache.py +253 -0
  23. euroeval/model_config.py +77 -0
  24. euroeval/model_loading.py +78 -0
  25. euroeval/scores.py +90 -0
  26. euroeval/speed_benchmark.py +124 -0
  27. euroeval/task_utils/__init__.py +1 -0
  28. euroeval/task_utils/multiple_choice_classification.py +176 -0
  29. euroeval/task_utils/question_answering.py +698 -0
  30. euroeval/task_utils/sequence_classification.py +237 -0
  31. euroeval/task_utils/text_to_text.py +150 -0
  32. euroeval/task_utils/token_classification.py +464 -0
  33. euroeval/tasks.py +202 -0
  34. euroeval/types.py +97 -0
  35. euroeval/utils.py +574 -0
  36. euroeval-15.2.0.dist-info/METADATA +234 -0
  37. euroeval-15.2.0.dist-info/RECORD +40 -0
  38. euroeval-15.2.0.dist-info/WHEEL +4 -0
  39. euroeval-15.2.0.dist-info/entry_points.txt +4 -0
  40. euroeval-15.2.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,1074 @@
1
+ """Class that benchmarks language models."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ import sys
7
+ import typing as t
8
+ from copy import deepcopy
9
+ from pathlib import Path
10
+ from shutil import rmtree
11
+ from time import sleep
12
+
13
+ from torch.distributed import destroy_process_group
14
+
15
+ from .benchmark_config_factory import build_benchmark_config
16
+ from .constants import GENERATIVE_PIPELINE_TAGS
17
+ from .data_loading import load_data
18
+ from .data_models import BenchmarkConfigParams, BenchmarkResult
19
+ from .dataset_configs import get_all_dataset_configs
20
+ from .enums import Device, ModelType
21
+ from .exceptions import InvalidBenchmark, InvalidModel
22
+ from .finetuning import finetune
23
+ from .generation import generate
24
+ from .model_config import get_model_config
25
+ from .model_loading import load_model
26
+ from .scores import log_scores
27
+ from .speed_benchmark import benchmark_speed
28
+ from .tasks import SPEED
29
+ from .utils import enforce_reproducibility
30
+
31
+ if t.TYPE_CHECKING:
32
+ from .benchmark_modules import BenchmarkModule
33
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
34
+
35
+
36
+ logger = logging.getLogger("euroeval")
37
+
38
+
39
+ class Benchmarker:
40
+ """Benchmarking all the language models.
41
+
42
+ Attributes:
43
+ benchmark_config_default_params:
44
+ The default parameters for the benchmark configuration.
45
+ benchmark_config:
46
+ The benchmark configuration.
47
+ force:
48
+ Whether to force evaluations of models, even if they have been benchmarked
49
+ already.
50
+ results_path:
51
+ The path to the results file.
52
+ benchmark_results:
53
+ The benchmark results.
54
+ """
55
+
56
+ def __init__(
57
+ self,
58
+ progress_bar: bool = True,
59
+ save_results: bool = True,
60
+ task: str | list[str] | None = None,
61
+ dataset: list[str] | str | None = None,
62
+ language: str | list[str] = "all",
63
+ model_language: str | list[str] | None = None,
64
+ dataset_language: str | list[str] | None = None,
65
+ device: Device | None = None,
66
+ batch_size: int = 32,
67
+ raise_errors: bool = False,
68
+ cache_dir: str = ".euroeval_cache",
69
+ api_key: str | None = None,
70
+ force: bool = False,
71
+ verbose: bool = False,
72
+ trust_remote_code: bool = False,
73
+ use_flash_attention: bool | None = None,
74
+ clear_model_cache: bool = False,
75
+ evaluate_test_split: bool = False,
76
+ few_shot: bool = True,
77
+ num_iterations: int = 10,
78
+ api_base: str | None = None,
79
+ api_version: str | None = None,
80
+ debug: bool = False,
81
+ run_with_cli: bool = False,
82
+ only_allow_safetensors: bool = False,
83
+ ) -> None:
84
+ """Initialise the benchmarker.
85
+
86
+ Args:
87
+ progress_bar:
88
+ Whether progress bars should be shown. Defaults to True.
89
+ save_results:
90
+ Whether to save the benchmark results to
91
+ 'euroeval_benchmark_results.jsonl'. Defaults to True.
92
+ task:
93
+ The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
94
+ If both `task` and `dataset` are None then all datasets will be
95
+ benchmarked.
96
+ dataset:
97
+ The datasets to benchmark on. Mutually exclusive with `task`. If both
98
+ `task` and `dataset` are None then all datasets will be benchmarked.
99
+ language:
100
+ The language codes of the languages to include, both for models and
101
+ datasets. Set this to 'all' if all languages should be considered.
102
+ Defaults to "all".
103
+ model_language:
104
+ The language codes of the languages to include for models. If specified
105
+ then this overrides the `language` parameter for model languages.
106
+ Defaults to None.
107
+ dataset_language:
108
+ The language codes of the languages to include for datasets. If
109
+ specified then this overrides the `language` parameter for dataset
110
+ languages. Defaults to None.
111
+ device:
112
+ The device to use for benchmarking. Defaults to None.
113
+ batch_size:
114
+ The batch size to use. Defaults to 32.
115
+ raise_errors:
116
+ Whether to raise errors instead of skipping the model evaluation.
117
+ Defaults to False.
118
+ cache_dir:
119
+ Directory to store cached models. Defaults to '.euroeval_cache'.
120
+ api_key:
121
+ The API key to use for a given inference API.
122
+ force:
123
+ Whether to force evaluations of models, even if they have been
124
+ benchmarked already. Defaults to False.
125
+ verbose:
126
+ Whether to output additional output. This is automatically set if
127
+ `debug` is True. Defaults to False.
128
+ trust_remote_code:
129
+ Whether to trust remote code when loading models. Defaults to False.
130
+ use_flash_attention:
131
+ Whether to use Flash Attention. If None then it will be used if it is
132
+ installed and the model is a decoder model. Defaults to None.
133
+ clear_model_cache:
134
+ Whether to clear the model cache after benchmarking each model.
135
+ Defaults to False.
136
+ evaluate_test_split:
137
+ Whether to evaluate the test split of the datasets. Defaults to False.
138
+ few_shot:
139
+ Whether to only evaluate the model using few-shot evaluation. Only
140
+ relevant if the model is generative. Defaults to True.
141
+ num_iterations:
142
+ The number of times each model should be evaluated. This is only meant
143
+ to be used for power users, and scores will not be allowed on the
144
+ leaderboards if this is changed. Defaults to 10.
145
+ api_base:
146
+ The base URL for a given inference API. Only relevant if `model` refers
147
+ to a model on an inference API. Defaults to None.
148
+ api_version:
149
+ The version of the API to use. Defaults to None.
150
+ debug:
151
+ Whether to output debug information. Defaults to False.
152
+ run_with_cli:
153
+ Whether the benchmarker is being run from the command-line interface.
154
+ Defaults to False.
155
+ only_allow_safetensors:
156
+ Whether to only allow models that use the safetensors format. Defaults
157
+ to False.
158
+
159
+ Raises:
160
+ ValueError:
161
+ If both `task` and `dataset` are specified.
162
+ """
163
+ if task is not None and dataset is not None:
164
+ raise ValueError("Only one of `task` and `dataset` can be specified.")
165
+
166
+ self.benchmark_config_default_params = BenchmarkConfigParams(
167
+ progress_bar=progress_bar,
168
+ save_results=save_results,
169
+ task=task,
170
+ dataset=dataset,
171
+ language=language,
172
+ model_language=model_language,
173
+ dataset_language=dataset_language,
174
+ device=device,
175
+ batch_size=batch_size,
176
+ raise_errors=raise_errors,
177
+ cache_dir=cache_dir,
178
+ api_key=api_key,
179
+ force=force,
180
+ verbose=verbose,
181
+ trust_remote_code=trust_remote_code,
182
+ use_flash_attention=use_flash_attention,
183
+ clear_model_cache=clear_model_cache,
184
+ evaluate_test_split=evaluate_test_split,
185
+ few_shot=few_shot,
186
+ num_iterations=num_iterations,
187
+ api_base=api_base,
188
+ api_version=api_version,
189
+ debug=debug,
190
+ run_with_cli=run_with_cli,
191
+ only_allow_safetensors=only_allow_safetensors,
192
+ )
193
+
194
+ self.benchmark_config = build_benchmark_config(
195
+ first_time=True, **self.benchmark_config_default_params.model_dump()
196
+ )
197
+
198
+ # Initialise variable storing model lists, so we only have to fetch it once
199
+ self._model_lists: dict[str, list[str]] | None = None
200
+
201
+ self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
202
+ adjust_logging_level(verbose=self.benchmark_config.verbose)
203
+
204
+ @property
205
+ def benchmark_results(self) -> list[BenchmarkResult]:
206
+ """The benchmark results."""
207
+ if self.results_path.exists():
208
+ with self.results_path.open() as f:
209
+ return [
210
+ BenchmarkResult.from_dict(json.loads(line))
211
+ for line in f
212
+ if line.strip()
213
+ ]
214
+ else:
215
+ return list()
216
+
217
+ def benchmark(
218
+ self,
219
+ model: list[str] | str,
220
+ task: str | list[str] | None = None,
221
+ dataset: list[str] | str | None = None,
222
+ progress_bar: bool | None = None,
223
+ save_results: bool | None = None,
224
+ language: str | list[str] | None = None,
225
+ model_language: str | list[str] | None = None,
226
+ dataset_language: str | list[str] | None = None,
227
+ device: Device | None = None,
228
+ batch_size: int | None = None,
229
+ raise_errors: bool | None = None,
230
+ cache_dir: str | None = None,
231
+ api_key: str | None = None,
232
+ force: bool | None = None,
233
+ verbose: bool | None = None,
234
+ trust_remote_code: bool | None = None,
235
+ use_flash_attention: bool | None = None,
236
+ clear_model_cache: bool | None = None,
237
+ evaluate_test_split: bool | None = None,
238
+ few_shot: bool | None = None,
239
+ num_iterations: int | None = None,
240
+ only_allow_safetensors: bool | None = None,
241
+ ) -> list[BenchmarkResult]:
242
+ """Benchmarks models on datasets.
243
+
244
+ Args:
245
+ model:
246
+ The full Hugging Face Hub path(s) to the pretrained transformer model.
247
+ The specific model version to use can be added after the suffix '@':
248
+ "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
249
+ and defaults to the latest version if not specified.
250
+ task:
251
+ The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
252
+ If both `task` and `dataset` are None then all datasets will be
253
+ benchmarked. Defaults to None.
254
+ dataset:
255
+ The datasets to benchmark on. Mutually exclusive with `task`. If both
256
+ `task` and `dataset` are None then all datasets will be benchmarked.
257
+ Defaults to None.
258
+ progress_bar:
259
+ Whether progress bars should be shown. Defaults to the value specified
260
+ when initialising the benchmarker.
261
+ save_results:
262
+ Whether to save the benchmark results to
263
+ 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
264
+ when initialising the benchmarker.
265
+ language:
266
+ The language codes of the languages to include, both for models and
267
+ datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
268
+ 'all' if all languages should be considered. Defaults to the value
269
+ specified when initialising the benchmarker.
270
+ model_language:
271
+ The language codes of the languages to include for models. If specified
272
+ then this overrides the `language` parameter for model languages.
273
+ Defaults to the value specified when initialising the benchmarker.
274
+ dataset_language:
275
+ The language codes of the languages to include for datasets. If
276
+ specified then this overrides the `language` parameter for dataset
277
+ languages. Defaults to the value specified when initialising the
278
+ benchmarker.
279
+ device:
280
+ The device to use for benchmarking. Defaults to the value specified when
281
+ initialising the benchmarker.
282
+ batch_size:
283
+ The batch size to use. Defaults to the value specified when initialising
284
+ the benchmarker.
285
+ raise_errors:
286
+ Whether to raise errors instead of skipping the model evaluation.
287
+ cache_dir:
288
+ Directory to store cached models. Defaults to the value specified when
289
+ initialising the benchmarker.
290
+ api_key:
291
+ The API key to use for a given inference server. Defaults to the value
292
+ specified when initialising the benchmarker.
293
+ force:
294
+ Whether to force evaluations of models, even if they have been
295
+ benchmarked already. Defaults to the value specified when initialising
296
+ the benchmarker.
297
+ verbose:
298
+ Whether to output additional output. Defaults to the value specified
299
+ when initialising the benchmarker.
300
+ trust_remote_code:
301
+ Whether to trust remote code when loading models. Defaults to the value
302
+ specified when initialising the benchmarker.
303
+ use_flash_attention:
304
+ Whether to use Flash Attention. Defaults to the value specified when
305
+ initialising the benchmarker.
306
+ clear_model_cache:
307
+ Whether to clear the model cache after benchmarking each model. Defaults
308
+ to the value specified when initialising the benchmarker.
309
+ evaluate_test_split:
310
+ Whether to evaluate the test split of the datasets. Defaults to the
311
+ value specified when initialising the benchmarker.
312
+ few_shot:
313
+ Whether to only evaluate the model using few-shot evaluation. Only
314
+ relevant if the model is generative. Defaults to the value specified
315
+ when initialising the benchmarker.
316
+ num_iterations:
317
+ The number of times each model should be evaluated. This is only meant
318
+ to be used for power users, and scores will not be allowed on the
319
+ leaderboards if this is changed. Defaults to the value specified when
320
+ initialising the benchmarker.
321
+ only_allow_safetensors:
322
+ Whether to only allow models that use the safetensors format. Defaults
323
+ to the value specified when initialising the benchmarker.
324
+
325
+ Returns:
326
+ A list of benchmark results.
327
+
328
+ Raises:
329
+ ValueError:
330
+ If both `task` and `dataset` are specified.
331
+ """
332
+ if task is not None and dataset is not None:
333
+ raise ValueError("Only one of `task` and `dataset` can be specified.")
334
+
335
+ benchmark_config = self._get_updated_benchmark_config(
336
+ task=task,
337
+ dataset=dataset,
338
+ progress_bar=progress_bar,
339
+ save_results=save_results,
340
+ language=language,
341
+ model_language=model_language,
342
+ dataset_language=dataset_language,
343
+ device=device,
344
+ batch_size=batch_size,
345
+ raise_errors=raise_errors,
346
+ cache_dir=cache_dir,
347
+ api_key=api_key,
348
+ force=force,
349
+ verbose=verbose,
350
+ trust_remote_code=trust_remote_code,
351
+ use_flash_attention=use_flash_attention,
352
+ clear_model_cache=clear_model_cache,
353
+ evaluate_test_split=evaluate_test_split,
354
+ few_shot=few_shot,
355
+ num_iterations=num_iterations,
356
+ only_allow_safetensors=only_allow_safetensors,
357
+ )
358
+
359
+ adjust_logging_level(verbose=benchmark_config.verbose)
360
+
361
+ if benchmark_config.clear_model_cache:
362
+ clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
363
+
364
+ model_ids = self._prepare_model_ids(model_id=model)
365
+ dataset_configs = prepare_dataset_configs(
366
+ dataset_names=benchmark_config.datasets
367
+ )
368
+
369
+ current_benchmark_results: list[BenchmarkResult] = list()
370
+ for m_id in model_ids:
371
+ try:
372
+ model_config = get_model_config(
373
+ model_id=m_id, benchmark_config=benchmark_config
374
+ )
375
+ except InvalidModel as e:
376
+ logger.info(e.message)
377
+ continue
378
+
379
+ loaded_model: BenchmarkModule | None = None
380
+ for dataset_config in dataset_configs:
381
+ # Skip if we have already benchmarked this model on this dataset and
382
+ # we are not forcing the benchmark
383
+ if not benchmark_config.force and model_has_been_benchmarked(
384
+ model_id=m_id,
385
+ dataset=dataset_config.name,
386
+ few_shot=benchmark_config.few_shot,
387
+ validation_split=not benchmark_config.evaluate_test_split,
388
+ benchmark_results=self.benchmark_results,
389
+ ):
390
+ logger.debug(
391
+ f"Skipping benchmarking {m_id} on {dataset_config.pretty_name},"
392
+ " as it has already been benchmarked."
393
+ )
394
+ continue
395
+
396
+ # We do not re-initialise generative models as their architecture is not
397
+ # customised to specific datasets
398
+ if model_config.task in GENERATIVE_PIPELINE_TAGS:
399
+ initial_logging(
400
+ model_config=model_config,
401
+ dataset_config=dataset_config,
402
+ benchmark_config=benchmark_config,
403
+ )
404
+ if loaded_model is None:
405
+ logger.info("Loading model...")
406
+ try:
407
+ loaded_model = load_model(
408
+ model_config=model_config,
409
+ dataset_config=dataset_config,
410
+ benchmark_config=benchmark_config,
411
+ )
412
+ except InvalidModel as e:
413
+ if benchmark_config.raise_errors:
414
+ raise e
415
+ logger.info(e.message)
416
+ break
417
+ else:
418
+ loaded_model.dataset_config = dataset_config
419
+
420
+ # Benchmark a single model on a single dataset
421
+ benchmark_output_or_err = self._benchmark_single(
422
+ model=loaded_model,
423
+ model_config=model_config,
424
+ dataset_config=dataset_config,
425
+ benchmark_config=benchmark_config,
426
+ )
427
+
428
+ if (
429
+ isinstance(benchmark_output_or_err, Exception)
430
+ and benchmark_config.raise_errors
431
+ ):
432
+ raise benchmark_output_or_err
433
+
434
+ elif isinstance(benchmark_output_or_err, InvalidBenchmark):
435
+ if benchmark_config.raise_errors:
436
+ raise benchmark_output_or_err
437
+ logger.info(
438
+ f"{m_id} could not be benchmarked on "
439
+ f"{dataset_config.pretty_name}. Skipping. The error message "
440
+ f"raised was {benchmark_output_or_err.message!r}."
441
+ )
442
+ continue
443
+
444
+ elif isinstance(benchmark_output_or_err, InvalidModel):
445
+ if benchmark_config.raise_errors:
446
+ raise benchmark_output_or_err
447
+ logger.info(benchmark_output_or_err.message)
448
+ break
449
+
450
+ else:
451
+ record = benchmark_output_or_err
452
+ current_benchmark_results.append(record)
453
+ if benchmark_config.save_results:
454
+ record.append_to_results(results_path=self.results_path)
455
+
456
+ if benchmark_config.clear_model_cache:
457
+ clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
458
+
459
+ # This avoids the following warning at the end of the benchmarking:
460
+ # Warning: WARNING: process group has NOT been destroyed before we destruct
461
+ # ProcessGroupNCCL. On normal program exit, the application should call
462
+ # destroy_process_group to ensure that any pending NCCL operations have
463
+ # finished in this process. In rare cases this process can exit before this
464
+ # point and block the progress of another member of the process group. This
465
+ # constraint has always been present, but this warning has only been added
466
+ # since PyTorch 2.4 (function operator())
467
+ try:
468
+ destroy_process_group()
469
+ except AssertionError:
470
+ pass
471
+
472
+ return current_benchmark_results
473
+
474
+ def _get_updated_benchmark_config(
475
+ self,
476
+ progress_bar: bool | None = None,
477
+ save_results: bool | None = None,
478
+ task: str | list[str] | None | None = None,
479
+ dataset: str | list[str] | None | None = None,
480
+ language: str | list[str] | None = None,
481
+ model_language: str | list[str] | None | None = None,
482
+ dataset_language: str | list[str] | None | None = None,
483
+ device: Device | None | None = None,
484
+ batch_size: int | None = None,
485
+ raise_errors: bool | None = None,
486
+ cache_dir: str | None = None,
487
+ api_key: str | None | None = None,
488
+ force: bool | None = None,
489
+ verbose: bool | None = None,
490
+ trust_remote_code: bool | None = None,
491
+ use_flash_attention: bool | None | None = None,
492
+ clear_model_cache: bool | None = None,
493
+ evaluate_test_split: bool | None = None,
494
+ few_shot: bool | None = None,
495
+ num_iterations: int | None = None,
496
+ api_base: str | None | None = None,
497
+ api_version: str | None | None = None,
498
+ debug: bool | None = None,
499
+ run_with_cli: bool | None = None,
500
+ only_allow_safetensors: bool | None = None,
501
+ ) -> "BenchmarkConfig":
502
+ """Get an updated benchmark configuration.
503
+
504
+ Args:
505
+ progress_bar:
506
+ Whether progress bars should be shown. If None, then this value will not
507
+ be updated.
508
+ save_results:
509
+ Whether to save the benchmark results to
510
+ 'euroeval_benchmark_results.jsonl'. If None, then this value will not
511
+ be updated.
512
+ task:
513
+ The tasks benchmark the model(s) on. If None, then this value will not
514
+ be updated.
515
+ dataset:
516
+ The datasets to benchmark on. If None, then this value will not be
517
+ updated.
518
+ language:
519
+ The language codes of the languages to include, both for models and
520
+ datasets. If None, then this value will not be updated.
521
+ model_language:
522
+ The language codes of the languages to include for models. If None, then
523
+ this value will not be updated.
524
+ dataset_language:
525
+ The language codes of the languages to include for datasets. If None,
526
+ then this value will not be updated.
527
+ device:
528
+ The device to use for benchmarking. If None, then this value will not be
529
+ updated.
530
+ batch_size:
531
+ The batch size to use. If None, then this value will not be updated.
532
+ raise_errors:
533
+ Whether to raise errors instead of skipping the model evaluation. If
534
+ None, then this value will not be updated.
535
+ cache_dir:
536
+ Directory to store cached models. If None, then this value will not be
537
+ updated.
538
+ api_key:
539
+ The API key to use for a given inference server. If None, then this
540
+ value will not be updated.
541
+ force:
542
+ Whether to force evaluations of models, even if they have been
543
+ benchmarked already. If None, then this value will not be updated.
544
+ verbose:
545
+ Whether to output additional output. If None, then this value will not
546
+ be updated.
547
+ trust_remote_code:
548
+ Whether to trust remote code when loading models. If None, then this
549
+ value will not be updated.
550
+ use_flash_attention:
551
+ Whether to use Flash Attention. If None, then this value will not be
552
+ updated.
553
+ clear_model_cache:
554
+ Whether to clear the model cache after benchmarking each model. If None,
555
+ then this value will not be updated.
556
+ evaluate_test_split:
557
+ Whether to evaluate the test split of the datasets. If None, then this
558
+ value will not be updated.
559
+ few_shot:
560
+ Whether to only evaluate the model using few-shot evaluation. If None,
561
+ then this value will not be updated.
562
+ num_iterations:
563
+ The number of times each model should be evaluated. If None, then this
564
+ value will not be updated.
565
+ api_base:
566
+ The base URL for a given inference API. If None, then this value will
567
+ not be updated.
568
+ api_version:
569
+ The version of the API to use. If None, then this value will not be
570
+ updated.
571
+ debug:
572
+ Whether to output debug information. If None, then this value will not
573
+ be updated.
574
+ run_with_cli:
575
+ Whether the benchmarker is being run from the command-line interface.
576
+ If None, then this value will not be updated.
577
+ only_allow_safetensors:
578
+ Whether to only allow models that use the safetensors format. If None,
579
+ then this value will not be updated.
580
+
581
+ Returns:
582
+ The updated benchmark configuration.
583
+ """
584
+ benchmark_config_params = deepcopy(self.benchmark_config_default_params)
585
+
586
+ if progress_bar is not None:
587
+ benchmark_config_params.progress_bar = progress_bar
588
+ if save_results is not None:
589
+ benchmark_config_params.save_results = save_results
590
+ if task is not None:
591
+ benchmark_config_params.task = task
592
+ benchmark_config_params.dataset = None
593
+ if dataset is not None:
594
+ benchmark_config_params.dataset = dataset
595
+ benchmark_config_params.task = None
596
+ if language is not None:
597
+ benchmark_config_params.language = language
598
+ if model_language is not None:
599
+ benchmark_config_params.model_language = model_language
600
+ if dataset_language is not None:
601
+ benchmark_config_params.dataset_language = dataset_language
602
+ if device is not None:
603
+ benchmark_config_params.device = device
604
+ if batch_size is not None:
605
+ benchmark_config_params.batch_size = batch_size
606
+ if raise_errors is not None:
607
+ benchmark_config_params.raise_errors = raise_errors
608
+ if cache_dir is not None:
609
+ benchmark_config_params.cache_dir = cache_dir
610
+ if api_key is not None:
611
+ benchmark_config_params.api_key = api_key
612
+ if force is not None:
613
+ benchmark_config_params.force = force
614
+ if verbose is not None:
615
+ benchmark_config_params.verbose = verbose
616
+ if trust_remote_code is not None:
617
+ benchmark_config_params.trust_remote_code = trust_remote_code
618
+ if use_flash_attention is not None:
619
+ benchmark_config_params.use_flash_attention = use_flash_attention
620
+ if clear_model_cache is not None:
621
+ benchmark_config_params.clear_model_cache = clear_model_cache
622
+ if evaluate_test_split is not None:
623
+ benchmark_config_params.evaluate_test_split = evaluate_test_split
624
+ if few_shot is not None:
625
+ benchmark_config_params.few_shot = few_shot
626
+ if num_iterations is not None:
627
+ benchmark_config_params.num_iterations = num_iterations
628
+ if api_base is not None:
629
+ benchmark_config_params.api_base = api_base
630
+ if api_version is not None:
631
+ benchmark_config_params.api_version = api_version
632
+ if debug is not None:
633
+ benchmark_config_params.debug = debug
634
+ if run_with_cli is not None:
635
+ benchmark_config_params.run_with_cli = run_with_cli
636
+ if only_allow_safetensors is not None:
637
+ benchmark_config_params.only_allow_safetensors = only_allow_safetensors
638
+
639
+ return build_benchmark_config(**benchmark_config_params.model_dump())
640
+
641
+ def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
642
+ """Prepare the model ID(s) to be benchmarked.
643
+
644
+ Args:
645
+ model_id:
646
+ The model ID(s) of the models to benchmark.
647
+
648
+ Returns:
649
+ The prepared list of model IDs.
650
+ """
651
+ model_ids = [model_id] if isinstance(model_id, str) else model_id
652
+
653
+ # Reorder the `model_ids` list to include the ones present in the benchmark
654
+ # results first
655
+ benchmarked_model_ids = [
656
+ re.sub(r"\(.+\)", "", record.model).strip()
657
+ for record in self.benchmark_results
658
+ ]
659
+ model_ids_sorted = [m_id for m_id in model_ids if m_id in benchmarked_model_ids]
660
+ model_ids_sorted += [
661
+ m_id for m_id in model_ids if m_id not in benchmarked_model_ids
662
+ ]
663
+
664
+ return [m_id.rstrip(" /") for m_id in model_ids_sorted]
665
+
666
+ def _benchmark_single(
667
+ self,
668
+ model: "BenchmarkModule | None",
669
+ model_config: "ModelConfig",
670
+ dataset_config: "DatasetConfig",
671
+ benchmark_config: "BenchmarkConfig",
672
+ ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
673
+ """Benchmark a single model on a single dataset.
674
+
675
+ Args:
676
+ model:
677
+ The model to benchmark.
678
+ model_config:
679
+ The configuration of the model we are evaluating.
680
+ dataset_config:
681
+ The configuration of the dataset we are evaluating on.
682
+ benchmark_config:
683
+ The general benchmark configuration.
684
+
685
+ Returns:
686
+ The benchmark result, or an error if the benchmark was unsuccessful.
687
+ """
688
+ if model is None:
689
+ initial_logging(
690
+ model_config=model_config,
691
+ dataset_config=dataset_config,
692
+ benchmark_config=benchmark_config,
693
+ )
694
+
695
+ while True:
696
+ try:
697
+ # Set random seeds to enforce reproducibility of the randomly
698
+ # initialised weights
699
+ rng = enforce_reproducibility()
700
+
701
+ if model is None or model_config.model_type != ModelType.GENERATIVE:
702
+ logger.info("Loading model...")
703
+ model = load_model(
704
+ model_config=model_config,
705
+ dataset_config=dataset_config,
706
+ benchmark_config=benchmark_config,
707
+ )
708
+ assert model is not None
709
+
710
+ if dataset_config.task == SPEED:
711
+ scores = benchmark_speed(
712
+ model=model, benchmark_config=self.benchmark_config
713
+ )
714
+
715
+ else:
716
+ bootstrapped_datasets = load_data(
717
+ rng=rng,
718
+ dataset_config=dataset_config,
719
+ benchmark_config=benchmark_config,
720
+ )
721
+ prepared_datasets = model.prepare_datasets(
722
+ datasets=bootstrapped_datasets, task=dataset_config.task
723
+ )
724
+ if model_config.model_type == ModelType.GENERATIVE:
725
+ scores = generate(
726
+ model=model,
727
+ datasets=prepared_datasets,
728
+ model_config=model_config,
729
+ dataset_config=dataset_config,
730
+ benchmark_config=self.benchmark_config,
731
+ )
732
+ else:
733
+ scores = finetune(
734
+ model=model,
735
+ datasets=prepared_datasets,
736
+ model_config=model_config,
737
+ dataset_config=dataset_config,
738
+ benchmark_config=benchmark_config,
739
+ )
740
+
741
+ results = log_scores(
742
+ dataset_name=dataset_config.pretty_name,
743
+ metric_configs=dataset_config.task.metrics,
744
+ scores=scores,
745
+ model_id=model_config.model_id,
746
+ )
747
+
748
+ record = BenchmarkResult(
749
+ dataset=dataset_config.name,
750
+ task=dataset_config.task.name,
751
+ dataset_languages=[
752
+ language.code for language in dataset_config.languages
753
+ ],
754
+ model=model_config.model_id,
755
+ results=results,
756
+ num_model_parameters=model.num_params,
757
+ max_sequence_length=model.model_max_length,
758
+ vocabulary_size=model.vocab_size,
759
+ merge=model_config.merge,
760
+ generative=model_config.model_type == ModelType.GENERATIVE,
761
+ generative_type=(
762
+ model.generative_type.value
763
+ if model.generative_type is not None
764
+ else None
765
+ ),
766
+ few_shot=benchmark_config.few_shot,
767
+ validation_split=not benchmark_config.evaluate_test_split,
768
+ )
769
+ logger.debug(f"Results:\n{results}")
770
+ return record
771
+
772
+ except (InvalidBenchmark, InvalidModel) as e:
773
+ # If the model ID is not valid then raise an error
774
+ model_err_msg = "does not exist on the Hugging Face Hub"
775
+ if benchmark_config.raise_errors and model_err_msg in str(e):
776
+ raise e
777
+
778
+ # Otherwise, if the error is due to Hugging Face Hub being down, then
779
+ # wait a bit and try again
780
+ elif "The Hugging Face Hub seems to be down." in str(e):
781
+ wait_time = 30
782
+ logger.debug(
783
+ "The Hugging Face Hub seems to be down. Retrying in "
784
+ f"{wait_time} seconds."
785
+ )
786
+ sleep(wait_time)
787
+ continue
788
+
789
+ # Otherwise, if the error is due to the MPS fallback not being enabled,
790
+ # then raise an error asking the user to enable it
791
+ elif "PYTORCH_ENABLE_MPS_FALLBACK" in str(e):
792
+ raise RuntimeError(
793
+ "The benchmark failed because the environment variable "
794
+ "`PYTORCH_ENABLE_MPS_FALLBACK` is not set. Please set this "
795
+ "environment variable to `1` and try again."
796
+ )
797
+
798
+ elif benchmark_config.raise_errors:
799
+ raise e
800
+ return e
801
+
802
+ def __call__(
803
+ self,
804
+ model: list[str] | str,
805
+ task: str | list[str] | None = None,
806
+ dataset: list[str] | str | None = None,
807
+ progress_bar: bool | None = None,
808
+ save_results: bool | None = None,
809
+ language: str | list[str] | None = None,
810
+ model_language: str | list[str] | None = None,
811
+ dataset_language: str | list[str] | None = None,
812
+ device: Device | None = None,
813
+ batch_size: int | None = None,
814
+ raise_errors: bool | None = None,
815
+ cache_dir: str | None = None,
816
+ api_key: str | None = None,
817
+ force: bool | None = None,
818
+ verbose: bool | None = None,
819
+ trust_remote_code: bool | None = None,
820
+ use_flash_attention: bool | None = None,
821
+ clear_model_cache: bool | None = None,
822
+ evaluate_test_split: bool | None = None,
823
+ few_shot: bool | None = None,
824
+ num_iterations: int | None = None,
825
+ only_allow_safetensors: bool | None = None,
826
+ ) -> list[BenchmarkResult]:
827
+ """Benchmarks models on datasets.
828
+
829
+ Args:
830
+ model:
831
+ The full Hugging Face Hub path(s) to the pretrained transformer model.
832
+ The specific model version to use can be added after the suffix '@':
833
+ "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
834
+ and defaults to the latest version if not specified.
835
+ task:
836
+ The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
837
+ If both `task` and `dataset` are None then all datasets will be
838
+ benchmarked. Defaults to None.
839
+ dataset:
840
+ The datasets to benchmark on. Mutually exclusive with `task`. If both
841
+ `task` and `dataset` are None then all datasets will be benchmarked.
842
+ Defaults to None.
843
+ progress_bar:
844
+ Whether progress bars should be shown. Defaults to the value specified
845
+ when initialising the benchmarker.
846
+ save_results:
847
+ Whether to save the benchmark results to
848
+ 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
849
+ when initialising the benchmarker.
850
+ language:
851
+ The language codes of the languages to include, both for models and
852
+ datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
853
+ 'all' if all languages should be considered. Defaults to the value
854
+ specified when initialising the benchmarker.
855
+ model_language:
856
+ The language codes of the languages to include for models. If specified
857
+ then this overrides the `language` parameter for model languages.
858
+ Defaults to the value specified when initialising the benchmarker.
859
+ dataset_language:
860
+ The language codes of the languages to include for datasets. If
861
+ specified then this overrides the `language` parameter for dataset
862
+ languages. Defaults to the value specified when initialising the
863
+ benchmarker.
864
+ device:
865
+ The device to use for benchmarking. Defaults to the value specified when
866
+ initialising the benchmarker.
867
+ batch_size:
868
+ The batch size to use. Defaults to the value specified when initialising
869
+ the benchmarker.
870
+ raise_errors:
871
+ Whether to raise errors instead of skipping the model evaluation.
872
+ cache_dir:
873
+ Directory to store cached models. Defaults to the value specified when
874
+ initialising the benchmarker.
875
+ api_key:
876
+ The API key to use for a given inference server. Defaults to the value
877
+ specified when initialising the benchmarker.
878
+ force:
879
+ Whether to force evaluations of models, even if they have been
880
+ benchmarked already. Defaults to the value specified when initialising
881
+ the benchmarker.
882
+ verbose:
883
+ Whether to output additional output. Defaults to the value specified
884
+ when initialising the benchmarker.
885
+ trust_remote_code:
886
+ Whether to trust remote code when loading models. Defaults to the value
887
+ specified when initialising the benchmarker.
888
+ use_flash_attention:
889
+ Whether to use Flash Attention. Defaults to the value specified when
890
+ initialising the benchmarker.
891
+ clear_model_cache:
892
+ Whether to clear the model cache after benchmarking each model. Defaults
893
+ to the value specified when initialising the benchmarker.
894
+ evaluate_test_split:
895
+ Whether to evaluate the test split of the datasets. Defaults to the
896
+ value specified when initialising the benchmarker.
897
+ few_shot:
898
+ Whether to only evaluate the model using few-shot evaluation. Only
899
+ relevant if the model is generative. Defaults to the value specified
900
+ when initialising the benchmarker.
901
+ num_iterations:
902
+ The number of times each model should be evaluated. This is only meant
903
+ to be used for power users, and scores will not be allowed on the
904
+ leaderboards if this is changed. Defaults to the value specified when
905
+ initialising the benchmarker.
906
+ only_allow_safetensors:
907
+ Whether to only allow models that use the safetensors format. Defaults
908
+ to the value specified when initialising the benchmarker.
909
+
910
+ Returns:
911
+ A list of benchmark results.
912
+
913
+ Raises:
914
+ ValueError:
915
+ If both `task` and `dataset` are specified.
916
+ """
917
+ logger.warning(
918
+ "Calling the `Benchmarker` class directly is deprecated. Please use the "
919
+ "`benchmark` function instead. This will be removed in a future version."
920
+ )
921
+ return self.benchmark(
922
+ model=model,
923
+ task=task,
924
+ dataset=dataset,
925
+ progress_bar=progress_bar,
926
+ save_results=save_results,
927
+ language=language,
928
+ model_language=model_language,
929
+ dataset_language=dataset_language,
930
+ device=device,
931
+ batch_size=batch_size,
932
+ raise_errors=raise_errors,
933
+ cache_dir=cache_dir,
934
+ api_key=api_key,
935
+ force=force,
936
+ verbose=verbose,
937
+ trust_remote_code=trust_remote_code,
938
+ use_flash_attention=use_flash_attention,
939
+ clear_model_cache=clear_model_cache,
940
+ evaluate_test_split=evaluate_test_split,
941
+ few_shot=few_shot,
942
+ num_iterations=num_iterations,
943
+ only_allow_safetensors=only_allow_safetensors,
944
+ )
945
+
946
+
947
+ def model_has_been_benchmarked(
948
+ model_id: str,
949
+ dataset: str,
950
+ few_shot: bool,
951
+ validation_split: bool,
952
+ benchmark_results: list[BenchmarkResult],
953
+ ) -> bool:
954
+ """Checks whether a model has already been benchmarked on a dataset.
955
+
956
+ Args:
957
+ model_id:
958
+ The model ID.
959
+ dataset:
960
+ The dataset.
961
+ few_shot:
962
+ Whether the model was evaluated using few-shot evaluation.
963
+ validation_split:
964
+ Whether the model was evaluated on the validation split.
965
+ benchmark_results:
966
+ The benchmark results.
967
+
968
+ Returns:
969
+ Whether the model has already been evaluated on the dataset.
970
+ """
971
+ for record in benchmark_results:
972
+ same_evaluation = record.model == model_id and record.dataset == dataset
973
+ same_validation_split_setting = record.validation_split == validation_split
974
+ same_few_shot_setting = record.few_shot == few_shot or not record.generative
975
+ if same_evaluation and same_validation_split_setting and same_few_shot_setting:
976
+ return True
977
+ return False
978
+
979
+
980
+ def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
981
+ """Adjust the logging level based on verbosity.
982
+
983
+ Args:
984
+ verbose:
985
+ Whether to output additional output.
986
+ ignore_testing:
987
+ Whether to ignore the testing flag.
988
+
989
+ Returns:
990
+ The logging level that was set.
991
+ """
992
+ if hasattr(sys, "_called_from_test") and not ignore_testing:
993
+ logging_level = logging.CRITICAL
994
+ elif verbose:
995
+ logging_level = logging.DEBUG
996
+ else:
997
+ logging_level = logging.INFO
998
+ logger.setLevel(logging_level)
999
+ return logging_level
1000
+
1001
+
1002
+ def clear_model_cache_fn(cache_dir: str) -> None:
1003
+ """Clear the model cache.
1004
+
1005
+ Note that this will not remove the stored completions.
1006
+
1007
+ Args:
1008
+ cache_dir:
1009
+ The path to the cache directory.
1010
+ """
1011
+ model_cache_path = Path(cache_dir) / "model_cache"
1012
+ model_cache_path.mkdir(parents=True, exist_ok=True)
1013
+ for model_dir in model_cache_path.iterdir():
1014
+ if model_dir.is_dir():
1015
+ for sub_model_dir in model_dir.iterdir():
1016
+ if sub_model_dir.is_dir():
1017
+ rmtree(sub_model_dir)
1018
+
1019
+
1020
+ def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
1021
+ """Prepare the dataset configuration(s) to be benchmarked.
1022
+
1023
+ Args:
1024
+ dataset_names:
1025
+ The dataset names to benchmark.
1026
+
1027
+ Returns:
1028
+ The prepared list of model IDs.
1029
+ """
1030
+ return [
1031
+ cfg for cfg in get_all_dataset_configs().values() if cfg.name in dataset_names
1032
+ ]
1033
+
1034
+
1035
+ def initial_logging(
1036
+ model_config: "ModelConfig",
1037
+ dataset_config: "DatasetConfig",
1038
+ benchmark_config: "BenchmarkConfig",
1039
+ ) -> None:
1040
+ """Initial logging at the start of the benchmarking process.
1041
+
1042
+ Args:
1043
+ model_config:
1044
+ The configuration of the model we are evaluating.
1045
+ dataset_config:
1046
+ The configuration of the dataset we are evaluating on.
1047
+ benchmark_config:
1048
+ The general benchmark configuration.
1049
+ """
1050
+ split_type = "validation" if not benchmark_config.evaluate_test_split else "test"
1051
+ if model_config.task in GENERATIVE_PIPELINE_TAGS:
1052
+ if benchmark_config.few_shot:
1053
+ eval_type = "Few-shot benchmarking"
1054
+ else:
1055
+ eval_type = "Zero-shot benchmarking"
1056
+ else:
1057
+ eval_type = "Benchmarking"
1058
+ logger.info(
1059
+ f"{eval_type} {model_config.model_id} on the {split_type} split of "
1060
+ f"{dataset_config.pretty_name}"
1061
+ )
1062
+
1063
+ if dataset_config.unofficial:
1064
+ logger.info(
1065
+ f"Note that the {dataset_config.name!r} dataset is unofficial, "
1066
+ "meaning that the resulting evaluation will not be included in the "
1067
+ "official leaderboard."
1068
+ )
1069
+ if benchmark_config.debug:
1070
+ logger.info(
1071
+ "Running in debug mode. This will output additional information, as "
1072
+ "well as store the model outputs in the current directory after each "
1073
+ "batch. For this reason, evaluation will be slower."
1074
+ )