EuroEval 16.2.0__py3-none-any.whl → 16.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -6,9 +6,9 @@ import typing as t
6
6
 
7
7
  import torch
8
8
 
9
- from .data_models import BenchmarkConfig
9
+ from .data_models import BenchmarkConfig, BenchmarkConfigParams
10
10
  from .dataset_configs import get_all_dataset_configs
11
- from .enums import Device, GenerativeType
11
+ from .enums import Device
12
12
  from .exceptions import InvalidBenchmark
13
13
  from .languages import get_all_languages
14
14
  from .tasks import SPEED, get_all_tasks
@@ -21,154 +21,66 @@ logger = logging.getLogger("euroeval")
21
21
 
22
22
 
23
23
  def build_benchmark_config(
24
- progress_bar: bool,
25
- save_results: bool,
26
- task: str | list[str] | None,
27
- dataset: str | list[str] | None,
28
- language: str | list[str],
29
- model_language: str | list[str] | None,
30
- dataset_language: str | list[str] | None,
31
- device: Device | None,
32
- batch_size: int,
33
- raise_errors: bool,
34
- cache_dir: str,
35
- api_key: str | None,
36
- force: bool,
37
- verbose: bool,
38
- trust_remote_code: bool,
39
- clear_model_cache: bool,
40
- evaluate_test_split: bool,
41
- few_shot: bool,
42
- num_iterations: int,
43
- api_base: str | None,
44
- api_version: str | None,
45
- gpu_memory_utilization: float,
46
- generative_type: GenerativeType | None,
47
- debug: bool,
48
- run_with_cli: bool,
49
- requires_safetensors: bool,
50
- download_only: bool,
24
+ benchmark_config_params: BenchmarkConfigParams,
51
25
  ) -> BenchmarkConfig:
52
26
  """Create a benchmark configuration.
53
27
 
54
28
  Args:
55
- progress_bar:
56
- Whether to show a progress bar when running the benchmark.
57
- save_results:
58
- Whether to save the benchmark results to a file.
59
- task:
60
- The tasks to include for dataset. If None then datasets will not be
61
- filtered based on their task.
62
- dataset:
63
- The datasets to include for task. If None then all datasets will be
64
- included, limited by the `task` parameter.
65
- language:
66
- The language codes of the languages to include, both for models and
67
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this
68
- to 'all' if all languages should be considered.
69
- model_language:
70
- The language codes of the languages to include for models. If None then
71
- the `language` parameter will be used.
72
- dataset_language:
73
- The language codes of the languages to include for datasets. If None then
74
- the `language` parameter will be used.
75
- device:
76
- The device to use for running the models. If None then the device will be
77
- set automatically.
78
- batch_size:
79
- The batch size to use for running the models.
80
- raise_errors:
81
- Whether to raise errors when running the benchmark.
82
- cache_dir:
83
- The directory to use for caching the models.
84
- api_key:
85
- The API key to use for a given inference server.
86
- force:
87
- Whether to force the benchmark to run even if the results are already
88
- cached.
89
- verbose:
90
- Whether to print verbose output when running the benchmark. This is
91
- automatically set if `debug` is True.
92
- trust_remote_code:
93
- Whether to trust remote code when running the benchmark.
94
- clear_model_cache:
95
- Whether to clear the model cache before running the benchmark.
96
- evaluate_test_split:
97
- Whether to use the test split for the datasets.
98
- few_shot:
99
- Whether to use few-shot learning for the models.
100
- num_iterations:
101
- The number of iterations each model should be evaluated for.
102
- api_base:
103
- The base URL for a given inference API. Only relevant if `model` refers to a
104
- model on an inference API.
105
- api_version:
106
- The version of the API to use for a given inference API.
107
- gpu_memory_utilization:
108
- The GPU memory utilization to use for vLLM. A larger value will result in
109
- faster evaluation, but at the risk of running out of GPU memory. Only reduce
110
- this if you are running out of GPU memory. Only relevant if the model is
111
- generative.
112
- generative_type:
113
- The type of generative model. Only relevant if the model is generative. If
114
- not specified, the type will be inferred automatically.
115
- debug:
116
- Whether to run the benchmark in debug mode.
117
- run_with_cli:
118
- Whether the benchmark is being run with the CLI.
119
- requires_safetensors:
120
- Whether to only allow evaluations of models stored as safetensors.
121
- download_only:
122
- Whether to only download the requested model weights and datasets.
29
+ benchmark_config_params:
30
+ The parameters for creating the benchmark configuration.
123
31
 
124
32
  Returns:
125
33
  The benchmark configuration.
126
34
  """
127
- language_codes = get_correct_language_codes(language_codes=language)
35
+ language_codes = get_correct_language_codes(
36
+ language_codes=benchmark_config_params.language
37
+ )
128
38
  model_languages = prepare_languages(
129
- language_codes=model_language, default_language_codes=language_codes
39
+ language_codes=benchmark_config_params.model_language,
40
+ default_language_codes=language_codes,
130
41
  )
131
42
  dataset_languages = prepare_languages(
132
- language_codes=dataset_language, default_language_codes=language_codes
43
+ language_codes=benchmark_config_params.dataset_language,
44
+ default_language_codes=language_codes,
133
45
  )
134
46
 
135
47
  tasks, datasets = prepare_tasks_and_datasets(
136
- task=task, dataset=dataset, dataset_languages=dataset_languages
48
+ task=benchmark_config_params.task,
49
+ dataset=benchmark_config_params.dataset,
50
+ dataset_languages=dataset_languages,
137
51
  )
138
52
 
139
- torch_device = prepare_device(device=device)
140
-
141
- # Set variable with number of iterations
142
- if hasattr(sys, "_called_from_test"):
143
- num_iterations = 1
144
-
145
53
  return BenchmarkConfig(
146
54
  model_languages=model_languages,
147
55
  dataset_languages=dataset_languages,
148
56
  tasks=tasks,
149
57
  datasets=datasets,
150
- batch_size=batch_size,
151
- raise_errors=raise_errors,
152
- cache_dir=cache_dir,
153
- api_key=api_key,
154
- force=force,
155
- progress_bar=progress_bar,
156
- save_results=save_results,
157
- verbose=verbose or debug,
158
- device=torch_device,
159
- trust_remote_code=trust_remote_code,
160
- clear_model_cache=clear_model_cache,
161
- evaluate_test_split=evaluate_test_split,
162
- few_shot=few_shot,
163
- num_iterations=num_iterations,
164
- api_base=api_base,
165
- api_version=api_version,
166
- gpu_memory_utilization=gpu_memory_utilization,
167
- generative_type=generative_type,
168
- debug=debug,
169
- run_with_cli=run_with_cli,
170
- requires_safetensors=requires_safetensors,
171
- download_only=download_only,
58
+ batch_size=benchmark_config_params.batch_size,
59
+ raise_errors=benchmark_config_params.raise_errors,
60
+ cache_dir=benchmark_config_params.cache_dir,
61
+ api_key=benchmark_config_params.api_key,
62
+ force=benchmark_config_params.force,
63
+ progress_bar=benchmark_config_params.progress_bar,
64
+ save_results=benchmark_config_params.save_results,
65
+ verbose=benchmark_config_params.verbose or benchmark_config_params.debug,
66
+ device=prepare_device(device=benchmark_config_params.device),
67
+ trust_remote_code=benchmark_config_params.trust_remote_code,
68
+ clear_model_cache=benchmark_config_params.clear_model_cache,
69
+ evaluate_test_split=benchmark_config_params.evaluate_test_split,
70
+ few_shot=benchmark_config_params.few_shot,
71
+ num_iterations=(
72
+ 1
73
+ if hasattr(sys, "_called_from_test")
74
+ else benchmark_config_params.num_iterations
75
+ ),
76
+ api_base=benchmark_config_params.api_base,
77
+ api_version=benchmark_config_params.api_version,
78
+ gpu_memory_utilization=benchmark_config_params.gpu_memory_utilization,
79
+ generative_type=benchmark_config_params.generative_type,
80
+ debug=benchmark_config_params.debug,
81
+ run_with_cli=benchmark_config_params.run_with_cli,
82
+ requires_safetensors=benchmark_config_params.requires_safetensors,
83
+ download_only=benchmark_config_params.download_only,
172
84
  )
173
85
 
174
86
 
@@ -836,15 +836,18 @@ def load_model_and_tokeniser(
836
836
 
837
837
  clear_vllm()
838
838
 
839
- # if we do not have an internet connection we need to give the path to the folder
840
- # that contains the model weights and config files, otherwise vLLM will try to
841
- # download them regardless if they are already present in the download_dir
842
- model_path = resolve_model_path(download_dir)
843
-
844
839
  try:
845
840
  model = LLM(
846
- model=model_id if internet_connection_available() else model_path,
847
- tokenizer=model_id if internet_connection_available() else model_path,
841
+ model=(
842
+ model_id
843
+ if internet_connection_available()
844
+ else resolve_model_path(download_dir=download_dir)
845
+ ),
846
+ tokenizer=(
847
+ model_id
848
+ if internet_connection_available()
849
+ else resolve_model_path(download_dir=download_dir)
850
+ ),
848
851
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
849
852
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
850
853
  download_dir=download_dir,
euroeval/benchmarker.py CHANGED
@@ -6,7 +6,6 @@ import logging
6
6
  import re
7
7
  import sys
8
8
  import typing as t
9
- from copy import deepcopy
10
9
  from pathlib import Path
11
10
  from shutil import rmtree
12
11
  from time import sleep
@@ -200,10 +199,10 @@ class Benchmarker:
200
199
  )
201
200
 
202
201
  self.benchmark_config_default_params = BenchmarkConfigParams(
203
- progress_bar=progress_bar,
204
- save_results=save_results,
205
202
  task=task,
206
203
  dataset=dataset,
204
+ progress_bar=progress_bar,
205
+ save_results=save_results,
207
206
  language=language,
208
207
  model_language=model_language,
209
208
  dataset_language=dataset_language,
@@ -212,24 +211,25 @@ class Benchmarker:
212
211
  raise_errors=raise_errors,
213
212
  cache_dir=cache_dir,
214
213
  api_key=api_key,
215
- force=force,
216
- verbose=verbose,
214
+ api_base=api_base,
215
+ api_version=api_version,
217
216
  trust_remote_code=trust_remote_code,
218
217
  clear_model_cache=clear_model_cache,
219
218
  evaluate_test_split=evaluate_test_split,
220
219
  few_shot=few_shot,
221
220
  num_iterations=num_iterations,
222
- api_base=api_base,
223
- api_version=api_version,
221
+ requires_safetensors=requires_safetensors,
222
+ download_only=download_only,
224
223
  gpu_memory_utilization=gpu_memory_utilization,
225
224
  generative_type=generative_type,
225
+ verbose=verbose,
226
+ force=force,
226
227
  debug=debug,
227
228
  run_with_cli=run_with_cli,
228
- requires_safetensors=requires_safetensors,
229
229
  )
230
230
 
231
231
  self.benchmark_config = build_benchmark_config(
232
- **self.benchmark_config_default_params.model_dump()
232
+ benchmark_config_params=self.benchmark_config_default_params
233
233
  )
234
234
 
235
235
  # Initialise variable storing model lists, so we only have to fetch it once
@@ -331,14 +331,20 @@ class Benchmarker:
331
331
  raise_errors: bool | None = None,
332
332
  cache_dir: str | None = None,
333
333
  api_key: str | None = None,
334
- force: bool | None = None,
335
- verbose: bool | None = None,
334
+ api_base: str | None = None,
335
+ api_version: str | None = None,
336
336
  trust_remote_code: bool | None = None,
337
337
  clear_model_cache: bool | None = None,
338
338
  evaluate_test_split: bool | None = None,
339
339
  few_shot: bool | None = None,
340
340
  num_iterations: int | None = None,
341
341
  requires_safetensors: bool | None = None,
342
+ download_only: bool | None = None,
343
+ gpu_memory_utilization: float | None = None,
344
+ generative_type: GenerativeType | None = None,
345
+ force: bool | None = None,
346
+ verbose: bool | None = None,
347
+ debug: bool | None = None,
342
348
  ) -> list[BenchmarkResult]:
343
349
  """Benchmarks models on datasets.
344
350
 
@@ -391,13 +397,13 @@ class Benchmarker:
391
397
  api_key:
392
398
  The API key to use for a given inference server. Defaults to the value
393
399
  specified when initialising the benchmarker.
394
- force:
395
- Whether to force evaluations of models, even if they have been
396
- benchmarked already. Defaults to the value specified when initialising
397
- the benchmarker.
398
- verbose:
399
- Whether to output additional output. Defaults to the value specified
400
- when initialising the benchmarker.
400
+ api_base:
401
+ The base URL for a given inference API. Only relevant if `model` refers
402
+ to a model on an inference API. Defaults to the value specified when
403
+ initialising the benchmarker.
404
+ api_version:
405
+ The version of the API to use. Defaults to the value specified when
406
+ initialising the benchmarker.
401
407
  trust_remote_code:
402
408
  Whether to trust remote code when loading models. Defaults to the value
403
409
  specified when initialising the benchmarker.
@@ -422,6 +428,27 @@ class Benchmarker:
422
428
  download_only:
423
429
  Whether to only download the models without evaluating them. Defaults
424
430
  to the value specified when initialising the benchmarker.
431
+ gpu_memory_utilization:
432
+ The GPU memory utilization to use for vLLM. Only relevant if the model
433
+ is generative. A larger value will result in faster evaluation, but at
434
+ the risk of running out of GPU memory. Only reduce this if you are
435
+ running out of GPU memory. Defaults to the value specified when
436
+ initialising the benchmarker.
437
+ generative_type:
438
+ The type of generative model to benchmark. Only relevant if the model is
439
+ generative. If not specified, then the type will be inferred based on
440
+ the tags of the model. Defaults to the value specified when initialising
441
+ the benchmarker.
442
+ force:
443
+ Whether to force evaluations of models, even if they have been
444
+ benchmarked already. Defaults to the value specified when initialising
445
+ the benchmarker.
446
+ verbose:
447
+ Whether to output additional output. Defaults to the value specified
448
+ when initialising the benchmarker.
449
+ debug:
450
+ Whether to output debug information. Defaults to the value specified
451
+ when initialising the benchmarker.
425
452
 
426
453
  Returns:
427
454
  A list of benchmark results.
@@ -433,27 +460,141 @@ class Benchmarker:
433
460
  if task is not None and dataset is not None:
434
461
  raise ValueError("Only one of `task` and `dataset` can be specified.")
435
462
 
436
- benchmark_config = self._get_updated_benchmark_config(
437
- task=task,
438
- dataset=dataset,
439
- progress_bar=progress_bar,
440
- save_results=save_results,
441
- language=language,
442
- model_language=model_language,
443
- dataset_language=dataset_language,
444
- device=device,
445
- batch_size=batch_size,
446
- raise_errors=raise_errors,
447
- cache_dir=cache_dir,
448
- api_key=api_key,
449
- force=force,
450
- verbose=verbose,
451
- trust_remote_code=trust_remote_code,
452
- clear_model_cache=clear_model_cache,
453
- evaluate_test_split=evaluate_test_split,
454
- few_shot=few_shot,
455
- num_iterations=num_iterations,
456
- requires_safetensors=requires_safetensors,
463
+ # Get a new updated benchmark configuration, based on any changes to the
464
+ # parameters
465
+ benchmark_config_params = BenchmarkConfigParams(
466
+ task=(
467
+ task if task is not None else self.benchmark_config_default_params.task
468
+ ),
469
+ dataset=(
470
+ dataset
471
+ if dataset is not None
472
+ else self.benchmark_config_default_params.dataset
473
+ ),
474
+ progress_bar=(
475
+ progress_bar
476
+ if progress_bar is not None
477
+ else self.benchmark_config_default_params.progress_bar
478
+ ),
479
+ save_results=(
480
+ save_results
481
+ if save_results is not None
482
+ else self.benchmark_config_default_params.save_results
483
+ ),
484
+ language=(
485
+ language
486
+ if language is not None
487
+ else self.benchmark_config_default_params.language
488
+ ),
489
+ model_language=(
490
+ model_language
491
+ if model_language is not None
492
+ else self.benchmark_config_default_params.model_language
493
+ ),
494
+ dataset_language=(
495
+ dataset_language
496
+ if dataset_language is not None
497
+ else self.benchmark_config_default_params.dataset_language
498
+ ),
499
+ device=(
500
+ device
501
+ if device is not None
502
+ else self.benchmark_config_default_params.device
503
+ ),
504
+ batch_size=(
505
+ batch_size
506
+ if batch_size is not None
507
+ else self.benchmark_config_default_params.batch_size
508
+ ),
509
+ raise_errors=(
510
+ raise_errors
511
+ if raise_errors is not None
512
+ else self.benchmark_config_default_params.raise_errors
513
+ ),
514
+ cache_dir=(
515
+ cache_dir
516
+ if cache_dir is not None
517
+ else self.benchmark_config_default_params.cache_dir
518
+ ),
519
+ api_key=(
520
+ api_key
521
+ if api_key is not None
522
+ else self.benchmark_config_default_params.api_key
523
+ ),
524
+ api_base=(
525
+ api_base
526
+ if api_base is not None
527
+ else self.benchmark_config_default_params.api_base
528
+ ),
529
+ api_version=(
530
+ api_version
531
+ if api_version is not None
532
+ else self.benchmark_config_default_params.api_version
533
+ ),
534
+ trust_remote_code=(
535
+ trust_remote_code
536
+ if trust_remote_code is not None
537
+ else self.benchmark_config_default_params.trust_remote_code
538
+ ),
539
+ clear_model_cache=(
540
+ clear_model_cache
541
+ if clear_model_cache is not None
542
+ else self.benchmark_config_default_params.clear_model_cache
543
+ ),
544
+ evaluate_test_split=(
545
+ evaluate_test_split
546
+ if evaluate_test_split is not None
547
+ else self.benchmark_config_default_params.evaluate_test_split
548
+ ),
549
+ few_shot=(
550
+ few_shot
551
+ if few_shot is not None
552
+ else self.benchmark_config_default_params.few_shot
553
+ ),
554
+ num_iterations=(
555
+ num_iterations
556
+ if num_iterations is not None
557
+ else self.benchmark_config_default_params.num_iterations
558
+ ),
559
+ requires_safetensors=(
560
+ requires_safetensors
561
+ if requires_safetensors is not None
562
+ else self.benchmark_config_default_params.requires_safetensors
563
+ ),
564
+ download_only=(
565
+ download_only
566
+ if download_only is not None
567
+ else self.benchmark_config_default_params.download_only
568
+ ),
569
+ gpu_memory_utilization=(
570
+ gpu_memory_utilization
571
+ if gpu_memory_utilization is not None
572
+ else self.benchmark_config_default_params.gpu_memory_utilization
573
+ ),
574
+ generative_type=(
575
+ generative_type
576
+ if generative_type is not None
577
+ else self.benchmark_config_default_params.generative_type
578
+ ),
579
+ force=(
580
+ force
581
+ if force is not None
582
+ else self.benchmark_config_default_params.force
583
+ ),
584
+ verbose=(
585
+ verbose
586
+ if verbose is not None
587
+ else self.benchmark_config_default_params.verbose
588
+ ),
589
+ debug=(
590
+ debug
591
+ if debug is not None
592
+ else self.benchmark_config_default_params.debug
593
+ ),
594
+ run_with_cli=self.benchmark_config_default_params.run_with_cli,
595
+ )
596
+ benchmark_config = build_benchmark_config(
597
+ benchmark_config_params=benchmark_config_params
457
598
  )
458
599
 
459
600
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -651,170 +792,6 @@ class Benchmarker:
651
792
  destroy_process_group()
652
793
  return current_benchmark_results
653
794
 
654
- def _get_updated_benchmark_config(
655
- self,
656
- progress_bar: bool | None = None,
657
- save_results: bool | None = None,
658
- task: str | list[str] | None | None = None,
659
- dataset: str | list[str] | None | None = None,
660
- language: str | list[str] | None = None,
661
- model_language: str | list[str] | None | None = None,
662
- dataset_language: str | list[str] | None | None = None,
663
- device: Device | None | None = None,
664
- batch_size: int | None = None,
665
- raise_errors: bool | None = None,
666
- cache_dir: str | None = None,
667
- api_key: str | None | None = None,
668
- force: bool | None = None,
669
- verbose: bool | None = None,
670
- trust_remote_code: bool | None = None,
671
- clear_model_cache: bool | None = None,
672
- evaluate_test_split: bool | None = None,
673
- few_shot: bool | None = None,
674
- num_iterations: int | None = None,
675
- api_base: str | None | None = None,
676
- api_version: str | None | None = None,
677
- debug: bool | None = None,
678
- run_with_cli: bool | None = None,
679
- requires_safetensors: bool | None = None,
680
- ) -> "BenchmarkConfig":
681
- """Get an updated benchmark configuration.
682
-
683
- Args:
684
- progress_bar:
685
- Whether progress bars should be shown. If None, then this value will not
686
- be updated.
687
- save_results:
688
- Whether to save the benchmark results to
689
- 'euroeval_benchmark_results.jsonl'. If None, then this value will not
690
- be updated.
691
- task:
692
- The tasks benchmark the model(s) on. If None, then this value will not
693
- be updated.
694
- dataset:
695
- The datasets to benchmark on. If None, then this value will not be
696
- updated.
697
- language:
698
- The language codes of the languages to include, both for models and
699
- datasets. If None, then this value will not be updated.
700
- model_language:
701
- The language codes of the languages to include for models. If None, then
702
- this value will not be updated.
703
- dataset_language:
704
- The language codes of the languages to include for datasets. If None,
705
- then this value will not be updated.
706
- device:
707
- The device to use for benchmarking. If None, then this value will not be
708
- updated.
709
- batch_size:
710
- The batch size to use. If None, then this value will not be updated.
711
- raise_errors:
712
- Whether to raise errors instead of skipping the model evaluation. If
713
- None, then this value will not be updated.
714
- cache_dir:
715
- Directory to store cached models. If None, then this value will not be
716
- updated.
717
- api_key:
718
- The API key to use for a given inference server. If None, then this
719
- value will not be updated.
720
- force:
721
- Whether to force evaluations of models, even if they have been
722
- benchmarked already. If None, then this value will not be updated.
723
- verbose:
724
- Whether to output additional output. If None, then this value will not
725
- be updated.
726
- trust_remote_code:
727
- Whether to trust remote code when loading models. If None, then this
728
- value will not be updated.
729
- clear_model_cache:
730
- Whether to clear the model cache after benchmarking each model. If None,
731
- then this value will not be updated.
732
- evaluate_test_split:
733
- Whether to evaluate the test split of the datasets. If None, then this
734
- value will not be updated.
735
- few_shot:
736
- Whether to only evaluate the model using few-shot evaluation. If None,
737
- then this value will not be updated.
738
- num_iterations:
739
- The number of times each model should be evaluated. If None, then this
740
- value will not be updated.
741
- api_base:
742
- The base URL for a given inference API. If None, then this value will
743
- not be updated.
744
- api_version:
745
- The version of the API to use. If None, then this value will not be
746
- updated.
747
- debug:
748
- Whether to output debug information. If None, then this value will not
749
- be updated.
750
- run_with_cli:
751
- Whether the benchmarker is being run from the command-line interface.
752
- If None, then this value will not be updated.
753
- requires_safetensors:
754
- Whether to only allow models that use the safetensors format. If None,
755
- then this value will not be updated.
756
- download_only:
757
- Whether to only download the models without evaluating them. If None,
758
- then this value will not be updated.
759
-
760
- Returns:
761
- The updated benchmark configuration.
762
- """
763
- benchmark_config_params = deepcopy(self.benchmark_config_default_params)
764
-
765
- if progress_bar is not None:
766
- benchmark_config_params.progress_bar = progress_bar
767
- if save_results is not None:
768
- benchmark_config_params.save_results = save_results
769
- if task is not None:
770
- benchmark_config_params.task = task
771
- benchmark_config_params.dataset = None
772
- if dataset is not None:
773
- benchmark_config_params.dataset = dataset
774
- benchmark_config_params.task = None
775
- if language is not None:
776
- benchmark_config_params.language = language
777
- if model_language is not None:
778
- benchmark_config_params.model_language = model_language
779
- if dataset_language is not None:
780
- benchmark_config_params.dataset_language = dataset_language
781
- if device is not None:
782
- benchmark_config_params.device = device
783
- if batch_size is not None:
784
- benchmark_config_params.batch_size = batch_size
785
- if raise_errors is not None:
786
- benchmark_config_params.raise_errors = raise_errors
787
- if cache_dir is not None:
788
- benchmark_config_params.cache_dir = cache_dir
789
- if api_key is not None:
790
- benchmark_config_params.api_key = api_key
791
- if force is not None:
792
- benchmark_config_params.force = force
793
- if verbose is not None:
794
- benchmark_config_params.verbose = verbose
795
- if trust_remote_code is not None:
796
- benchmark_config_params.trust_remote_code = trust_remote_code
797
- if clear_model_cache is not None:
798
- benchmark_config_params.clear_model_cache = clear_model_cache
799
- if evaluate_test_split is not None:
800
- benchmark_config_params.evaluate_test_split = evaluate_test_split
801
- if few_shot is not None:
802
- benchmark_config_params.few_shot = few_shot
803
- if num_iterations is not None:
804
- benchmark_config_params.num_iterations = num_iterations
805
- if api_base is not None:
806
- benchmark_config_params.api_base = api_base
807
- if api_version is not None:
808
- benchmark_config_params.api_version = api_version
809
- if debug is not None:
810
- benchmark_config_params.debug = debug
811
- if run_with_cli is not None:
812
- benchmark_config_params.run_with_cli = run_with_cli
813
- if requires_safetensors is not None:
814
- benchmark_config_params.requires_safetensors = requires_safetensors
815
-
816
- return build_benchmark_config(**benchmark_config_params.model_dump())
817
-
818
795
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
819
796
  """Prepare the model ID(s) to be benchmarked.
820
797
 
@@ -982,144 +959,13 @@ class Benchmarker:
982
959
  raise e
983
960
  return e
984
961
 
985
- def __call__(
986
- self,
987
- model: list[str] | str,
988
- task: str | list[str] | None = None,
989
- dataset: list[str] | str | None = None,
990
- progress_bar: bool | None = None,
991
- save_results: bool | None = None,
992
- language: str | list[str] | None = None,
993
- model_language: str | list[str] | None = None,
994
- dataset_language: str | list[str] | None = None,
995
- device: Device | None = None,
996
- batch_size: int | None = None,
997
- raise_errors: bool | None = None,
998
- cache_dir: str | None = None,
999
- api_key: str | None = None,
1000
- force: bool | None = None,
1001
- verbose: bool | None = None,
1002
- trust_remote_code: bool | None = None,
1003
- clear_model_cache: bool | None = None,
1004
- evaluate_test_split: bool | None = None,
1005
- few_shot: bool | None = None,
1006
- num_iterations: int | None = None,
1007
- requires_safetensors: bool | None = None,
1008
- ) -> list[BenchmarkResult]:
1009
- """Benchmarks models on datasets.
1010
-
1011
- Args:
1012
- model:
1013
- The full Hugging Face Hub path(s) to the pretrained transformer model.
1014
- The specific model version to use can be added after the suffix '@':
1015
- "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
1016
- and defaults to the latest version if not specified.
1017
- task:
1018
- The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
1019
- If both `task` and `dataset` are None then all datasets will be
1020
- benchmarked. Defaults to None.
1021
- dataset:
1022
- The datasets to benchmark on. Mutually exclusive with `task`. If both
1023
- `task` and `dataset` are None then all datasets will be benchmarked.
1024
- Defaults to None.
1025
- progress_bar:
1026
- Whether progress bars should be shown. Defaults to the value specified
1027
- when initialising the benchmarker.
1028
- save_results:
1029
- Whether to save the benchmark results to
1030
- 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
1031
- when initialising the benchmarker.
1032
- language:
1033
- The language codes of the languages to include, both for models and
1034
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
1035
- 'all' if all languages should be considered. Defaults to the value
1036
- specified when initialising the benchmarker.
1037
- model_language:
1038
- The language codes of the languages to include for models. If specified
1039
- then this overrides the `language` parameter for model languages.
1040
- Defaults to the value specified when initialising the benchmarker.
1041
- dataset_language:
1042
- The language codes of the languages to include for datasets. If
1043
- specified then this overrides the `language` parameter for dataset
1044
- languages. Defaults to the value specified when initialising the
1045
- benchmarker.
1046
- device:
1047
- The device to use for benchmarking. Defaults to the value specified when
1048
- initialising the benchmarker.
1049
- batch_size:
1050
- The batch size to use. Defaults to the value specified when initialising
1051
- the benchmarker.
1052
- raise_errors:
1053
- Whether to raise errors instead of skipping the model evaluation.
1054
- cache_dir:
1055
- Directory to store cached models. Defaults to the value specified when
1056
- initialising the benchmarker.
1057
- api_key:
1058
- The API key to use for a given inference server. Defaults to the value
1059
- specified when initialising the benchmarker.
1060
- force:
1061
- Whether to force evaluations of models, even if they have been
1062
- benchmarked already. Defaults to the value specified when initialising
1063
- the benchmarker.
1064
- verbose:
1065
- Whether to output additional output. Defaults to the value specified
1066
- when initialising the benchmarker.
1067
- trust_remote_code:
1068
- Whether to trust remote code when loading models. Defaults to the value
1069
- specified when initialising the benchmarker.
1070
- clear_model_cache:
1071
- Whether to clear the model cache after benchmarking each model. Defaults
1072
- to the value specified when initialising the benchmarker.
1073
- evaluate_test_split:
1074
- Whether to evaluate the test split of the datasets. Defaults to the
1075
- value specified when initialising the benchmarker.
1076
- few_shot:
1077
- Whether to only evaluate the model using few-shot evaluation. Only
1078
- relevant if the model is generative. Defaults to the value specified
1079
- when initialising the benchmarker.
1080
- num_iterations:
1081
- The number of times each model should be evaluated. This is only meant
1082
- to be used for power users, and scores will not be allowed on the
1083
- leaderboards if this is changed. Defaults to the value specified when
1084
- initialising the benchmarker.
1085
- requires_safetensors:
1086
- Whether to only allow models that use the safetensors format. Defaults
1087
- to the value specified when initialising the benchmarker.
1088
-
1089
- Returns:
1090
- A list of benchmark results.
1091
-
1092
- Raises:
1093
- ValueError:
1094
- If both `task` and `dataset` are specified.
1095
- """
962
+ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
963
+ """Alias for `self.benchmark()`."""
1096
964
  logger.warning(
1097
965
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
1098
966
  "`benchmark` function instead. This will be removed in a future version."
1099
967
  )
1100
- return self.benchmark(
1101
- model=model,
1102
- task=task,
1103
- dataset=dataset,
1104
- progress_bar=progress_bar,
1105
- save_results=save_results,
1106
- language=language,
1107
- model_language=model_language,
1108
- dataset_language=dataset_language,
1109
- device=device,
1110
- batch_size=batch_size,
1111
- raise_errors=raise_errors,
1112
- cache_dir=cache_dir,
1113
- api_key=api_key,
1114
- force=force,
1115
- verbose=verbose,
1116
- trust_remote_code=trust_remote_code,
1117
- clear_model_cache=clear_model_cache,
1118
- evaluate_test_split=evaluate_test_split,
1119
- few_shot=few_shot,
1120
- num_iterations=num_iterations,
1121
- requires_safetensors=requires_safetensors,
1122
- )
968
+ return self.benchmark(*args, **kwds)
1123
969
 
1124
970
 
1125
971
  def model_has_been_benchmarked(
euroeval/data_models.py CHANGED
@@ -170,14 +170,16 @@ class BenchmarkConfig:
170
170
  """General benchmarking configuration, across datasets and models.
171
171
 
172
172
  Attributes:
173
- model_languages:
174
- The languages of the models to benchmark.
175
- dataset_languages:
176
- The languages of the datasets in the benchmark.
177
173
  tasks:
178
174
  The tasks benchmark the model(s) on.
179
175
  datasets:
180
176
  The datasets to benchmark on.
177
+ model_languages:
178
+ The languages of the models to benchmark.
179
+ dataset_languages:
180
+ The languages of the datasets in the benchmark.
181
+ device:
182
+ The device to use for benchmarking.
181
183
  batch_size:
182
184
  The batch size to use.
183
185
  raise_errors:
@@ -186,17 +188,16 @@ class BenchmarkConfig:
186
188
  Directory to store cached models and datasets.
187
189
  api_key:
188
190
  The API key to use for a given inference API.
189
- force:
190
- Whether to force the benchmark to run even if the results are already
191
- cached.
191
+ api_base:
192
+ The base URL for a given inference API. Only relevant if `model` refers to a
193
+ model on an inference API.
194
+ api_version:
195
+ The version of the API to use. Only relevant if `model` refers to a model on
196
+ an inference API.
192
197
  progress_bar:
193
198
  Whether to show a progress bar.
194
199
  save_results:
195
200
  Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
196
- device:
197
- The device to use for benchmarking.
198
- verbose:
199
- Whether to print verbose output.
200
201
  trust_remote_code:
201
202
  Whether to trust remote code when loading models from the Hugging Face Hub.
202
203
  clear_model_cache:
@@ -208,21 +209,11 @@ class BenchmarkConfig:
208
209
  if the model is generative.
209
210
  num_iterations:
210
211
  The number of iterations each model should be evaluated for.
211
- api_base:
212
- The base URL for a given inference API. Only relevant if `model` refers to a
213
- model on an inference API.
214
- api_version:
215
- The version of the API to use. Only relevant if `model` refers to a model on
216
- an inference API.
217
212
  gpu_memory_utilization:
218
213
  The GPU memory utilization to use for vLLM. A larger value will result in
219
214
  faster evaluation, but at the risk of running out of GPU memory. Only reduce
220
215
  this if you are running out of GPU memory. Only relevant if the model is
221
216
  generative.
222
- debug:
223
- Whether to run the benchmark in debug mode.
224
- run_with_cli:
225
- Whether the benchmark is being run with the CLI.
226
217
  requires_safetensors:
227
218
  Whether to only allow models that use the safetensors format.
228
219
  generative_type:
@@ -231,6 +222,15 @@ class BenchmarkConfig:
231
222
  download_only:
232
223
  Whether to only download the models, metrics and datasets without
233
224
  evaluating.
225
+ force:
226
+ Whether to force the benchmark to run even if the results are already
227
+ cached.
228
+ verbose:
229
+ Whether to print verbose output.
230
+ debug:
231
+ Whether to run the benchmark in debug mode.
232
+ run_with_cli:
233
+ Whether the benchmark is being run with the CLI.
234
234
  """
235
235
 
236
236
  model_languages: list[Language]
@@ -241,24 +241,24 @@ class BenchmarkConfig:
241
241
  raise_errors: bool
242
242
  cache_dir: str
243
243
  api_key: str | None
244
- force: bool
244
+ api_base: str | None
245
+ api_version: str | None
245
246
  progress_bar: bool
246
247
  save_results: bool
247
248
  device: torch.device
248
- verbose: bool
249
249
  trust_remote_code: bool
250
250
  clear_model_cache: bool
251
251
  evaluate_test_split: bool
252
252
  few_shot: bool
253
253
  num_iterations: int
254
- api_base: str | None
255
- api_version: str | None
256
254
  gpu_memory_utilization: float
257
- debug: bool
258
- run_with_cli: bool
259
255
  requires_safetensors: bool
260
256
  generative_type: GenerativeType | None
261
257
  download_only: bool
258
+ force: bool
259
+ verbose: bool
260
+ debug: bool
261
+ run_with_cli: bool
262
262
 
263
263
 
264
264
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -266,10 +266,10 @@ class BenchmarkConfigParams(pydantic.BaseModel):
266
266
 
267
267
  model_config = pydantic.ConfigDict(protected_namespaces=())
268
268
 
269
- progress_bar: bool
270
- save_results: bool
271
269
  task: str | list[str] | None
272
270
  dataset: str | list[str] | None
271
+ progress_bar: bool
272
+ save_results: bool
273
273
  language: str | list[str]
274
274
  model_language: str | list[str] | None
275
275
  dataset_language: str | list[str] | None
@@ -278,20 +278,21 @@ class BenchmarkConfigParams(pydantic.BaseModel):
278
278
  raise_errors: bool
279
279
  cache_dir: str
280
280
  api_key: str | None
281
- force: bool
282
- verbose: bool
281
+ api_base: str | None
282
+ api_version: str | None
283
283
  trust_remote_code: bool
284
284
  clear_model_cache: bool
285
285
  evaluate_test_split: bool
286
286
  few_shot: bool
287
287
  num_iterations: int
288
- api_base: str | None
289
- api_version: str | None
288
+ requires_safetensors: bool
289
+ download_only: bool
290
290
  gpu_memory_utilization: float
291
291
  generative_type: GenerativeType | None
292
+ force: bool
293
+ verbose: bool
292
294
  debug: bool
293
295
  run_with_cli: bool
294
- requires_safetensors: bool
295
296
 
296
297
 
297
298
  class BenchmarkResult(pydantic.BaseModel):
euroeval/utils.py CHANGED
@@ -62,6 +62,10 @@ def resolve_model_path(download_dir: str) -> str:
62
62
 
63
63
  Returns:
64
64
  The path to the model.
65
+
66
+ Raises:
67
+ InvalidModel:
68
+ If the model path is not valid, or if required files are missing.
65
69
  """
66
70
  model_path = Path(download_dir)
67
71
  # Get the 'path safe' version of the model id, which is the last dir in the path
@@ -271,14 +275,15 @@ def internet_connection_available() -> bool:
271
275
  s = socket.create_connection(("1.1.1.1", 80))
272
276
  s.close()
273
277
  return True
274
- # a bit ugly but we dont want to actually import the pytest-socket exceptions
275
- # we catch all exceptions and check if the name matches any known errors
278
+
279
+ # We want to only catch exceptions related to socket connections, but as we cannot
280
+ # import these here as they're developer dependencies, we check the exception name
281
+ # instead. If the exception is not related to socket connections, we reraise it.
276
282
  except Exception as e:
277
283
  pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
278
284
  if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
279
285
  return False
280
- else:
281
- raise e
286
+ raise e
282
287
 
283
288
 
284
289
  class HiddenPrints:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.2.0
3
+ Version: 16.2.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,11 +1,11 @@
1
1
  euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
2
- euroeval/benchmark_config_factory.py,sha256=NcdxQkGrstsprdz1QW3XrgS8B65uEP5SqxFJoL8zEEk,11831
3
- euroeval/benchmarker.py,sha256=I82iVGwlRJ9BQ02u_bt5ngN-ZzWEJT2ReCrqXgh6lx4,55285
2
+ euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
3
+ euroeval/benchmarker.py,sha256=fjEqAkUC92fYUarWleigxfSrw0siYWW4YI_KlwyDUF4,46992
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
5
  euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
6
6
  euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
7
7
  euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=LNioJFW231RSSKZx7WIs46Xxs0KWgb7ElRyyULHSEzQ,27742
8
+ euroeval/data_models.py,sha256=X4zAdR1K2MPb4f4Vc7gPYfolzFxxsz5WplnsmsiMYY8,27766
9
9
  euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
@@ -20,13 +20,13 @@ euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4
20
20
  euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
21
21
  euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
22
22
  euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
23
- euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
23
+ euroeval/utils.py,sha256=AyUWGh-G5j14jXZ6ccS1LyTXml2JgbOzOt_e-rr5mag,19451
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
26
26
  euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
27
27
  euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
28
28
  euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
29
- euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
29
+ euroeval/benchmark_modules/vllm.py,sha256=bo5XaKlHEKhdEFPNJxsnJFq4RWOC9VoOH4Hqw_6dbMQ,43893
30
30
  euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
31
31
  euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
32
32
  euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
63
63
  euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
64
64
  euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
65
65
  euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
66
- euroeval-16.2.0.dist-info/METADATA,sha256=GQ1C9avsX8wl0Hcj3wmXvziveGDFWUT2aUrhhjIDzwc,14590
67
- euroeval-16.2.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
- euroeval-16.2.0.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
- euroeval-16.2.0.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
- euroeval-16.2.0.dist-info/RECORD,,
66
+ euroeval-16.2.2.dist-info/METADATA,sha256=jGGv76AqT4vGKREN8jD3bBHi19vVyIKUNvlk6FNhRN8,14590
67
+ euroeval-16.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ euroeval-16.2.2.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
+ euroeval-16.2.2.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
+ euroeval-16.2.2.dist-info/RECORD,,