EuroEval 16.2.1__py3-none-any.whl → 16.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

@@ -836,15 +836,18 @@ def load_model_and_tokeniser(
836
836
 
837
837
  clear_vllm()
838
838
 
839
- # if we do not have an internet connection we need to give the path to the folder
840
- # that contains the model weights and config files, otherwise vLLM will try to
841
- # download them regardless if they are already present in the download_dir
842
- model_path = resolve_model_path(download_dir)
843
-
844
839
  try:
845
840
  model = LLM(
846
- model=model_id if internet_connection_available() else model_path,
847
- tokenizer=model_id if internet_connection_available() else model_path,
841
+ model=(
842
+ model_id
843
+ if internet_connection_available()
844
+ else resolve_model_path(download_dir=download_dir)
845
+ ),
846
+ tokenizer=(
847
+ model_id
848
+ if internet_connection_available()
849
+ else resolve_model_path(download_dir=download_dir)
850
+ ),
848
851
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
849
852
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
850
853
  download_dir=download_dir,
euroeval/benchmarker.py CHANGED
@@ -6,7 +6,6 @@ import logging
6
6
  import re
7
7
  import sys
8
8
  import typing as t
9
- from copy import deepcopy
10
9
  from pathlib import Path
11
10
  from shutil import rmtree
12
11
  from time import sleep
@@ -200,10 +199,10 @@ class Benchmarker:
200
199
  )
201
200
 
202
201
  self.benchmark_config_default_params = BenchmarkConfigParams(
203
- progress_bar=progress_bar,
204
- save_results=save_results,
205
202
  task=task,
206
203
  dataset=dataset,
204
+ progress_bar=progress_bar,
205
+ save_results=save_results,
207
206
  language=language,
208
207
  model_language=model_language,
209
208
  dataset_language=dataset_language,
@@ -212,21 +211,21 @@ class Benchmarker:
212
211
  raise_errors=raise_errors,
213
212
  cache_dir=cache_dir,
214
213
  api_key=api_key,
215
- force=force,
216
- verbose=verbose,
214
+ api_base=api_base,
215
+ api_version=api_version,
217
216
  trust_remote_code=trust_remote_code,
218
217
  clear_model_cache=clear_model_cache,
219
218
  evaluate_test_split=evaluate_test_split,
220
219
  few_shot=few_shot,
221
220
  num_iterations=num_iterations,
222
- api_base=api_base,
223
- api_version=api_version,
221
+ requires_safetensors=requires_safetensors,
222
+ download_only=download_only,
224
223
  gpu_memory_utilization=gpu_memory_utilization,
225
224
  generative_type=generative_type,
226
- download_only=download_only,
225
+ verbose=verbose,
226
+ force=force,
227
227
  debug=debug,
228
228
  run_with_cli=run_with_cli,
229
- requires_safetensors=requires_safetensors,
230
229
  )
231
230
 
232
231
  self.benchmark_config = build_benchmark_config(
@@ -332,8 +331,8 @@ class Benchmarker:
332
331
  raise_errors: bool | None = None,
333
332
  cache_dir: str | None = None,
334
333
  api_key: str | None = None,
335
- force: bool | None = None,
336
- verbose: bool | None = None,
334
+ api_base: str | None = None,
335
+ api_version: str | None = None,
337
336
  trust_remote_code: bool | None = None,
338
337
  clear_model_cache: bool | None = None,
339
338
  evaluate_test_split: bool | None = None,
@@ -341,6 +340,11 @@ class Benchmarker:
341
340
  num_iterations: int | None = None,
342
341
  requires_safetensors: bool | None = None,
343
342
  download_only: bool | None = None,
343
+ gpu_memory_utilization: float | None = None,
344
+ generative_type: GenerativeType | None = None,
345
+ force: bool | None = None,
346
+ verbose: bool | None = None,
347
+ debug: bool | None = None,
344
348
  ) -> list[BenchmarkResult]:
345
349
  """Benchmarks models on datasets.
346
350
 
@@ -393,13 +397,13 @@ class Benchmarker:
393
397
  api_key:
394
398
  The API key to use for a given inference server. Defaults to the value
395
399
  specified when initialising the benchmarker.
396
- force:
397
- Whether to force evaluations of models, even if they have been
398
- benchmarked already. Defaults to the value specified when initialising
399
- the benchmarker.
400
- verbose:
401
- Whether to output additional output. Defaults to the value specified
402
- when initialising the benchmarker.
400
+ api_base:
401
+ The base URL for a given inference API. Only relevant if `model` refers
402
+ to a model on an inference API. Defaults to the value specified when
403
+ initialising the benchmarker.
404
+ api_version:
405
+ The version of the API to use. Defaults to the value specified when
406
+ initialising the benchmarker.
403
407
  trust_remote_code:
404
408
  Whether to trust remote code when loading models. Defaults to the value
405
409
  specified when initialising the benchmarker.
@@ -424,6 +428,27 @@ class Benchmarker:
424
428
  download_only:
425
429
  Whether to only download the models without evaluating them. Defaults
426
430
  to the value specified when initialising the benchmarker.
431
+ gpu_memory_utilization:
432
+ The GPU memory utilization to use for vLLM. Only relevant if the model
433
+ is generative. A larger value will result in faster evaluation, but at
434
+ the risk of running out of GPU memory. Only reduce this if you are
435
+ running out of GPU memory. Defaults to the value specified when
436
+ initialising the benchmarker.
437
+ generative_type:
438
+ The type of generative model to benchmark. Only relevant if the model is
439
+ generative. If not specified, then the type will be inferred based on
440
+ the tags of the model. Defaults to the value specified when initialising
441
+ the benchmarker.
442
+ force:
443
+ Whether to force evaluations of models, even if they have been
444
+ benchmarked already. Defaults to the value specified when initialising
445
+ the benchmarker.
446
+ verbose:
447
+ Whether to output additional output. Defaults to the value specified
448
+ when initialising the benchmarker.
449
+ debug:
450
+ Whether to output debug information. Defaults to the value specified
451
+ when initialising the benchmarker.
427
452
 
428
453
  Returns:
429
454
  A list of benchmark results.
@@ -435,28 +460,141 @@ class Benchmarker:
435
460
  if task is not None and dataset is not None:
436
461
  raise ValueError("Only one of `task` and `dataset` can be specified.")
437
462
 
438
- benchmark_config = self._get_updated_benchmark_config(
439
- task=task,
440
- dataset=dataset,
441
- progress_bar=progress_bar,
442
- save_results=save_results,
443
- language=language,
444
- model_language=model_language,
445
- dataset_language=dataset_language,
446
- device=device,
447
- batch_size=batch_size,
448
- raise_errors=raise_errors,
449
- cache_dir=cache_dir,
450
- api_key=api_key,
451
- force=force,
452
- verbose=verbose,
453
- trust_remote_code=trust_remote_code,
454
- clear_model_cache=clear_model_cache,
455
- evaluate_test_split=evaluate_test_split,
456
- few_shot=few_shot,
457
- num_iterations=num_iterations,
458
- requires_safetensors=requires_safetensors,
459
- download_only=download_only,
463
+ # Get a new updated benchmark configuration, based on any changes to the
464
+ # parameters
465
+ benchmark_config_params = BenchmarkConfigParams(
466
+ task=(
467
+ task if task is not None else self.benchmark_config_default_params.task
468
+ ),
469
+ dataset=(
470
+ dataset
471
+ if dataset is not None
472
+ else self.benchmark_config_default_params.dataset
473
+ ),
474
+ progress_bar=(
475
+ progress_bar
476
+ if progress_bar is not None
477
+ else self.benchmark_config_default_params.progress_bar
478
+ ),
479
+ save_results=(
480
+ save_results
481
+ if save_results is not None
482
+ else self.benchmark_config_default_params.save_results
483
+ ),
484
+ language=(
485
+ language
486
+ if language is not None
487
+ else self.benchmark_config_default_params.language
488
+ ),
489
+ model_language=(
490
+ model_language
491
+ if model_language is not None
492
+ else self.benchmark_config_default_params.model_language
493
+ ),
494
+ dataset_language=(
495
+ dataset_language
496
+ if dataset_language is not None
497
+ else self.benchmark_config_default_params.dataset_language
498
+ ),
499
+ device=(
500
+ device
501
+ if device is not None
502
+ else self.benchmark_config_default_params.device
503
+ ),
504
+ batch_size=(
505
+ batch_size
506
+ if batch_size is not None
507
+ else self.benchmark_config_default_params.batch_size
508
+ ),
509
+ raise_errors=(
510
+ raise_errors
511
+ if raise_errors is not None
512
+ else self.benchmark_config_default_params.raise_errors
513
+ ),
514
+ cache_dir=(
515
+ cache_dir
516
+ if cache_dir is not None
517
+ else self.benchmark_config_default_params.cache_dir
518
+ ),
519
+ api_key=(
520
+ api_key
521
+ if api_key is not None
522
+ else self.benchmark_config_default_params.api_key
523
+ ),
524
+ api_base=(
525
+ api_base
526
+ if api_base is not None
527
+ else self.benchmark_config_default_params.api_base
528
+ ),
529
+ api_version=(
530
+ api_version
531
+ if api_version is not None
532
+ else self.benchmark_config_default_params.api_version
533
+ ),
534
+ trust_remote_code=(
535
+ trust_remote_code
536
+ if trust_remote_code is not None
537
+ else self.benchmark_config_default_params.trust_remote_code
538
+ ),
539
+ clear_model_cache=(
540
+ clear_model_cache
541
+ if clear_model_cache is not None
542
+ else self.benchmark_config_default_params.clear_model_cache
543
+ ),
544
+ evaluate_test_split=(
545
+ evaluate_test_split
546
+ if evaluate_test_split is not None
547
+ else self.benchmark_config_default_params.evaluate_test_split
548
+ ),
549
+ few_shot=(
550
+ few_shot
551
+ if few_shot is not None
552
+ else self.benchmark_config_default_params.few_shot
553
+ ),
554
+ num_iterations=(
555
+ num_iterations
556
+ if num_iterations is not None
557
+ else self.benchmark_config_default_params.num_iterations
558
+ ),
559
+ requires_safetensors=(
560
+ requires_safetensors
561
+ if requires_safetensors is not None
562
+ else self.benchmark_config_default_params.requires_safetensors
563
+ ),
564
+ download_only=(
565
+ download_only
566
+ if download_only is not None
567
+ else self.benchmark_config_default_params.download_only
568
+ ),
569
+ gpu_memory_utilization=(
570
+ gpu_memory_utilization
571
+ if gpu_memory_utilization is not None
572
+ else self.benchmark_config_default_params.gpu_memory_utilization
573
+ ),
574
+ generative_type=(
575
+ generative_type
576
+ if generative_type is not None
577
+ else self.benchmark_config_default_params.generative_type
578
+ ),
579
+ force=(
580
+ force
581
+ if force is not None
582
+ else self.benchmark_config_default_params.force
583
+ ),
584
+ verbose=(
585
+ verbose
586
+ if verbose is not None
587
+ else self.benchmark_config_default_params.verbose
588
+ ),
589
+ debug=(
590
+ debug
591
+ if debug is not None
592
+ else self.benchmark_config_default_params.debug
593
+ ),
594
+ run_with_cli=self.benchmark_config_default_params.run_with_cli,
595
+ )
596
+ benchmark_config = build_benchmark_config(
597
+ benchmark_config_params=benchmark_config_params
460
598
  )
461
599
 
462
600
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -654,176 +792,6 @@ class Benchmarker:
654
792
  destroy_process_group()
655
793
  return current_benchmark_results
656
794
 
657
- def _get_updated_benchmark_config(
658
- self,
659
- progress_bar: bool | None = None,
660
- save_results: bool | None = None,
661
- task: str | list[str] | None | None = None,
662
- dataset: str | list[str] | None | None = None,
663
- language: str | list[str] | None = None,
664
- model_language: str | list[str] | None | None = None,
665
- dataset_language: str | list[str] | None | None = None,
666
- device: Device | None | None = None,
667
- batch_size: int | None = None,
668
- raise_errors: bool | None = None,
669
- cache_dir: str | None = None,
670
- api_key: str | None | None = None,
671
- force: bool | None = None,
672
- verbose: bool | None = None,
673
- trust_remote_code: bool | None = None,
674
- clear_model_cache: bool | None = None,
675
- evaluate_test_split: bool | None = None,
676
- few_shot: bool | None = None,
677
- num_iterations: int | None = None,
678
- api_base: str | None | None = None,
679
- api_version: str | None | None = None,
680
- debug: bool | None = None,
681
- run_with_cli: bool | None = None,
682
- requires_safetensors: bool | None = None,
683
- download_only: bool | None = None,
684
- ) -> "BenchmarkConfig":
685
- """Get an updated benchmark configuration.
686
-
687
- Args:
688
- progress_bar:
689
- Whether progress bars should be shown. If None, then this value will not
690
- be updated.
691
- save_results:
692
- Whether to save the benchmark results to
693
- 'euroeval_benchmark_results.jsonl'. If None, then this value will not
694
- be updated.
695
- task:
696
- The tasks benchmark the model(s) on. If None, then this value will not
697
- be updated.
698
- dataset:
699
- The datasets to benchmark on. If None, then this value will not be
700
- updated.
701
- language:
702
- The language codes of the languages to include, both for models and
703
- datasets. If None, then this value will not be updated.
704
- model_language:
705
- The language codes of the languages to include for models. If None, then
706
- this value will not be updated.
707
- dataset_language:
708
- The language codes of the languages to include for datasets. If None,
709
- then this value will not be updated.
710
- device:
711
- The device to use for benchmarking. If None, then this value will not be
712
- updated.
713
- batch_size:
714
- The batch size to use. If None, then this value will not be updated.
715
- raise_errors:
716
- Whether to raise errors instead of skipping the model evaluation. If
717
- None, then this value will not be updated.
718
- cache_dir:
719
- Directory to store cached models. If None, then this value will not be
720
- updated.
721
- api_key:
722
- The API key to use for a given inference server. If None, then this
723
- value will not be updated.
724
- force:
725
- Whether to force evaluations of models, even if they have been
726
- benchmarked already. If None, then this value will not be updated.
727
- verbose:
728
- Whether to output additional output. If None, then this value will not
729
- be updated.
730
- trust_remote_code:
731
- Whether to trust remote code when loading models. If None, then this
732
- value will not be updated.
733
- clear_model_cache:
734
- Whether to clear the model cache after benchmarking each model. If None,
735
- then this value will not be updated.
736
- evaluate_test_split:
737
- Whether to evaluate the test split of the datasets. If None, then this
738
- value will not be updated.
739
- few_shot:
740
- Whether to only evaluate the model using few-shot evaluation. If None,
741
- then this value will not be updated.
742
- num_iterations:
743
- The number of times each model should be evaluated. If None, then this
744
- value will not be updated.
745
- api_base:
746
- The base URL for a given inference API. If None, then this value will
747
- not be updated.
748
- api_version:
749
- The version of the API to use. If None, then this value will not be
750
- updated.
751
- debug:
752
- Whether to output debug information. If None, then this value will not
753
- be updated.
754
- run_with_cli:
755
- Whether the benchmarker is being run from the command-line interface.
756
- If None, then this value will not be updated.
757
- requires_safetensors:
758
- Whether to only allow models that use the safetensors format. If None,
759
- then this value will not be updated.
760
- download_only:
761
- Whether to only download the models without evaluating them. If None,
762
- then this value will not be updated.
763
- download_only:
764
- Whether to only download models and datasets without performing any
765
- benchmarking. If None, then this value will not be updated.
766
-
767
- Returns:
768
- The updated benchmark configuration.
769
- """
770
- benchmark_config_params = deepcopy(self.benchmark_config_default_params)
771
-
772
- if progress_bar is not None:
773
- benchmark_config_params.progress_bar = progress_bar
774
- if save_results is not None:
775
- benchmark_config_params.save_results = save_results
776
- if task is not None:
777
- benchmark_config_params.task = task
778
- benchmark_config_params.dataset = None
779
- if dataset is not None:
780
- benchmark_config_params.dataset = dataset
781
- benchmark_config_params.task = None
782
- if language is not None:
783
- benchmark_config_params.language = language
784
- if model_language is not None:
785
- benchmark_config_params.model_language = model_language
786
- if dataset_language is not None:
787
- benchmark_config_params.dataset_language = dataset_language
788
- if device is not None:
789
- benchmark_config_params.device = device
790
- if batch_size is not None:
791
- benchmark_config_params.batch_size = batch_size
792
- if raise_errors is not None:
793
- benchmark_config_params.raise_errors = raise_errors
794
- if cache_dir is not None:
795
- benchmark_config_params.cache_dir = cache_dir
796
- if api_key is not None:
797
- benchmark_config_params.api_key = api_key
798
- if force is not None:
799
- benchmark_config_params.force = force
800
- if verbose is not None:
801
- benchmark_config_params.verbose = verbose
802
- if trust_remote_code is not None:
803
- benchmark_config_params.trust_remote_code = trust_remote_code
804
- if clear_model_cache is not None:
805
- benchmark_config_params.clear_model_cache = clear_model_cache
806
- if evaluate_test_split is not None:
807
- benchmark_config_params.evaluate_test_split = evaluate_test_split
808
- if few_shot is not None:
809
- benchmark_config_params.few_shot = few_shot
810
- if num_iterations is not None:
811
- benchmark_config_params.num_iterations = num_iterations
812
- if api_base is not None:
813
- benchmark_config_params.api_base = api_base
814
- if api_version is not None:
815
- benchmark_config_params.api_version = api_version
816
- if debug is not None:
817
- benchmark_config_params.debug = debug
818
- if run_with_cli is not None:
819
- benchmark_config_params.run_with_cli = run_with_cli
820
- if requires_safetensors is not None:
821
- benchmark_config_params.requires_safetensors = requires_safetensors
822
- if download_only is not None:
823
- benchmark_config_params.download_only = download_only
824
-
825
- return build_benchmark_config(benchmark_config_params=benchmark_config_params)
826
-
827
795
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
828
796
  """Prepare the model ID(s) to be benchmarked.
829
797
 
@@ -991,144 +959,13 @@ class Benchmarker:
991
959
  raise e
992
960
  return e
993
961
 
994
- def __call__(
995
- self,
996
- model: list[str] | str,
997
- task: str | list[str] | None = None,
998
- dataset: list[str] | str | None = None,
999
- progress_bar: bool | None = None,
1000
- save_results: bool | None = None,
1001
- language: str | list[str] | None = None,
1002
- model_language: str | list[str] | None = None,
1003
- dataset_language: str | list[str] | None = None,
1004
- device: Device | None = None,
1005
- batch_size: int | None = None,
1006
- raise_errors: bool | None = None,
1007
- cache_dir: str | None = None,
1008
- api_key: str | None = None,
1009
- force: bool | None = None,
1010
- verbose: bool | None = None,
1011
- trust_remote_code: bool | None = None,
1012
- clear_model_cache: bool | None = None,
1013
- evaluate_test_split: bool | None = None,
1014
- few_shot: bool | None = None,
1015
- num_iterations: int | None = None,
1016
- requires_safetensors: bool | None = None,
1017
- ) -> list[BenchmarkResult]:
1018
- """Benchmarks models on datasets.
1019
-
1020
- Args:
1021
- model:
1022
- The full Hugging Face Hub path(s) to the pretrained transformer model.
1023
- The specific model version to use can be added after the suffix '@':
1024
- "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
1025
- and defaults to the latest version if not specified.
1026
- task:
1027
- The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
1028
- If both `task` and `dataset` are None then all datasets will be
1029
- benchmarked. Defaults to None.
1030
- dataset:
1031
- The datasets to benchmark on. Mutually exclusive with `task`. If both
1032
- `task` and `dataset` are None then all datasets will be benchmarked.
1033
- Defaults to None.
1034
- progress_bar:
1035
- Whether progress bars should be shown. Defaults to the value specified
1036
- when initialising the benchmarker.
1037
- save_results:
1038
- Whether to save the benchmark results to
1039
- 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
1040
- when initialising the benchmarker.
1041
- language:
1042
- The language codes of the languages to include, both for models and
1043
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
1044
- 'all' if all languages should be considered. Defaults to the value
1045
- specified when initialising the benchmarker.
1046
- model_language:
1047
- The language codes of the languages to include for models. If specified
1048
- then this overrides the `language` parameter for model languages.
1049
- Defaults to the value specified when initialising the benchmarker.
1050
- dataset_language:
1051
- The language codes of the languages to include for datasets. If
1052
- specified then this overrides the `language` parameter for dataset
1053
- languages. Defaults to the value specified when initialising the
1054
- benchmarker.
1055
- device:
1056
- The device to use for benchmarking. Defaults to the value specified when
1057
- initialising the benchmarker.
1058
- batch_size:
1059
- The batch size to use. Defaults to the value specified when initialising
1060
- the benchmarker.
1061
- raise_errors:
1062
- Whether to raise errors instead of skipping the model evaluation.
1063
- cache_dir:
1064
- Directory to store cached models. Defaults to the value specified when
1065
- initialising the benchmarker.
1066
- api_key:
1067
- The API key to use for a given inference server. Defaults to the value
1068
- specified when initialising the benchmarker.
1069
- force:
1070
- Whether to force evaluations of models, even if they have been
1071
- benchmarked already. Defaults to the value specified when initialising
1072
- the benchmarker.
1073
- verbose:
1074
- Whether to output additional output. Defaults to the value specified
1075
- when initialising the benchmarker.
1076
- trust_remote_code:
1077
- Whether to trust remote code when loading models. Defaults to the value
1078
- specified when initialising the benchmarker.
1079
- clear_model_cache:
1080
- Whether to clear the model cache after benchmarking each model. Defaults
1081
- to the value specified when initialising the benchmarker.
1082
- evaluate_test_split:
1083
- Whether to evaluate the test split of the datasets. Defaults to the
1084
- value specified when initialising the benchmarker.
1085
- few_shot:
1086
- Whether to only evaluate the model using few-shot evaluation. Only
1087
- relevant if the model is generative. Defaults to the value specified
1088
- when initialising the benchmarker.
1089
- num_iterations:
1090
- The number of times each model should be evaluated. This is only meant
1091
- to be used for power users, and scores will not be allowed on the
1092
- leaderboards if this is changed. Defaults to the value specified when
1093
- initialising the benchmarker.
1094
- requires_safetensors:
1095
- Whether to only allow models that use the safetensors format. Defaults
1096
- to the value specified when initialising the benchmarker.
1097
-
1098
- Returns:
1099
- A list of benchmark results.
1100
-
1101
- Raises:
1102
- ValueError:
1103
- If both `task` and `dataset` are specified.
1104
- """
962
+ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
963
+ """Alias for `self.benchmark()`."""
1105
964
  logger.warning(
1106
965
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
1107
966
  "`benchmark` function instead. This will be removed in a future version."
1108
967
  )
1109
- return self.benchmark(
1110
- model=model,
1111
- task=task,
1112
- dataset=dataset,
1113
- progress_bar=progress_bar,
1114
- save_results=save_results,
1115
- language=language,
1116
- model_language=model_language,
1117
- dataset_language=dataset_language,
1118
- device=device,
1119
- batch_size=batch_size,
1120
- raise_errors=raise_errors,
1121
- cache_dir=cache_dir,
1122
- api_key=api_key,
1123
- force=force,
1124
- verbose=verbose,
1125
- trust_remote_code=trust_remote_code,
1126
- clear_model_cache=clear_model_cache,
1127
- evaluate_test_split=evaluate_test_split,
1128
- few_shot=few_shot,
1129
- num_iterations=num_iterations,
1130
- requires_safetensors=requires_safetensors,
1131
- )
968
+ return self.benchmark(*args, **kwds)
1132
969
 
1133
970
 
1134
971
  def model_has_been_benchmarked(
euroeval/data_models.py CHANGED
@@ -170,14 +170,16 @@ class BenchmarkConfig:
170
170
  """General benchmarking configuration, across datasets and models.
171
171
 
172
172
  Attributes:
173
- model_languages:
174
- The languages of the models to benchmark.
175
- dataset_languages:
176
- The languages of the datasets in the benchmark.
177
173
  tasks:
178
174
  The tasks benchmark the model(s) on.
179
175
  datasets:
180
176
  The datasets to benchmark on.
177
+ model_languages:
178
+ The languages of the models to benchmark.
179
+ dataset_languages:
180
+ The languages of the datasets in the benchmark.
181
+ device:
182
+ The device to use for benchmarking.
181
183
  batch_size:
182
184
  The batch size to use.
183
185
  raise_errors:
@@ -186,17 +188,16 @@ class BenchmarkConfig:
186
188
  Directory to store cached models and datasets.
187
189
  api_key:
188
190
  The API key to use for a given inference API.
189
- force:
190
- Whether to force the benchmark to run even if the results are already
191
- cached.
191
+ api_base:
192
+ The base URL for a given inference API. Only relevant if `model` refers to a
193
+ model on an inference API.
194
+ api_version:
195
+ The version of the API to use. Only relevant if `model` refers to a model on
196
+ an inference API.
192
197
  progress_bar:
193
198
  Whether to show a progress bar.
194
199
  save_results:
195
200
  Whether to save the benchmark results to 'euroeval_benchmark_results.json'.
196
- device:
197
- The device to use for benchmarking.
198
- verbose:
199
- Whether to print verbose output.
200
201
  trust_remote_code:
201
202
  Whether to trust remote code when loading models from the Hugging Face Hub.
202
203
  clear_model_cache:
@@ -208,21 +209,11 @@ class BenchmarkConfig:
208
209
  if the model is generative.
209
210
  num_iterations:
210
211
  The number of iterations each model should be evaluated for.
211
- api_base:
212
- The base URL for a given inference API. Only relevant if `model` refers to a
213
- model on an inference API.
214
- api_version:
215
- The version of the API to use. Only relevant if `model` refers to a model on
216
- an inference API.
217
212
  gpu_memory_utilization:
218
213
  The GPU memory utilization to use for vLLM. A larger value will result in
219
214
  faster evaluation, but at the risk of running out of GPU memory. Only reduce
220
215
  this if you are running out of GPU memory. Only relevant if the model is
221
216
  generative.
222
- debug:
223
- Whether to run the benchmark in debug mode.
224
- run_with_cli:
225
- Whether the benchmark is being run with the CLI.
226
217
  requires_safetensors:
227
218
  Whether to only allow models that use the safetensors format.
228
219
  generative_type:
@@ -231,6 +222,15 @@ class BenchmarkConfig:
231
222
  download_only:
232
223
  Whether to only download the models, metrics and datasets without
233
224
  evaluating.
225
+ force:
226
+ Whether to force the benchmark to run even if the results are already
227
+ cached.
228
+ verbose:
229
+ Whether to print verbose output.
230
+ debug:
231
+ Whether to run the benchmark in debug mode.
232
+ run_with_cli:
233
+ Whether the benchmark is being run with the CLI.
234
234
  """
235
235
 
236
236
  model_languages: list[Language]
@@ -241,24 +241,24 @@ class BenchmarkConfig:
241
241
  raise_errors: bool
242
242
  cache_dir: str
243
243
  api_key: str | None
244
- force: bool
244
+ api_base: str | None
245
+ api_version: str | None
245
246
  progress_bar: bool
246
247
  save_results: bool
247
248
  device: torch.device
248
- verbose: bool
249
249
  trust_remote_code: bool
250
250
  clear_model_cache: bool
251
251
  evaluate_test_split: bool
252
252
  few_shot: bool
253
253
  num_iterations: int
254
- api_base: str | None
255
- api_version: str | None
256
254
  gpu_memory_utilization: float
257
- debug: bool
258
- run_with_cli: bool
259
255
  requires_safetensors: bool
260
256
  generative_type: GenerativeType | None
261
257
  download_only: bool
258
+ force: bool
259
+ verbose: bool
260
+ debug: bool
261
+ run_with_cli: bool
262
262
 
263
263
 
264
264
  class BenchmarkConfigParams(pydantic.BaseModel):
@@ -266,10 +266,10 @@ class BenchmarkConfigParams(pydantic.BaseModel):
266
266
 
267
267
  model_config = pydantic.ConfigDict(protected_namespaces=())
268
268
 
269
- progress_bar: bool
270
- save_results: bool
271
269
  task: str | list[str] | None
272
270
  dataset: str | list[str] | None
271
+ progress_bar: bool
272
+ save_results: bool
273
273
  language: str | list[str]
274
274
  model_language: str | list[str] | None
275
275
  dataset_language: str | list[str] | None
@@ -278,21 +278,21 @@ class BenchmarkConfigParams(pydantic.BaseModel):
278
278
  raise_errors: bool
279
279
  cache_dir: str
280
280
  api_key: str | None
281
- force: bool
282
- verbose: bool
281
+ api_base: str | None
282
+ api_version: str | None
283
283
  trust_remote_code: bool
284
284
  clear_model_cache: bool
285
285
  evaluate_test_split: bool
286
286
  few_shot: bool
287
287
  num_iterations: int
288
- api_base: str | None
289
- api_version: str | None
288
+ requires_safetensors: bool
289
+ download_only: bool
290
290
  gpu_memory_utilization: float
291
291
  generative_type: GenerativeType | None
292
- download_only: bool
292
+ force: bool
293
+ verbose: bool
293
294
  debug: bool
294
295
  run_with_cli: bool
295
- requires_safetensors: bool
296
296
 
297
297
 
298
298
  class BenchmarkResult(pydantic.BaseModel):
euroeval/utils.py CHANGED
@@ -62,6 +62,10 @@ def resolve_model_path(download_dir: str) -> str:
62
62
 
63
63
  Returns:
64
64
  The path to the model.
65
+
66
+ Raises:
67
+ InvalidModel:
68
+ If the model path is not valid, or if required files are missing.
65
69
  """
66
70
  model_path = Path(download_dir)
67
71
  # Get the 'path safe' version of the model id, which is the last dir in the path
@@ -271,14 +275,15 @@ def internet_connection_available() -> bool:
271
275
  s = socket.create_connection(("1.1.1.1", 80))
272
276
  s.close()
273
277
  return True
274
- # a bit ugly but we dont want to actually import the pytest-socket exceptions
275
- # we catch all exceptions and check if the name matches any known errors
278
+
279
+ # We want to only catch exceptions related to socket connections, but as we cannot
280
+ # import these here as they're developer dependencies, we check the exception name
281
+ # instead. If the exception is not related to socket connections, we reraise it.
276
282
  except Exception as e:
277
283
  pytest_socket_errors = ["SocketConnectBlockedError", "SocketBlockedError"]
278
284
  if type(e).__name__ in pytest_socket_errors or isinstance(e, OSError):
279
285
  return False
280
- else:
281
- raise e
286
+ raise e
282
287
 
283
288
 
284
289
  class HiddenPrints:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: EuroEval
3
- Version: 16.2.1
3
+ Version: 16.2.2
4
4
  Summary: The robust European language model benchmark.
5
5
  Project-URL: Repository, https://github.com/EuroEval/EuroEval
6
6
  Project-URL: Issues, https://github.com/EuroEval/EuroEval/issues
@@ -1,11 +1,11 @@
1
1
  euroeval/__init__.py,sha256=mXTjuGrEE-1fIS9x28oJKg-gNGt4q7y2E74l330KEmY,3787
2
2
  euroeval/benchmark_config_factory.py,sha256=eOQsd9F4cJy8I7a3_lIKDZ5b5ukipIUqk0GZ3pyytwQ,8596
3
- euroeval/benchmarker.py,sha256=5l4p1ncq4VJX_bDjv2f8oBq2GETPtJmduGOnLAbWjF8,55762
3
+ euroeval/benchmarker.py,sha256=fjEqAkUC92fYUarWleigxfSrw0siYWW4YI_KlwyDUF4,46992
4
4
  euroeval/callbacks.py,sha256=5BTlDvBJ60xRvj01EpXZSZu3MFdKa3LgVuhxoLb3i3E,2565
5
5
  euroeval/cli.py,sha256=GOAWzdtasJfOvTuVQszu-T1T9GfQ_un-blOICO-y7g4,9316
6
6
  euroeval/constants.py,sha256=NN7kcwQdlDyyGFSrLjsL_qKVRyoRqZ9sKO5SjlgtRwA,2741
7
7
  euroeval/data_loading.py,sha256=F3fHyR7FoS_a1dx_DyqtcxdB-jxWwE3RCNRvWcp5z1c,4527
8
- euroeval/data_models.py,sha256=9Sgrq6Ktg1ETXRJ0v4VA_amAPowGuB7fZtL-8RlDQn0,27766
8
+ euroeval/data_models.py,sha256=X4zAdR1K2MPb4f4Vc7gPYfolzFxxsz5WplnsmsiMYY8,27766
9
9
  euroeval/enums.py,sha256=SeFek-Lre2Q5sxbP5svqjDZFZR2vlJhg9dkRH4JvU1g,3436
10
10
  euroeval/exceptions.py,sha256=5kQ-YvHyFO3aaA-zfOTaS07LRFH8xlSqlOiATvnIObY,5116
11
11
  euroeval/finetuning.py,sha256=G86pxxjOAgtcEWpyYDwYOV9pM7WG2Uu9fu7GdDso8dI,11426
@@ -20,13 +20,13 @@ euroeval/speed_benchmark.py,sha256=3iz_bfJgAoJ9K2HNjufyrBMjHVT8PAjuY_NocBGwKe0,4
20
20
  euroeval/tasks.py,sha256=EzEWFDo_0ffabBFiRu-mw80jENUioE8D_VEn_Dsv-F8,4703
21
21
  euroeval/tokenisation_utils.py,sha256=nLeF2cdZSm5PZiAcDTtxY82nUJ-or8VU8YxYLa167EM,21158
22
22
  euroeval/types.py,sha256=_iVy-RwiCGu9TNX2sfyJTdCvXy1akNGTCywAo-YpBqU,2815
23
- euroeval/utils.py,sha256=DRJW6wtmNpRtuHt03diWo3S5m3rdxoPEQpd-KWi7aGY,19255
23
+ euroeval/utils.py,sha256=AyUWGh-G5j14jXZ6ccS1LyTXml2JgbOzOt_e-rr5mag,19451
24
24
  euroeval/benchmark_modules/__init__.py,sha256=TNO-sNDwlXE-LMFXfwwqjQqUy55gywSmwRBcoPUFuaU,236
25
25
  euroeval/benchmark_modules/base.py,sha256=mHF8XS6GGUXV-sJtxmI5WJBWPLMHuh-4Z4OWjC25x9Y,11566
26
26
  euroeval/benchmark_modules/fresh.py,sha256=TveSQiFBi3xXgCEQBdHwkUQ685PDkKW0y3G5Yt5rkeM,10655
27
27
  euroeval/benchmark_modules/hf.py,sha256=XmkoDFzaJqnd_5mmUkqCaOgAdRPFs3KZKZZ0cr83TlM,44742
28
28
  euroeval/benchmark_modules/litellm.py,sha256=F3udd6NmhQOe3go_7rAcWg7mgZrNQpWWvLe-5U4E2RQ,64771
29
- euroeval/benchmark_modules/vllm.py,sha256=yLy8TCTnodu4NdTiO7XSdxuHX60AJ1-7p6J3e5h7-iA,43994
29
+ euroeval/benchmark_modules/vllm.py,sha256=bo5XaKlHEKhdEFPNJxsnJFq4RWOC9VoOH4Hqw_6dbMQ,43893
30
30
  euroeval/dataset_configs/__init__.py,sha256=uuIZmElpJV8iupo5oDj3TeQhBDRANdWpLKYFASLirHA,2046
31
31
  euroeval/dataset_configs/danish.py,sha256=QABfgI7m-0-5AimDXegp5ssDSLcM2VrAI_RWsinSZP4,5631
32
32
  euroeval/dataset_configs/dutch.py,sha256=63Ro2yFym5MuIDXf5953vUYenw9B0kZSCmZbXjdy4Rs,5517
@@ -63,8 +63,8 @@ euroeval/task_group_utils/question_answering.py,sha256=eUczZntrC9lhCUQlwNQB49i-5
63
63
  euroeval/task_group_utils/sequence_classification.py,sha256=TAqZCoMQ9I-HFhMH35_J1mY2SQg95HUbXcgrBIyhgk0,16082
64
64
  euroeval/task_group_utils/text_to_text.py,sha256=7f4hGAs5WNJ9PmW1mLhjDMrPxrYAvw5axXsneiJop1w,4993
65
65
  euroeval/task_group_utils/token_classification.py,sha256=Yjai937ia1nZBMOWySqCXr_dA6WiVLGvmb4Hm_TU0Bg,17118
66
- euroeval-16.2.1.dist-info/METADATA,sha256=brIXZ3x3MUf-ggNpKKC_4Lvrqem0MfKPrJ8DZJ5T3Iw,14590
67
- euroeval-16.2.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
- euroeval-16.2.1.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
- euroeval-16.2.1.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
- euroeval-16.2.1.dist-info/RECORD,,
66
+ euroeval-16.2.2.dist-info/METADATA,sha256=jGGv76AqT4vGKREN8jD3bBHi19vVyIKUNvlk6FNhRN8,14590
67
+ euroeval-16.2.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
68
+ euroeval-16.2.2.dist-info/entry_points.txt,sha256=-mtBu-10bFWeZ2bS32gVK6-s-LNCQLxvnNUPBLd5ud4,87
69
+ euroeval-16.2.2.dist-info/licenses/LICENSE,sha256=guvz_zBHgkQSY_QiUU0Bkc1k-L_PFZuLjIPfuKne2OY,1080
70
+ euroeval-16.2.2.dist-info/RECORD,,