EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (39) hide show
  1. euroeval/__init__.py +4 -2
  2. euroeval/benchmark_modules/fresh.py +3 -1
  3. euroeval/benchmark_modules/hf.py +8 -4
  4. euroeval/benchmark_modules/litellm.py +5 -17
  5. euroeval/benchmark_modules/vllm.py +98 -30
  6. euroeval/benchmarker.py +291 -405
  7. euroeval/cli.py +1 -1
  8. euroeval/constants.py +3 -0
  9. euroeval/data_models.py +35 -35
  10. euroeval/dataset_configs/__init__.py +1 -0
  11. euroeval/dataset_configs/danish.py +0 -2
  12. euroeval/dataset_configs/dutch.py +0 -2
  13. euroeval/dataset_configs/english.py +0 -2
  14. euroeval/dataset_configs/finnish.py +0 -2
  15. euroeval/dataset_configs/french.py +0 -2
  16. euroeval/dataset_configs/german.py +0 -2
  17. euroeval/dataset_configs/italian.py +0 -2
  18. euroeval/dataset_configs/latvian.py +2 -3
  19. euroeval/dataset_configs/lithuanian.py +62 -0
  20. euroeval/dataset_configs/norwegian.py +0 -2
  21. euroeval/dataset_configs/polish.py +0 -2
  22. euroeval/dataset_configs/portuguese.py +0 -2
  23. euroeval/dataset_configs/spanish.py +0 -2
  24. euroeval/dataset_configs/swedish.py +0 -3
  25. euroeval/metrics/huggingface.py +1 -1
  26. euroeval/metrics/pipeline.py +5 -0
  27. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  28. euroeval/prompt_templates/multiple_choice.py +9 -0
  29. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  30. euroeval/prompt_templates/reading_comprehension.py +10 -0
  31. euroeval/prompt_templates/sentiment_classification.py +11 -0
  32. euroeval/tokenisation_utils.py +8 -8
  33. euroeval/utils.py +10 -5
  34. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
  35. euroeval-16.3.0.dist-info/RECORD +71 -0
  36. euroeval-16.2.1.dist-info/RECORD +0 -70
  37. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
  38. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
  39. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -6,13 +6,13 @@ import logging
6
6
  import re
7
7
  import sys
8
8
  import typing as t
9
- from copy import deepcopy
10
9
  from pathlib import Path
11
10
  from shutil import rmtree
12
11
  from time import sleep
13
12
 
14
13
  from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
15
14
  from torch.distributed import destroy_process_group
15
+ from tqdm.auto import tqdm
16
16
 
17
17
  from .benchmark_config_factory import build_benchmark_config
18
18
  from .constants import GENERATIVE_PIPELINE_TAGS
@@ -33,6 +33,7 @@ from .utils import (
33
33
  get_package_version,
34
34
  internet_connection_available,
35
35
  log_once,
36
+ split_model_id,
36
37
  )
37
38
 
38
39
  if t.TYPE_CHECKING:
@@ -83,7 +84,7 @@ class Benchmarker:
83
84
  num_iterations: int = 10,
84
85
  api_base: str | None = None,
85
86
  api_version: str | None = None,
86
- gpu_memory_utilization: float = 0.9,
87
+ gpu_memory_utilization: float = 0.8,
87
88
  generative_type: GenerativeType | None = None,
88
89
  debug: bool = False,
89
90
  run_with_cli: bool = False,
@@ -200,10 +201,10 @@ class Benchmarker:
200
201
  )
201
202
 
202
203
  self.benchmark_config_default_params = BenchmarkConfigParams(
203
- progress_bar=progress_bar,
204
- save_results=save_results,
205
204
  task=task,
206
205
  dataset=dataset,
206
+ progress_bar=progress_bar,
207
+ save_results=save_results,
207
208
  language=language,
208
209
  model_language=model_language,
209
210
  dataset_language=dataset_language,
@@ -212,21 +213,21 @@ class Benchmarker:
212
213
  raise_errors=raise_errors,
213
214
  cache_dir=cache_dir,
214
215
  api_key=api_key,
215
- force=force,
216
- verbose=verbose,
216
+ api_base=api_base,
217
+ api_version=api_version,
217
218
  trust_remote_code=trust_remote_code,
218
219
  clear_model_cache=clear_model_cache,
219
220
  evaluate_test_split=evaluate_test_split,
220
221
  few_shot=few_shot,
221
222
  num_iterations=num_iterations,
222
- api_base=api_base,
223
- api_version=api_version,
223
+ requires_safetensors=requires_safetensors,
224
+ download_only=download_only,
224
225
  gpu_memory_utilization=gpu_memory_utilization,
225
226
  generative_type=generative_type,
226
- download_only=download_only,
227
+ verbose=verbose,
228
+ force=force,
227
229
  debug=debug,
228
230
  run_with_cli=run_with_cli,
229
- requires_safetensors=requires_safetensors,
230
231
  )
231
232
 
232
233
  self.benchmark_config = build_benchmark_config(
@@ -332,8 +333,8 @@ class Benchmarker:
332
333
  raise_errors: bool | None = None,
333
334
  cache_dir: str | None = None,
334
335
  api_key: str | None = None,
335
- force: bool | None = None,
336
- verbose: bool | None = None,
336
+ api_base: str | None = None,
337
+ api_version: str | None = None,
337
338
  trust_remote_code: bool | None = None,
338
339
  clear_model_cache: bool | None = None,
339
340
  evaluate_test_split: bool | None = None,
@@ -341,6 +342,11 @@ class Benchmarker:
341
342
  num_iterations: int | None = None,
342
343
  requires_safetensors: bool | None = None,
343
344
  download_only: bool | None = None,
345
+ gpu_memory_utilization: float | None = None,
346
+ generative_type: GenerativeType | None = None,
347
+ force: bool | None = None,
348
+ verbose: bool | None = None,
349
+ debug: bool | None = None,
344
350
  ) -> list[BenchmarkResult]:
345
351
  """Benchmarks models on datasets.
346
352
 
@@ -393,13 +399,13 @@ class Benchmarker:
393
399
  api_key:
394
400
  The API key to use for a given inference server. Defaults to the value
395
401
  specified when initialising the benchmarker.
396
- force:
397
- Whether to force evaluations of models, even if they have been
398
- benchmarked already. Defaults to the value specified when initialising
399
- the benchmarker.
400
- verbose:
401
- Whether to output additional output. Defaults to the value specified
402
- when initialising the benchmarker.
402
+ api_base:
403
+ The base URL for a given inference API. Only relevant if `model` refers
404
+ to a model on an inference API. Defaults to the value specified when
405
+ initialising the benchmarker.
406
+ api_version:
407
+ The version of the API to use. Defaults to the value specified when
408
+ initialising the benchmarker.
403
409
  trust_remote_code:
404
410
  Whether to trust remote code when loading models. Defaults to the value
405
411
  specified when initialising the benchmarker.
@@ -424,6 +430,27 @@ class Benchmarker:
424
430
  download_only:
425
431
  Whether to only download the models without evaluating them. Defaults
426
432
  to the value specified when initialising the benchmarker.
433
+ gpu_memory_utilization:
434
+ The GPU memory utilization to use for vLLM. Only relevant if the model
435
+ is generative. A larger value will result in faster evaluation, but at
436
+ the risk of running out of GPU memory. Only reduce this if you are
437
+ running out of GPU memory. Defaults to the value specified when
438
+ initialising the benchmarker.
439
+ generative_type:
440
+ The type of generative model to benchmark. Only relevant if the model is
441
+ generative. If not specified, then the type will be inferred based on
442
+ the tags of the model. Defaults to the value specified when initialising
443
+ the benchmarker.
444
+ force:
445
+ Whether to force evaluations of models, even if they have been
446
+ benchmarked already. Defaults to the value specified when initialising
447
+ the benchmarker.
448
+ verbose:
449
+ Whether to output additional output. Defaults to the value specified
450
+ when initialising the benchmarker.
451
+ debug:
452
+ Whether to output debug information. Defaults to the value specified
453
+ when initialising the benchmarker.
427
454
 
428
455
  Returns:
429
456
  A list of benchmark results.
@@ -435,28 +462,141 @@ class Benchmarker:
435
462
  if task is not None and dataset is not None:
436
463
  raise ValueError("Only one of `task` and `dataset` can be specified.")
437
464
 
438
- benchmark_config = self._get_updated_benchmark_config(
439
- task=task,
440
- dataset=dataset,
441
- progress_bar=progress_bar,
442
- save_results=save_results,
443
- language=language,
444
- model_language=model_language,
445
- dataset_language=dataset_language,
446
- device=device,
447
- batch_size=batch_size,
448
- raise_errors=raise_errors,
449
- cache_dir=cache_dir,
450
- api_key=api_key,
451
- force=force,
452
- verbose=verbose,
453
- trust_remote_code=trust_remote_code,
454
- clear_model_cache=clear_model_cache,
455
- evaluate_test_split=evaluate_test_split,
456
- few_shot=few_shot,
457
- num_iterations=num_iterations,
458
- requires_safetensors=requires_safetensors,
459
- download_only=download_only,
465
+ # Get a new updated benchmark configuration, based on any changes to the
466
+ # parameters
467
+ benchmark_config_params = BenchmarkConfigParams(
468
+ task=(
469
+ task if task is not None else self.benchmark_config_default_params.task
470
+ ),
471
+ dataset=(
472
+ dataset
473
+ if dataset is not None
474
+ else self.benchmark_config_default_params.dataset
475
+ ),
476
+ progress_bar=(
477
+ progress_bar
478
+ if progress_bar is not None
479
+ else self.benchmark_config_default_params.progress_bar
480
+ ),
481
+ save_results=(
482
+ save_results
483
+ if save_results is not None
484
+ else self.benchmark_config_default_params.save_results
485
+ ),
486
+ language=(
487
+ language
488
+ if language is not None
489
+ else self.benchmark_config_default_params.language
490
+ ),
491
+ model_language=(
492
+ model_language
493
+ if model_language is not None
494
+ else self.benchmark_config_default_params.model_language
495
+ ),
496
+ dataset_language=(
497
+ dataset_language
498
+ if dataset_language is not None
499
+ else self.benchmark_config_default_params.dataset_language
500
+ ),
501
+ device=(
502
+ device
503
+ if device is not None
504
+ else self.benchmark_config_default_params.device
505
+ ),
506
+ batch_size=(
507
+ batch_size
508
+ if batch_size is not None
509
+ else self.benchmark_config_default_params.batch_size
510
+ ),
511
+ raise_errors=(
512
+ raise_errors
513
+ if raise_errors is not None
514
+ else self.benchmark_config_default_params.raise_errors
515
+ ),
516
+ cache_dir=(
517
+ cache_dir
518
+ if cache_dir is not None
519
+ else self.benchmark_config_default_params.cache_dir
520
+ ),
521
+ api_key=(
522
+ api_key
523
+ if api_key is not None
524
+ else self.benchmark_config_default_params.api_key
525
+ ),
526
+ api_base=(
527
+ api_base
528
+ if api_base is not None
529
+ else self.benchmark_config_default_params.api_base
530
+ ),
531
+ api_version=(
532
+ api_version
533
+ if api_version is not None
534
+ else self.benchmark_config_default_params.api_version
535
+ ),
536
+ trust_remote_code=(
537
+ trust_remote_code
538
+ if trust_remote_code is not None
539
+ else self.benchmark_config_default_params.trust_remote_code
540
+ ),
541
+ clear_model_cache=(
542
+ clear_model_cache
543
+ if clear_model_cache is not None
544
+ else self.benchmark_config_default_params.clear_model_cache
545
+ ),
546
+ evaluate_test_split=(
547
+ evaluate_test_split
548
+ if evaluate_test_split is not None
549
+ else self.benchmark_config_default_params.evaluate_test_split
550
+ ),
551
+ few_shot=(
552
+ few_shot
553
+ if few_shot is not None
554
+ else self.benchmark_config_default_params.few_shot
555
+ ),
556
+ num_iterations=(
557
+ num_iterations
558
+ if num_iterations is not None
559
+ else self.benchmark_config_default_params.num_iterations
560
+ ),
561
+ requires_safetensors=(
562
+ requires_safetensors
563
+ if requires_safetensors is not None
564
+ else self.benchmark_config_default_params.requires_safetensors
565
+ ),
566
+ download_only=(
567
+ download_only
568
+ if download_only is not None
569
+ else self.benchmark_config_default_params.download_only
570
+ ),
571
+ gpu_memory_utilization=(
572
+ gpu_memory_utilization
573
+ if gpu_memory_utilization is not None
574
+ else self.benchmark_config_default_params.gpu_memory_utilization
575
+ ),
576
+ generative_type=(
577
+ generative_type
578
+ if generative_type is not None
579
+ else self.benchmark_config_default_params.generative_type
580
+ ),
581
+ force=(
582
+ force
583
+ if force is not None
584
+ else self.benchmark_config_default_params.force
585
+ ),
586
+ verbose=(
587
+ verbose
588
+ if verbose is not None
589
+ else self.benchmark_config_default_params.verbose
590
+ ),
591
+ debug=(
592
+ debug
593
+ if debug is not None
594
+ else self.benchmark_config_default_params.debug
595
+ ),
596
+ run_with_cli=self.benchmark_config_default_params.run_with_cli,
597
+ )
598
+ benchmark_config = build_benchmark_config(
599
+ benchmark_config_params=benchmark_config_params
460
600
  )
461
601
 
462
602
  adjust_logging_level(verbose=benchmark_config.verbose)
@@ -469,46 +609,90 @@ class Benchmarker:
469
609
  dataset_names=benchmark_config.datasets
470
610
  )
471
611
 
472
- total_benchmarks = len(model_ids) * len(dataset_configs)
473
- num_finished_benchmarks = 0
474
-
475
- current_benchmark_results: list[BenchmarkResult] = list()
476
- for model_id in model_ids:
477
- # Load the model configuration, or skip the model if it is invalid
612
+ # Get all the model configs
613
+ model_configs: list[ModelConfig] = list()
614
+ for model_id in tqdm(
615
+ iterable=model_ids,
616
+ desc="Fetching model configurations",
617
+ disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
618
+ ):
478
619
  try:
479
620
  model_config = get_model_config(
480
621
  model_id=model_id, benchmark_config=benchmark_config
481
622
  )
623
+ model_configs.append(model_config)
482
624
  except InvalidModel as e:
483
625
  logger.info(e.message)
484
- num_finished_benchmarks += len(dataset_configs)
626
+
627
+ # Create a dictionary that takes each model config to the dataset configs that
628
+ # we need to benchmark the model on. Here we remove the datasets that the model
629
+ # has already been benchmarked on, or datasets that the model cannot be
630
+ # benchmarked on.
631
+ model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
632
+ model_config: [
633
+ dataset_config
634
+ for dataset_config in dataset_configs
635
+ if (
636
+ benchmark_config.force
637
+ or not model_has_been_benchmarked(
638
+ model_config=model_config,
639
+ dataset_config=dataset_config,
640
+ benchmark_config=benchmark_config,
641
+ benchmark_results=self.benchmark_results,
642
+ )
643
+ )
644
+ and model_config.model_type in dataset_config.allowed_model_types
645
+ ]
646
+ for model_config in model_configs
647
+ }
648
+
649
+ total_benchmarks = sum(
650
+ len(dataset_configs)
651
+ for dataset_configs in model_config_to_dataset_configs.values()
652
+ )
653
+ if total_benchmarks == 0:
654
+ logger.info(
655
+ "No benchmarks to run, as all the selected models have already been "
656
+ "benchmarked on all the selected datasets."
657
+ )
658
+ return list()
659
+
660
+ logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
661
+
662
+ num_finished_benchmarks = 0
663
+ current_benchmark_results: list[BenchmarkResult] = list()
664
+ for model_config in model_configs:
665
+ if not model_config_to_dataset_configs[model_config]:
666
+ logger.debug(
667
+ f"Skipping model {model_config.model_id!r} because it has "
668
+ "already been benchmarked on all valid datasets."
669
+ )
485
670
  continue
486
671
 
487
672
  if model_config.adapter_base_model_id:
488
673
  open_issue_msg = (
489
- "If offline support is important to you, please "
490
- "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
674
+ "If offline support is important to you, please consider opening "
675
+ "an issue at https://github.com/EuroEval/EuroEval/issues."
491
676
  )
492
677
  if not internet_connection_available():
493
678
  raise InvalidModel(
494
679
  "Offline benchmarking of models with adapters is not currently "
495
- "supported. "
496
- f"An active internet connection is required. {open_issue_msg}"
680
+ "supported. An active internet connection is required. "
681
+ "{open_issue_msg}"
497
682
  )
498
683
  elif benchmark_config.download_only:
499
684
  log_once(
500
685
  "You are using download only mode with a model that includes "
501
- "an adapter. "
502
- "Please note: Offline benchmarking of adapter models is not "
503
- "currently supported. "
504
- "An internet connection will be required during evaluation. "
686
+ "an adapter. Please note that offline benchmarking of "
687
+ "adapter models is not currently supported - an internet "
688
+ "connection will be required during evaluation in this case. "
505
689
  f"{open_issue_msg}",
506
690
  level=logging.WARNING,
507
691
  )
508
692
 
509
693
  loaded_model: BenchmarkModule | None = None
510
694
  benchmark_params_to_revert: dict[str, t.Any] = dict()
511
- for dataset_config in dataset_configs:
695
+ for dataset_config in model_config_to_dataset_configs[model_config]:
512
696
  # Revert any changes to the benchmark configuration made for the
513
697
  # previous dataset
514
698
  for param, value in benchmark_params_to_revert.items():
@@ -536,34 +720,6 @@ class Benchmarker:
536
720
  benchmark_params_to_revert["few_shot"] = True
537
721
  benchmark_config.few_shot = False
538
722
 
539
- # Skip if we have already benchmarked this model on this dataset and
540
- # we are not forcing the benchmark
541
- if not benchmark_config.force and model_has_been_benchmarked(
542
- model_id=model_id,
543
- dataset=dataset_config.name,
544
- few_shot=benchmark_config.few_shot,
545
- validation_split=not benchmark_config.evaluate_test_split,
546
- benchmark_results=self.benchmark_results,
547
- ):
548
- logger.debug(
549
- f"Skipping benchmarking {model_id} on "
550
- f"{dataset_config.pretty_name}, as it has already been "
551
- "benchmarked."
552
- )
553
- num_finished_benchmarks += 1
554
- continue
555
-
556
- # Skip if the model type should not be benchmarked on this dataset
557
- model_type = model_config.model_type
558
- allowed_model_types = dataset_config.allowed_model_types
559
- if model_type not in allowed_model_types:
560
- logger.debug(
561
- f"Skipping benchmarking {model_id} on "
562
- f"{dataset_config.pretty_name}, as it is of type {model_type}, "
563
- f"and the only allowed model types are {allowed_model_types}."
564
- )
565
- continue
566
-
567
723
  # We do not re-initialise generative models as their architecture is not
568
724
  # customised to specific datasets
569
725
  if model_config.model_type == ModelType.GENERATIVE:
@@ -597,6 +753,22 @@ class Benchmarker:
597
753
  else:
598
754
  loaded_model.dataset_config = dataset_config
599
755
 
756
+ # Skip the benchmark if the model is not of the correct
757
+ # generative type
758
+ if (
759
+ loaded_model.generative_type
760
+ not in dataset_config.allowed_generative_types
761
+ ):
762
+ logger.debug(
763
+ f"Skipping the benchmark of model "
764
+ f"{model_config.model_id!r}on dataset "
765
+ f"{dataset_config.name!r} because the model has generative "
766
+ f"type {loaded_model.generative_type} and the dataset "
767
+ f"only allows {dataset_config.allowed_generative_types}."
768
+ )
769
+ num_finished_benchmarks += 1
770
+ continue
771
+
600
772
  # Benchmark a single model on a single dataset
601
773
  benchmark_output_or_err = self._benchmark_single(
602
774
  model=loaded_model,
@@ -654,176 +826,6 @@ class Benchmarker:
654
826
  destroy_process_group()
655
827
  return current_benchmark_results
656
828
 
657
- def _get_updated_benchmark_config(
658
- self,
659
- progress_bar: bool | None = None,
660
- save_results: bool | None = None,
661
- task: str | list[str] | None | None = None,
662
- dataset: str | list[str] | None | None = None,
663
- language: str | list[str] | None = None,
664
- model_language: str | list[str] | None | None = None,
665
- dataset_language: str | list[str] | None | None = None,
666
- device: Device | None | None = None,
667
- batch_size: int | None = None,
668
- raise_errors: bool | None = None,
669
- cache_dir: str | None = None,
670
- api_key: str | None | None = None,
671
- force: bool | None = None,
672
- verbose: bool | None = None,
673
- trust_remote_code: bool | None = None,
674
- clear_model_cache: bool | None = None,
675
- evaluate_test_split: bool | None = None,
676
- few_shot: bool | None = None,
677
- num_iterations: int | None = None,
678
- api_base: str | None | None = None,
679
- api_version: str | None | None = None,
680
- debug: bool | None = None,
681
- run_with_cli: bool | None = None,
682
- requires_safetensors: bool | None = None,
683
- download_only: bool | None = None,
684
- ) -> "BenchmarkConfig":
685
- """Get an updated benchmark configuration.
686
-
687
- Args:
688
- progress_bar:
689
- Whether progress bars should be shown. If None, then this value will not
690
- be updated.
691
- save_results:
692
- Whether to save the benchmark results to
693
- 'euroeval_benchmark_results.jsonl'. If None, then this value will not
694
- be updated.
695
- task:
696
- The tasks benchmark the model(s) on. If None, then this value will not
697
- be updated.
698
- dataset:
699
- The datasets to benchmark on. If None, then this value will not be
700
- updated.
701
- language:
702
- The language codes of the languages to include, both for models and
703
- datasets. If None, then this value will not be updated.
704
- model_language:
705
- The language codes of the languages to include for models. If None, then
706
- this value will not be updated.
707
- dataset_language:
708
- The language codes of the languages to include for datasets. If None,
709
- then this value will not be updated.
710
- device:
711
- The device to use for benchmarking. If None, then this value will not be
712
- updated.
713
- batch_size:
714
- The batch size to use. If None, then this value will not be updated.
715
- raise_errors:
716
- Whether to raise errors instead of skipping the model evaluation. If
717
- None, then this value will not be updated.
718
- cache_dir:
719
- Directory to store cached models. If None, then this value will not be
720
- updated.
721
- api_key:
722
- The API key to use for a given inference server. If None, then this
723
- value will not be updated.
724
- force:
725
- Whether to force evaluations of models, even if they have been
726
- benchmarked already. If None, then this value will not be updated.
727
- verbose:
728
- Whether to output additional output. If None, then this value will not
729
- be updated.
730
- trust_remote_code:
731
- Whether to trust remote code when loading models. If None, then this
732
- value will not be updated.
733
- clear_model_cache:
734
- Whether to clear the model cache after benchmarking each model. If None,
735
- then this value will not be updated.
736
- evaluate_test_split:
737
- Whether to evaluate the test split of the datasets. If None, then this
738
- value will not be updated.
739
- few_shot:
740
- Whether to only evaluate the model using few-shot evaluation. If None,
741
- then this value will not be updated.
742
- num_iterations:
743
- The number of times each model should be evaluated. If None, then this
744
- value will not be updated.
745
- api_base:
746
- The base URL for a given inference API. If None, then this value will
747
- not be updated.
748
- api_version:
749
- The version of the API to use. If None, then this value will not be
750
- updated.
751
- debug:
752
- Whether to output debug information. If None, then this value will not
753
- be updated.
754
- run_with_cli:
755
- Whether the benchmarker is being run from the command-line interface.
756
- If None, then this value will not be updated.
757
- requires_safetensors:
758
- Whether to only allow models that use the safetensors format. If None,
759
- then this value will not be updated.
760
- download_only:
761
- Whether to only download the models without evaluating them. If None,
762
- then this value will not be updated.
763
- download_only:
764
- Whether to only download models and datasets without performing any
765
- benchmarking. If None, then this value will not be updated.
766
-
767
- Returns:
768
- The updated benchmark configuration.
769
- """
770
- benchmark_config_params = deepcopy(self.benchmark_config_default_params)
771
-
772
- if progress_bar is not None:
773
- benchmark_config_params.progress_bar = progress_bar
774
- if save_results is not None:
775
- benchmark_config_params.save_results = save_results
776
- if task is not None:
777
- benchmark_config_params.task = task
778
- benchmark_config_params.dataset = None
779
- if dataset is not None:
780
- benchmark_config_params.dataset = dataset
781
- benchmark_config_params.task = None
782
- if language is not None:
783
- benchmark_config_params.language = language
784
- if model_language is not None:
785
- benchmark_config_params.model_language = model_language
786
- if dataset_language is not None:
787
- benchmark_config_params.dataset_language = dataset_language
788
- if device is not None:
789
- benchmark_config_params.device = device
790
- if batch_size is not None:
791
- benchmark_config_params.batch_size = batch_size
792
- if raise_errors is not None:
793
- benchmark_config_params.raise_errors = raise_errors
794
- if cache_dir is not None:
795
- benchmark_config_params.cache_dir = cache_dir
796
- if api_key is not None:
797
- benchmark_config_params.api_key = api_key
798
- if force is not None:
799
- benchmark_config_params.force = force
800
- if verbose is not None:
801
- benchmark_config_params.verbose = verbose
802
- if trust_remote_code is not None:
803
- benchmark_config_params.trust_remote_code = trust_remote_code
804
- if clear_model_cache is not None:
805
- benchmark_config_params.clear_model_cache = clear_model_cache
806
- if evaluate_test_split is not None:
807
- benchmark_config_params.evaluate_test_split = evaluate_test_split
808
- if few_shot is not None:
809
- benchmark_config_params.few_shot = few_shot
810
- if num_iterations is not None:
811
- benchmark_config_params.num_iterations = num_iterations
812
- if api_base is not None:
813
- benchmark_config_params.api_base = api_base
814
- if api_version is not None:
815
- benchmark_config_params.api_version = api_version
816
- if debug is not None:
817
- benchmark_config_params.debug = debug
818
- if run_with_cli is not None:
819
- benchmark_config_params.run_with_cli = run_with_cli
820
- if requires_safetensors is not None:
821
- benchmark_config_params.requires_safetensors = requires_safetensors
822
- if download_only is not None:
823
- benchmark_config_params.download_only = download_only
824
-
825
- return build_benchmark_config(benchmark_config_params=benchmark_config_params)
826
-
827
829
  def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
828
830
  """Prepare the model ID(s) to be benchmarked.
829
831
 
@@ -991,164 +993,30 @@ class Benchmarker:
991
993
  raise e
992
994
  return e
993
995
 
994
- def __call__(
995
- self,
996
- model: list[str] | str,
997
- task: str | list[str] | None = None,
998
- dataset: list[str] | str | None = None,
999
- progress_bar: bool | None = None,
1000
- save_results: bool | None = None,
1001
- language: str | list[str] | None = None,
1002
- model_language: str | list[str] | None = None,
1003
- dataset_language: str | list[str] | None = None,
1004
- device: Device | None = None,
1005
- batch_size: int | None = None,
1006
- raise_errors: bool | None = None,
1007
- cache_dir: str | None = None,
1008
- api_key: str | None = None,
1009
- force: bool | None = None,
1010
- verbose: bool | None = None,
1011
- trust_remote_code: bool | None = None,
1012
- clear_model_cache: bool | None = None,
1013
- evaluate_test_split: bool | None = None,
1014
- few_shot: bool | None = None,
1015
- num_iterations: int | None = None,
1016
- requires_safetensors: bool | None = None,
1017
- ) -> list[BenchmarkResult]:
1018
- """Benchmarks models on datasets.
1019
-
1020
- Args:
1021
- model:
1022
- The full Hugging Face Hub path(s) to the pretrained transformer model.
1023
- The specific model version to use can be added after the suffix '@':
1024
- "model@v1.0.0". It can be a branch name, a tag name, or a commit id,
1025
- and defaults to the latest version if not specified.
1026
- task:
1027
- The tasks benchmark the model(s) on. Mutually exclusive with `dataset`.
1028
- If both `task` and `dataset` are None then all datasets will be
1029
- benchmarked. Defaults to None.
1030
- dataset:
1031
- The datasets to benchmark on. Mutually exclusive with `task`. If both
1032
- `task` and `dataset` are None then all datasets will be benchmarked.
1033
- Defaults to None.
1034
- progress_bar:
1035
- Whether progress bars should be shown. Defaults to the value specified
1036
- when initialising the benchmarker.
1037
- save_results:
1038
- Whether to save the benchmark results to
1039
- 'euroeval_benchmark_results.jsonl'. Defaults to the value specified
1040
- when initialising the benchmarker.
1041
- language:
1042
- The language codes of the languages to include, both for models and
1043
- datasets. Here 'no' means both Bokmål (nb) and Nynorsk (nn). Set this to
1044
- 'all' if all languages should be considered. Defaults to the value
1045
- specified when initialising the benchmarker.
1046
- model_language:
1047
- The language codes of the languages to include for models. If specified
1048
- then this overrides the `language` parameter for model languages.
1049
- Defaults to the value specified when initialising the benchmarker.
1050
- dataset_language:
1051
- The language codes of the languages to include for datasets. If
1052
- specified then this overrides the `language` parameter for dataset
1053
- languages. Defaults to the value specified when initialising the
1054
- benchmarker.
1055
- device:
1056
- The device to use for benchmarking. Defaults to the value specified when
1057
- initialising the benchmarker.
1058
- batch_size:
1059
- The batch size to use. Defaults to the value specified when initialising
1060
- the benchmarker.
1061
- raise_errors:
1062
- Whether to raise errors instead of skipping the model evaluation.
1063
- cache_dir:
1064
- Directory to store cached models. Defaults to the value specified when
1065
- initialising the benchmarker.
1066
- api_key:
1067
- The API key to use for a given inference server. Defaults to the value
1068
- specified when initialising the benchmarker.
1069
- force:
1070
- Whether to force evaluations of models, even if they have been
1071
- benchmarked already. Defaults to the value specified when initialising
1072
- the benchmarker.
1073
- verbose:
1074
- Whether to output additional output. Defaults to the value specified
1075
- when initialising the benchmarker.
1076
- trust_remote_code:
1077
- Whether to trust remote code when loading models. Defaults to the value
1078
- specified when initialising the benchmarker.
1079
- clear_model_cache:
1080
- Whether to clear the model cache after benchmarking each model. Defaults
1081
- to the value specified when initialising the benchmarker.
1082
- evaluate_test_split:
1083
- Whether to evaluate the test split of the datasets. Defaults to the
1084
- value specified when initialising the benchmarker.
1085
- few_shot:
1086
- Whether to only evaluate the model using few-shot evaluation. Only
1087
- relevant if the model is generative. Defaults to the value specified
1088
- when initialising the benchmarker.
1089
- num_iterations:
1090
- The number of times each model should be evaluated. This is only meant
1091
- to be used for power users, and scores will not be allowed on the
1092
- leaderboards if this is changed. Defaults to the value specified when
1093
- initialising the benchmarker.
1094
- requires_safetensors:
1095
- Whether to only allow models that use the safetensors format. Defaults
1096
- to the value specified when initialising the benchmarker.
1097
-
1098
- Returns:
1099
- A list of benchmark results.
1100
-
1101
- Raises:
1102
- ValueError:
1103
- If both `task` and `dataset` are specified.
1104
- """
996
+ def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
997
+ """Alias for `self.benchmark()`."""
1105
998
  logger.warning(
1106
999
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
1107
1000
  "`benchmark` function instead. This will be removed in a future version."
1108
1001
  )
1109
- return self.benchmark(
1110
- model=model,
1111
- task=task,
1112
- dataset=dataset,
1113
- progress_bar=progress_bar,
1114
- save_results=save_results,
1115
- language=language,
1116
- model_language=model_language,
1117
- dataset_language=dataset_language,
1118
- device=device,
1119
- batch_size=batch_size,
1120
- raise_errors=raise_errors,
1121
- cache_dir=cache_dir,
1122
- api_key=api_key,
1123
- force=force,
1124
- verbose=verbose,
1125
- trust_remote_code=trust_remote_code,
1126
- clear_model_cache=clear_model_cache,
1127
- evaluate_test_split=evaluate_test_split,
1128
- few_shot=few_shot,
1129
- num_iterations=num_iterations,
1130
- requires_safetensors=requires_safetensors,
1131
- )
1002
+ return self.benchmark(*args, **kwds)
1132
1003
 
1133
1004
 
1134
1005
  def model_has_been_benchmarked(
1135
- model_id: str,
1136
- dataset: str,
1137
- few_shot: bool,
1138
- validation_split: bool,
1006
+ model_config: "ModelConfig",
1007
+ dataset_config: "DatasetConfig",
1008
+ benchmark_config: "BenchmarkConfig",
1139
1009
  benchmark_results: list[BenchmarkResult],
1140
1010
  ) -> bool:
1141
1011
  """Checks whether a model has already been benchmarked on a dataset.
1142
1012
 
1143
1013
  Args:
1144
- model_id:
1145
- The model ID.
1146
- dataset:
1147
- The dataset.
1148
- few_shot:
1149
- Whether the model was evaluated using few-shot evaluation.
1150
- validation_split:
1151
- Whether the model was evaluated on the validation split.
1014
+ model_config:
1015
+ The configuration of the model we are evaluating.
1016
+ dataset_config:
1017
+ The configuration of the dataset we are evaluating on.
1018
+ benchmark_config:
1019
+ The general benchmark configuration.
1152
1020
  benchmark_results:
1153
1021
  The benchmark results.
1154
1022
 
@@ -1156,10 +1024,28 @@ def model_has_been_benchmarked(
1156
1024
  Whether the model has already been evaluated on the dataset.
1157
1025
  """
1158
1026
  for record in benchmark_results:
1159
- same_evaluation = record.model == model_id and record.dataset == dataset
1160
- same_validation_split_setting = record.validation_split == validation_split
1161
- same_few_shot_setting = record.few_shot == few_shot or not record.generative
1162
- if same_evaluation and same_validation_split_setting and same_few_shot_setting:
1027
+ model_id_components = split_model_id(model_id=record.model)
1028
+ same_model_id = model_id_components.model_id == model_config.model_id
1029
+ same_revision = model_id_components.revision == model_config.revision
1030
+ same_param = model_id_components.param == model_config.param
1031
+ same_dataset = record.dataset == dataset_config.name
1032
+ same_split = (
1033
+ record.validation_split != benchmark_config.evaluate_test_split
1034
+ or "val" not in dataset_config.splits
1035
+ )
1036
+ same_num_shots = (
1037
+ record.few_shot == benchmark_config.few_shot
1038
+ or not record.generative
1039
+ or dataset_config.task.requires_zero_shot
1040
+ )
1041
+ if (
1042
+ same_model_id
1043
+ and same_revision
1044
+ and same_param
1045
+ and same_dataset
1046
+ and same_split
1047
+ and same_num_shots
1048
+ ):
1163
1049
  return True
1164
1050
  return False
1165
1051