EuroEval 16.3.0__py3-none-any.whl → 16.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (78) hide show
  1. euroeval/__init__.py +9 -2
  2. euroeval/benchmark_config_factory.py +51 -50
  3. euroeval/benchmark_modules/base.py +9 -21
  4. euroeval/benchmark_modules/fresh.py +2 -1
  5. euroeval/benchmark_modules/hf.py +101 -71
  6. euroeval/benchmark_modules/litellm.py +115 -53
  7. euroeval/benchmark_modules/vllm.py +107 -92
  8. euroeval/benchmarker.py +144 -121
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +86 -8
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +80 -29
  14. euroeval/data_models.py +338 -330
  15. euroeval/dataset_configs/__init__.py +12 -3
  16. euroeval/dataset_configs/bulgarian.py +56 -0
  17. euroeval/dataset_configs/czech.py +75 -0
  18. euroeval/dataset_configs/danish.py +55 -93
  19. euroeval/dataset_configs/dutch.py +48 -87
  20. euroeval/dataset_configs/english.py +45 -77
  21. euroeval/dataset_configs/estonian.py +42 -34
  22. euroeval/dataset_configs/faroese.py +19 -60
  23. euroeval/dataset_configs/finnish.py +36 -69
  24. euroeval/dataset_configs/french.py +39 -75
  25. euroeval/dataset_configs/german.py +45 -82
  26. euroeval/dataset_configs/greek.py +64 -0
  27. euroeval/dataset_configs/icelandic.py +54 -91
  28. euroeval/dataset_configs/italian.py +42 -79
  29. euroeval/dataset_configs/latvian.py +28 -35
  30. euroeval/dataset_configs/lithuanian.py +28 -26
  31. euroeval/dataset_configs/norwegian.py +72 -115
  32. euroeval/dataset_configs/polish.py +33 -61
  33. euroeval/dataset_configs/portuguese.py +33 -66
  34. euroeval/dataset_configs/serbian.py +64 -0
  35. euroeval/dataset_configs/slovak.py +55 -0
  36. euroeval/dataset_configs/spanish.py +42 -77
  37. euroeval/dataset_configs/swedish.py +52 -90
  38. euroeval/dataset_configs/ukrainian.py +64 -0
  39. euroeval/exceptions.py +1 -1
  40. euroeval/finetuning.py +24 -17
  41. euroeval/generation.py +15 -14
  42. euroeval/generation_utils.py +8 -8
  43. euroeval/languages.py +395 -323
  44. euroeval/logging_utils.py +250 -0
  45. euroeval/metrics/base.py +0 -3
  46. euroeval/metrics/huggingface.py +21 -6
  47. euroeval/metrics/llm_as_a_judge.py +6 -4
  48. euroeval/metrics/pipeline.py +17 -9
  49. euroeval/metrics/speed.py +0 -3
  50. euroeval/model_cache.py +17 -19
  51. euroeval/model_config.py +4 -5
  52. euroeval/model_loading.py +3 -0
  53. euroeval/prompt_templates/__init__.py +2 -0
  54. euroeval/prompt_templates/classification.py +206 -0
  55. euroeval/prompt_templates/linguistic_acceptability.py +99 -42
  56. euroeval/prompt_templates/multiple_choice.py +102 -38
  57. euroeval/prompt_templates/named_entity_recognition.py +172 -51
  58. euroeval/prompt_templates/reading_comprehension.py +119 -42
  59. euroeval/prompt_templates/sentiment_classification.py +110 -40
  60. euroeval/prompt_templates/summarization.py +85 -40
  61. euroeval/prompt_templates/token_classification.py +279 -0
  62. euroeval/scores.py +11 -10
  63. euroeval/speed_benchmark.py +5 -6
  64. euroeval/task_group_utils/multiple_choice_classification.py +2 -4
  65. euroeval/task_group_utils/question_answering.py +24 -16
  66. euroeval/task_group_utils/sequence_classification.py +48 -35
  67. euroeval/task_group_utils/text_to_text.py +19 -9
  68. euroeval/task_group_utils/token_classification.py +21 -17
  69. euroeval/tasks.py +44 -1
  70. euroeval/tokenisation_utils.py +33 -22
  71. euroeval/types.py +10 -9
  72. euroeval/utils.py +35 -149
  73. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/METADATA +196 -39
  74. euroeval-16.5.0.dist-info/RECORD +81 -0
  75. euroeval-16.3.0.dist-info/RECORD +0 -71
  76. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/WHEEL +0 -0
  77. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/entry_points.txt +0 -0
  78. {euroeval-16.3.0.dist-info → euroeval-16.5.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -1,10 +1,12 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
+ import collections.abc as c
3
4
  import contextlib
5
+ import datetime as dt
4
6
  import json
5
7
  import logging
8
+ import os
6
9
  import re
7
- import sys
8
10
  import typing as t
9
11
  from pathlib import Path
10
12
  from shutil import rmtree
@@ -12,7 +14,6 @@ from time import sleep
12
14
 
13
15
  from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
14
16
  from torch.distributed import destroy_process_group
15
- from tqdm.auto import tqdm
16
17
 
17
18
  from .benchmark_config_factory import build_benchmark_config
18
19
  from .constants import GENERATIVE_PIPELINE_TAGS
@@ -23,6 +24,7 @@ from .enums import Device, GenerativeType, ModelType
23
24
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
24
25
  from .finetuning import finetune
25
26
  from .generation import generate
27
+ from .logging_utils import adjust_logging_level, get_pbar, log, log_once
26
28
  from .model_config import get_model_config
27
29
  from .model_loading import load_model
28
30
  from .scores import log_scores
@@ -32,16 +34,12 @@ from .utils import (
32
34
  enforce_reproducibility,
33
35
  get_package_version,
34
36
  internet_connection_available,
35
- log_once,
36
37
  split_model_id,
37
38
  )
38
39
 
39
40
  if t.TYPE_CHECKING:
40
41
  from .benchmark_modules import BenchmarkModule
41
- from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
42
-
43
-
44
- logger = logging.getLogger("euroeval")
42
+ from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig, Task
45
43
 
46
44
 
47
45
  class Benchmarker:
@@ -65,11 +63,11 @@ class Benchmarker:
65
63
  self,
66
64
  progress_bar: bool = True,
67
65
  save_results: bool = True,
68
- task: str | list[str] | None = None,
69
- dataset: list[str] | str | None = None,
70
- language: str | list[str] = "all",
71
- model_language: str | list[str] | None = None,
72
- dataset_language: str | list[str] | None = None,
66
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
67
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
68
+ language: str | c.Sequence[str] = "all",
69
+ model_language: str | c.Sequence[str] | None = None,
70
+ dataset_language: str | c.Sequence[str] | None = None,
73
71
  device: Device | None = None,
74
72
  batch_size: int = 32,
75
73
  raise_errors: bool = False,
@@ -179,6 +177,8 @@ class Benchmarker:
179
177
  ValueError:
180
178
  If both `task` and `dataset` are specified, or if `download_only`
181
179
  is True and we have no internet connection.
180
+ ImportError:
181
+ If `hf_transfer` is enabled but not installed.
182
182
  """
183
183
  if task is not None and dataset is not None:
184
184
  raise ValueError("Only one of `task` and `dataset` can be specified.")
@@ -200,6 +200,10 @@ class Benchmarker:
200
200
  "Try installing it with `pip install hf_transfer`."
201
201
  )
202
202
 
203
+ # If FULL_LOG has been set, then force verbose mode
204
+ if os.getenv("FULL_LOG", "0") == "1":
205
+ verbose = True
206
+
203
207
  self.benchmark_config_default_params = BenchmarkConfigParams(
204
208
  task=task,
205
209
  dataset=dataset,
@@ -235,13 +239,13 @@ class Benchmarker:
235
239
  )
236
240
 
237
241
  # Initialise variable storing model lists, so we only have to fetch it once
238
- self._model_lists: dict[str, list[str]] | None = None
242
+ self._model_lists: dict[str, c.Sequence[str]] | None = None
239
243
 
240
244
  self.results_path = Path.cwd() / "euroeval_benchmark_results.jsonl"
241
245
  adjust_logging_level(verbose=self.benchmark_config.verbose)
242
246
 
243
247
  @property
244
- def benchmark_results(self) -> list[BenchmarkResult]:
248
+ def benchmark_results(self) -> c.Sequence[BenchmarkResult]:
245
249
  """The benchmark results.
246
250
 
247
251
  Returns:
@@ -301,7 +305,6 @@ class Benchmarker:
301
305
  )
302
306
  del dataset
303
307
 
304
- log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
305
308
  model = load_model(
306
309
  model_config=model_config,
307
310
  dataset_config=dataset_config,
@@ -320,14 +323,14 @@ class Benchmarker:
320
323
 
321
324
  def benchmark(
322
325
  self,
323
- model: list[str] | str,
324
- task: str | list[str] | None = None,
325
- dataset: list[str] | str | None = None,
326
+ model: c.Sequence[str] | str,
327
+ task: "str | Task | c.Sequence[str | Task] | None" = None,
328
+ dataset: "str | DatasetConfig | c.Sequence[str | DatasetConfig] | None" = None,
326
329
  progress_bar: bool | None = None,
327
330
  save_results: bool | None = None,
328
- language: str | list[str] | None = None,
329
- model_language: str | list[str] | None = None,
330
- dataset_language: str | list[str] | None = None,
331
+ language: str | c.Sequence[str] | None = None,
332
+ model_language: str | c.Sequence[str] | None = None,
333
+ dataset_language: str | c.Sequence[str] | None = None,
331
334
  device: Device | None = None,
332
335
  batch_size: int | None = None,
333
336
  raise_errors: bool | None = None,
@@ -347,7 +350,7 @@ class Benchmarker:
347
350
  force: bool | None = None,
348
351
  verbose: bool | None = None,
349
352
  debug: bool | None = None,
350
- ) -> list[BenchmarkResult]:
353
+ ) -> c.Sequence[BenchmarkResult]:
351
354
  """Benchmarks models on datasets.
352
355
 
353
356
  Args:
@@ -605,13 +608,11 @@ class Benchmarker:
605
608
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
606
609
 
607
610
  model_ids = self._prepare_model_ids(model_id=model)
608
- dataset_configs = prepare_dataset_configs(
609
- dataset_names=benchmark_config.datasets
610
- )
611
+ dataset_configs = benchmark_config.datasets
611
612
 
612
613
  # Get all the model configs
613
614
  model_configs: list[ModelConfig] = list()
614
- for model_id in tqdm(
615
+ for model_id in get_pbar(
615
616
  iterable=model_ids,
616
617
  desc="Fetching model configurations",
617
618
  disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
@@ -622,50 +623,63 @@ class Benchmarker:
622
623
  )
623
624
  model_configs.append(model_config)
624
625
  except InvalidModel as e:
625
- logger.info(e.message)
626
+ log(e.message, level=logging.ERROR)
626
627
 
627
628
  # Create a dictionary that takes each model config to the dataset configs that
628
- # we need to benchmark the model on. Here we remove the datasets that the model
629
- # has already been benchmarked on, or datasets that the model cannot be
630
- # benchmarked on.
631
- model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
629
+ # we need to benchmark the model on. We initially include all the relevant
630
+ # datasets for each model.
631
+ model_config_to_dataset_configs: dict[
632
+ ModelConfig, c.Sequence[DatasetConfig]
633
+ ] = {
632
634
  model_config: [
633
635
  dataset_config
634
636
  for dataset_config in dataset_configs
635
- if (
636
- benchmark_config.force
637
- or not model_has_been_benchmarked(
638
- model_config=model_config,
639
- dataset_config=dataset_config,
640
- benchmark_config=benchmark_config,
641
- benchmark_results=self.benchmark_results,
642
- )
643
- )
644
- and model_config.model_type in dataset_config.allowed_model_types
637
+ if model_config.model_type in dataset_config.allowed_model_types
645
638
  ]
646
639
  for model_config in model_configs
647
640
  }
648
641
 
642
+ # Initialise the current benchmark results with all the ones that we have cached
643
+ # on disk already (can be none), and remove those datasets from the mapping
644
+ current_benchmark_results: list[BenchmarkResult] = list()
645
+ for (
646
+ model_config,
647
+ model_dataset_configs,
648
+ ) in model_config_to_dataset_configs.items():
649
+ new_model_dataset_configs: list[DatasetConfig] = list()
650
+ for dataset_config in model_dataset_configs:
651
+ benchmark_record = get_record(
652
+ model_config=model_config,
653
+ dataset_config=dataset_config,
654
+ benchmark_config=benchmark_config,
655
+ benchmark_results=self.benchmark_results,
656
+ )
657
+ if benchmark_record is not None and not benchmark_config.force:
658
+ current_benchmark_results.append(benchmark_record)
659
+ else:
660
+ new_model_dataset_configs.append(dataset_config)
661
+ model_config_to_dataset_configs[model_config] = new_model_dataset_configs
662
+
649
663
  total_benchmarks = sum(
650
664
  len(dataset_configs)
651
665
  for dataset_configs in model_config_to_dataset_configs.values()
652
666
  )
653
667
  if total_benchmarks == 0:
654
- logger.info(
668
+ log(
655
669
  "No benchmarks to run, as all the selected models have already been "
656
- "benchmarked on all the selected datasets."
670
+ "benchmarked on all the selected datasets.",
671
+ level=logging.INFO,
657
672
  )
658
- return list()
659
-
660
- logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
673
+ return current_benchmark_results
661
674
 
662
675
  num_finished_benchmarks = 0
663
- current_benchmark_results: list[BenchmarkResult] = list()
676
+ benchmark_params_to_revert: dict[str, t.Any] = dict()
664
677
  for model_config in model_configs:
665
678
  if not model_config_to_dataset_configs[model_config]:
666
- logger.debug(
679
+ log(
667
680
  f"Skipping model {model_config.model_id!r} because it has "
668
- "already been benchmarked on all valid datasets."
681
+ "already been benchmarked on all valid datasets.",
682
+ level=logging.DEBUG,
669
683
  )
670
684
  continue
671
685
 
@@ -691,7 +705,6 @@ class Benchmarker:
691
705
  )
692
706
 
693
707
  loaded_model: BenchmarkModule | None = None
694
- benchmark_params_to_revert: dict[str, t.Any] = dict()
695
708
  for dataset_config in model_config_to_dataset_configs[model_config]:
696
709
  # Revert any changes to the benchmark configuration made for the
697
710
  # previous dataset
@@ -704,18 +717,20 @@ class Benchmarker:
704
717
  "val" not in dataset_config.splits
705
718
  and not benchmark_config.evaluate_test_split
706
719
  ):
707
- logger.debug(
720
+ log(
708
721
  "The dataset does not have a validation split, so even though "
709
722
  "you requested evaluating the validation split (the default), "
710
- "we will evaluate on the test split."
723
+ "we will evaluate on the test split.",
724
+ level=logging.DEBUG,
711
725
  )
712
726
  benchmark_params_to_revert["evaluate_test_split"] = False
713
727
  benchmark_config.evaluate_test_split = True
714
728
  if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
715
- logger.debug(
729
+ log(
716
730
  "The task requires zero-shot evaluation, so even though you "
717
731
  "requested few-shot evaluation (the default), we will evaluate "
718
- "zero-shot."
732
+ "zero-shot.",
733
+ level=logging.DEBUG,
719
734
  )
720
735
  benchmark_params_to_revert["few_shot"] = True
721
736
  benchmark_config.few_shot = False
@@ -723,13 +738,7 @@ class Benchmarker:
723
738
  # We do not re-initialise generative models as their architecture is not
724
739
  # customised to specific datasets
725
740
  if model_config.model_type == ModelType.GENERATIVE:
726
- initial_logging(
727
- model_config=model_config,
728
- dataset_config=dataset_config,
729
- benchmark_config=benchmark_config,
730
- )
731
741
  if loaded_model is None:
732
- logger.info("Loading model...")
733
742
  try:
734
743
  loaded_model = load_model(
735
744
  model_config=model_config,
@@ -739,7 +748,7 @@ class Benchmarker:
739
748
  except InvalidModel as e:
740
749
  if benchmark_config.raise_errors:
741
750
  raise e
742
- logger.info(e.message)
751
+ log(e.message, level=logging.ERROR)
743
752
 
744
753
  # Add the remaining number of benchmarks for the model to
745
754
  # our benchmark counter, since we're skipping the rest of
@@ -759,12 +768,13 @@ class Benchmarker:
759
768
  loaded_model.generative_type
760
769
  not in dataset_config.allowed_generative_types
761
770
  ):
762
- logger.debug(
771
+ log(
763
772
  f"Skipping the benchmark of model "
764
773
  f"{model_config.model_id!r}on dataset "
765
774
  f"{dataset_config.name!r} because the model has generative "
766
775
  f"type {loaded_model.generative_type} and the dataset "
767
- f"only allows {dataset_config.allowed_generative_types}."
776
+ f"only allows {dataset_config.allowed_generative_types}.",
777
+ level=logging.DEBUG,
768
778
  )
769
779
  num_finished_benchmarks += 1
770
780
  continue
@@ -775,6 +785,8 @@ class Benchmarker:
775
785
  model_config=model_config,
776
786
  dataset_config=dataset_config,
777
787
  benchmark_config=benchmark_config,
788
+ num_finished_benchmarks=num_finished_benchmarks,
789
+ num_total_benchmarks=total_benchmarks,
778
790
  )
779
791
 
780
792
  if (
@@ -784,12 +796,12 @@ class Benchmarker:
784
796
  raise benchmark_output_or_err
785
797
 
786
798
  elif isinstance(benchmark_output_or_err, InvalidBenchmark):
787
- logger.info(benchmark_output_or_err.message)
799
+ log(benchmark_output_or_err.message, level=logging.WARNING)
788
800
  num_finished_benchmarks += 1
789
801
  continue
790
802
 
791
803
  elif isinstance(benchmark_output_or_err, InvalidModel):
792
- logger.info(benchmark_output_or_err.message)
804
+ log(benchmark_output_or_err.message, level=logging.WARNING)
793
805
 
794
806
  # Add the remaining number of benchmarks for the model to our
795
807
  # benchmark counter, since we're skipping the rest of them
@@ -805,15 +817,15 @@ class Benchmarker:
805
817
  record.append_to_results(results_path=self.results_path)
806
818
 
807
819
  num_finished_benchmarks += 1
808
- logger.info(
809
- f"Finished {num_finished_benchmarks} out of "
810
- f"{total_benchmarks} benchmarks."
811
- )
812
820
 
813
821
  del loaded_model
814
822
  if benchmark_config.clear_model_cache:
815
823
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
816
824
 
825
+ log(
826
+ f"\nCompleted {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO
827
+ )
828
+
817
829
  # This avoids the following warning at the end of the benchmarking:
818
830
  # Warning: WARNING: process group has NOT been destroyed before we destruct
819
831
  # ProcessGroupNCCL. On normal program exit, the application should call
@@ -826,7 +838,7 @@ class Benchmarker:
826
838
  destroy_process_group()
827
839
  return current_benchmark_results
828
840
 
829
- def _prepare_model_ids(self, model_id: list[str] | str) -> list[str]:
841
+ def _prepare_model_ids(self, model_id: c.Sequence[str] | str) -> c.Sequence[str]:
830
842
  """Prepare the model ID(s) to be benchmarked.
831
843
 
832
844
  Args:
@@ -857,6 +869,8 @@ class Benchmarker:
857
869
  model_config: "ModelConfig",
858
870
  dataset_config: "DatasetConfig",
859
871
  benchmark_config: "BenchmarkConfig",
872
+ num_finished_benchmarks: int,
873
+ num_total_benchmarks: int,
860
874
  ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
861
875
  """Benchmark a single model on a single dataset.
862
876
 
@@ -869,25 +883,29 @@ class Benchmarker:
869
883
  The configuration of the dataset we are evaluating on.
870
884
  benchmark_config:
871
885
  The general benchmark configuration.
886
+ num_finished_benchmarks:
887
+ The number of benchmarks that have already been completed.
888
+ num_total_benchmarks:
889
+ The total number of benchmarks to be completed.
872
890
 
873
891
  Returns:
874
892
  The benchmark result, or an error if the benchmark was unsuccessful.
875
- """
876
- if model is None:
877
- initial_logging(
878
- model_config=model_config,
879
- dataset_config=dataset_config,
880
- benchmark_config=benchmark_config,
881
- )
882
893
 
883
- while True:
894
+ Raises:
895
+ RuntimeError:
896
+ If the MPS fallback is not enabled when required.
897
+ InvalidBenchmark:
898
+ If the benchmark was unsuccessful.
899
+ InvalidModel:
900
+ If the model is invalid.
901
+ """
902
+ for _ in range(num_attempts := 5):
884
903
  try:
885
904
  # Set random seeds to enforce reproducibility of the randomly
886
905
  # initialised weights
887
906
  rng = enforce_reproducibility()
888
907
 
889
908
  if model is None or model_config.model_type != ModelType.GENERATIVE:
890
- logger.info("Loading model...")
891
909
  model = load_model(
892
910
  model_config=model_config,
893
911
  dataset_config=dataset_config,
@@ -895,6 +913,14 @@ class Benchmarker:
895
913
  )
896
914
  assert model is not None
897
915
 
916
+ initial_logging(
917
+ model_config=model_config,
918
+ dataset_config=dataset_config,
919
+ benchmark_config=benchmark_config,
920
+ num_finished_benchmarks=num_finished_benchmarks,
921
+ num_total_benchmarks=num_total_benchmarks,
922
+ )
923
+
898
924
  if dataset_config.task == SPEED:
899
925
  scores = benchmark_speed(
900
926
  model=model, benchmark_config=benchmark_config
@@ -962,14 +988,15 @@ class Benchmarker:
962
988
  few_shot=benchmark_config.few_shot,
963
989
  validation_split=not benchmark_config.evaluate_test_split,
964
990
  )
965
- logger.debug(f"Results:\n{results}")
991
+ log(f"Results:\n{results}", level=logging.DEBUG)
966
992
  return record
967
993
 
968
994
  except HuggingFaceHubDown:
969
995
  wait_time = 30
970
- logger.debug(
996
+ log(
971
997
  f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
972
- "seconds."
998
+ "seconds.",
999
+ level=logging.DEBUG,
973
1000
  )
974
1001
  sleep(wait_time)
975
1002
  continue
@@ -992,23 +1019,29 @@ class Benchmarker:
992
1019
  elif benchmark_config.raise_errors:
993
1020
  raise e
994
1021
  return e
1022
+ else:
1023
+ return InvalidBenchmark(
1024
+ f"Failed to benchmark model {model_config.model_id!r} on dataset "
1025
+ f"{dataset_config.name!r} after {num_attempts} attempts."
1026
+ )
995
1027
 
996
1028
  def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
997
1029
  """Alias for `self.benchmark()`."""
998
- logger.warning(
1030
+ log(
999
1031
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
1000
- "`benchmark` function instead. This will be removed in a future version."
1032
+ "`benchmark` function instead. This will be removed in a future version.",
1033
+ level=logging.WARNING,
1001
1034
  )
1002
1035
  return self.benchmark(*args, **kwds)
1003
1036
 
1004
1037
 
1005
- def model_has_been_benchmarked(
1038
+ def get_record(
1006
1039
  model_config: "ModelConfig",
1007
1040
  dataset_config: "DatasetConfig",
1008
1041
  benchmark_config: "BenchmarkConfig",
1009
- benchmark_results: list[BenchmarkResult],
1010
- ) -> bool:
1011
- """Checks whether a model has already been benchmarked on a dataset.
1042
+ benchmark_results: c.Sequence[BenchmarkResult],
1043
+ ) -> BenchmarkResult | None:
1044
+ """Get the benchmark record for a given model and dataset.
1012
1045
 
1013
1046
  Args:
1014
1047
  model_config:
@@ -1021,7 +1054,7 @@ def model_has_been_benchmarked(
1021
1054
  The benchmark results.
1022
1055
 
1023
1056
  Returns:
1024
- Whether the model has already been evaluated on the dataset.
1057
+ The benchmark record, or None if no such record exists.
1025
1058
  """
1026
1059
  for record in benchmark_results:
1027
1060
  model_id_components = split_model_id(model_id=record.model)
@@ -1046,30 +1079,8 @@ def model_has_been_benchmarked(
1046
1079
  and same_split
1047
1080
  and same_num_shots
1048
1081
  ):
1049
- return True
1050
- return False
1051
-
1052
-
1053
- def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
1054
- """Adjust the logging level based on verbosity.
1055
-
1056
- Args:
1057
- verbose:
1058
- Whether to output additional output.
1059
- ignore_testing:
1060
- Whether to ignore the testing flag.
1061
-
1062
- Returns:
1063
- The logging level that was set.
1064
- """
1065
- if hasattr(sys, "_called_from_test") and not ignore_testing:
1066
- logging_level = logging.CRITICAL
1067
- elif verbose:
1068
- logging_level = logging.DEBUG
1069
- else:
1070
- logging_level = logging.INFO
1071
- logger.setLevel(logging_level)
1072
- return logging_level
1082
+ return record
1083
+ return None
1073
1084
 
1074
1085
 
1075
1086
  def clear_model_cache_fn(cache_dir: str) -> None:
@@ -1090,7 +1101,9 @@ def clear_model_cache_fn(cache_dir: str) -> None:
1090
1101
  rmtree(sub_model_dir)
1091
1102
 
1092
1103
 
1093
- def prepare_dataset_configs(dataset_names: list[str]) -> list["DatasetConfig"]:
1104
+ def prepare_dataset_configs(
1105
+ dataset_names: c.Sequence[str],
1106
+ ) -> c.Sequence["DatasetConfig"]:
1094
1107
  """Prepare the dataset configuration(s) to be benchmarked.
1095
1108
 
1096
1109
  Args:
@@ -1109,6 +1122,8 @@ def initial_logging(
1109
1122
  model_config: "ModelConfig",
1110
1123
  dataset_config: "DatasetConfig",
1111
1124
  benchmark_config: "BenchmarkConfig",
1125
+ num_finished_benchmarks: int,
1126
+ num_total_benchmarks: int,
1112
1127
  ) -> None:
1113
1128
  """Initial logging at the start of the benchmarking process.
1114
1129
 
@@ -1119,6 +1134,10 @@ def initial_logging(
1119
1134
  The configuration of the dataset we are evaluating on.
1120
1135
  benchmark_config:
1121
1136
  The general benchmark configuration.
1137
+ num_finished_benchmarks:
1138
+ The number of benchmarks that have already been finished.
1139
+ num_total_benchmarks:
1140
+ The total number of benchmarks to be run.
1122
1141
  """
1123
1142
  model_id = model_config.model_id
1124
1143
  if model_config.revision and model_config.revision != "main":
@@ -1135,21 +1154,25 @@ def initial_logging(
1135
1154
  else:
1136
1155
  eval_type = "Benchmarking"
1137
1156
 
1138
- logger.info(
1139
- f"{eval_type} {model_id} on the {split_type} split of "
1140
- f"{dataset_config.pretty_name}"
1157
+ log_once(
1158
+ f"\n{eval_type} {model_id} on the {split_type} split of "
1159
+ f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
1160
+ f"{num_total_benchmarks} benchmarks)...",
1161
+ prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
1141
1162
  )
1142
1163
 
1143
1164
  if dataset_config.unofficial:
1144
- logger.info(
1165
+ log_once(
1145
1166
  f"Note that the {dataset_config.name!r} dataset is unofficial, "
1146
1167
  "meaning that the resulting evaluation will not be included in the "
1147
- "official leaderboard."
1168
+ "official leaderboard.",
1169
+ level=logging.WARNING,
1148
1170
  )
1149
1171
 
1150
1172
  if benchmark_config.debug:
1151
- logger.info(
1173
+ log_once(
1152
1174
  "Running in debug mode. This will output additional information, as "
1153
1175
  "well as store the model outputs in the current directory after each "
1154
- "batch. For this reason, evaluation will be slower."
1176
+ "batch. For this reason, evaluation will be slower.",
1177
+ level=logging.WARNING,
1155
1178
  )
@@ -0,0 +1,79 @@
1
+ """Caching utility functions."""
2
+
3
+ import typing as t
4
+ from functools import wraps
5
+
6
+ from .constants import T
7
+
8
+
9
+ def cache_arguments(
10
+ *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
11
+ ) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
12
+ """Cache specified arguments of a function.
13
+
14
+ Args:
15
+ arguments:
16
+ The list of argument names to cache. If empty, all arguments are cached.
17
+ disable_condition:
18
+ A function that checks if cache should be disabled.
19
+
20
+ Returns:
21
+ A decorator that caches the specified arguments of a function.
22
+ """
23
+
24
+ def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
25
+ """Decorator that caches the specified arguments of a function.
26
+
27
+ Args:
28
+ func:
29
+ The function to decorate.
30
+
31
+ Returns:
32
+ The decorated function.
33
+ """
34
+ cache: dict[tuple, T] = dict()
35
+
36
+ @wraps(func)
37
+ def wrapper(*args, **kwargs) -> T:
38
+ """Wrapper function that caches the specified arguments.
39
+
40
+ Args:
41
+ *args:
42
+ The positional arguments to the function.
43
+ **kwargs:
44
+ The keyword arguments to the function.
45
+
46
+ Returns:
47
+ The result of the function.
48
+
49
+ Raises:
50
+ ValueError:
51
+ If an argument name is not found in the function parameters.
52
+ """
53
+ if not arguments:
54
+ key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
55
+ else:
56
+ func_params = func.__code__.co_varnames
57
+ key_items: list[t.Any] = list()
58
+ for arg_name in arguments:
59
+ if arg_name in kwargs:
60
+ key_items.append(kwargs[arg_name])
61
+ else:
62
+ try:
63
+ arg_index = func_params.index(arg_name)
64
+ key_items.append(args[arg_index])
65
+ except (ValueError, IndexError):
66
+ raise ValueError(
67
+ f"Argument {arg_name} not found in function "
68
+ f"{func.__name__} parameters."
69
+ )
70
+ key = tuple(key_items)
71
+
72
+ # Do not cache if the condition is met
73
+ if key not in cache or disable_condition():
74
+ cache[key] = func(*args, **kwargs)
75
+ return cache[key]
76
+
77
+ return wrapper
78
+
79
+ return caching_decorator