EuroEval 16.2.2__py3-none-any.whl → 16.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (65) hide show
  1. euroeval/__init__.py +7 -4
  2. euroeval/benchmark_config_factory.py +0 -4
  3. euroeval/benchmark_modules/base.py +3 -16
  4. euroeval/benchmark_modules/fresh.py +5 -2
  5. euroeval/benchmark_modules/hf.py +107 -66
  6. euroeval/benchmark_modules/litellm.py +103 -55
  7. euroeval/benchmark_modules/vllm.py +155 -82
  8. euroeval/benchmarker.py +184 -129
  9. euroeval/caching_utils.py +79 -0
  10. euroeval/callbacks.py +5 -7
  11. euroeval/cli.py +1 -1
  12. euroeval/constants.py +9 -0
  13. euroeval/data_loading.py +14 -11
  14. euroeval/data_models.py +12 -4
  15. euroeval/dataset_configs/__init__.py +3 -0
  16. euroeval/dataset_configs/czech.py +79 -0
  17. euroeval/dataset_configs/danish.py +10 -13
  18. euroeval/dataset_configs/dutch.py +0 -3
  19. euroeval/dataset_configs/english.py +0 -3
  20. euroeval/dataset_configs/estonian.py +11 -1
  21. euroeval/dataset_configs/finnish.py +0 -3
  22. euroeval/dataset_configs/french.py +0 -3
  23. euroeval/dataset_configs/german.py +0 -3
  24. euroeval/dataset_configs/italian.py +0 -3
  25. euroeval/dataset_configs/latvian.py +2 -4
  26. euroeval/dataset_configs/lithuanian.py +68 -0
  27. euroeval/dataset_configs/norwegian.py +0 -3
  28. euroeval/dataset_configs/polish.py +0 -3
  29. euroeval/dataset_configs/portuguese.py +0 -3
  30. euroeval/dataset_configs/slovak.py +60 -0
  31. euroeval/dataset_configs/spanish.py +0 -3
  32. euroeval/dataset_configs/swedish.py +10 -15
  33. euroeval/finetuning.py +21 -15
  34. euroeval/generation.py +10 -10
  35. euroeval/generation_utils.py +2 -3
  36. euroeval/logging_utils.py +250 -0
  37. euroeval/metrics/base.py +0 -3
  38. euroeval/metrics/huggingface.py +10 -6
  39. euroeval/metrics/llm_as_a_judge.py +5 -3
  40. euroeval/metrics/pipeline.py +22 -9
  41. euroeval/metrics/speed.py +0 -3
  42. euroeval/model_cache.py +11 -14
  43. euroeval/model_config.py +4 -5
  44. euroeval/model_loading.py +3 -0
  45. euroeval/prompt_templates/linguistic_acceptability.py +30 -3
  46. euroeval/prompt_templates/multiple_choice.py +34 -1
  47. euroeval/prompt_templates/named_entity_recognition.py +71 -11
  48. euroeval/prompt_templates/reading_comprehension.py +41 -3
  49. euroeval/prompt_templates/sentiment_classification.py +34 -1
  50. euroeval/prompt_templates/summarization.py +26 -6
  51. euroeval/scores.py +7 -7
  52. euroeval/speed_benchmark.py +3 -5
  53. euroeval/task_group_utils/multiple_choice_classification.py +0 -3
  54. euroeval/task_group_utils/question_answering.py +0 -3
  55. euroeval/task_group_utils/sequence_classification.py +43 -31
  56. euroeval/task_group_utils/text_to_text.py +17 -8
  57. euroeval/task_group_utils/token_classification.py +10 -9
  58. euroeval/tokenisation_utils.py +22 -20
  59. euroeval/utils.py +30 -147
  60. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/METADATA +182 -61
  61. euroeval-16.4.0.dist-info/RECORD +75 -0
  62. euroeval-16.2.2.dist-info/RECORD +0 -70
  63. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/WHEEL +0 -0
  64. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/entry_points.txt +0 -0
  65. {euroeval-16.2.2.dist-info → euroeval-16.4.0.dist-info}/licenses/LICENSE +0 -0
euroeval/benchmarker.py CHANGED
@@ -1,10 +1,11 @@
1
1
  """Class that benchmarks language models."""
2
2
 
3
3
  import contextlib
4
+ import datetime as dt
4
5
  import json
5
6
  import logging
7
+ import os
6
8
  import re
7
- import sys
8
9
  import typing as t
9
10
  from pathlib import Path
10
11
  from shutil import rmtree
@@ -22,6 +23,7 @@ from .enums import Device, GenerativeType, ModelType
22
23
  from .exceptions import HuggingFaceHubDown, InvalidBenchmark, InvalidModel
23
24
  from .finetuning import finetune
24
25
  from .generation import generate
26
+ from .logging_utils import adjust_logging_level, get_pbar, log, log_once
25
27
  from .model_config import get_model_config
26
28
  from .model_loading import load_model
27
29
  from .scores import log_scores
@@ -31,7 +33,7 @@ from .utils import (
31
33
  enforce_reproducibility,
32
34
  get_package_version,
33
35
  internet_connection_available,
34
- log_once,
36
+ split_model_id,
35
37
  )
36
38
 
37
39
  if t.TYPE_CHECKING:
@@ -39,9 +41,6 @@ if t.TYPE_CHECKING:
39
41
  from .data_models import BenchmarkConfig, DatasetConfig, ModelConfig
40
42
 
41
43
 
42
- logger = logging.getLogger("euroeval")
43
-
44
-
45
44
  class Benchmarker:
46
45
  """Benchmarking all the language models.
47
46
 
@@ -82,7 +81,7 @@ class Benchmarker:
82
81
  num_iterations: int = 10,
83
82
  api_base: str | None = None,
84
83
  api_version: str | None = None,
85
- gpu_memory_utilization: float = 0.9,
84
+ gpu_memory_utilization: float = 0.8,
86
85
  generative_type: GenerativeType | None = None,
87
86
  debug: bool = False,
88
87
  run_with_cli: bool = False,
@@ -198,6 +197,10 @@ class Benchmarker:
198
197
  "Try installing it with `pip install hf_transfer`."
199
198
  )
200
199
 
200
+ # If FULL_LOG has been set, then force verbose mode
201
+ if os.getenv("FULL_LOG", "0") == "1":
202
+ verbose = True
203
+
201
204
  self.benchmark_config_default_params = BenchmarkConfigParams(
202
205
  task=task,
203
206
  dataset=dataset,
@@ -299,7 +302,6 @@ class Benchmarker:
299
302
  )
300
303
  del dataset
301
304
 
302
- log_once(f"Loading model {model_config.model_id}", level=logging.INFO)
303
305
  model = load_model(
304
306
  model_config=model_config,
305
307
  dataset_config=dataset_config,
@@ -607,46 +609,90 @@ class Benchmarker:
607
609
  dataset_names=benchmark_config.datasets
608
610
  )
609
611
 
610
- total_benchmarks = len(model_ids) * len(dataset_configs)
611
- num_finished_benchmarks = 0
612
-
613
- current_benchmark_results: list[BenchmarkResult] = list()
614
- for model_id in model_ids:
615
- # Load the model configuration, or skip the model if it is invalid
612
+ # Get all the model configs
613
+ model_configs: list[ModelConfig] = list()
614
+ for model_id in get_pbar(
615
+ iterable=model_ids,
616
+ desc="Fetching model configurations",
617
+ disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
618
+ ):
616
619
  try:
617
620
  model_config = get_model_config(
618
621
  model_id=model_id, benchmark_config=benchmark_config
619
622
  )
623
+ model_configs.append(model_config)
620
624
  except InvalidModel as e:
621
- logger.info(e.message)
622
- num_finished_benchmarks += len(dataset_configs)
625
+ log(e.message, level=logging.ERROR)
626
+
627
+ # Create a dictionary that takes each model config to the dataset configs that
628
+ # we need to benchmark the model on. Here we remove the datasets that the model
629
+ # has already been benchmarked on, or datasets that the model cannot be
630
+ # benchmarked on.
631
+ model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
632
+ model_config: [
633
+ dataset_config
634
+ for dataset_config in dataset_configs
635
+ if (
636
+ benchmark_config.force
637
+ or not model_has_been_benchmarked(
638
+ model_config=model_config,
639
+ dataset_config=dataset_config,
640
+ benchmark_config=benchmark_config,
641
+ benchmark_results=self.benchmark_results,
642
+ )
643
+ )
644
+ and model_config.model_type in dataset_config.allowed_model_types
645
+ ]
646
+ for model_config in model_configs
647
+ }
648
+
649
+ total_benchmarks = sum(
650
+ len(dataset_configs)
651
+ for dataset_configs in model_config_to_dataset_configs.values()
652
+ )
653
+ if total_benchmarks == 0:
654
+ log(
655
+ "No benchmarks to run, as all the selected models have already been "
656
+ "benchmarked on all the selected datasets.",
657
+ level=logging.INFO,
658
+ )
659
+ return list()
660
+
661
+ num_finished_benchmarks = 0
662
+ current_benchmark_results: list[BenchmarkResult] = list()
663
+ benchmark_params_to_revert: dict[str, t.Any] = dict()
664
+ for model_config in model_configs:
665
+ if not model_config_to_dataset_configs[model_config]:
666
+ log(
667
+ f"Skipping model {model_config.model_id!r} because it has "
668
+ "already been benchmarked on all valid datasets.",
669
+ level=logging.DEBUG,
670
+ )
623
671
  continue
624
672
 
625
673
  if model_config.adapter_base_model_id:
626
674
  open_issue_msg = (
627
- "If offline support is important to you, please "
628
- "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
675
+ "If offline support is important to you, please consider opening "
676
+ "an issue at https://github.com/EuroEval/EuroEval/issues."
629
677
  )
630
678
  if not internet_connection_available():
631
679
  raise InvalidModel(
632
680
  "Offline benchmarking of models with adapters is not currently "
633
- "supported. "
634
- f"An active internet connection is required. {open_issue_msg}"
681
+ "supported. An active internet connection is required. "
682
+ "{open_issue_msg}"
635
683
  )
636
684
  elif benchmark_config.download_only:
637
685
  log_once(
638
686
  "You are using download only mode with a model that includes "
639
- "an adapter. "
640
- "Please note: Offline benchmarking of adapter models is not "
641
- "currently supported. "
642
- "An internet connection will be required during evaluation. "
687
+ "an adapter. Please note that offline benchmarking of "
688
+ "adapter models is not currently supported - an internet "
689
+ "connection will be required during evaluation in this case. "
643
690
  f"{open_issue_msg}",
644
691
  level=logging.WARNING,
645
692
  )
646
693
 
647
694
  loaded_model: BenchmarkModule | None = None
648
- benchmark_params_to_revert: dict[str, t.Any] = dict()
649
- for dataset_config in dataset_configs:
695
+ for dataset_config in model_config_to_dataset_configs[model_config]:
650
696
  # Revert any changes to the benchmark configuration made for the
651
697
  # previous dataset
652
698
  for param, value in benchmark_params_to_revert.items():
@@ -658,60 +704,28 @@ class Benchmarker:
658
704
  "val" not in dataset_config.splits
659
705
  and not benchmark_config.evaluate_test_split
660
706
  ):
661
- logger.debug(
707
+ log(
662
708
  "The dataset does not have a validation split, so even though "
663
709
  "you requested evaluating the validation split (the default), "
664
- "we will evaluate on the test split."
710
+ "we will evaluate on the test split.",
711
+ level=logging.DEBUG,
665
712
  )
666
713
  benchmark_params_to_revert["evaluate_test_split"] = False
667
714
  benchmark_config.evaluate_test_split = True
668
715
  if dataset_config.task.requires_zero_shot and benchmark_config.few_shot:
669
- logger.debug(
716
+ log(
670
717
  "The task requires zero-shot evaluation, so even though you "
671
718
  "requested few-shot evaluation (the default), we will evaluate "
672
- "zero-shot."
719
+ "zero-shot.",
720
+ level=logging.DEBUG,
673
721
  )
674
722
  benchmark_params_to_revert["few_shot"] = True
675
723
  benchmark_config.few_shot = False
676
724
 
677
- # Skip if we have already benchmarked this model on this dataset and
678
- # we are not forcing the benchmark
679
- if not benchmark_config.force and model_has_been_benchmarked(
680
- model_id=model_id,
681
- dataset=dataset_config.name,
682
- few_shot=benchmark_config.few_shot,
683
- validation_split=not benchmark_config.evaluate_test_split,
684
- benchmark_results=self.benchmark_results,
685
- ):
686
- logger.debug(
687
- f"Skipping benchmarking {model_id} on "
688
- f"{dataset_config.pretty_name}, as it has already been "
689
- "benchmarked."
690
- )
691
- num_finished_benchmarks += 1
692
- continue
693
-
694
- # Skip if the model type should not be benchmarked on this dataset
695
- model_type = model_config.model_type
696
- allowed_model_types = dataset_config.allowed_model_types
697
- if model_type not in allowed_model_types:
698
- logger.debug(
699
- f"Skipping benchmarking {model_id} on "
700
- f"{dataset_config.pretty_name}, as it is of type {model_type}, "
701
- f"and the only allowed model types are {allowed_model_types}."
702
- )
703
- continue
704
-
705
725
  # We do not re-initialise generative models as their architecture is not
706
726
  # customised to specific datasets
707
727
  if model_config.model_type == ModelType.GENERATIVE:
708
- initial_logging(
709
- model_config=model_config,
710
- dataset_config=dataset_config,
711
- benchmark_config=benchmark_config,
712
- )
713
728
  if loaded_model is None:
714
- logger.info("Loading model...")
715
729
  try:
716
730
  loaded_model = load_model(
717
731
  model_config=model_config,
@@ -721,7 +735,7 @@ class Benchmarker:
721
735
  except InvalidModel as e:
722
736
  if benchmark_config.raise_errors:
723
737
  raise e
724
- logger.info(e.message)
738
+ log(e.message, level=logging.ERROR)
725
739
 
726
740
  # Add the remaining number of benchmarks for the model to
727
741
  # our benchmark counter, since we're skipping the rest of
@@ -735,12 +749,31 @@ class Benchmarker:
735
749
  else:
736
750
  loaded_model.dataset_config = dataset_config
737
751
 
752
+ # Skip the benchmark if the model is not of the correct
753
+ # generative type
754
+ if (
755
+ loaded_model.generative_type
756
+ not in dataset_config.allowed_generative_types
757
+ ):
758
+ log(
759
+ f"Skipping the benchmark of model "
760
+ f"{model_config.model_id!r}on dataset "
761
+ f"{dataset_config.name!r} because the model has generative "
762
+ f"type {loaded_model.generative_type} and the dataset "
763
+ f"only allows {dataset_config.allowed_generative_types}.",
764
+ level=logging.DEBUG,
765
+ )
766
+ num_finished_benchmarks += 1
767
+ continue
768
+
738
769
  # Benchmark a single model on a single dataset
739
770
  benchmark_output_or_err = self._benchmark_single(
740
771
  model=loaded_model,
741
772
  model_config=model_config,
742
773
  dataset_config=dataset_config,
743
774
  benchmark_config=benchmark_config,
775
+ num_finished_benchmarks=num_finished_benchmarks,
776
+ num_total_benchmarks=total_benchmarks,
744
777
  )
745
778
 
746
779
  if (
@@ -750,12 +783,12 @@ class Benchmarker:
750
783
  raise benchmark_output_or_err
751
784
 
752
785
  elif isinstance(benchmark_output_or_err, InvalidBenchmark):
753
- logger.info(benchmark_output_or_err.message)
786
+ log(benchmark_output_or_err.message, level=logging.WARNING)
754
787
  num_finished_benchmarks += 1
755
788
  continue
756
789
 
757
790
  elif isinstance(benchmark_output_or_err, InvalidModel):
758
- logger.info(benchmark_output_or_err.message)
791
+ log(benchmark_output_or_err.message, level=logging.WARNING)
759
792
 
760
793
  # Add the remaining number of benchmarks for the model to our
761
794
  # benchmark counter, since we're skipping the rest of them
@@ -771,15 +804,13 @@ class Benchmarker:
771
804
  record.append_to_results(results_path=self.results_path)
772
805
 
773
806
  num_finished_benchmarks += 1
774
- logger.info(
775
- f"Finished {num_finished_benchmarks} out of "
776
- f"{total_benchmarks} benchmarks."
777
- )
778
807
 
779
808
  del loaded_model
780
809
  if benchmark_config.clear_model_cache:
781
810
  clear_model_cache_fn(cache_dir=benchmark_config.cache_dir)
782
811
 
812
+ log(f"Completed {num_finished_benchmarks:,} benchmarks.\n", level=logging.INFO)
813
+
783
814
  # This avoids the following warning at the end of the benchmarking:
784
815
  # Warning: WARNING: process group has NOT been destroyed before we destruct
785
816
  # ProcessGroupNCCL. On normal program exit, the application should call
@@ -823,6 +854,8 @@ class Benchmarker:
823
854
  model_config: "ModelConfig",
824
855
  dataset_config: "DatasetConfig",
825
856
  benchmark_config: "BenchmarkConfig",
857
+ num_finished_benchmarks: int,
858
+ num_total_benchmarks: int,
826
859
  ) -> BenchmarkResult | InvalidBenchmark | InvalidModel:
827
860
  """Benchmark a single model on a single dataset.
828
861
 
@@ -835,25 +868,29 @@ class Benchmarker:
835
868
  The configuration of the dataset we are evaluating on.
836
869
  benchmark_config:
837
870
  The general benchmark configuration.
871
+ num_finished_benchmarks:
872
+ The number of benchmarks that have already been completed.
873
+ num_total_benchmarks:
874
+ The total number of benchmarks to be completed.
838
875
 
839
876
  Returns:
840
877
  The benchmark result, or an error if the benchmark was unsuccessful.
841
- """
842
- if model is None:
843
- initial_logging(
844
- model_config=model_config,
845
- dataset_config=dataset_config,
846
- benchmark_config=benchmark_config,
847
- )
848
878
 
849
- while True:
879
+ Raises:
880
+ RuntimeError:
881
+ If the MPS fallback is not enabled when required.
882
+ InvalidBenchmark:
883
+ If the benchmark was unsuccessful.
884
+ InvalidModel:
885
+ If the model is invalid.
886
+ """
887
+ for _ in range(num_attempts := 5):
850
888
  try:
851
889
  # Set random seeds to enforce reproducibility of the randomly
852
890
  # initialised weights
853
891
  rng = enforce_reproducibility()
854
892
 
855
893
  if model is None or model_config.model_type != ModelType.GENERATIVE:
856
- logger.info("Loading model...")
857
894
  model = load_model(
858
895
  model_config=model_config,
859
896
  dataset_config=dataset_config,
@@ -861,6 +898,14 @@ class Benchmarker:
861
898
  )
862
899
  assert model is not None
863
900
 
901
+ initial_logging(
902
+ model_config=model_config,
903
+ dataset_config=dataset_config,
904
+ benchmark_config=benchmark_config,
905
+ num_finished_benchmarks=num_finished_benchmarks,
906
+ num_total_benchmarks=num_total_benchmarks,
907
+ )
908
+
864
909
  if dataset_config.task == SPEED:
865
910
  scores = benchmark_speed(
866
911
  model=model, benchmark_config=benchmark_config
@@ -928,14 +973,15 @@ class Benchmarker:
928
973
  few_shot=benchmark_config.few_shot,
929
974
  validation_split=not benchmark_config.evaluate_test_split,
930
975
  )
931
- logger.debug(f"Results:\n{results}")
976
+ log(f"Results:\n{results}", level=logging.DEBUG)
932
977
  return record
933
978
 
934
979
  except HuggingFaceHubDown:
935
980
  wait_time = 30
936
- logger.debug(
981
+ log(
937
982
  f"The Hugging Face Hub seems to be down. Retrying in {wait_time} "
938
- "seconds."
983
+ "seconds.",
984
+ level=logging.DEBUG,
939
985
  )
940
986
  sleep(wait_time)
941
987
  continue
@@ -958,34 +1004,37 @@ class Benchmarker:
958
1004
  elif benchmark_config.raise_errors:
959
1005
  raise e
960
1006
  return e
1007
+ else:
1008
+ return InvalidBenchmark(
1009
+ f"Failed to benchmark model {model_config.model_id!r} on dataset "
1010
+ f"{dataset_config.name!r} after {num_attempts} attempts."
1011
+ )
961
1012
 
962
1013
  def __call__(self, *args: t.Any, **kwds: t.Any) -> t.Any: # noqa: ANN401
963
1014
  """Alias for `self.benchmark()`."""
964
- logger.warning(
1015
+ log(
965
1016
  "Calling the `Benchmarker` class directly is deprecated. Please use the "
966
- "`benchmark` function instead. This will be removed in a future version."
1017
+ "`benchmark` function instead. This will be removed in a future version.",
1018
+ level=logging.WARNING,
967
1019
  )
968
1020
  return self.benchmark(*args, **kwds)
969
1021
 
970
1022
 
971
1023
  def model_has_been_benchmarked(
972
- model_id: str,
973
- dataset: str,
974
- few_shot: bool,
975
- validation_split: bool,
1024
+ model_config: "ModelConfig",
1025
+ dataset_config: "DatasetConfig",
1026
+ benchmark_config: "BenchmarkConfig",
976
1027
  benchmark_results: list[BenchmarkResult],
977
1028
  ) -> bool:
978
1029
  """Checks whether a model has already been benchmarked on a dataset.
979
1030
 
980
1031
  Args:
981
- model_id:
982
- The model ID.
983
- dataset:
984
- The dataset.
985
- few_shot:
986
- Whether the model was evaluated using few-shot evaluation.
987
- validation_split:
988
- Whether the model was evaluated on the validation split.
1032
+ model_config:
1033
+ The configuration of the model we are evaluating.
1034
+ dataset_config:
1035
+ The configuration of the dataset we are evaluating on.
1036
+ benchmark_config:
1037
+ The general benchmark configuration.
989
1038
  benchmark_results:
990
1039
  The benchmark results.
991
1040
 
@@ -993,36 +1042,32 @@ def model_has_been_benchmarked(
993
1042
  Whether the model has already been evaluated on the dataset.
994
1043
  """
995
1044
  for record in benchmark_results:
996
- same_evaluation = record.model == model_id and record.dataset == dataset
997
- same_validation_split_setting = record.validation_split == validation_split
998
- same_few_shot_setting = record.few_shot == few_shot or not record.generative
999
- if same_evaluation and same_validation_split_setting and same_few_shot_setting:
1045
+ model_id_components = split_model_id(model_id=record.model)
1046
+ same_model_id = model_id_components.model_id == model_config.model_id
1047
+ same_revision = model_id_components.revision == model_config.revision
1048
+ same_param = model_id_components.param == model_config.param
1049
+ same_dataset = record.dataset == dataset_config.name
1050
+ same_split = (
1051
+ record.validation_split != benchmark_config.evaluate_test_split
1052
+ or "val" not in dataset_config.splits
1053
+ )
1054
+ same_num_shots = (
1055
+ record.few_shot == benchmark_config.few_shot
1056
+ or not record.generative
1057
+ or dataset_config.task.requires_zero_shot
1058
+ )
1059
+ if (
1060
+ same_model_id
1061
+ and same_revision
1062
+ and same_param
1063
+ and same_dataset
1064
+ and same_split
1065
+ and same_num_shots
1066
+ ):
1000
1067
  return True
1001
1068
  return False
1002
1069
 
1003
1070
 
1004
- def adjust_logging_level(verbose: bool, ignore_testing: bool = False) -> int:
1005
- """Adjust the logging level based on verbosity.
1006
-
1007
- Args:
1008
- verbose:
1009
- Whether to output additional output.
1010
- ignore_testing:
1011
- Whether to ignore the testing flag.
1012
-
1013
- Returns:
1014
- The logging level that was set.
1015
- """
1016
- if hasattr(sys, "_called_from_test") and not ignore_testing:
1017
- logging_level = logging.CRITICAL
1018
- elif verbose:
1019
- logging_level = logging.DEBUG
1020
- else:
1021
- logging_level = logging.INFO
1022
- logger.setLevel(logging_level)
1023
- return logging_level
1024
-
1025
-
1026
1071
  def clear_model_cache_fn(cache_dir: str) -> None:
1027
1072
  """Clear the model cache.
1028
1073
 
@@ -1060,6 +1105,8 @@ def initial_logging(
1060
1105
  model_config: "ModelConfig",
1061
1106
  dataset_config: "DatasetConfig",
1062
1107
  benchmark_config: "BenchmarkConfig",
1108
+ num_finished_benchmarks: int,
1109
+ num_total_benchmarks: int,
1063
1110
  ) -> None:
1064
1111
  """Initial logging at the start of the benchmarking process.
1065
1112
 
@@ -1070,6 +1117,10 @@ def initial_logging(
1070
1117
  The configuration of the dataset we are evaluating on.
1071
1118
  benchmark_config:
1072
1119
  The general benchmark configuration.
1120
+ num_finished_benchmarks:
1121
+ The number of benchmarks that have already been finished.
1122
+ num_total_benchmarks:
1123
+ The total number of benchmarks to be run.
1073
1124
  """
1074
1125
  model_id = model_config.model_id
1075
1126
  if model_config.revision and model_config.revision != "main":
@@ -1086,21 +1137,25 @@ def initial_logging(
1086
1137
  else:
1087
1138
  eval_type = "Benchmarking"
1088
1139
 
1089
- logger.info(
1090
- f"{eval_type} {model_id} on the {split_type} split of "
1091
- f"{dataset_config.pretty_name}"
1140
+ log_once(
1141
+ f"\n{eval_type} {model_id} on the {split_type} split of "
1142
+ f"{dataset_config.pretty_name} ({num_finished_benchmarks + 1}/"
1143
+ f"{num_total_benchmarks} benchmarks)...",
1144
+ prefix=f"\n[{dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}]",
1092
1145
  )
1093
1146
 
1094
1147
  if dataset_config.unofficial:
1095
- logger.info(
1148
+ log_once(
1096
1149
  f"Note that the {dataset_config.name!r} dataset is unofficial, "
1097
1150
  "meaning that the resulting evaluation will not be included in the "
1098
- "official leaderboard."
1151
+ "official leaderboard.",
1152
+ level=logging.WARNING,
1099
1153
  )
1100
1154
 
1101
1155
  if benchmark_config.debug:
1102
- logger.info(
1156
+ log_once(
1103
1157
  "Running in debug mode. This will output additional information, as "
1104
1158
  "well as store the model outputs in the current directory after each "
1105
- "batch. For this reason, evaluation will be slower."
1159
+ "batch. For this reason, evaluation will be slower.",
1160
+ level=logging.WARNING,
1106
1161
  )
@@ -0,0 +1,79 @@
1
+ """Caching utility functions."""
2
+
3
+ import typing as t
4
+ from functools import wraps
5
+
6
+ from .constants import T
7
+
8
+
9
+ def cache_arguments(
10
+ *arguments: str, disable_condition: t.Callable[[], bool] = lambda: False
11
+ ) -> t.Callable[[t.Callable[..., T]], t.Callable[..., T]]:
12
+ """Cache specified arguments of a function.
13
+
14
+ Args:
15
+ arguments:
16
+ The list of argument names to cache. If empty, all arguments are cached.
17
+ disable_condition:
18
+ A function that checks if cache should be disabled.
19
+
20
+ Returns:
21
+ A decorator that caches the specified arguments of a function.
22
+ """
23
+
24
+ def caching_decorator(func: t.Callable[..., T]) -> t.Callable[..., T]:
25
+ """Decorator that caches the specified arguments of a function.
26
+
27
+ Args:
28
+ func:
29
+ The function to decorate.
30
+
31
+ Returns:
32
+ The decorated function.
33
+ """
34
+ cache: dict[tuple, T] = dict()
35
+
36
+ @wraps(func)
37
+ def wrapper(*args, **kwargs) -> T:
38
+ """Wrapper function that caches the specified arguments.
39
+
40
+ Args:
41
+ *args:
42
+ The positional arguments to the function.
43
+ **kwargs:
44
+ The keyword arguments to the function.
45
+
46
+ Returns:
47
+ The result of the function.
48
+
49
+ Raises:
50
+ ValueError:
51
+ If an argument name is not found in the function parameters.
52
+ """
53
+ if not arguments:
54
+ key = args + tuple(kwargs[k] for k in sorted(kwargs.keys()))
55
+ else:
56
+ func_params = func.__code__.co_varnames
57
+ key_items: list[t.Any] = []
58
+ for arg_name in arguments:
59
+ if arg_name in kwargs:
60
+ key_items.append(kwargs[arg_name])
61
+ else:
62
+ try:
63
+ arg_index = func_params.index(arg_name)
64
+ key_items.append(args[arg_index])
65
+ except (ValueError, IndexError):
66
+ raise ValueError(
67
+ f"Argument {arg_name} not found in function "
68
+ f"{func.__name__} parameters."
69
+ )
70
+ key = tuple(key_items)
71
+
72
+ # Do not cache if the condition is met
73
+ if key not in cache or disable_condition():
74
+ cache[key] = func(*args, **kwargs)
75
+ return cache[key]
76
+
77
+ return wrapper
78
+
79
+ return caching_decorator
euroeval/callbacks.py CHANGED
@@ -7,6 +7,8 @@ from collections.abc import Sized
7
7
  from tqdm.auto import tqdm
8
8
  from transformers.trainer_callback import ProgressCallback
9
9
 
10
+ from .logging_utils import get_pbar
11
+
10
12
  if t.TYPE_CHECKING:
11
13
  from torch.utils.data import DataLoader
12
14
  from transformers.trainer_callback import TrainerControl, TrainerState
@@ -32,11 +34,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
32
34
  """Callback actions when training begins."""
33
35
  if state.is_local_process_zero:
34
36
  desc = "Finetuning model"
35
- self.training_bar = tqdm(
36
- total=None,
37
- leave=False,
38
- desc=desc,
39
- disable=hasattr(sys, "_called_from_test"),
37
+ self.training_bar = get_pbar(
38
+ total=None, desc=desc, disable=hasattr(sys, "_called_from_test")
40
39
  )
41
40
  self.current_step = 0
42
41
 
@@ -67,9 +66,8 @@ class NeverLeaveProgressCallback(ProgressCallback):
67
66
  if state.is_local_process_zero and correct_dtype:
68
67
  if self.prediction_bar is None:
69
68
  desc = "Evaluating model"
70
- self.prediction_bar = tqdm(
69
+ self.prediction_bar = get_pbar(
71
70
  total=len(eval_dataloader),
72
- leave=False,
73
71
  desc=desc,
74
72
  disable=hasattr(sys, "_called_from_test"),
75
73
  )
euroeval/cli.py CHANGED
@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
188
188
  )
189
189
  @click.option(
190
190
  "--gpu-memory-utilization",
191
- default=0.9,
191
+ default=0.8,
192
192
  show_default=True,
193
193
  help="The GPU memory utilization to use for vLLM. A larger value will result in "
194
194
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "