EuroEval 16.2.2__py3-none-any.whl → 16.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (38) hide show
  1. euroeval/__init__.py +4 -2
  2. euroeval/benchmark_modules/fresh.py +3 -1
  3. euroeval/benchmark_modules/hf.py +8 -4
  4. euroeval/benchmark_modules/litellm.py +5 -17
  5. euroeval/benchmark_modules/vllm.py +88 -23
  6. euroeval/benchmarker.py +110 -61
  7. euroeval/cli.py +1 -1
  8. euroeval/constants.py +3 -0
  9. euroeval/dataset_configs/__init__.py +1 -0
  10. euroeval/dataset_configs/danish.py +0 -2
  11. euroeval/dataset_configs/dutch.py +0 -2
  12. euroeval/dataset_configs/english.py +0 -2
  13. euroeval/dataset_configs/finnish.py +0 -2
  14. euroeval/dataset_configs/french.py +0 -2
  15. euroeval/dataset_configs/german.py +0 -2
  16. euroeval/dataset_configs/italian.py +0 -2
  17. euroeval/dataset_configs/latvian.py +2 -3
  18. euroeval/dataset_configs/lithuanian.py +62 -0
  19. euroeval/dataset_configs/norwegian.py +0 -2
  20. euroeval/dataset_configs/polish.py +0 -2
  21. euroeval/dataset_configs/portuguese.py +0 -2
  22. euroeval/dataset_configs/spanish.py +0 -2
  23. euroeval/dataset_configs/swedish.py +0 -3
  24. euroeval/metrics/huggingface.py +1 -1
  25. euroeval/metrics/pipeline.py +5 -0
  26. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  27. euroeval/prompt_templates/multiple_choice.py +9 -0
  28. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  29. euroeval/prompt_templates/reading_comprehension.py +10 -0
  30. euroeval/prompt_templates/sentiment_classification.py +11 -0
  31. euroeval/tokenisation_utils.py +8 -8
  32. euroeval/utils.py +1 -1
  33. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
  34. euroeval-16.3.0.dist-info/RECORD +71 -0
  35. euroeval-16.2.2.dist-info/RECORD +0 -70
  36. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
  37. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
  38. {euroeval-16.2.2.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
103
103
  os.environ["VLLM_USE_V1"] = "1"
104
104
 
105
105
 
106
- # Use the FlashInfer flash-attention backend for vLLM
107
- os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
106
+ # Use the FlashInfer flash-attention backend for vLLM, unless the user has already
107
+ # specified a different backend.
108
+ if os.getenv("VLLM_ATTENTION_BACKEND") is None:
109
+ os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
108
110
 
109
111
 
110
112
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
@@ -1,5 +1,6 @@
1
1
  """Freshly initialised encoder models."""
2
2
 
3
+ import re
3
4
  import typing as t
4
5
  from functools import cached_property
5
6
  from json import JSONDecodeError
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
45
46
  """A freshly initialised encoder model."""
46
47
 
47
48
  fresh_model = True
49
+ allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
48
50
 
49
51
  def __init__(
50
52
  self,
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
294
296
  token=get_hf_token(api_key=benchmark_config.api_key),
295
297
  add_prefix_space=prefix,
296
298
  cache_dir=model_config.model_cache_dir,
297
- use_fast=True,
299
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
298
300
  verbose=False,
299
301
  trust_remote_code=benchmark_config.trust_remote_code,
300
302
  )
@@ -2,6 +2,7 @@
2
2
 
3
3
  import collections.abc as c
4
4
  import logging
5
+ import re
5
6
  import typing as t
6
7
  from functools import cached_property, partial
7
8
  from json import JSONDecodeError
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
93
94
  fresh_model = False
94
95
  batching_preference = BatchingPreference.NO_PREFERENCE
95
96
  high_priority = True
97
+ allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
96
98
 
97
99
  def __init__(
98
100
  self,
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
690
692
  model=model,
691
693
  model_id=model_id,
692
694
  trust_remote_code=benchmark_config.trust_remote_code,
693
- model_cache_dir=model_config.model_cache_dir,
695
+ model_config=model_config,
694
696
  )
695
697
 
696
698
  return model, tokeniser
@@ -880,7 +882,7 @@ def load_tokeniser(
880
882
  model: "PreTrainedModel | None",
881
883
  model_id: str,
882
884
  trust_remote_code: bool,
883
- model_cache_dir: str,
885
+ model_config: "ModelConfig",
884
886
  ) -> "PreTrainedTokenizer":
885
887
  """Load the tokeniser.
886
888
 
@@ -892,17 +894,19 @@ def load_tokeniser(
892
894
  The model identifier. Used for logging.
893
895
  trust_remote_code:
894
896
  Whether to trust remote code.
897
+ model_config:
898
+ The model configuration.
895
899
 
896
900
  Returns:
897
901
  The loaded tokeniser.
898
902
  """
899
903
  loading_kwargs: dict[str, bool | str] = dict(
900
- use_fast=True,
904
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
901
905
  verbose=False,
902
906
  trust_remote_code=trust_remote_code,
903
907
  padding_side="right",
904
908
  truncation_side="right",
905
- cache_dir=model_cache_dir,
909
+ cache_dir=model_config.model_cache_dir,
906
910
  )
907
911
 
908
912
  # If the model is a subclass of a certain model types then we have to add a prefix
@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
369
369
  ]
370
370
  logger.debug(
371
371
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
372
- f"{len(inputs_to_run):,} failed message(s)"
372
+ f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
373
+ f"{failures[0][1]}."
373
374
  )
374
375
 
375
376
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
453
454
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
454
455
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
455
456
  response_format_messages = [
456
- "got an unexpected keyword argument 'response_format'",
457
- "The model outputs empty dictionaries.",
457
+ "got an unexpected keyword argument 'response_format'"
458
458
  ]
459
459
 
460
460
  if any(msg.lower() in error_msg for msg in stop_messages):
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
713
713
  ]
714
714
  responses = await tqdm_async.gather(*requests, leave=False)
715
715
 
716
- # If we are performing structured generation and the model just outputs an empty
717
- # dictionary, then we convert those to exceptions, to disable structured
718
- # generation
719
- if "response_format" in generation_kwargs:
720
- responses = [
721
- RuntimeError("The model outputs empty dictionaries.")
722
- if not isinstance(response, Exception)
723
- and any(choice.message.content == "{}" for choice in response.choices)
724
- else response
725
- for response in responses
726
- ]
727
-
728
716
  # Separate the successful responses from the failed ones
729
717
  successes = [
730
718
  (idx, response)
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
984
972
  model=None,
985
973
  model_id=model_id,
986
974
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
- model_cache_dir=self.model_config.model_cache_dir,
975
+ model_config=self.model_config,
988
976
  )
989
977
 
990
978
  if (
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
1067
1055
  model=None,
1068
1056
  model_id=model_id,
1069
1057
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
- model_cache_dir=self.model_config.model_cache_dir,
1058
+ model_config=self.model_config,
1071
1059
  )
1072
1060
 
1073
1061
  all_max_lengths: list[int] = list()
@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
104
104
  fresh_model = False
105
105
  batching_preference = BatchingPreference.ALL_AT_ONCE
106
106
  high_priority = True
107
- allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
107
+ allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
108
108
 
109
109
  def __init__(
110
110
  self,
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
559
559
  torch.LongTensor(completion_id) for completion_id in completion_ids
560
560
  ]
561
561
  )
562
- if self.end_of_reasoning_token is not None:
563
- completions = [
564
- completion.split(self.end_of_reasoning_token)[-1]
565
- for completion in completions
566
- ]
562
+ if (
563
+ self.end_of_reasoning_token is not None
564
+ and self.generative_type == GenerativeType.REASONING
565
+ ):
566
+ for idx in range(len(completions)):
567
+ if self.end_of_reasoning_token in completions[idx]:
568
+ completions[idx] = completions[idx].split(
569
+ self.end_of_reasoning_token
570
+ )[-1]
571
+ elif self.benchmark_config.verbose:
572
+ logger.warning(
573
+ f"The model {self.model_config.model_id!r} is a reasoning "
574
+ "model, but the generated output does not contain the end of "
575
+ f"reasoning token ({self.end_of_reasoning_token!r}). Using "
576
+ "an empty string as the prediction instead."
577
+ )
578
+ completions[idx] = ""
579
+ else:
580
+ log_once(
581
+ f"The model {self.model_config.model_id!r} is a reasoning "
582
+ "model, but the generated output does not contain the end of "
583
+ f"reasoning token ({self.end_of_reasoning_token!r}). Using "
584
+ "an empty string as the prediction instead. Only showing "
585
+ "this warning once - see all occurrences if you run with the "
586
+ "`verbose` flag.",
587
+ level=logging.WARNING,
588
+ )
589
+ completions[idx] = ""
567
590
  stop_token_pattern = re.compile(
568
591
  "|".join(re.escape(stop_token) for stop_token in stop_tokens)
569
592
  )
@@ -830,9 +853,12 @@ def load_model_and_tokeniser(
830
853
  adapter_base_model_id=model_config.adapter_base_model_id,
831
854
  trust_remote_code=benchmark_config.trust_remote_code,
832
855
  model_max_length=true_max_model_len,
833
- model_cache_dir=model_config.model_cache_dir,
856
+ model_config=model_config,
834
857
  token=get_hf_token(api_key=benchmark_config.api_key),
835
858
  )
859
+ vllm_tokenisation_params = get_vllm_tokenisation_params(
860
+ tokeniser=tokeniser, model_config=model_config
861
+ )
836
862
 
837
863
  clear_vllm()
838
864
 
@@ -865,16 +891,7 @@ def load_model_and_tokeniser(
865
891
  enable_prefix_caching=False,
866
892
  enable_lora=model_config.adapter_base_model_id is not None,
867
893
  max_lora_rank=256,
868
- # Special arguments in case we are dealing with a Mistral model
869
- tokenizer_mode="mistral"
870
- if isinstance(tokeniser, MistralCommonTokenizer)
871
- else "auto",
872
- config_format="mistral"
873
- if isinstance(tokeniser, MistralCommonTokenizer)
874
- else "auto",
875
- load_format="mistral"
876
- if isinstance(tokeniser, MistralCommonTokenizer)
877
- else "auto",
894
+ **vllm_tokenisation_params,
878
895
  )
879
896
  except (RuntimeError, ValueError, OSError) as e:
880
897
  if "awaiting a review from the repo authors" in str(e):
@@ -903,7 +920,7 @@ def load_tokeniser(
903
920
  adapter_base_model_id: str | None,
904
921
  trust_remote_code: bool,
905
922
  model_max_length: int,
906
- model_cache_dir: str,
923
+ model_config: "ModelConfig",
907
924
  token: str | bool,
908
925
  ) -> "PreTrainedTokenizer":
909
926
  """Load the tokeniser.
@@ -920,8 +937,8 @@ def load_tokeniser(
920
937
  Whether to trust remote code.
921
938
  model_max_length:
922
939
  The maximum length of the model.
923
- model_cache_dir:
924
- The cache directory for the model.
940
+ model_config:
941
+ The model configuration.
925
942
  token:
926
943
  The Hugging Face API token.
927
944
 
@@ -932,7 +949,7 @@ def load_tokeniser(
932
949
  config = AutoConfig.from_pretrained(
933
950
  adapter_base_model_id or model_id,
934
951
  revision=revision,
935
- cache_dir=model_cache_dir,
952
+ cache_dir=model_config.model_cache_dir,
936
953
  token=token,
937
954
  trust_remote_code=trust_remote_code,
938
955
  local_files_only=not internet_connection_available(),
@@ -940,15 +957,25 @@ def load_tokeniser(
940
957
  num_retries = 5
941
958
  for _ in range(num_retries):
942
959
  try:
960
+ # Mistral instruction-tuned models need a custom tokeniser
961
+ if model_id.startswith("mistralai/") and "base" not in model_id.lower():
962
+ tokeniser = MistralCommonTokenizer.from_pretrained(
963
+ model_id,
964
+ padding_side="left",
965
+ truncation_side="left",
966
+ model_max_length=model_max_length,
967
+ token=token,
968
+ )
969
+ break
943
970
  tokeniser = AutoTokenizer.from_pretrained(
944
971
  model_id,
945
- use_fast=True,
972
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
946
973
  verbose=False,
947
974
  trust_remote_code=trust_remote_code,
948
975
  padding_side="left",
949
976
  truncation_side="left",
950
977
  model_max_length=model_max_length,
951
- cache_dir=model_cache_dir,
978
+ cache_dir=model_config.model_cache_dir,
952
979
  config=config,
953
980
  token=token,
954
981
  local_files_only=not internet_connection_available(),
@@ -1189,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
1189
1216
  """
1190
1217
  tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
1191
1218
  return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
1219
+
1220
+
1221
+ def get_vllm_tokenisation_params(
1222
+ tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1223
+ ) -> dict[str, t.Any]:
1224
+ """Get the tokenisation parameters for vLLM.
1225
+
1226
+ Args:
1227
+ tokeniser:
1228
+ The tokeniser.
1229
+ model_config:
1230
+ The model configuration.
1231
+
1232
+ Returns:
1233
+ A dictionary of tokenisation parameters to pass to vLLM.
1234
+ """
1235
+ if isinstance(tokeniser, MistralCommonTokenizer):
1236
+ tokeniser_mode = "mistral"
1237
+ elif model_config.param == "slow-tokenizer":
1238
+ tokeniser_mode = "slow"
1239
+ else:
1240
+ tokeniser_mode = "auto"
1241
+
1242
+ if isinstance(tokeniser, MistralCommonTokenizer):
1243
+ config_format = "mistral"
1244
+ else:
1245
+ config_format = "auto"
1246
+
1247
+ if isinstance(tokeniser, MistralCommonTokenizer):
1248
+ load_format = "mistral"
1249
+ else:
1250
+ load_format = "auto"
1251
+
1252
+ return dict(
1253
+ tokenizer_mode=tokeniser_mode,
1254
+ config_format=config_format,
1255
+ load_format=load_format,
1256
+ )
euroeval/benchmarker.py CHANGED
@@ -12,6 +12,7 @@ from time import sleep
12
12
 
13
13
  from huggingface_hub.constants import HF_HUB_ENABLE_HF_TRANSFER
14
14
  from torch.distributed import destroy_process_group
15
+ from tqdm.auto import tqdm
15
16
 
16
17
  from .benchmark_config_factory import build_benchmark_config
17
18
  from .constants import GENERATIVE_PIPELINE_TAGS
@@ -32,6 +33,7 @@ from .utils import (
32
33
  get_package_version,
33
34
  internet_connection_available,
34
35
  log_once,
36
+ split_model_id,
35
37
  )
36
38
 
37
39
  if t.TYPE_CHECKING:
@@ -82,7 +84,7 @@ class Benchmarker:
82
84
  num_iterations: int = 10,
83
85
  api_base: str | None = None,
84
86
  api_version: str | None = None,
85
- gpu_memory_utilization: float = 0.9,
87
+ gpu_memory_utilization: float = 0.8,
86
88
  generative_type: GenerativeType | None = None,
87
89
  debug: bool = False,
88
90
  run_with_cli: bool = False,
@@ -607,46 +609,90 @@ class Benchmarker:
607
609
  dataset_names=benchmark_config.datasets
608
610
  )
609
611
 
610
- total_benchmarks = len(model_ids) * len(dataset_configs)
611
- num_finished_benchmarks = 0
612
-
613
- current_benchmark_results: list[BenchmarkResult] = list()
614
- for model_id in model_ids:
615
- # Load the model configuration, or skip the model if it is invalid
612
+ # Get all the model configs
613
+ model_configs: list[ModelConfig] = list()
614
+ for model_id in tqdm(
615
+ iterable=model_ids,
616
+ desc="Fetching model configurations",
617
+ disable=not benchmark_config.verbose or not benchmark_config.progress_bar,
618
+ ):
616
619
  try:
617
620
  model_config = get_model_config(
618
621
  model_id=model_id, benchmark_config=benchmark_config
619
622
  )
623
+ model_configs.append(model_config)
620
624
  except InvalidModel as e:
621
625
  logger.info(e.message)
622
- num_finished_benchmarks += len(dataset_configs)
626
+
627
+ # Create a dictionary that takes each model config to the dataset configs that
628
+ # we need to benchmark the model on. Here we remove the datasets that the model
629
+ # has already been benchmarked on, or datasets that the model cannot be
630
+ # benchmarked on.
631
+ model_config_to_dataset_configs: dict[ModelConfig, list[DatasetConfig]] = {
632
+ model_config: [
633
+ dataset_config
634
+ for dataset_config in dataset_configs
635
+ if (
636
+ benchmark_config.force
637
+ or not model_has_been_benchmarked(
638
+ model_config=model_config,
639
+ dataset_config=dataset_config,
640
+ benchmark_config=benchmark_config,
641
+ benchmark_results=self.benchmark_results,
642
+ )
643
+ )
644
+ and model_config.model_type in dataset_config.allowed_model_types
645
+ ]
646
+ for model_config in model_configs
647
+ }
648
+
649
+ total_benchmarks = sum(
650
+ len(dataset_configs)
651
+ for dataset_configs in model_config_to_dataset_configs.values()
652
+ )
653
+ if total_benchmarks == 0:
654
+ logger.info(
655
+ "No benchmarks to run, as all the selected models have already been "
656
+ "benchmarked on all the selected datasets."
657
+ )
658
+ return list()
659
+
660
+ logger.info(f"Initiated evaluation of {total_benchmarks:,} benchmarks.")
661
+
662
+ num_finished_benchmarks = 0
663
+ current_benchmark_results: list[BenchmarkResult] = list()
664
+ for model_config in model_configs:
665
+ if not model_config_to_dataset_configs[model_config]:
666
+ logger.debug(
667
+ f"Skipping model {model_config.model_id!r} because it has "
668
+ "already been benchmarked on all valid datasets."
669
+ )
623
670
  continue
624
671
 
625
672
  if model_config.adapter_base_model_id:
626
673
  open_issue_msg = (
627
- "If offline support is important to you, please "
628
- "consider opening an issue at https://github.com/EuroEval/EuroEval/issues."
674
+ "If offline support is important to you, please consider opening "
675
+ "an issue at https://github.com/EuroEval/EuroEval/issues."
629
676
  )
630
677
  if not internet_connection_available():
631
678
  raise InvalidModel(
632
679
  "Offline benchmarking of models with adapters is not currently "
633
- "supported. "
634
- f"An active internet connection is required. {open_issue_msg}"
680
+ "supported. An active internet connection is required. "
681
+ "{open_issue_msg}"
635
682
  )
636
683
  elif benchmark_config.download_only:
637
684
  log_once(
638
685
  "You are using download only mode with a model that includes "
639
- "an adapter. "
640
- "Please note: Offline benchmarking of adapter models is not "
641
- "currently supported. "
642
- "An internet connection will be required during evaluation. "
686
+ "an adapter. Please note that offline benchmarking of "
687
+ "adapter models is not currently supported - an internet "
688
+ "connection will be required during evaluation in this case. "
643
689
  f"{open_issue_msg}",
644
690
  level=logging.WARNING,
645
691
  )
646
692
 
647
693
  loaded_model: BenchmarkModule | None = None
648
694
  benchmark_params_to_revert: dict[str, t.Any] = dict()
649
- for dataset_config in dataset_configs:
695
+ for dataset_config in model_config_to_dataset_configs[model_config]:
650
696
  # Revert any changes to the benchmark configuration made for the
651
697
  # previous dataset
652
698
  for param, value in benchmark_params_to_revert.items():
@@ -674,34 +720,6 @@ class Benchmarker:
674
720
  benchmark_params_to_revert["few_shot"] = True
675
721
  benchmark_config.few_shot = False
676
722
 
677
- # Skip if we have already benchmarked this model on this dataset and
678
- # we are not forcing the benchmark
679
- if not benchmark_config.force and model_has_been_benchmarked(
680
- model_id=model_id,
681
- dataset=dataset_config.name,
682
- few_shot=benchmark_config.few_shot,
683
- validation_split=not benchmark_config.evaluate_test_split,
684
- benchmark_results=self.benchmark_results,
685
- ):
686
- logger.debug(
687
- f"Skipping benchmarking {model_id} on "
688
- f"{dataset_config.pretty_name}, as it has already been "
689
- "benchmarked."
690
- )
691
- num_finished_benchmarks += 1
692
- continue
693
-
694
- # Skip if the model type should not be benchmarked on this dataset
695
- model_type = model_config.model_type
696
- allowed_model_types = dataset_config.allowed_model_types
697
- if model_type not in allowed_model_types:
698
- logger.debug(
699
- f"Skipping benchmarking {model_id} on "
700
- f"{dataset_config.pretty_name}, as it is of type {model_type}, "
701
- f"and the only allowed model types are {allowed_model_types}."
702
- )
703
- continue
704
-
705
723
  # We do not re-initialise generative models as their architecture is not
706
724
  # customised to specific datasets
707
725
  if model_config.model_type == ModelType.GENERATIVE:
@@ -735,6 +753,22 @@ class Benchmarker:
735
753
  else:
736
754
  loaded_model.dataset_config = dataset_config
737
755
 
756
+ # Skip the benchmark if the model is not of the correct
757
+ # generative type
758
+ if (
759
+ loaded_model.generative_type
760
+ not in dataset_config.allowed_generative_types
761
+ ):
762
+ logger.debug(
763
+ f"Skipping the benchmark of model "
764
+ f"{model_config.model_id!r}on dataset "
765
+ f"{dataset_config.name!r} because the model has generative "
766
+ f"type {loaded_model.generative_type} and the dataset "
767
+ f"only allows {dataset_config.allowed_generative_types}."
768
+ )
769
+ num_finished_benchmarks += 1
770
+ continue
771
+
738
772
  # Benchmark a single model on a single dataset
739
773
  benchmark_output_or_err = self._benchmark_single(
740
774
  model=loaded_model,
@@ -969,23 +1003,20 @@ class Benchmarker:
969
1003
 
970
1004
 
971
1005
  def model_has_been_benchmarked(
972
- model_id: str,
973
- dataset: str,
974
- few_shot: bool,
975
- validation_split: bool,
1006
+ model_config: "ModelConfig",
1007
+ dataset_config: "DatasetConfig",
1008
+ benchmark_config: "BenchmarkConfig",
976
1009
  benchmark_results: list[BenchmarkResult],
977
1010
  ) -> bool:
978
1011
  """Checks whether a model has already been benchmarked on a dataset.
979
1012
 
980
1013
  Args:
981
- model_id:
982
- The model ID.
983
- dataset:
984
- The dataset.
985
- few_shot:
986
- Whether the model was evaluated using few-shot evaluation.
987
- validation_split:
988
- Whether the model was evaluated on the validation split.
1014
+ model_config:
1015
+ The configuration of the model we are evaluating.
1016
+ dataset_config:
1017
+ The configuration of the dataset we are evaluating on.
1018
+ benchmark_config:
1019
+ The general benchmark configuration.
989
1020
  benchmark_results:
990
1021
  The benchmark results.
991
1022
 
@@ -993,10 +1024,28 @@ def model_has_been_benchmarked(
993
1024
  Whether the model has already been evaluated on the dataset.
994
1025
  """
995
1026
  for record in benchmark_results:
996
- same_evaluation = record.model == model_id and record.dataset == dataset
997
- same_validation_split_setting = record.validation_split == validation_split
998
- same_few_shot_setting = record.few_shot == few_shot or not record.generative
999
- if same_evaluation and same_validation_split_setting and same_few_shot_setting:
1027
+ model_id_components = split_model_id(model_id=record.model)
1028
+ same_model_id = model_id_components.model_id == model_config.model_id
1029
+ same_revision = model_id_components.revision == model_config.revision
1030
+ same_param = model_id_components.param == model_config.param
1031
+ same_dataset = record.dataset == dataset_config.name
1032
+ same_split = (
1033
+ record.validation_split != benchmark_config.evaluate_test_split
1034
+ or "val" not in dataset_config.splits
1035
+ )
1036
+ same_num_shots = (
1037
+ record.few_shot == benchmark_config.few_shot
1038
+ or not record.generative
1039
+ or dataset_config.task.requires_zero_shot
1040
+ )
1041
+ if (
1042
+ same_model_id
1043
+ and same_revision
1044
+ and same_param
1045
+ and same_dataset
1046
+ and same_split
1047
+ and same_num_shots
1048
+ ):
1000
1049
  return True
1001
1050
  return False
1002
1051
 
euroeval/cli.py CHANGED
@@ -188,7 +188,7 @@ from .tasks import get_all_tasks
188
188
  )
189
189
  @click.option(
190
190
  "--gpu-memory-utilization",
191
- default=0.9,
191
+ default=0.8,
192
192
  show_default=True,
193
193
  help="The GPU memory utilization to use for vLLM. A larger value will result in "
194
194
  "faster evaluation, but at the risk of running out of GPU memory. Only reduce this "
euroeval/constants.py CHANGED
@@ -50,9 +50,11 @@ METRIC_ATTRIBUTES_TAKING_UP_MEMORY = ["cached_bertscorer"]
50
50
  # Hugging Face Hub tags used to classify models as merge models
51
51
  MERGE_TAGS = ["merge", "mergekit"]
52
52
 
53
+
53
54
  # The minimum required CUDA compute capability for using bfloat16 in vLLM
54
55
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY = 8.0
55
56
 
57
+
56
58
  # Used to detect whether a model is a reasoning model
57
59
  REASONING_TOKENS = [
58
60
  ("<think>", "</think>"),
@@ -60,6 +62,7 @@ REASONING_TOKENS = [
60
62
  ("<reasoning>", "</reasoning>"),
61
63
  ]
62
64
 
65
+
63
66
  # These tokens are sometimes used by models to indicate the end of a generated
64
67
  # response, but they do not use them as a proper EOS token, so we have to deal with them
65
68
  # manually. We only use them as stop tokens if they actually appear in the model's
@@ -14,6 +14,7 @@ from .german import * # noqa: F403
14
14
  from .icelandic import * # noqa: F403
15
15
  from .italian import * # noqa: F403
16
16
  from .latvian import * # noqa: F403
17
+ from .lithuanian import * # noqa: F403
17
18
  from .norwegian import * # noqa: F403
18
19
  from .polish import * # noqa: F403
19
20
  from .portuguese import * # noqa: F403
@@ -1,7 +1,6 @@
1
1
  """All Danish dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import DA
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -159,7 +158,6 @@ WINOGRANDE_DA_CONFIG = DatasetConfig(
159
158
  languages=[DA],
160
159
  splits=["train", "test"],
161
160
  _labels=["a", "b"],
162
- _allowed_model_types=[ModelType.GENERATIVE],
163
161
  unofficial=True,
164
162
  )
165
163
 
@@ -1,7 +1,6 @@
1
1
  """All Dutch dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import NL
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -152,7 +151,6 @@ WINOGRANDE_NL_CONFIG = DatasetConfig(
152
151
  languages=[NL],
153
152
  splits=["train", "test"],
154
153
  _labels=["a", "b"],
155
- _allowed_model_types=[ModelType.GENERATIVE],
156
154
  unofficial=True,
157
155
  )
158
156
 
@@ -1,7 +1,6 @@
1
1
  """All English dataset configurations used in EuroEval."""
2
2
 
3
3
  from ..data_models import DatasetConfig
4
- from ..enums import ModelType
5
4
  from ..languages import EN
6
5
  from ..tasks import COMMON_SENSE, EUROPEAN_VALUES, KNOW, LA, MCRC, NER, RC, SENT, SUMM
7
6
 
@@ -135,7 +134,6 @@ WINOGRANDE_CONFIG = DatasetConfig(
135
134
  languages=[EN],
136
135
  splits=["train", "test"],
137
136
  _labels=["a", "b"],
138
- _allowed_model_types=[ModelType.GENERATIVE],
139
137
  unofficial=True,
140
138
  )
141
139