EuroEval 16.2.1__py3-none-any.whl → 16.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (39) hide show
  1. euroeval/__init__.py +4 -2
  2. euroeval/benchmark_modules/fresh.py +3 -1
  3. euroeval/benchmark_modules/hf.py +8 -4
  4. euroeval/benchmark_modules/litellm.py +5 -17
  5. euroeval/benchmark_modules/vllm.py +98 -30
  6. euroeval/benchmarker.py +291 -405
  7. euroeval/cli.py +1 -1
  8. euroeval/constants.py +3 -0
  9. euroeval/data_models.py +35 -35
  10. euroeval/dataset_configs/__init__.py +1 -0
  11. euroeval/dataset_configs/danish.py +0 -2
  12. euroeval/dataset_configs/dutch.py +0 -2
  13. euroeval/dataset_configs/english.py +0 -2
  14. euroeval/dataset_configs/finnish.py +0 -2
  15. euroeval/dataset_configs/french.py +0 -2
  16. euroeval/dataset_configs/german.py +0 -2
  17. euroeval/dataset_configs/italian.py +0 -2
  18. euroeval/dataset_configs/latvian.py +2 -3
  19. euroeval/dataset_configs/lithuanian.py +62 -0
  20. euroeval/dataset_configs/norwegian.py +0 -2
  21. euroeval/dataset_configs/polish.py +0 -2
  22. euroeval/dataset_configs/portuguese.py +0 -2
  23. euroeval/dataset_configs/spanish.py +0 -2
  24. euroeval/dataset_configs/swedish.py +0 -3
  25. euroeval/metrics/huggingface.py +1 -1
  26. euroeval/metrics/pipeline.py +5 -0
  27. euroeval/prompt_templates/linguistic_acceptability.py +9 -0
  28. euroeval/prompt_templates/multiple_choice.py +9 -0
  29. euroeval/prompt_templates/named_entity_recognition.py +20 -0
  30. euroeval/prompt_templates/reading_comprehension.py +10 -0
  31. euroeval/prompt_templates/sentiment_classification.py +11 -0
  32. euroeval/tokenisation_utils.py +8 -8
  33. euroeval/utils.py +10 -5
  34. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/METADATA +181 -60
  35. euroeval-16.3.0.dist-info/RECORD +71 -0
  36. euroeval-16.2.1.dist-info/RECORD +0 -70
  37. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/WHEEL +0 -0
  38. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/entry_points.txt +0 -0
  39. {euroeval-16.2.1.dist-info → euroeval-16.3.0.dist-info}/licenses/LICENSE +0 -0
euroeval/__init__.py CHANGED
@@ -103,8 +103,10 @@ os.environ["DISABLE_AIOHTTP_TRANSPORT"] = "True"
103
103
  os.environ["VLLM_USE_V1"] = "1"
104
104
 
105
105
 
106
- # Use the FlashInfer flash-attention backend for vLLM
107
- os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
106
+ # Use the FlashInfer flash-attention backend for vLLM, unless the user has already
107
+ # specified a different backend.
108
+ if os.getenv("VLLM_ATTENTION_BACKEND") is None:
109
+ os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
108
110
 
109
111
 
110
112
  # Set the HF_TOKEN env var to copy the HUGGINGFACE_API_KEY env var, as vLLM uses the
@@ -1,5 +1,6 @@
1
1
  """Freshly initialised encoder models."""
2
2
 
3
+ import re
3
4
  import typing as t
4
5
  from functools import cached_property
5
6
  from json import JSONDecodeError
@@ -45,6 +46,7 @@ class FreshEncoderModel(HuggingFaceEncoderModel):
45
46
  """A freshly initialised encoder model."""
46
47
 
47
48
  fresh_model = True
49
+ allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
48
50
 
49
51
  def __init__(
50
52
  self,
@@ -294,7 +296,7 @@ def load_model_and_tokeniser(
294
296
  token=get_hf_token(api_key=benchmark_config.api_key),
295
297
  add_prefix_space=prefix,
296
298
  cache_dir=model_config.model_cache_dir,
297
- use_fast=True,
299
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
298
300
  verbose=False,
299
301
  trust_remote_code=benchmark_config.trust_remote_code,
300
302
  )
@@ -2,6 +2,7 @@
2
2
 
3
3
  import collections.abc as c
4
4
  import logging
5
+ import re
5
6
  import typing as t
6
7
  from functools import cached_property, partial
7
8
  from json import JSONDecodeError
@@ -93,6 +94,7 @@ class HuggingFaceEncoderModel(BenchmarkModule):
93
94
  fresh_model = False
94
95
  batching_preference = BatchingPreference.NO_PREFERENCE
95
96
  high_priority = True
97
+ allowed_params = {re.compile(r".*"): ["slow-tokenizer"]}
96
98
 
97
99
  def __init__(
98
100
  self,
@@ -690,7 +692,7 @@ def load_model_and_tokeniser(
690
692
  model=model,
691
693
  model_id=model_id,
692
694
  trust_remote_code=benchmark_config.trust_remote_code,
693
- model_cache_dir=model_config.model_cache_dir,
695
+ model_config=model_config,
694
696
  )
695
697
 
696
698
  return model, tokeniser
@@ -880,7 +882,7 @@ def load_tokeniser(
880
882
  model: "PreTrainedModel | None",
881
883
  model_id: str,
882
884
  trust_remote_code: bool,
883
- model_cache_dir: str,
885
+ model_config: "ModelConfig",
884
886
  ) -> "PreTrainedTokenizer":
885
887
  """Load the tokeniser.
886
888
 
@@ -892,17 +894,19 @@ def load_tokeniser(
892
894
  The model identifier. Used for logging.
893
895
  trust_remote_code:
894
896
  Whether to trust remote code.
897
+ model_config:
898
+ The model configuration.
895
899
 
896
900
  Returns:
897
901
  The loaded tokeniser.
898
902
  """
899
903
  loading_kwargs: dict[str, bool | str] = dict(
900
- use_fast=True,
904
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
901
905
  verbose=False,
902
906
  trust_remote_code=trust_remote_code,
903
907
  padding_side="right",
904
908
  truncation_side="right",
905
- cache_dir=model_cache_dir,
909
+ cache_dir=model_config.model_cache_dir,
906
910
  )
907
911
 
908
912
  # If the model is a subclass of a certain model types then we have to add a prefix
@@ -369,7 +369,8 @@ class LiteLLMModel(BenchmarkModule):
369
369
  ]
370
370
  logger.debug(
371
371
  f"Attempt {attempt + 1:,}/{num_attempts:,}: retrying "
372
- f"{len(inputs_to_run):,} failed message(s)"
372
+ f"{len(inputs_to_run):,} failed message(s). Here is the first error: "
373
+ f"{failures[0][1]}."
373
374
  )
374
375
 
375
376
  # Attempt to handle the exceptions, to improve the chance of getting
@@ -453,8 +454,7 @@ class LiteLLMModel(BenchmarkModule):
453
454
  requires_thinking_disabled_messages = ["thinking.type: Field required"]
454
455
  seed_pattern = re.compile(r"does not support parameters: \[.*'seed'.*\]")
455
456
  response_format_messages = [
456
- "got an unexpected keyword argument 'response_format'",
457
- "The model outputs empty dictionaries.",
457
+ "got an unexpected keyword argument 'response_format'"
458
458
  ]
459
459
 
460
460
  if any(msg.lower() in error_msg for msg in stop_messages):
@@ -713,18 +713,6 @@ class LiteLLMModel(BenchmarkModule):
713
713
  ]
714
714
  responses = await tqdm_async.gather(*requests, leave=False)
715
715
 
716
- # If we are performing structured generation and the model just outputs an empty
717
- # dictionary, then we convert those to exceptions, to disable structured
718
- # generation
719
- if "response_format" in generation_kwargs:
720
- responses = [
721
- RuntimeError("The model outputs empty dictionaries.")
722
- if not isinstance(response, Exception)
723
- and any(choice.message.content == "{}" for choice in response.choices)
724
- else response
725
- for response in responses
726
- ]
727
-
728
716
  # Separate the successful responses from the failed ones
729
717
  successes = [
730
718
  (idx, response)
@@ -984,7 +972,7 @@ class LiteLLMModel(BenchmarkModule):
984
972
  model=None,
985
973
  model_id=model_id,
986
974
  trust_remote_code=self.benchmark_config.trust_remote_code,
987
- model_cache_dir=self.model_config.model_cache_dir,
975
+ model_config=self.model_config,
988
976
  )
989
977
 
990
978
  if (
@@ -1067,7 +1055,7 @@ class LiteLLMModel(BenchmarkModule):
1067
1055
  model=None,
1068
1056
  model_id=model_id,
1069
1057
  trust_remote_code=self.benchmark_config.trust_remote_code,
1070
- model_cache_dir=self.model_config.model_cache_dir,
1058
+ model_config=self.model_config,
1071
1059
  )
1072
1060
 
1073
1061
  all_max_lengths: list[int] = list()
@@ -104,7 +104,7 @@ class VLLMModel(HuggingFaceEncoderModel):
104
104
  fresh_model = False
105
105
  batching_preference = BatchingPreference.ALL_AT_ONCE
106
106
  high_priority = True
107
- allowed_params = {re.compile(r".*"): ["thinking", "no-thinking"]}
107
+ allowed_params = {re.compile(r".*"): ["thinking", "no-thinking", "slow-tokenizer"]}
108
108
 
109
109
  def __init__(
110
110
  self,
@@ -559,11 +559,34 @@ class VLLMModel(HuggingFaceEncoderModel):
559
559
  torch.LongTensor(completion_id) for completion_id in completion_ids
560
560
  ]
561
561
  )
562
- if self.end_of_reasoning_token is not None:
563
- completions = [
564
- completion.split(self.end_of_reasoning_token)[-1]
565
- for completion in completions
566
- ]
562
+ if (
563
+ self.end_of_reasoning_token is not None
564
+ and self.generative_type == GenerativeType.REASONING
565
+ ):
566
+ for idx in range(len(completions)):
567
+ if self.end_of_reasoning_token in completions[idx]:
568
+ completions[idx] = completions[idx].split(
569
+ self.end_of_reasoning_token
570
+ )[-1]
571
+ elif self.benchmark_config.verbose:
572
+ logger.warning(
573
+ f"The model {self.model_config.model_id!r} is a reasoning "
574
+ "model, but the generated output does not contain the end of "
575
+ f"reasoning token ({self.end_of_reasoning_token!r}). Using "
576
+ "an empty string as the prediction instead."
577
+ )
578
+ completions[idx] = ""
579
+ else:
580
+ log_once(
581
+ f"The model {self.model_config.model_id!r} is a reasoning "
582
+ "model, but the generated output does not contain the end of "
583
+ f"reasoning token ({self.end_of_reasoning_token!r}). Using "
584
+ "an empty string as the prediction instead. Only showing "
585
+ "this warning once - see all occurrences if you run with the "
586
+ "`verbose` flag.",
587
+ level=logging.WARNING,
588
+ )
589
+ completions[idx] = ""
567
590
  stop_token_pattern = re.compile(
568
591
  "|".join(re.escape(stop_token) for stop_token in stop_tokens)
569
592
  )
@@ -830,21 +853,27 @@ def load_model_and_tokeniser(
830
853
  adapter_base_model_id=model_config.adapter_base_model_id,
831
854
  trust_remote_code=benchmark_config.trust_remote_code,
832
855
  model_max_length=true_max_model_len,
833
- model_cache_dir=model_config.model_cache_dir,
856
+ model_config=model_config,
834
857
  token=get_hf_token(api_key=benchmark_config.api_key),
835
858
  )
859
+ vllm_tokenisation_params = get_vllm_tokenisation_params(
860
+ tokeniser=tokeniser, model_config=model_config
861
+ )
836
862
 
837
863
  clear_vllm()
838
864
 
839
- # if we do not have an internet connection we need to give the path to the folder
840
- # that contains the model weights and config files, otherwise vLLM will try to
841
- # download them regardless if they are already present in the download_dir
842
- model_path = resolve_model_path(download_dir)
843
-
844
865
  try:
845
866
  model = LLM(
846
- model=model_id if internet_connection_available() else model_path,
847
- tokenizer=model_id if internet_connection_available() else model_path,
867
+ model=(
868
+ model_id
869
+ if internet_connection_available()
870
+ else resolve_model_path(download_dir=download_dir)
871
+ ),
872
+ tokenizer=(
873
+ model_id
874
+ if internet_connection_available()
875
+ else resolve_model_path(download_dir=download_dir)
876
+ ),
848
877
  gpu_memory_utilization=benchmark_config.gpu_memory_utilization,
849
878
  max_model_len=min(true_max_model_len, MAX_CONTEXT_LENGTH),
850
879
  download_dir=download_dir,
@@ -862,16 +891,7 @@ def load_model_and_tokeniser(
862
891
  enable_prefix_caching=False,
863
892
  enable_lora=model_config.adapter_base_model_id is not None,
864
893
  max_lora_rank=256,
865
- # Special arguments in case we are dealing with a Mistral model
866
- tokenizer_mode="mistral"
867
- if isinstance(tokeniser, MistralCommonTokenizer)
868
- else "auto",
869
- config_format="mistral"
870
- if isinstance(tokeniser, MistralCommonTokenizer)
871
- else "auto",
872
- load_format="mistral"
873
- if isinstance(tokeniser, MistralCommonTokenizer)
874
- else "auto",
894
+ **vllm_tokenisation_params,
875
895
  )
876
896
  except (RuntimeError, ValueError, OSError) as e:
877
897
  if "awaiting a review from the repo authors" in str(e):
@@ -900,7 +920,7 @@ def load_tokeniser(
900
920
  adapter_base_model_id: str | None,
901
921
  trust_remote_code: bool,
902
922
  model_max_length: int,
903
- model_cache_dir: str,
923
+ model_config: "ModelConfig",
904
924
  token: str | bool,
905
925
  ) -> "PreTrainedTokenizer":
906
926
  """Load the tokeniser.
@@ -917,8 +937,8 @@ def load_tokeniser(
917
937
  Whether to trust remote code.
918
938
  model_max_length:
919
939
  The maximum length of the model.
920
- model_cache_dir:
921
- The cache directory for the model.
940
+ model_config:
941
+ The model configuration.
922
942
  token:
923
943
  The Hugging Face API token.
924
944
 
@@ -929,7 +949,7 @@ def load_tokeniser(
929
949
  config = AutoConfig.from_pretrained(
930
950
  adapter_base_model_id or model_id,
931
951
  revision=revision,
932
- cache_dir=model_cache_dir,
952
+ cache_dir=model_config.model_cache_dir,
933
953
  token=token,
934
954
  trust_remote_code=trust_remote_code,
935
955
  local_files_only=not internet_connection_available(),
@@ -937,15 +957,25 @@ def load_tokeniser(
937
957
  num_retries = 5
938
958
  for _ in range(num_retries):
939
959
  try:
960
+ # Mistral instruction-tuned models need a custom tokeniser
961
+ if model_id.startswith("mistralai/") and "base" not in model_id.lower():
962
+ tokeniser = MistralCommonTokenizer.from_pretrained(
963
+ model_id,
964
+ padding_side="left",
965
+ truncation_side="left",
966
+ model_max_length=model_max_length,
967
+ token=token,
968
+ )
969
+ break
940
970
  tokeniser = AutoTokenizer.from_pretrained(
941
971
  model_id,
942
- use_fast=True,
972
+ use_fast=False if model_config.param == "slow-tokenizer" else True,
943
973
  verbose=False,
944
974
  trust_remote_code=trust_remote_code,
945
975
  padding_side="left",
946
976
  truncation_side="left",
947
977
  model_max_length=model_max_length,
948
- cache_dir=model_cache_dir,
978
+ cache_dir=model_config.model_cache_dir,
949
979
  config=config,
950
980
  token=token,
951
981
  local_files_only=not internet_connection_available(),
@@ -1186,3 +1216,41 @@ def get_pbar_without_leave(*tqdm_args, **tqdm_kwargs) -> tqdm:
1186
1216
  """
1187
1217
  tqdm_kwargs.pop("leave", None) # Remove the 'leave' key if it exists
1188
1218
  return tqdm(*tqdm_args, leave=False, **tqdm_kwargs)
1219
+
1220
+
1221
+ def get_vllm_tokenisation_params(
1222
+ tokeniser: "PreTrainedTokenizer", model_config: "ModelConfig"
1223
+ ) -> dict[str, t.Any]:
1224
+ """Get the tokenisation parameters for vLLM.
1225
+
1226
+ Args:
1227
+ tokeniser:
1228
+ The tokeniser.
1229
+ model_config:
1230
+ The model configuration.
1231
+
1232
+ Returns:
1233
+ A dictionary of tokenisation parameters to pass to vLLM.
1234
+ """
1235
+ if isinstance(tokeniser, MistralCommonTokenizer):
1236
+ tokeniser_mode = "mistral"
1237
+ elif model_config.param == "slow-tokenizer":
1238
+ tokeniser_mode = "slow"
1239
+ else:
1240
+ tokeniser_mode = "auto"
1241
+
1242
+ if isinstance(tokeniser, MistralCommonTokenizer):
1243
+ config_format = "mistral"
1244
+ else:
1245
+ config_format = "auto"
1246
+
1247
+ if isinstance(tokeniser, MistralCommonTokenizer):
1248
+ load_format = "mistral"
1249
+ else:
1250
+ load_format = "auto"
1251
+
1252
+ return dict(
1253
+ tokenizer_mode=tokeniser_mode,
1254
+ config_format=config_format,
1255
+ load_format=load_format,
1256
+ )