EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +323 -193
  7. euroeval/benchmark_modules/vllm.py +166 -112
  8. euroeval/benchmarker.py +59 -33
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +8 -7
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -468
  61. euroeval-15.15.0.dist-info/RECORD +0 -63
  62. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,6 @@ import contextlib
5
5
  import importlib.util
6
6
  import json
7
7
  import logging
8
- import os
9
8
  import re
10
9
  import typing as t
11
10
  from functools import partial
@@ -16,6 +15,7 @@ import torch
16
15
  from huggingface_hub import snapshot_download
17
16
  from pydantic import conlist, create_model
18
17
  from tqdm.auto import tqdm
18
+ from transformers import MistralCommonTokenizer
19
19
  from transformers.models.auto.configuration_auto import AutoConfig
20
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
21
21
  from urllib3.exceptions import RequestError
@@ -24,11 +24,10 @@ from ..constants import (
24
24
  CUSTOM_STOP_TOKENS,
25
25
  GENERATIVE_PIPELINE_TAGS,
26
26
  MAX_CONTEXT_LENGTH,
27
- MAX_LOGPROBS,
27
+ MAX_VLLM_LOGPROBS,
28
28
  MERGE_TAGS,
29
29
  REASONING_MAX_TOKENS,
30
30
  REASONING_TOKENS,
31
- TASKS_USING_JSON,
32
31
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
32
  )
34
33
  from ..data_models import GenerativeModelOutput, ModelConfig
@@ -54,17 +53,20 @@ from ..task_group_utils import (
54
53
  token_classification,
55
54
  )
56
55
  from ..tokenization_utils import (
56
+ apply_chat_template,
57
57
  get_bos_token,
58
58
  get_end_of_chat_token_ids,
59
59
  get_eos_token,
60
60
  get_first_label_token_mapping,
61
61
  get_pad_token,
62
+ has_chat_template,
62
63
  should_prompts_be_stripped,
63
64
  )
64
65
  from ..types import ExtractLabelsFunction
65
66
  from ..utils import (
66
67
  clear_memory,
67
68
  create_model_cache_dir,
69
+ get_hf_token,
68
70
  get_min_cuda_compute_capability,
69
71
  log_once,
70
72
  )
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
79
81
  from vllm.lora.request import LoRARequest
80
82
  from vllm.sampling_params import GuidedDecodingParams
81
83
 
82
- if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
83
- import ray
84
-
85
84
  if t.TYPE_CHECKING:
86
85
  from datasets import DatasetDict
87
86
  from transformers.tokenization_utils import PreTrainedTokenizer
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
104
103
  model_config: "ModelConfig",
105
104
  dataset_config: "DatasetConfig",
106
105
  benchmark_config: "BenchmarkConfig",
106
+ log_metadata: bool = True,
107
107
  ) -> None:
108
108
  """Initialise the vLLM model.
109
109
 
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
114
114
  The dataset configuration.
115
115
  benchmark_config:
116
116
  The benchmark configuration.
117
+ log_metadata:
118
+ Whether to log the model and dataset metadata.
117
119
  """
118
- if (
119
- importlib.util.find_spec("vllm") is None
120
- or importlib.util.find_spec("ray") is None
121
- ):
120
+ if importlib.util.find_spec("vllm") is None:
122
121
  raise NeedsExtraInstalled(extra="generative")
123
122
 
124
- model, tokenizer = load_model_and_tokenizer(
123
+ model, tokeniser = load_model_and_tokeniser(
125
124
  model_config=model_config, benchmark_config=benchmark_config
126
125
  )
127
126
  self._model: "LLM" = model
128
- self._tokenizer: "PreTrainedTokenizer" = tokenizer
127
+ self._tokeniser: "PreTrainedTokenizer" = tokeniser
129
128
  self.end_of_reasoning_token = get_end_of_reasoning_token(
130
- model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
129
+ model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
131
130
  )
132
131
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
133
- tokenizer=self._tokenizer
132
+ tokeniser=self._tokeniser
134
133
  )
135
134
  self.custom_stop_tokens = get_custom_stop_tokens(
136
135
  model=self._model,
137
- tokenizer=self._tokenizer,
136
+ tokeniser=self._tokeniser,
138
137
  model_id=model_config.model_id,
139
138
  is_reasoning_model=self.end_of_reasoning_token is not None,
140
139
  )
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
145
144
  model_config=model_config,
146
145
  dataset_config=dataset_config,
147
146
  benchmark_config=benchmark_config,
147
+ log_metadata=log_metadata,
148
148
  )
149
149
 
150
150
  self.buffer |= dict(
151
- instruction_model=self._tokenizer.chat_template is not None,
151
+ instruction_model=has_chat_template(tokeniser=self._tokeniser),
152
152
  first_label_token_mapping=get_first_label_token_mapping(
153
153
  dataset_config=self.dataset_config,
154
154
  model_config=self.model_config,
155
- tokenizer=self._tokenizer,
155
+ tokeniser=self._tokeniser,
156
156
  generative_type=self.generative_type,
157
+ log_metadata=self.log_metadata,
157
158
  ),
158
159
  )
159
160
  if self.model_config.adapter_base_model_id is not None:
@@ -167,12 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
167
168
  )
168
169
 
169
170
  def __del__(self) -> None:
170
- """Clean up the model and tokenizer."""
171
- clear_vllm()
171
+ """Clean up the model and tokeniser."""
172
+ try:
173
+ if importlib.util.find_spec("vllm") is not None:
174
+ clear_vllm()
175
+ except ImportError:
176
+ pass
172
177
  if hasattr(self, "_model"):
173
178
  del self._model
174
- if hasattr(self, "_tokenizer"):
175
- del self._tokenizer
179
+ if hasattr(self, "_tokeniser"):
180
+ del self._tokeniser
176
181
 
177
182
  @property
178
183
  def generative_type(self) -> GenerativeType | None:
@@ -181,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
181
186
  Returns:
182
187
  The generative type of the model, or None if it has not been set yet.
183
188
  """
184
- if not hasattr(self, "_tokenizer"):
189
+ if not hasattr(self, "_tokeniser"):
185
190
  return None
186
191
  elif self.end_of_reasoning_token is not None:
187
192
  return GenerativeType.REASONING
188
193
  elif (
189
- self._tokenizer.chat_template is not None
194
+ has_chat_template(tokeniser=self._tokeniser)
190
195
  or "instruct" in self.model_config.model_id.lower()
191
196
  ):
192
197
  return GenerativeType.INSTRUCTION_TUNED
@@ -266,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
266
271
 
267
272
  if self.benchmark_config.few_shot:
268
273
  few_shot_examples = extract_few_shot_examples(
269
- dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
274
+ dataset=dataset,
275
+ dataset_config=self.dataset_config,
276
+ benchmark_config=self.benchmark_config,
277
+ itr_idx=itr_idx,
270
278
  )
271
279
  else:
272
280
  few_shot_examples = list()
@@ -279,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
279
287
  dataset_config=self.dataset_config,
280
288
  instruction_model=self.buffer["instruction_model"],
281
289
  always_populate_text_field=True,
282
- tokenizer=self._tokenizer,
290
+ tokeniser=self._tokeniser,
283
291
  ),
284
292
  batched=True,
285
293
  load_from_cache_file=False,
@@ -297,35 +305,40 @@ class VLLMModel(HuggingFaceEncoderModel):
297
305
 
298
306
  Returns:
299
307
  The generated model outputs.
308
+
309
+ Raises:
310
+ InvalidBenchmark:
311
+ If the dataset requires logprobs, but we could not get the first token
312
+ of each label in the dataset.
300
313
  """
301
314
  # Get stopping tokens
302
315
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
303
316
  if self.buffer["instruction_model"] is False:
304
317
  stop_tokens.append("\n\n")
305
- if self._tokenizer.pad_token_id is not None:
306
- assert isinstance(self._tokenizer.pad_token, str), (
318
+ if self._tokeniser.pad_token_id is not None:
319
+ assert isinstance(self._tokeniser.pad_token, str), (
307
320
  f"The pad token for the model {self.model_config.model_id!r} "
308
- f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
321
+ f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
309
322
  )
310
- stop_tokens.append(self._tokenizer.pad_token)
311
- if self._tokenizer.eos_token_id is not None:
312
- assert isinstance(self._tokenizer.eos_token, str), (
323
+ stop_tokens.append(self._tokeniser.pad_token)
324
+ if self._tokeniser.eos_token_id is not None:
325
+ assert isinstance(self._tokeniser.eos_token, str), (
313
326
  f"The EOS token for the model {self.model_config.model_id!r} "
314
- f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
327
+ f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
315
328
  )
316
- stop_tokens.append(self._tokenizer.eos_token)
317
- if self._tokenizer.pad_token_id is None:
318
- self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
319
- self._tokenizer.pad_token = self._tokenizer.eos_token
329
+ stop_tokens.append(self._tokeniser.eos_token)
330
+ if self._tokeniser.pad_token_id is None:
331
+ self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
332
+ self._tokeniser.pad_token = self._tokeniser.eos_token
320
333
  if self.end_of_chat_token_ids is not None:
321
- end_of_chat_token = self._tokenizer.decode(
334
+ end_of_chat_token = self._tokeniser.decode(
322
335
  self.end_of_chat_token_ids
323
336
  ).strip()
324
337
  if end_of_chat_token:
325
338
  stop_tokens.append(end_of_chat_token)
326
339
 
327
340
  structured_generation_schema = None
328
- if self.dataset_config.task in TASKS_USING_JSON:
341
+ if self.dataset_config.task.uses_structured_output:
329
342
  if self.generative_type == GenerativeType.REASONING:
330
343
  log_once(
331
344
  f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -354,9 +367,33 @@ class VLLMModel(HuggingFaceEncoderModel):
354
367
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
355
368
  dataset_config=self.dataset_config,
356
369
  model_config=self.model_config,
357
- tokenizer=self._tokenizer,
370
+ tokeniser=self._tokeniser,
358
371
  generative_type=self.generative_type,
372
+ log_metadata=self.log_metadata,
359
373
  )
374
+ if (
375
+ not self.buffer["first_label_token_mapping"]
376
+ and self.dataset_config.task.requires_logprobs
377
+ ):
378
+ raise InvalidBenchmark(
379
+ "The dataset requires logprobs, but we encountered an error when "
380
+ "trying to get the first token of each label in the dataset. You can "
381
+ "try running this benchmark with the --verbose flag to see what the "
382
+ "error was. Skipping this evaluation."
383
+ )
384
+
385
+ # Define the guided decoding that we will use for structured generation
386
+ if structured_generation_schema is not None:
387
+ guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
388
+ elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
389
+ guided_decoding = GuidedDecodingParams(
390
+ choice=[
391
+ self.dataset_config.prompt_label_mapping[label]
392
+ for label in self.dataset_config.labels
393
+ ]
394
+ )
395
+ else:
396
+ guided_decoding = None
360
397
 
361
398
  # Define the parameters used for vLLM generation
362
399
  max_tokens: int = (
@@ -366,14 +403,12 @@ class VLLMModel(HuggingFaceEncoderModel):
366
403
  )
367
404
  sampling_params = SamplingParams(
368
405
  max_tokens=max_tokens,
369
- logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
406
+ logprobs=MAX_VLLM_LOGPROBS
407
+ if self.buffer["first_label_token_mapping"]
408
+ else None,
370
409
  temperature=0.0,
371
410
  stop=[stop_token for stop_token in stop_tokens if stop_token],
372
- guided_decoding=(
373
- GuidedDecodingParams(json=structured_generation_schema)
374
- if structured_generation_schema
375
- else None
376
- ),
411
+ guided_decoding=guided_decoding,
377
412
  )
378
413
 
379
414
  # If any of the prompts are empty then we need to replace them with a BOS token
@@ -382,7 +417,7 @@ class VLLMModel(HuggingFaceEncoderModel):
382
417
  if any(len(prompt) == 0 for prompt in prompts):
383
418
  logger.debug("Found empty prompts, replacing with BOS token.")
384
419
  prompts = [
385
- prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
420
+ prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
386
421
  for prompt in prompts
387
422
  ]
388
423
 
@@ -393,7 +428,7 @@ class VLLMModel(HuggingFaceEncoderModel):
393
428
  if not self.buffer.get(
394
429
  "instruction_model", False
395
430
  ) and should_prompts_be_stripped(
396
- labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
431
+ labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
397
432
  ):
398
433
  log_once(
399
434
  f"Stripping prompts for model {self.model_config.model_id!r}.",
@@ -431,22 +466,22 @@ class VLLMModel(HuggingFaceEncoderModel):
431
466
  "Prompts are too long, so truncating them and trying again..."
432
467
  )
433
468
  logger.debug(f"The error message was: {str(e)}")
434
- tokenized_prompts = self._tokenizer(
469
+ tokenized_prompts = self._tokeniser(
435
470
  text=prompts,
436
471
  truncation=True,
437
472
  max_length=max(
438
- min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
473
+ min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
439
474
  - max_tokens,
440
475
  0,
441
476
  ),
442
477
  )
443
- prompts = self._tokenizer.batch_decode(
478
+ prompts = self._tokeniser.batch_decode(
444
479
  sequences=tokenized_prompts.input_ids, skip_special_tokens=True
445
480
  )
446
481
  else:
447
482
  raise InvalidBenchmark(
448
483
  f"An error occurred during vLLM generation: {str(e)}"
449
- )
484
+ ) from e
450
485
  else:
451
486
  raise InvalidBenchmark(
452
487
  f"Could not generate sequences after {num_attempts} attempts."
@@ -476,7 +511,7 @@ class VLLMModel(HuggingFaceEncoderModel):
476
511
  completion_ids: list[list[int]] = [
477
512
  output.outputs[0].token_ids for output in raw_outputs
478
513
  ]
479
- completions = self._tokenizer.batch_decode(
514
+ completions = self._tokeniser.batch_decode(
480
515
  sequences=[
481
516
  torch.LongTensor(completion_id) for completion_id in completion_ids
482
517
  ]
@@ -624,10 +659,10 @@ class VLLMModel(HuggingFaceEncoderModel):
624
659
  )
625
660
 
626
661
 
627
- def load_model_and_tokenizer(
662
+ def load_model_and_tokeniser(
628
663
  model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
629
664
  ) -> tuple["LLM", "PreTrainedTokenizer"]:
630
- """Load the model and tokenizer.
665
+ """Load the model and tokeniser.
631
666
 
632
667
  Args:
633
668
  model_config:
@@ -636,7 +671,7 @@ def load_model_and_tokenizer(
636
671
  The benchmark configuration.
637
672
 
638
673
  Returns:
639
- A pair (model, tokenizer), with the loaded model and tokenizer
674
+ A pair (model, tokeniser), with the loaded model and tokeniser
640
675
  """
641
676
  # Prefer base model ID if the model is an adapter - the adapter will be added on
642
677
  # during inference in this case
@@ -674,7 +709,7 @@ def load_model_and_tokenizer(
674
709
  dtype: str | torch.dtype = "auto"
675
710
 
676
711
  # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
677
- if hf_model_config.torch_dtype == torch.float32:
712
+ if hf_model_config.dtype == torch.float32:
678
713
  if torch.cuda.is_bf16_supported():
679
714
  logger.info(
680
715
  "You are loading a model with dtype FP32, which we will convert to "
@@ -690,29 +725,33 @@ def load_model_and_tokenizer(
690
725
  )
691
726
  dtype = torch.float16
692
727
 
693
- # If the model is a quantized model, we need to set the dtype to float16
694
- if quantization is not None and hf_model_config.torch_dtype != torch.float16:
728
+ # If the model is a quantized model, we might need to change the dtype
729
+ if quantization == "mxfp4" and hf_model_config.dtype is None:
730
+ dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
731
+ logger.debug(
732
+ "You are loading a quantized model where `dtype` has not been set. "
733
+ f"Setting dtype to {dtype!r}."
734
+ )
735
+ elif quantization is not None and hf_model_config.dtype != torch.float16:
695
736
  logger.info(
696
737
  "You are loading a quantized model with dtype "
697
- f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
738
+ f"{hf_model_config.dtype}, which vLLM does not support. Setting "
698
739
  "dtype to float16 instead."
699
740
  )
700
741
  dtype = torch.float16
701
742
 
702
743
  # If the model is a bf16 model, we need to check the CUDA compute capability
703
- if hf_model_config.torch_dtype == torch.bfloat16:
744
+ if hf_model_config.dtype == torch.bfloat16:
704
745
  min_cuda_compute_capability = get_min_cuda_compute_capability()
705
746
  required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
706
747
 
707
748
  if min_cuda_compute_capability is not None:
708
749
  if min_cuda_compute_capability < required_capability:
709
750
  logger.info(
710
- "You are loading a model with "
711
- f"dtype {hf_model_config.torch_dtype}, "
712
- "which vLLM only supports for CUDA devices with"
713
- f"CUDA compute capability >={required_capability}. "
714
- "You are using one or more devices with "
715
- f"compute capability {min_cuda_compute_capability}. "
751
+ f"You are loading a model with dtype {hf_model_config.dtype}, "
752
+ "which vLLM only supports for CUDA devices with CUDA compute "
753
+ f"capability >={required_capability}. You are using one or more "
754
+ f"devices with compute capability {min_cuda_compute_capability}. "
716
755
  "Setting dtype to float16 instead."
717
756
  )
718
757
  dtype = torch.float16
@@ -740,14 +779,14 @@ def load_model_and_tokenizer(
740
779
  else:
741
780
  true_max_model_len = MAX_CONTEXT_LENGTH
742
781
 
743
- tokenizer = load_tokenizer(
782
+ tokeniser = load_tokeniser(
744
783
  model_id=model_config.model_id,
745
784
  revision=model_config.revision,
746
785
  adapter_base_model_id=model_config.adapter_base_model_id,
747
786
  trust_remote_code=benchmark_config.trust_remote_code,
748
787
  model_max_length=true_max_model_len,
749
788
  model_cache_dir=model_config.model_cache_dir,
750
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
789
+ token=get_hf_token(api_key=benchmark_config.api_key),
751
790
  )
752
791
 
753
792
  clear_vllm()
@@ -762,9 +801,7 @@ def load_model_and_tokenizer(
762
801
  trust_remote_code=benchmark_config.trust_remote_code,
763
802
  revision=revision,
764
803
  seed=4242,
765
- distributed_executor_backend=(
766
- "ray" if torch.cuda.device_count() > 1 else "mp"
767
- ),
804
+ distributed_executor_backend="mp",
768
805
  tensor_parallel_size=torch.cuda.device_count(),
769
806
  disable_custom_all_reduce=True,
770
807
  quantization=quantization,
@@ -775,29 +812,39 @@ def load_model_and_tokenizer(
775
812
  enable_prefix_caching=False,
776
813
  enable_lora=model_config.adapter_base_model_id is not None,
777
814
  max_lora_rank=256,
815
+ # Special arguments in case we are dealing with a Mistral model
816
+ tokenizer_mode="mistral"
817
+ if isinstance(tokeniser, MistralCommonTokenizer)
818
+ else "auto",
819
+ config_format="mistral"
820
+ if isinstance(tokeniser, MistralCommonTokenizer)
821
+ else "auto",
822
+ load_format="mistral"
823
+ if isinstance(tokeniser, MistralCommonTokenizer)
824
+ else "auto",
778
825
  )
779
826
  except (RuntimeError, ValueError, OSError) as e:
780
827
  if "awaiting a review from the repo authors" in str(e):
781
828
  raise InvalidModel(
782
829
  f"The model {model_id!r} is awaiting a review from the repository "
783
830
  "authors. Please try again later."
784
- )
831
+ ) from e
785
832
  elif "trust_remote_code" in str(e):
786
833
  raise InvalidModel(
787
834
  f"Loading the model {model_id!r} needs to trust remote code. "
788
835
  "If you trust the suppliers of this model, then you can enable "
789
836
  "this by setting the `--trust-remote-code` flag."
790
- )
837
+ ) from e
791
838
  raise InvalidModel(
792
839
  f"The model {model_id!r} could not be loaded. The error was {e!r}."
793
- )
840
+ ) from e
794
841
 
795
842
  model.config = hf_model_config
796
843
 
797
- return model, tokenizer
844
+ return model, tokeniser
798
845
 
799
846
 
800
- def load_tokenizer(
847
+ def load_tokeniser(
801
848
  model_id: str,
802
849
  revision: str,
803
850
  adapter_base_model_id: str | None,
@@ -806,7 +853,7 @@ def load_tokenizer(
806
853
  model_cache_dir: str,
807
854
  token: str | bool,
808
855
  ) -> "PreTrainedTokenizer":
809
- """Load the tokenizer.
856
+ """Load the tokeniser.
810
857
 
811
858
  Args:
812
859
  model_id:
@@ -826,7 +873,7 @@ def load_tokenizer(
826
873
  The Hugging Face API token.
827
874
 
828
875
  Returns:
829
- The loaded tokenizer.
876
+ The loaded tokeniser.
830
877
  """
831
878
  revision = revision if adapter_base_model_id is None else "main"
832
879
  config = AutoConfig.from_pretrained(
@@ -839,7 +886,7 @@ def load_tokenizer(
839
886
  num_retries = 5
840
887
  for _ in range(num_retries):
841
888
  try:
842
- tokenizer = AutoTokenizer.from_pretrained(
889
+ tokeniser = AutoTokenizer.from_pretrained(
843
890
  model_id,
844
891
  use_fast=True,
845
892
  verbose=False,
@@ -854,30 +901,45 @@ def load_tokenizer(
854
901
  except (json.JSONDecodeError, OSError, TypeError) as e:
855
902
  if adapter_base_model_id is None or model_id == adapter_base_model_id:
856
903
  raise InvalidModel(
857
- f"Could not load tokenizer for model {model_id!r}. The error was "
904
+ f"Could not load tokeniser for model {model_id!r}. The error was "
858
905
  f"{str(e)}."
859
- )
906
+ ) from e
860
907
  logger.debug(
861
- f"Could not load tokenizer for {model_id!r}. Falling back to "
908
+ f"Could not load tokeniser for {model_id!r}. Falling back to "
862
909
  f"{adapter_base_model_id!r}."
863
910
  )
864
911
  model_id = adapter_base_model_id
865
912
  except (TimeoutError, RequestError):
866
- logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
913
+ logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
867
914
  sleep(5)
868
915
  continue
916
+ except (KeyError, ValueError) as e:
917
+ if "mistral" in str(e).lower():
918
+ tokeniser = MistralCommonTokenizer.from_pretrained(
919
+ model_id,
920
+ padding_side="left",
921
+ truncation_side="left",
922
+ model_max_length=model_max_length,
923
+ token=token,
924
+ )
925
+ break
926
+ raise InvalidModel(
927
+ f"Could not load tokeniser for model {model_id!r}. The error was "
928
+ f"{str(e)}."
929
+ ) from e
869
930
  else:
870
931
  raise InvalidModel(
871
- f"Could not load tokenizer for model {model_id!r} after {num_retries} "
932
+ f"Could not load tokeniser for model {model_id!r} after {num_retries} "
872
933
  "attempts."
873
934
  )
874
935
 
875
936
  # Ensure that BOS, EOS and PAD tokens are set
876
- tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
877
- tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
878
- tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
937
+ if not isinstance(tokeniser, MistralCommonTokenizer):
938
+ tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
939
+ tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
940
+ tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
879
941
 
880
- return tokenizer
942
+ return tokeniser
881
943
 
882
944
 
883
945
  def clear_vllm() -> None:
@@ -885,25 +947,21 @@ def clear_vllm() -> None:
885
947
  with contextlib.suppress(ValueError):
886
948
  destroy_model_parallel()
887
949
  destroy_distributed_environment()
888
- if ray.is_initialized():
889
- ray.shutdown()
890
950
  with contextlib.suppress(AssertionError):
891
951
  torch.distributed.destroy_process_group()
892
- if ray.is_initialized():
893
- ray.shutdown()
894
952
  clear_memory()
895
953
 
896
954
 
897
955
  def get_end_of_reasoning_token(
898
- model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
956
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
899
957
  ) -> str | None:
900
958
  """Get the end-of-reasoning token for a generative model.
901
959
 
902
960
  Args:
903
961
  model:
904
962
  The vLLM model.
905
- tokenizer:
906
- The tokenizer.
963
+ tokeniser:
964
+ The tokeniser.
907
965
  model_id:
908
966
  The model ID.
909
967
 
@@ -912,11 +970,9 @@ def get_end_of_reasoning_token(
912
970
  """
913
971
  # Create a prompt to check if the model uses the reasoning tokens
914
972
  prompt = "What is your name?"
915
- if tokenizer.chat_template is not None:
916
- templated_prompt = tokenizer.apply_chat_template(
917
- conversation=[dict(role="user", content=prompt)],
918
- add_generation_prompt=True,
919
- tokenize=False,
973
+ if has_chat_template(tokeniser=tokeniser):
974
+ templated_prompt = apply_chat_template(
975
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
920
976
  )
921
977
  assert isinstance(templated_prompt, str)
922
978
  prompt = templated_prompt
@@ -941,7 +997,7 @@ def get_end_of_reasoning_token(
941
997
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
942
998
  "tokens in the prompt or the completion. Assuming the model is not "
943
999
  "a reasoning model.",
944
- level=logging.INFO,
1000
+ level=logging.DEBUG,
945
1001
  )
946
1002
  return None
947
1003
 
@@ -967,7 +1023,7 @@ def get_end_of_reasoning_token(
967
1023
  "the beginning-of-reasoning tokens "
968
1024
  f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
969
1025
  "This is probably not correct, so please report this issue.",
970
- level=logging.INFO,
1026
+ level=logging.WARNING,
971
1027
  )
972
1028
  return None
973
1029
 
@@ -977,14 +1033,14 @@ def get_end_of_reasoning_token(
977
1033
  f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
978
1034
  "the reasoning token. If this is not the correct reasoning token, "
979
1035
  "please report this issue.",
980
- level=logging.INFO,
1036
+ level=logging.WARNING,
981
1037
  )
982
1038
 
983
1039
  bor_token, eor_token = eor_reasoning_matches[0]
984
1040
  log_once(
985
1041
  f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
986
1042
  f"token {eor_token!r} for model {model_id!r}.",
987
- level=logging.INFO,
1043
+ level=logging.DEBUG,
988
1044
  )
989
1045
 
990
1046
  return eor_token
@@ -992,7 +1048,7 @@ def get_end_of_reasoning_token(
992
1048
 
993
1049
  def get_custom_stop_tokens(
994
1050
  model: "LLM",
995
- tokenizer: "PreTrainedTokenizer",
1051
+ tokeniser: "PreTrainedTokenizer",
996
1052
  model_id: str,
997
1053
  is_reasoning_model: bool,
998
1054
  ) -> list[str]:
@@ -1001,8 +1057,8 @@ def get_custom_stop_tokens(
1001
1057
  Args:
1002
1058
  model:
1003
1059
  The vLLM model.
1004
- tokenizer:
1005
- The tokenizer.
1060
+ tokeniser:
1061
+ The tokeniser.
1006
1062
  model_id:
1007
1063
  The model ID.
1008
1064
  is_reasoning_model:
@@ -1015,11 +1071,9 @@ def get_custom_stop_tokens(
1015
1071
  candidate_stop_tokens = CUSTOM_STOP_TOKENS
1016
1072
 
1017
1073
  prompt = "Hello"
1018
- if tokenizer.chat_template is not None:
1019
- templated_prompt = tokenizer.apply_chat_template(
1020
- conversation=[dict(role="user", content=prompt)],
1021
- add_generation_prompt=True,
1022
- tokenize=False,
1074
+ if has_chat_template(tokeniser=tokeniser):
1075
+ templated_prompt = apply_chat_template(
1076
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1023
1077
  )
1024
1078
  assert isinstance(templated_prompt, str)
1025
1079
  prompt = templated_prompt