EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (63) hide show
  1. euroeval/__init__.py +3 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +161 -114
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +13 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +53 -7
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +38 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +46 -14
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +234 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  41. euroeval/prompt_templates/multiple_choice.py +23 -2
  42. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  43. euroeval/prompt_templates/reading_comprehension.py +42 -2
  44. euroeval/prompt_templates/sentiment_classification.py +46 -2
  45. euroeval/prompt_templates/summarization.py +24 -4
  46. euroeval/scores.py +7 -2
  47. euroeval/speed_benchmark.py +6 -6
  48. euroeval/task_group_utils/multiple_choice_classification.py +17 -6
  49. euroeval/task_group_utils/question_answering.py +35 -28
  50. euroeval/task_group_utils/sequence_classification.py +96 -23
  51. euroeval/task_group_utils/text_to_text.py +7 -3
  52. euroeval/task_group_utils/token_classification.py +47 -75
  53. euroeval/tasks.py +31 -6
  54. euroeval/tokenization_utils.py +295 -207
  55. euroeval/utils.py +118 -34
  56. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
  57. euroeval-16.0.0.dist-info/RECORD +69 -0
  58. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
  59. euroeval/human_evaluation.py +0 -738
  60. euroeval/metrics.py +0 -470
  61. euroeval-15.16.0.dist-info/RECORD +0 -63
  62. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,6 @@ import contextlib
5
5
  import importlib.util
6
6
  import json
7
7
  import logging
8
- import os
9
8
  import re
10
9
  import typing as t
11
10
  from functools import partial
@@ -16,6 +15,7 @@ import torch
16
15
  from huggingface_hub import snapshot_download
17
16
  from pydantic import conlist, create_model
18
17
  from tqdm.auto import tqdm
18
+ from transformers import MistralCommonTokenizer
19
19
  from transformers.models.auto.configuration_auto import AutoConfig
20
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
21
21
  from urllib3.exceptions import RequestError
@@ -24,11 +24,10 @@ from ..constants import (
24
24
  CUSTOM_STOP_TOKENS,
25
25
  GENERATIVE_PIPELINE_TAGS,
26
26
  MAX_CONTEXT_LENGTH,
27
- MAX_LOGPROBS,
27
+ MAX_VLLM_LOGPROBS,
28
28
  MERGE_TAGS,
29
29
  REASONING_MAX_TOKENS,
30
30
  REASONING_TOKENS,
31
- TASKS_USING_JSON,
32
31
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
32
  )
34
33
  from ..data_models import GenerativeModelOutput, ModelConfig
@@ -54,17 +53,20 @@ from ..task_group_utils import (
54
53
  token_classification,
55
54
  )
56
55
  from ..tokenization_utils import (
56
+ apply_chat_template,
57
57
  get_bos_token,
58
58
  get_end_of_chat_token_ids,
59
59
  get_eos_token,
60
60
  get_first_label_token_mapping,
61
61
  get_pad_token,
62
+ has_chat_template,
62
63
  should_prompts_be_stripped,
63
64
  )
64
65
  from ..types import ExtractLabelsFunction
65
66
  from ..utils import (
66
67
  clear_memory,
67
68
  create_model_cache_dir,
69
+ get_hf_token,
68
70
  get_min_cuda_compute_capability,
69
71
  log_once,
70
72
  )
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
79
81
  from vllm.lora.request import LoRARequest
80
82
  from vllm.sampling_params import GuidedDecodingParams
81
83
 
82
- if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
83
- import ray
84
-
85
84
  if t.TYPE_CHECKING:
86
85
  from datasets import DatasetDict
87
86
  from transformers.tokenization_utils import PreTrainedTokenizer
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
104
103
  model_config: "ModelConfig",
105
104
  dataset_config: "DatasetConfig",
106
105
  benchmark_config: "BenchmarkConfig",
106
+ log_metadata: bool = True,
107
107
  ) -> None:
108
108
  """Initialise the vLLM model.
109
109
 
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
114
114
  The dataset configuration.
115
115
  benchmark_config:
116
116
  The benchmark configuration.
117
+ log_metadata:
118
+ Whether to log the model and dataset metadata.
117
119
  """
118
- if (
119
- importlib.util.find_spec("vllm") is None
120
- or importlib.util.find_spec("ray") is None
121
- ):
120
+ if importlib.util.find_spec("vllm") is None:
122
121
  raise NeedsExtraInstalled(extra="generative")
123
122
 
124
- model, tokenizer = load_model_and_tokenizer(
123
+ model, tokeniser = load_model_and_tokeniser(
125
124
  model_config=model_config, benchmark_config=benchmark_config
126
125
  )
127
126
  self._model: "LLM" = model
128
- self._tokenizer: "PreTrainedTokenizer" = tokenizer
127
+ self._tokeniser: "PreTrainedTokenizer" = tokeniser
129
128
  self.end_of_reasoning_token = get_end_of_reasoning_token(
130
- model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
129
+ model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
131
130
  )
132
131
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
133
- tokenizer=self._tokenizer
132
+ tokeniser=self._tokeniser
134
133
  )
135
134
  self.custom_stop_tokens = get_custom_stop_tokens(
136
135
  model=self._model,
137
- tokenizer=self._tokenizer,
136
+ tokeniser=self._tokeniser,
138
137
  model_id=model_config.model_id,
139
138
  is_reasoning_model=self.end_of_reasoning_token is not None,
140
139
  )
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
145
144
  model_config=model_config,
146
145
  dataset_config=dataset_config,
147
146
  benchmark_config=benchmark_config,
147
+ log_metadata=log_metadata,
148
148
  )
149
149
 
150
150
  self.buffer |= dict(
151
- instruction_model=self._tokenizer.chat_template is not None,
151
+ instruction_model=has_chat_template(tokeniser=self._tokeniser),
152
152
  first_label_token_mapping=get_first_label_token_mapping(
153
153
  dataset_config=self.dataset_config,
154
154
  model_config=self.model_config,
155
- tokenizer=self._tokenizer,
155
+ tokeniser=self._tokeniser,
156
156
  generative_type=self.generative_type,
157
+ log_metadata=self.log_metadata,
157
158
  ),
158
159
  )
159
160
  if self.model_config.adapter_base_model_id is not None:
@@ -167,13 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
167
168
  )
168
169
 
169
170
  def __del__(self) -> None:
170
- """Clean up the model and tokenizer."""
171
- if importlib.util.find_spec("vllm") is not None:
172
- clear_vllm()
171
+ """Clean up the model and tokeniser."""
172
+ try:
173
+ if importlib.util.find_spec("vllm") is not None:
174
+ clear_vllm()
175
+ except ImportError:
176
+ pass
173
177
  if hasattr(self, "_model"):
174
178
  del self._model
175
- if hasattr(self, "_tokenizer"):
176
- del self._tokenizer
179
+ if hasattr(self, "_tokeniser"):
180
+ del self._tokeniser
177
181
 
178
182
  @property
179
183
  def generative_type(self) -> GenerativeType | None:
@@ -182,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
182
186
  Returns:
183
187
  The generative type of the model, or None if it has not been set yet.
184
188
  """
185
- if not hasattr(self, "_tokenizer"):
189
+ if not hasattr(self, "_tokeniser"):
186
190
  return None
187
191
  elif self.end_of_reasoning_token is not None:
188
192
  return GenerativeType.REASONING
189
193
  elif (
190
- self._tokenizer.chat_template is not None
194
+ has_chat_template(tokeniser=self._tokeniser)
191
195
  or "instruct" in self.model_config.model_id.lower()
192
196
  ):
193
197
  return GenerativeType.INSTRUCTION_TUNED
@@ -267,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
267
271
 
268
272
  if self.benchmark_config.few_shot:
269
273
  few_shot_examples = extract_few_shot_examples(
270
- dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
274
+ dataset=dataset,
275
+ dataset_config=self.dataset_config,
276
+ benchmark_config=self.benchmark_config,
277
+ itr_idx=itr_idx,
271
278
  )
272
279
  else:
273
280
  few_shot_examples = list()
@@ -280,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
280
287
  dataset_config=self.dataset_config,
281
288
  instruction_model=self.buffer["instruction_model"],
282
289
  always_populate_text_field=True,
283
- tokenizer=self._tokenizer,
290
+ tokeniser=self._tokeniser,
284
291
  ),
285
292
  batched=True,
286
293
  load_from_cache_file=False,
@@ -298,35 +305,40 @@ class VLLMModel(HuggingFaceEncoderModel):
298
305
 
299
306
  Returns:
300
307
  The generated model outputs.
308
+
309
+ Raises:
310
+ InvalidBenchmark:
311
+ If the dataset requires logprobs, but we could not get the first token
312
+ of each label in the dataset.
301
313
  """
302
314
  # Get stopping tokens
303
315
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
304
316
  if self.buffer["instruction_model"] is False:
305
317
  stop_tokens.append("\n\n")
306
- if self._tokenizer.pad_token_id is not None:
307
- assert isinstance(self._tokenizer.pad_token, str), (
318
+ if self._tokeniser.pad_token_id is not None:
319
+ assert isinstance(self._tokeniser.pad_token, str), (
308
320
  f"The pad token for the model {self.model_config.model_id!r} "
309
- f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
321
+ f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
310
322
  )
311
- stop_tokens.append(self._tokenizer.pad_token)
312
- if self._tokenizer.eos_token_id is not None:
313
- assert isinstance(self._tokenizer.eos_token, str), (
323
+ stop_tokens.append(self._tokeniser.pad_token)
324
+ if self._tokeniser.eos_token_id is not None:
325
+ assert isinstance(self._tokeniser.eos_token, str), (
314
326
  f"The EOS token for the model {self.model_config.model_id!r} "
315
- f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
327
+ f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
316
328
  )
317
- stop_tokens.append(self._tokenizer.eos_token)
318
- if self._tokenizer.pad_token_id is None:
319
- self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
320
- self._tokenizer.pad_token = self._tokenizer.eos_token
329
+ stop_tokens.append(self._tokeniser.eos_token)
330
+ if self._tokeniser.pad_token_id is None:
331
+ self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
332
+ self._tokeniser.pad_token = self._tokeniser.eos_token
321
333
  if self.end_of_chat_token_ids is not None:
322
- end_of_chat_token = self._tokenizer.decode(
334
+ end_of_chat_token = self._tokeniser.decode(
323
335
  self.end_of_chat_token_ids
324
336
  ).strip()
325
337
  if end_of_chat_token:
326
338
  stop_tokens.append(end_of_chat_token)
327
339
 
328
340
  structured_generation_schema = None
329
- if self.dataset_config.task in TASKS_USING_JSON:
341
+ if self.dataset_config.task.uses_structured_output:
330
342
  if self.generative_type == GenerativeType.REASONING:
331
343
  log_once(
332
344
  f"The model {self.model_config.model_id!r} is a reasoning model "
@@ -355,9 +367,33 @@ class VLLMModel(HuggingFaceEncoderModel):
355
367
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
356
368
  dataset_config=self.dataset_config,
357
369
  model_config=self.model_config,
358
- tokenizer=self._tokenizer,
370
+ tokeniser=self._tokeniser,
359
371
  generative_type=self.generative_type,
372
+ log_metadata=self.log_metadata,
360
373
  )
374
+ if (
375
+ not self.buffer["first_label_token_mapping"]
376
+ and self.dataset_config.task.requires_logprobs
377
+ ):
378
+ raise InvalidBenchmark(
379
+ "The dataset requires logprobs, but we encountered an error when "
380
+ "trying to get the first token of each label in the dataset. You can "
381
+ "try running this benchmark with the --verbose flag to see what the "
382
+ "error was. Skipping this evaluation."
383
+ )
384
+
385
+ # Define the guided decoding that we will use for structured generation
386
+ if structured_generation_schema is not None:
387
+ guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
388
+ elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
389
+ guided_decoding = GuidedDecodingParams(
390
+ choice=[
391
+ self.dataset_config.prompt_label_mapping[label]
392
+ for label in self.dataset_config.labels
393
+ ]
394
+ )
395
+ else:
396
+ guided_decoding = None
361
397
 
362
398
  # Define the parameters used for vLLM generation
363
399
  max_tokens: int = (
@@ -367,14 +403,12 @@ class VLLMModel(HuggingFaceEncoderModel):
367
403
  )
368
404
  sampling_params = SamplingParams(
369
405
  max_tokens=max_tokens,
370
- logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
406
+ logprobs=MAX_VLLM_LOGPROBS
407
+ if self.buffer["first_label_token_mapping"]
408
+ else None,
371
409
  temperature=0.0,
372
410
  stop=[stop_token for stop_token in stop_tokens if stop_token],
373
- guided_decoding=(
374
- GuidedDecodingParams(json=structured_generation_schema)
375
- if structured_generation_schema
376
- else None
377
- ),
411
+ guided_decoding=guided_decoding,
378
412
  )
379
413
 
380
414
  # If any of the prompts are empty then we need to replace them with a BOS token
@@ -383,7 +417,7 @@ class VLLMModel(HuggingFaceEncoderModel):
383
417
  if any(len(prompt) == 0 for prompt in prompts):
384
418
  logger.debug("Found empty prompts, replacing with BOS token.")
385
419
  prompts = [
386
- prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
420
+ prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
387
421
  for prompt in prompts
388
422
  ]
389
423
 
@@ -394,7 +428,7 @@ class VLLMModel(HuggingFaceEncoderModel):
394
428
  if not self.buffer.get(
395
429
  "instruction_model", False
396
430
  ) and should_prompts_be_stripped(
397
- labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
431
+ labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
398
432
  ):
399
433
  log_once(
400
434
  f"Stripping prompts for model {self.model_config.model_id!r}.",
@@ -432,22 +466,22 @@ class VLLMModel(HuggingFaceEncoderModel):
432
466
  "Prompts are too long, so truncating them and trying again..."
433
467
  )
434
468
  logger.debug(f"The error message was: {str(e)}")
435
- tokenized_prompts = self._tokenizer(
469
+ tokenized_prompts = self._tokeniser(
436
470
  text=prompts,
437
471
  truncation=True,
438
472
  max_length=max(
439
- min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
473
+ min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
440
474
  - max_tokens,
441
475
  0,
442
476
  ),
443
477
  )
444
- prompts = self._tokenizer.batch_decode(
478
+ prompts = self._tokeniser.batch_decode(
445
479
  sequences=tokenized_prompts.input_ids, skip_special_tokens=True
446
480
  )
447
481
  else:
448
482
  raise InvalidBenchmark(
449
483
  f"An error occurred during vLLM generation: {str(e)}"
450
- )
484
+ ) from e
451
485
  else:
452
486
  raise InvalidBenchmark(
453
487
  f"Could not generate sequences after {num_attempts} attempts."
@@ -477,7 +511,7 @@ class VLLMModel(HuggingFaceEncoderModel):
477
511
  completion_ids: list[list[int]] = [
478
512
  output.outputs[0].token_ids for output in raw_outputs
479
513
  ]
480
- completions = self._tokenizer.batch_decode(
514
+ completions = self._tokeniser.batch_decode(
481
515
  sequences=[
482
516
  torch.LongTensor(completion_id) for completion_id in completion_ids
483
517
  ]
@@ -625,10 +659,10 @@ class VLLMModel(HuggingFaceEncoderModel):
625
659
  )
626
660
 
627
661
 
628
- def load_model_and_tokenizer(
662
+ def load_model_and_tokeniser(
629
663
  model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
630
664
  ) -> tuple["LLM", "PreTrainedTokenizer"]:
631
- """Load the model and tokenizer.
665
+ """Load the model and tokeniser.
632
666
 
633
667
  Args:
634
668
  model_config:
@@ -637,7 +671,7 @@ def load_model_and_tokenizer(
637
671
  The benchmark configuration.
638
672
 
639
673
  Returns:
640
- A pair (model, tokenizer), with the loaded model and tokenizer
674
+ A pair (model, tokeniser), with the loaded model and tokeniser
641
675
  """
642
676
  # Prefer base model ID if the model is an adapter - the adapter will be added on
643
677
  # during inference in this case
@@ -675,7 +709,7 @@ def load_model_and_tokenizer(
675
709
  dtype: str | torch.dtype = "auto"
676
710
 
677
711
  # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
678
- if hf_model_config.torch_dtype == torch.float32:
712
+ if hf_model_config.dtype == torch.float32:
679
713
  if torch.cuda.is_bf16_supported():
680
714
  logger.info(
681
715
  "You are loading a model with dtype FP32, which we will convert to "
@@ -692,34 +726,32 @@ def load_model_and_tokenizer(
692
726
  dtype = torch.float16
693
727
 
694
728
  # If the model is a quantized model, we might need to change the dtype
695
- if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
729
+ if quantization == "mxfp4" and hf_model_config.dtype is None:
696
730
  dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
697
731
  logger.debug(
698
- "You are loading a quantized model where `torch_dtype` has not been set. "
732
+ "You are loading a quantized model where `dtype` has not been set. "
699
733
  f"Setting dtype to {dtype!r}."
700
734
  )
701
- elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
735
+ elif quantization is not None and hf_model_config.dtype != torch.float16:
702
736
  logger.info(
703
737
  "You are loading a quantized model with dtype "
704
- f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
738
+ f"{hf_model_config.dtype}, which vLLM does not support. Setting "
705
739
  "dtype to float16 instead."
706
740
  )
707
741
  dtype = torch.float16
708
742
 
709
743
  # If the model is a bf16 model, we need to check the CUDA compute capability
710
- if hf_model_config.torch_dtype == torch.bfloat16:
744
+ if hf_model_config.dtype == torch.bfloat16:
711
745
  min_cuda_compute_capability = get_min_cuda_compute_capability()
712
746
  required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
713
747
 
714
748
  if min_cuda_compute_capability is not None:
715
749
  if min_cuda_compute_capability < required_capability:
716
750
  logger.info(
717
- "You are loading a model with "
718
- f"dtype {hf_model_config.torch_dtype}, "
719
- "which vLLM only supports for CUDA devices with"
720
- f"CUDA compute capability >={required_capability}. "
721
- "You are using one or more devices with "
722
- f"compute capability {min_cuda_compute_capability}. "
751
+ f"You are loading a model with dtype {hf_model_config.dtype}, "
752
+ "which vLLM only supports for CUDA devices with CUDA compute "
753
+ f"capability >={required_capability}. You are using one or more "
754
+ f"devices with compute capability {min_cuda_compute_capability}. "
723
755
  "Setting dtype to float16 instead."
724
756
  )
725
757
  dtype = torch.float16
@@ -747,14 +779,14 @@ def load_model_and_tokenizer(
747
779
  else:
748
780
  true_max_model_len = MAX_CONTEXT_LENGTH
749
781
 
750
- tokenizer = load_tokenizer(
782
+ tokeniser = load_tokeniser(
751
783
  model_id=model_config.model_id,
752
784
  revision=model_config.revision,
753
785
  adapter_base_model_id=model_config.adapter_base_model_id,
754
786
  trust_remote_code=benchmark_config.trust_remote_code,
755
787
  model_max_length=true_max_model_len,
756
788
  model_cache_dir=model_config.model_cache_dir,
757
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
789
+ token=get_hf_token(api_key=benchmark_config.api_key),
758
790
  )
759
791
 
760
792
  clear_vllm()
@@ -769,9 +801,7 @@ def load_model_and_tokenizer(
769
801
  trust_remote_code=benchmark_config.trust_remote_code,
770
802
  revision=revision,
771
803
  seed=4242,
772
- distributed_executor_backend=(
773
- "ray" if torch.cuda.device_count() > 1 else "mp"
774
- ),
804
+ distributed_executor_backend="mp",
775
805
  tensor_parallel_size=torch.cuda.device_count(),
776
806
  disable_custom_all_reduce=True,
777
807
  quantization=quantization,
@@ -782,29 +812,39 @@ def load_model_and_tokenizer(
782
812
  enable_prefix_caching=False,
783
813
  enable_lora=model_config.adapter_base_model_id is not None,
784
814
  max_lora_rank=256,
815
+ # Special arguments in case we are dealing with a Mistral model
816
+ tokenizer_mode="mistral"
817
+ if isinstance(tokeniser, MistralCommonTokenizer)
818
+ else "auto",
819
+ config_format="mistral"
820
+ if isinstance(tokeniser, MistralCommonTokenizer)
821
+ else "auto",
822
+ load_format="mistral"
823
+ if isinstance(tokeniser, MistralCommonTokenizer)
824
+ else "auto",
785
825
  )
786
826
  except (RuntimeError, ValueError, OSError) as e:
787
827
  if "awaiting a review from the repo authors" in str(e):
788
828
  raise InvalidModel(
789
829
  f"The model {model_id!r} is awaiting a review from the repository "
790
830
  "authors. Please try again later."
791
- )
831
+ ) from e
792
832
  elif "trust_remote_code" in str(e):
793
833
  raise InvalidModel(
794
834
  f"Loading the model {model_id!r} needs to trust remote code. "
795
835
  "If you trust the suppliers of this model, then you can enable "
796
836
  "this by setting the `--trust-remote-code` flag."
797
- )
837
+ ) from e
798
838
  raise InvalidModel(
799
839
  f"The model {model_id!r} could not be loaded. The error was {e!r}."
800
- )
840
+ ) from e
801
841
 
802
842
  model.config = hf_model_config
803
843
 
804
- return model, tokenizer
844
+ return model, tokeniser
805
845
 
806
846
 
807
- def load_tokenizer(
847
+ def load_tokeniser(
808
848
  model_id: str,
809
849
  revision: str,
810
850
  adapter_base_model_id: str | None,
@@ -813,7 +853,7 @@ def load_tokenizer(
813
853
  model_cache_dir: str,
814
854
  token: str | bool,
815
855
  ) -> "PreTrainedTokenizer":
816
- """Load the tokenizer.
856
+ """Load the tokeniser.
817
857
 
818
858
  Args:
819
859
  model_id:
@@ -833,7 +873,7 @@ def load_tokenizer(
833
873
  The Hugging Face API token.
834
874
 
835
875
  Returns:
836
- The loaded tokenizer.
876
+ The loaded tokeniser.
837
877
  """
838
878
  revision = revision if adapter_base_model_id is None else "main"
839
879
  config = AutoConfig.from_pretrained(
@@ -846,7 +886,7 @@ def load_tokenizer(
846
886
  num_retries = 5
847
887
  for _ in range(num_retries):
848
888
  try:
849
- tokenizer = AutoTokenizer.from_pretrained(
889
+ tokeniser = AutoTokenizer.from_pretrained(
850
890
  model_id,
851
891
  use_fast=True,
852
892
  verbose=False,
@@ -861,30 +901,45 @@ def load_tokenizer(
861
901
  except (json.JSONDecodeError, OSError, TypeError) as e:
862
902
  if adapter_base_model_id is None or model_id == adapter_base_model_id:
863
903
  raise InvalidModel(
864
- f"Could not load tokenizer for model {model_id!r}. The error was "
904
+ f"Could not load tokeniser for model {model_id!r}. The error was "
865
905
  f"{str(e)}."
866
- )
906
+ ) from e
867
907
  logger.debug(
868
- f"Could not load tokenizer for {model_id!r}. Falling back to "
908
+ f"Could not load tokeniser for {model_id!r}. Falling back to "
869
909
  f"{adapter_base_model_id!r}."
870
910
  )
871
911
  model_id = adapter_base_model_id
872
912
  except (TimeoutError, RequestError):
873
- logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
913
+ logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
874
914
  sleep(5)
875
915
  continue
916
+ except (KeyError, ValueError) as e:
917
+ if "mistral" in str(e).lower():
918
+ tokeniser = MistralCommonTokenizer.from_pretrained(
919
+ model_id,
920
+ padding_side="left",
921
+ truncation_side="left",
922
+ model_max_length=model_max_length,
923
+ token=token,
924
+ )
925
+ break
926
+ raise InvalidModel(
927
+ f"Could not load tokeniser for model {model_id!r}. The error was "
928
+ f"{str(e)}."
929
+ ) from e
876
930
  else:
877
931
  raise InvalidModel(
878
- f"Could not load tokenizer for model {model_id!r} after {num_retries} "
932
+ f"Could not load tokeniser for model {model_id!r} after {num_retries} "
879
933
  "attempts."
880
934
  )
881
935
 
882
936
  # Ensure that BOS, EOS and PAD tokens are set
883
- tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
884
- tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
885
- tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
937
+ if not isinstance(tokeniser, MistralCommonTokenizer):
938
+ tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
939
+ tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
940
+ tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
886
941
 
887
- return tokenizer
942
+ return tokeniser
888
943
 
889
944
 
890
945
  def clear_vllm() -> None:
@@ -892,25 +947,21 @@ def clear_vllm() -> None:
892
947
  with contextlib.suppress(ValueError):
893
948
  destroy_model_parallel()
894
949
  destroy_distributed_environment()
895
- if ray.is_initialized():
896
- ray.shutdown()
897
950
  with contextlib.suppress(AssertionError):
898
951
  torch.distributed.destroy_process_group()
899
- if ray.is_initialized():
900
- ray.shutdown()
901
952
  clear_memory()
902
953
 
903
954
 
904
955
  def get_end_of_reasoning_token(
905
- model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
956
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
906
957
  ) -> str | None:
907
958
  """Get the end-of-reasoning token for a generative model.
908
959
 
909
960
  Args:
910
961
  model:
911
962
  The vLLM model.
912
- tokenizer:
913
- The tokenizer.
963
+ tokeniser:
964
+ The tokeniser.
914
965
  model_id:
915
966
  The model ID.
916
967
 
@@ -919,11 +970,9 @@ def get_end_of_reasoning_token(
919
970
  """
920
971
  # Create a prompt to check if the model uses the reasoning tokens
921
972
  prompt = "What is your name?"
922
- if tokenizer.chat_template is not None:
923
- templated_prompt = tokenizer.apply_chat_template(
924
- conversation=[dict(role="user", content=prompt)],
925
- add_generation_prompt=True,
926
- tokenize=False,
973
+ if has_chat_template(tokeniser=tokeniser):
974
+ templated_prompt = apply_chat_template(
975
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
927
976
  )
928
977
  assert isinstance(templated_prompt, str)
929
978
  prompt = templated_prompt
@@ -948,7 +997,7 @@ def get_end_of_reasoning_token(
948
997
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
949
998
  "tokens in the prompt or the completion. Assuming the model is not "
950
999
  "a reasoning model.",
951
- level=logging.INFO,
1000
+ level=logging.DEBUG,
952
1001
  )
953
1002
  return None
954
1003
 
@@ -974,7 +1023,7 @@ def get_end_of_reasoning_token(
974
1023
  "the beginning-of-reasoning tokens "
975
1024
  f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
976
1025
  "This is probably not correct, so please report this issue.",
977
- level=logging.INFO,
1026
+ level=logging.WARNING,
978
1027
  )
979
1028
  return None
980
1029
 
@@ -984,14 +1033,14 @@ def get_end_of_reasoning_token(
984
1033
  f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
985
1034
  "the reasoning token. If this is not the correct reasoning token, "
986
1035
  "please report this issue.",
987
- level=logging.INFO,
1036
+ level=logging.WARNING,
988
1037
  )
989
1038
 
990
1039
  bor_token, eor_token = eor_reasoning_matches[0]
991
1040
  log_once(
992
1041
  f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
993
1042
  f"token {eor_token!r} for model {model_id!r}.",
994
- level=logging.INFO,
1043
+ level=logging.DEBUG,
995
1044
  )
996
1045
 
997
1046
  return eor_token
@@ -999,7 +1048,7 @@ def get_end_of_reasoning_token(
999
1048
 
1000
1049
  def get_custom_stop_tokens(
1001
1050
  model: "LLM",
1002
- tokenizer: "PreTrainedTokenizer",
1051
+ tokeniser: "PreTrainedTokenizer",
1003
1052
  model_id: str,
1004
1053
  is_reasoning_model: bool,
1005
1054
  ) -> list[str]:
@@ -1008,8 +1057,8 @@ def get_custom_stop_tokens(
1008
1057
  Args:
1009
1058
  model:
1010
1059
  The vLLM model.
1011
- tokenizer:
1012
- The tokenizer.
1060
+ tokeniser:
1061
+ The tokeniser.
1013
1062
  model_id:
1014
1063
  The model ID.
1015
1064
  is_reasoning_model:
@@ -1022,11 +1071,9 @@ def get_custom_stop_tokens(
1022
1071
  candidate_stop_tokens = CUSTOM_STOP_TOKENS
1023
1072
 
1024
1073
  prompt = "Hello"
1025
- if tokenizer.chat_template is not None:
1026
- templated_prompt = tokenizer.apply_chat_template(
1027
- conversation=[dict(role="user", content=prompt)],
1028
- add_generation_prompt=True,
1029
- tokenize=False,
1074
+ if has_chat_template(tokeniser=tokeniser):
1075
+ templated_prompt = apply_chat_template(
1076
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1030
1077
  )
1031
1078
  assert isinstance(templated_prompt, str)
1032
1079
  prompt = templated_prompt