EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of EuroEval might be problematic. Click here for more details.

Files changed (64) hide show
  1. euroeval/__init__.py +8 -7
  2. euroeval/benchmark_config_factory.py +3 -7
  3. euroeval/benchmark_modules/base.py +35 -19
  4. euroeval/benchmark_modules/fresh.py +24 -19
  5. euroeval/benchmark_modules/hf.py +136 -154
  6. euroeval/benchmark_modules/litellm.py +190 -110
  7. euroeval/benchmark_modules/vllm.py +199 -139
  8. euroeval/benchmarker.py +49 -22
  9. euroeval/cli.py +3 -3
  10. euroeval/constants.py +19 -15
  11. euroeval/data_loading.py +33 -28
  12. euroeval/data_models.py +73 -23
  13. euroeval/dataset_configs/__init__.py +2 -0
  14. euroeval/dataset_configs/danish.py +35 -1
  15. euroeval/dataset_configs/dutch.py +38 -1
  16. euroeval/dataset_configs/english.py +38 -1
  17. euroeval/dataset_configs/estonian.py +95 -0
  18. euroeval/dataset_configs/faroese.py +38 -0
  19. euroeval/dataset_configs/finnish.py +39 -1
  20. euroeval/dataset_configs/french.py +38 -1
  21. euroeval/dataset_configs/german.py +38 -1
  22. euroeval/dataset_configs/icelandic.py +39 -1
  23. euroeval/dataset_configs/italian.py +38 -1
  24. euroeval/dataset_configs/latvian.py +81 -0
  25. euroeval/dataset_configs/norwegian.py +38 -1
  26. euroeval/dataset_configs/portuguese.py +38 -1
  27. euroeval/dataset_configs/spanish.py +38 -1
  28. euroeval/dataset_configs/swedish.py +38 -1
  29. euroeval/enums.py +0 -6
  30. euroeval/finetuning.py +6 -6
  31. euroeval/generation.py +25 -14
  32. euroeval/generation_utils.py +90 -20
  33. euroeval/languages.py +947 -187
  34. euroeval/metrics/__init__.py +6 -0
  35. euroeval/metrics/base.py +76 -0
  36. euroeval/metrics/huggingface.py +192 -0
  37. euroeval/metrics/llm_as_a_judge.py +257 -0
  38. euroeval/metrics/pipeline.py +276 -0
  39. euroeval/metrics/speed.py +51 -0
  40. euroeval/model_cache.py +13 -1
  41. euroeval/prompt_templates/linguistic_acceptability.py +40 -2
  42. euroeval/prompt_templates/multiple_choice.py +23 -2
  43. euroeval/prompt_templates/named_entity_recognition.py +65 -2
  44. euroeval/prompt_templates/reading_comprehension.py +42 -2
  45. euroeval/prompt_templates/sentiment_classification.py +46 -2
  46. euroeval/prompt_templates/summarization.py +24 -4
  47. euroeval/scores.py +7 -2
  48. euroeval/speed_benchmark.py +6 -6
  49. euroeval/task_group_utils/multiple_choice_classification.py +19 -8
  50. euroeval/task_group_utils/question_answering.py +35 -28
  51. euroeval/task_group_utils/sequence_classification.py +128 -42
  52. euroeval/task_group_utils/text_to_text.py +7 -3
  53. euroeval/task_group_utils/token_classification.py +59 -73
  54. euroeval/tasks.py +33 -6
  55. euroeval/tokenization_utils.py +294 -207
  56. euroeval/utils.py +150 -35
  57. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
  58. euroeval-16.0.1.dist-info/RECORD +69 -0
  59. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
  60. euroeval/human_evaluation.py +0 -738
  61. euroeval/metrics.py +0 -470
  62. euroeval-15.16.0.dist-info/RECORD +0 -63
  63. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
  64. {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,6 @@ import contextlib
5
5
  import importlib.util
6
6
  import json
7
7
  import logging
8
- import os
9
8
  import re
10
9
  import typing as t
11
10
  from functools import partial
@@ -16,6 +15,7 @@ import torch
16
15
  from huggingface_hub import snapshot_download
17
16
  from pydantic import conlist, create_model
18
17
  from tqdm.auto import tqdm
18
+ from transformers import MistralCommonTokenizer
19
19
  from transformers.models.auto.configuration_auto import AutoConfig
20
20
  from transformers.models.auto.tokenization_auto import AutoTokenizer
21
21
  from urllib3.exceptions import RequestError
@@ -24,11 +24,10 @@ from ..constants import (
24
24
  CUSTOM_STOP_TOKENS,
25
25
  GENERATIVE_PIPELINE_TAGS,
26
26
  MAX_CONTEXT_LENGTH,
27
- MAX_LOGPROBS,
27
+ MAX_VLLM_LOGPROBS,
28
28
  MERGE_TAGS,
29
29
  REASONING_MAX_TOKENS,
30
30
  REASONING_TOKENS,
31
- TASKS_USING_JSON,
32
31
  VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
33
32
  )
34
33
  from ..data_models import GenerativeModelOutput, ModelConfig
@@ -54,17 +53,20 @@ from ..task_group_utils import (
54
53
  token_classification,
55
54
  )
56
55
  from ..tokenization_utils import (
56
+ apply_chat_template,
57
57
  get_bos_token,
58
58
  get_end_of_chat_token_ids,
59
59
  get_eos_token,
60
60
  get_first_label_token_mapping,
61
61
  get_pad_token,
62
+ has_chat_template,
62
63
  should_prompts_be_stripped,
63
64
  )
64
65
  from ..types import ExtractLabelsFunction
65
66
  from ..utils import (
66
67
  clear_memory,
67
68
  create_model_cache_dir,
69
+ get_hf_token,
68
70
  get_min_cuda_compute_capability,
69
71
  log_once,
70
72
  )
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
79
81
  from vllm.lora.request import LoRARequest
80
82
  from vllm.sampling_params import GuidedDecodingParams
81
83
 
82
- if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
83
- import ray
84
-
85
84
  if t.TYPE_CHECKING:
86
85
  from datasets import DatasetDict
87
86
  from transformers.tokenization_utils import PreTrainedTokenizer
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
104
103
  model_config: "ModelConfig",
105
104
  dataset_config: "DatasetConfig",
106
105
  benchmark_config: "BenchmarkConfig",
106
+ log_metadata: bool = True,
107
107
  ) -> None:
108
108
  """Initialise the vLLM model.
109
109
 
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
114
114
  The dataset configuration.
115
115
  benchmark_config:
116
116
  The benchmark configuration.
117
+ log_metadata:
118
+ Whether to log the model and dataset metadata.
117
119
  """
118
- if (
119
- importlib.util.find_spec("vllm") is None
120
- or importlib.util.find_spec("ray") is None
121
- ):
120
+ if importlib.util.find_spec("vllm") is None:
122
121
  raise NeedsExtraInstalled(extra="generative")
123
122
 
124
- model, tokenizer = load_model_and_tokenizer(
123
+ model, tokeniser = load_model_and_tokeniser(
125
124
  model_config=model_config, benchmark_config=benchmark_config
126
125
  )
127
126
  self._model: "LLM" = model
128
- self._tokenizer: "PreTrainedTokenizer" = tokenizer
127
+ self._tokeniser: "PreTrainedTokenizer" = tokeniser
129
128
  self.end_of_reasoning_token = get_end_of_reasoning_token(
130
- model=self._model, tokenizer=self._tokenizer, model_id=model_config.model_id
129
+ model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
131
130
  )
132
131
  self.end_of_chat_token_ids = get_end_of_chat_token_ids(
133
- tokenizer=self._tokenizer
132
+ tokeniser=self._tokeniser
134
133
  )
135
134
  self.custom_stop_tokens = get_custom_stop_tokens(
136
135
  model=self._model,
137
- tokenizer=self._tokenizer,
136
+ tokeniser=self._tokeniser,
138
137
  model_id=model_config.model_id,
139
138
  is_reasoning_model=self.end_of_reasoning_token is not None,
140
139
  )
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
145
144
  model_config=model_config,
146
145
  dataset_config=dataset_config,
147
146
  benchmark_config=benchmark_config,
147
+ log_metadata=log_metadata,
148
148
  )
149
149
 
150
150
  self.buffer |= dict(
151
- instruction_model=self._tokenizer.chat_template is not None,
151
+ instruction_model=has_chat_template(tokeniser=self._tokeniser),
152
152
  first_label_token_mapping=get_first_label_token_mapping(
153
153
  dataset_config=self.dataset_config,
154
154
  model_config=self.model_config,
155
- tokenizer=self._tokenizer,
155
+ tokeniser=self._tokeniser,
156
156
  generative_type=self.generative_type,
157
+ log_metadata=self.log_metadata,
157
158
  ),
158
159
  )
159
160
  if self.model_config.adapter_base_model_id is not None:
@@ -167,13 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
167
168
  )
168
169
 
169
170
  def __del__(self) -> None:
170
- """Clean up the model and tokenizer."""
171
- if importlib.util.find_spec("vllm") is not None:
172
- clear_vllm()
171
+ """Clean up the model and tokeniser."""
172
+ try:
173
+ if importlib.util.find_spec("vllm") is not None:
174
+ clear_vllm()
175
+ except ImportError:
176
+ pass
173
177
  if hasattr(self, "_model"):
174
178
  del self._model
175
- if hasattr(self, "_tokenizer"):
176
- del self._tokenizer
179
+ if hasattr(self, "_tokeniser"):
180
+ del self._tokeniser
177
181
 
178
182
  @property
179
183
  def generative_type(self) -> GenerativeType | None:
@@ -182,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
182
186
  Returns:
183
187
  The generative type of the model, or None if it has not been set yet.
184
188
  """
185
- if not hasattr(self, "_tokenizer"):
189
+ if not hasattr(self, "_tokeniser"):
186
190
  return None
187
191
  elif self.end_of_reasoning_token is not None:
188
192
  return GenerativeType.REASONING
189
193
  elif (
190
- self._tokenizer.chat_template is not None
194
+ has_chat_template(tokeniser=self._tokeniser)
191
195
  or "instruct" in self.model_config.model_id.lower()
192
196
  ):
193
197
  return GenerativeType.INSTRUCTION_TUNED
@@ -267,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
267
271
 
268
272
  if self.benchmark_config.few_shot:
269
273
  few_shot_examples = extract_few_shot_examples(
270
- dataset=dataset, dataset_config=self.dataset_config, itr_idx=itr_idx
274
+ dataset=dataset,
275
+ dataset_config=self.dataset_config,
276
+ benchmark_config=self.benchmark_config,
277
+ itr_idx=itr_idx,
271
278
  )
272
279
  else:
273
280
  few_shot_examples = list()
@@ -280,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
280
287
  dataset_config=self.dataset_config,
281
288
  instruction_model=self.buffer["instruction_model"],
282
289
  always_populate_text_field=True,
283
- tokenizer=self._tokenizer,
290
+ tokeniser=self._tokeniser,
284
291
  ),
285
292
  batched=True,
286
293
  load_from_cache_file=False,
@@ -298,66 +305,100 @@ class VLLMModel(HuggingFaceEncoderModel):
298
305
 
299
306
  Returns:
300
307
  The generated model outputs.
308
+
309
+ Raises:
310
+ InvalidBenchmark:
311
+ If the dataset requires logprobs, but we could not get the first token
312
+ of each label in the dataset.
301
313
  """
302
314
  # Get stopping tokens
303
315
  stop_tokens: list[str] = self.custom_stop_tokens.copy()
304
316
  if self.buffer["instruction_model"] is False:
305
317
  stop_tokens.append("\n\n")
306
- if self._tokenizer.pad_token_id is not None:
307
- assert isinstance(self._tokenizer.pad_token, str), (
318
+ if self._tokeniser.pad_token_id is not None:
319
+ assert isinstance(self._tokeniser.pad_token, str), (
308
320
  f"The pad token for the model {self.model_config.model_id!r} "
309
- f"is not a string, which is unexpected: {self._tokenizer.pad_token!r}."
321
+ f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
310
322
  )
311
- stop_tokens.append(self._tokenizer.pad_token)
312
- if self._tokenizer.eos_token_id is not None:
313
- assert isinstance(self._tokenizer.eos_token, str), (
323
+ stop_tokens.append(self._tokeniser.pad_token)
324
+ if self._tokeniser.eos_token_id is not None:
325
+ assert isinstance(self._tokeniser.eos_token, str), (
314
326
  f"The EOS token for the model {self.model_config.model_id!r} "
315
- f"is not a string, which is unexpected: {self._tokenizer.eos_token!r}."
327
+ f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
316
328
  )
317
- stop_tokens.append(self._tokenizer.eos_token)
318
- if self._tokenizer.pad_token_id is None:
319
- self._tokenizer.pad_token_id = self._tokenizer.eos_token_id
320
- self._tokenizer.pad_token = self._tokenizer.eos_token
329
+ stop_tokens.append(self._tokeniser.eos_token)
330
+ if self._tokeniser.pad_token_id is None:
331
+ self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
332
+ self._tokeniser.pad_token = self._tokeniser.eos_token
321
333
  if self.end_of_chat_token_ids is not None:
322
- end_of_chat_token = self._tokenizer.decode(
334
+ end_of_chat_token = self._tokeniser.decode(
323
335
  self.end_of_chat_token_ids
324
336
  ).strip()
325
337
  if end_of_chat_token:
326
338
  stop_tokens.append(end_of_chat_token)
327
339
 
328
- structured_generation_schema = None
329
- if self.dataset_config.task in TASKS_USING_JSON:
330
- if self.generative_type == GenerativeType.REASONING:
331
- log_once(
332
- f"The model {self.model_config.model_id!r} is a reasoning model "
333
- "and thus does not support structured generation, so we do not "
334
- "enable it.",
335
- level=logging.DEBUG,
336
- )
337
- else:
338
- ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
339
- keys_and_their_types: dict[str, t.Any] = {
340
- tag_name: (conlist(str, max_length=5), ...)
341
- for tag_name in ner_tag_names
342
- }
343
- answer_format_class = create_model(
344
- "AnswerFormat", **keys_and_their_types
345
- )
346
- structured_generation_schema = answer_format_class.model_json_schema()
347
- log_once(
348
- "Using structured generation with the JSON schema "
349
- f"{structured_generation_schema}",
350
- level=logging.DEBUG,
351
- )
352
-
353
340
  # Get the mapping from labels to the first token in the label. We call this each
354
341
  # time we generate a new dataset since the dataset config can change
355
342
  self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
356
343
  dataset_config=self.dataset_config,
357
344
  model_config=self.model_config,
358
- tokenizer=self._tokenizer,
345
+ tokeniser=self._tokeniser,
359
346
  generative_type=self.generative_type,
347
+ log_metadata=self.log_metadata,
360
348
  )
349
+ if (
350
+ not self.buffer["first_label_token_mapping"]
351
+ and self.dataset_config.task.requires_logprobs
352
+ ):
353
+ raise InvalidBenchmark(
354
+ "The dataset requires logprobs, but we encountered an error when "
355
+ "trying to get the first token of each label in the dataset. You can "
356
+ "try running this benchmark with the --verbose flag to see what the "
357
+ "error was. Skipping this evaluation."
358
+ )
359
+
360
+ structured_generation_schema = None
361
+ if (
362
+ self.dataset_config.task.uses_structured_output
363
+ or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
364
+ ) and self.generative_type == GenerativeType.REASONING:
365
+ guided_decoding = None
366
+ logger.debug(
367
+ "The dataset uses structured output, but we are not using it as the "
368
+ "model is a reasoning model."
369
+ )
370
+ elif self.dataset_config.task.uses_structured_output:
371
+ ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
372
+ keys_and_their_types: dict[str, t.Any] = {
373
+ tag_name: (conlist(str, max_length=5), ...)
374
+ for tag_name in ner_tag_names
375
+ }
376
+ answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
377
+ structured_generation_schema = answer_format_class.model_json_schema()
378
+ log_once(
379
+ "Using structured generation with the JSON schema: "
380
+ f"{json.dumps(structured_generation_schema)}",
381
+ level=logging.DEBUG,
382
+ )
383
+ guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
384
+ elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
385
+ guided_decoding = GuidedDecodingParams(
386
+ choice=[
387
+ self.dataset_config.prompt_label_mapping[label]
388
+ for label in self.dataset_config.labels
389
+ ]
390
+ )
391
+ log_once(
392
+ "Using structured generation with the choices: "
393
+ f"{guided_decoding.choice!r}.",
394
+ level=logging.DEBUG,
395
+ )
396
+ else:
397
+ guided_decoding = None
398
+ log_once(
399
+ "Not using structured generation as the dataset does not require it.",
400
+ level=logging.DEBUG,
401
+ )
361
402
 
362
403
  # Define the parameters used for vLLM generation
363
404
  max_tokens: int = (
@@ -367,14 +408,12 @@ class VLLMModel(HuggingFaceEncoderModel):
367
408
  )
368
409
  sampling_params = SamplingParams(
369
410
  max_tokens=max_tokens,
370
- logprobs=MAX_LOGPROBS if self.buffer["first_label_token_mapping"] else None,
411
+ logprobs=MAX_VLLM_LOGPROBS
412
+ if self.buffer["first_label_token_mapping"]
413
+ else None,
371
414
  temperature=0.0,
372
415
  stop=[stop_token for stop_token in stop_tokens if stop_token],
373
- guided_decoding=(
374
- GuidedDecodingParams(json=structured_generation_schema)
375
- if structured_generation_schema
376
- else None
377
- ),
416
+ guided_decoding=guided_decoding,
378
417
  )
379
418
 
380
419
  # If any of the prompts are empty then we need to replace them with a BOS token
@@ -383,7 +422,7 @@ class VLLMModel(HuggingFaceEncoderModel):
383
422
  if any(len(prompt) == 0 for prompt in prompts):
384
423
  logger.debug("Found empty prompts, replacing with BOS token.")
385
424
  prompts = [
386
- prompt if len(prompt) > 0 else str(self._tokenizer.bos_token)
425
+ prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
387
426
  for prompt in prompts
388
427
  ]
389
428
 
@@ -394,7 +433,7 @@ class VLLMModel(HuggingFaceEncoderModel):
394
433
  if not self.buffer.get(
395
434
  "instruction_model", False
396
435
  ) and should_prompts_be_stripped(
397
- labels_to_be_generated=labels_to_be_generated, tokenizer=self._tokenizer
436
+ labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
398
437
  ):
399
438
  log_once(
400
439
  f"Stripping prompts for model {self.model_config.model_id!r}.",
@@ -405,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
405
444
  # Generate sequences using vLLM
406
445
  input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
407
446
  num_attempts = 3
447
+ truncation_attempts = 0
408
448
  for _ in range(num_attempts):
409
449
  try:
410
450
  raw_outputs = self._model.generate(
@@ -432,22 +472,29 @@ class VLLMModel(HuggingFaceEncoderModel):
432
472
  "Prompts are too long, so truncating them and trying again..."
433
473
  )
434
474
  logger.debug(f"The error message was: {str(e)}")
435
- tokenized_prompts = self._tokenizer(
475
+
476
+ # If we have already tried truncating the prompts a few times, then
477
+ # we truncate a bit more aggressively
478
+ extra_truncation = 50 * truncation_attempts
479
+ truncation_attempts += 1
480
+
481
+ tokenized_prompts = self._tokeniser(
436
482
  text=prompts,
437
483
  truncation=True,
438
484
  max_length=max(
439
- min(self._tokenizer.model_max_length, MAX_CONTEXT_LENGTH)
440
- - max_tokens,
485
+ min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
486
+ - max_tokens
487
+ - extra_truncation,
441
488
  0,
442
489
  ),
443
490
  )
444
- prompts = self._tokenizer.batch_decode(
491
+ prompts = self._tokeniser.batch_decode(
445
492
  sequences=tokenized_prompts.input_ids, skip_special_tokens=True
446
493
  )
447
494
  else:
448
495
  raise InvalidBenchmark(
449
496
  f"An error occurred during vLLM generation: {str(e)}"
450
- )
497
+ ) from e
451
498
  else:
452
499
  raise InvalidBenchmark(
453
500
  f"Could not generate sequences after {num_attempts} attempts."
@@ -477,7 +524,7 @@ class VLLMModel(HuggingFaceEncoderModel):
477
524
  completion_ids: list[list[int]] = [
478
525
  output.outputs[0].token_ids for output in raw_outputs
479
526
  ]
480
- completions = self._tokenizer.batch_decode(
527
+ completions = self._tokeniser.batch_decode(
481
528
  sequences=[
482
529
  torch.LongTensor(completion_id) for completion_id in completion_ids
483
530
  ]
@@ -625,10 +672,10 @@ class VLLMModel(HuggingFaceEncoderModel):
625
672
  )
626
673
 
627
674
 
628
- def load_model_and_tokenizer(
675
+ def load_model_and_tokeniser(
629
676
  model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
630
677
  ) -> tuple["LLM", "PreTrainedTokenizer"]:
631
- """Load the model and tokenizer.
678
+ """Load the model and tokeniser.
632
679
 
633
680
  Args:
634
681
  model_config:
@@ -637,7 +684,7 @@ def load_model_and_tokenizer(
637
684
  The benchmark configuration.
638
685
 
639
686
  Returns:
640
- A pair (model, tokenizer), with the loaded model and tokenizer
687
+ A pair (model, tokeniser), with the loaded model and tokeniser
641
688
  """
642
689
  # Prefer base model ID if the model is an adapter - the adapter will be added on
643
690
  # during inference in this case
@@ -675,7 +722,7 @@ def load_model_and_tokenizer(
675
722
  dtype: str | torch.dtype = "auto"
676
723
 
677
724
  # Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
678
- if hf_model_config.torch_dtype == torch.float32:
725
+ if hf_model_config.dtype == torch.float32:
679
726
  if torch.cuda.is_bf16_supported():
680
727
  logger.info(
681
728
  "You are loading a model with dtype FP32, which we will convert to "
@@ -692,34 +739,32 @@ def load_model_and_tokenizer(
692
739
  dtype = torch.float16
693
740
 
694
741
  # If the model is a quantized model, we might need to change the dtype
695
- if quantization == "mxfp4" and hf_model_config.torch_dtype is None:
742
+ if quantization == "mxfp4" and hf_model_config.dtype is None:
696
743
  dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
697
744
  logger.debug(
698
- "You are loading a quantized model where `torch_dtype` has not been set. "
745
+ "You are loading a quantized model where `dtype` has not been set. "
699
746
  f"Setting dtype to {dtype!r}."
700
747
  )
701
- elif quantization is not None and hf_model_config.torch_dtype != torch.float16:
748
+ elif quantization is not None and hf_model_config.dtype != torch.float16:
702
749
  logger.info(
703
750
  "You are loading a quantized model with dtype "
704
- f"{hf_model_config.torch_dtype}, which vLLM does not support. Setting "
751
+ f"{hf_model_config.dtype}, which vLLM does not support. Setting "
705
752
  "dtype to float16 instead."
706
753
  )
707
754
  dtype = torch.float16
708
755
 
709
756
  # If the model is a bf16 model, we need to check the CUDA compute capability
710
- if hf_model_config.torch_dtype == torch.bfloat16:
757
+ if hf_model_config.dtype == torch.bfloat16:
711
758
  min_cuda_compute_capability = get_min_cuda_compute_capability()
712
759
  required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
713
760
 
714
761
  if min_cuda_compute_capability is not None:
715
762
  if min_cuda_compute_capability < required_capability:
716
763
  logger.info(
717
- "You are loading a model with "
718
- f"dtype {hf_model_config.torch_dtype}, "
719
- "which vLLM only supports for CUDA devices with"
720
- f"CUDA compute capability >={required_capability}. "
721
- "You are using one or more devices with "
722
- f"compute capability {min_cuda_compute_capability}. "
764
+ f"You are loading a model with dtype {hf_model_config.dtype}, "
765
+ "which vLLM only supports for CUDA devices with CUDA compute "
766
+ f"capability >={required_capability}. You are using one or more "
767
+ f"devices with compute capability {min_cuda_compute_capability}. "
723
768
  "Setting dtype to float16 instead."
724
769
  )
725
770
  dtype = torch.float16
@@ -747,14 +792,14 @@ def load_model_and_tokenizer(
747
792
  else:
748
793
  true_max_model_len = MAX_CONTEXT_LENGTH
749
794
 
750
- tokenizer = load_tokenizer(
795
+ tokeniser = load_tokeniser(
751
796
  model_id=model_config.model_id,
752
797
  revision=model_config.revision,
753
798
  adapter_base_model_id=model_config.adapter_base_model_id,
754
799
  trust_remote_code=benchmark_config.trust_remote_code,
755
800
  model_max_length=true_max_model_len,
756
801
  model_cache_dir=model_config.model_cache_dir,
757
- token=benchmark_config.api_key or os.getenv("HUGGINGFACE_API_KEY") or True,
802
+ token=get_hf_token(api_key=benchmark_config.api_key),
758
803
  )
759
804
 
760
805
  clear_vllm()
@@ -769,9 +814,7 @@ def load_model_and_tokenizer(
769
814
  trust_remote_code=benchmark_config.trust_remote_code,
770
815
  revision=revision,
771
816
  seed=4242,
772
- distributed_executor_backend=(
773
- "ray" if torch.cuda.device_count() > 1 else "mp"
774
- ),
817
+ distributed_executor_backend="mp",
775
818
  tensor_parallel_size=torch.cuda.device_count(),
776
819
  disable_custom_all_reduce=True,
777
820
  quantization=quantization,
@@ -782,29 +825,39 @@ def load_model_and_tokenizer(
782
825
  enable_prefix_caching=False,
783
826
  enable_lora=model_config.adapter_base_model_id is not None,
784
827
  max_lora_rank=256,
828
+ # Special arguments in case we are dealing with a Mistral model
829
+ tokenizer_mode="mistral"
830
+ if isinstance(tokeniser, MistralCommonTokenizer)
831
+ else "auto",
832
+ config_format="mistral"
833
+ if isinstance(tokeniser, MistralCommonTokenizer)
834
+ else "auto",
835
+ load_format="mistral"
836
+ if isinstance(tokeniser, MistralCommonTokenizer)
837
+ else "auto",
785
838
  )
786
839
  except (RuntimeError, ValueError, OSError) as e:
787
840
  if "awaiting a review from the repo authors" in str(e):
788
841
  raise InvalidModel(
789
842
  f"The model {model_id!r} is awaiting a review from the repository "
790
843
  "authors. Please try again later."
791
- )
844
+ ) from e
792
845
  elif "trust_remote_code" in str(e):
793
846
  raise InvalidModel(
794
847
  f"Loading the model {model_id!r} needs to trust remote code. "
795
848
  "If you trust the suppliers of this model, then you can enable "
796
849
  "this by setting the `--trust-remote-code` flag."
797
- )
850
+ ) from e
798
851
  raise InvalidModel(
799
852
  f"The model {model_id!r} could not be loaded. The error was {e!r}."
800
- )
853
+ ) from e
801
854
 
802
855
  model.config = hf_model_config
803
856
 
804
- return model, tokenizer
857
+ return model, tokeniser
805
858
 
806
859
 
807
- def load_tokenizer(
860
+ def load_tokeniser(
808
861
  model_id: str,
809
862
  revision: str,
810
863
  adapter_base_model_id: str | None,
@@ -813,7 +866,7 @@ def load_tokenizer(
813
866
  model_cache_dir: str,
814
867
  token: str | bool,
815
868
  ) -> "PreTrainedTokenizer":
816
- """Load the tokenizer.
869
+ """Load the tokeniser.
817
870
 
818
871
  Args:
819
872
  model_id:
@@ -833,7 +886,7 @@ def load_tokenizer(
833
886
  The Hugging Face API token.
834
887
 
835
888
  Returns:
836
- The loaded tokenizer.
889
+ The loaded tokeniser.
837
890
  """
838
891
  revision = revision if adapter_base_model_id is None else "main"
839
892
  config = AutoConfig.from_pretrained(
@@ -846,7 +899,7 @@ def load_tokenizer(
846
899
  num_retries = 5
847
900
  for _ in range(num_retries):
848
901
  try:
849
- tokenizer = AutoTokenizer.from_pretrained(
902
+ tokeniser = AutoTokenizer.from_pretrained(
850
903
  model_id,
851
904
  use_fast=True,
852
905
  verbose=False,
@@ -861,30 +914,45 @@ def load_tokenizer(
861
914
  except (json.JSONDecodeError, OSError, TypeError) as e:
862
915
  if adapter_base_model_id is None or model_id == adapter_base_model_id:
863
916
  raise InvalidModel(
864
- f"Could not load tokenizer for model {model_id!r}. The error was "
917
+ f"Could not load tokeniser for model {model_id!r}. The error was "
865
918
  f"{str(e)}."
866
- )
919
+ ) from e
867
920
  logger.debug(
868
- f"Could not load tokenizer for {model_id!r}. Falling back to "
921
+ f"Could not load tokeniser for {model_id!r}. Falling back to "
869
922
  f"{adapter_base_model_id!r}."
870
923
  )
871
924
  model_id = adapter_base_model_id
872
925
  except (TimeoutError, RequestError):
873
- logger.info(f"Couldn't load tokenizer for {model_id!r}. Retrying.")
926
+ logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
874
927
  sleep(5)
875
928
  continue
929
+ except (KeyError, ValueError) as e:
930
+ if "mistral" in str(e).lower():
931
+ tokeniser = MistralCommonTokenizer.from_pretrained(
932
+ model_id,
933
+ padding_side="left",
934
+ truncation_side="left",
935
+ model_max_length=model_max_length,
936
+ token=token,
937
+ )
938
+ break
939
+ raise InvalidModel(
940
+ f"Could not load tokeniser for model {model_id!r}. The error was "
941
+ f"{str(e)}."
942
+ ) from e
876
943
  else:
877
944
  raise InvalidModel(
878
- f"Could not load tokenizer for model {model_id!r} after {num_retries} "
945
+ f"Could not load tokeniser for model {model_id!r} after {num_retries} "
879
946
  "attempts."
880
947
  )
881
948
 
882
949
  # Ensure that BOS, EOS and PAD tokens are set
883
- tokenizer.bos_token, tokenizer.bos_token_id = get_bos_token(tokenizer=tokenizer)
884
- tokenizer.eos_token, tokenizer.eos_token_id = get_eos_token(tokenizer=tokenizer)
885
- tokenizer.pad_token, tokenizer.pad_token_id = get_pad_token(tokenizer=tokenizer)
950
+ if not isinstance(tokeniser, MistralCommonTokenizer):
951
+ tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
952
+ tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
953
+ tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
886
954
 
887
- return tokenizer
955
+ return tokeniser
888
956
 
889
957
 
890
958
  def clear_vllm() -> None:
@@ -892,25 +960,21 @@ def clear_vllm() -> None:
892
960
  with contextlib.suppress(ValueError):
893
961
  destroy_model_parallel()
894
962
  destroy_distributed_environment()
895
- if ray.is_initialized():
896
- ray.shutdown()
897
963
  with contextlib.suppress(AssertionError):
898
964
  torch.distributed.destroy_process_group()
899
- if ray.is_initialized():
900
- ray.shutdown()
901
965
  clear_memory()
902
966
 
903
967
 
904
968
  def get_end_of_reasoning_token(
905
- model: "LLM", tokenizer: "PreTrainedTokenizer", model_id: str
969
+ model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
906
970
  ) -> str | None:
907
971
  """Get the end-of-reasoning token for a generative model.
908
972
 
909
973
  Args:
910
974
  model:
911
975
  The vLLM model.
912
- tokenizer:
913
- The tokenizer.
976
+ tokeniser:
977
+ The tokeniser.
914
978
  model_id:
915
979
  The model ID.
916
980
 
@@ -919,11 +983,9 @@ def get_end_of_reasoning_token(
919
983
  """
920
984
  # Create a prompt to check if the model uses the reasoning tokens
921
985
  prompt = "What is your name?"
922
- if tokenizer.chat_template is not None:
923
- templated_prompt = tokenizer.apply_chat_template(
924
- conversation=[dict(role="user", content=prompt)],
925
- add_generation_prompt=True,
926
- tokenize=False,
986
+ if has_chat_template(tokeniser=tokeniser):
987
+ templated_prompt = apply_chat_template(
988
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
927
989
  )
928
990
  assert isinstance(templated_prompt, str)
929
991
  prompt = templated_prompt
@@ -948,7 +1010,7 @@ def get_end_of_reasoning_token(
948
1010
  f"The model {model_id!r} did not generate any beginning-of-reasoning "
949
1011
  "tokens in the prompt or the completion. Assuming the model is not "
950
1012
  "a reasoning model.",
951
- level=logging.INFO,
1013
+ level=logging.DEBUG,
952
1014
  )
953
1015
  return None
954
1016
 
@@ -974,7 +1036,7 @@ def get_end_of_reasoning_token(
974
1036
  "the beginning-of-reasoning tokens "
975
1037
  f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
976
1038
  "This is probably not correct, so please report this issue.",
977
- level=logging.INFO,
1039
+ level=logging.WARNING,
978
1040
  )
979
1041
  return None
980
1042
 
@@ -984,14 +1046,14 @@ def get_end_of_reasoning_token(
984
1046
  f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
985
1047
  "the reasoning token. If this is not the correct reasoning token, "
986
1048
  "please report this issue.",
987
- level=logging.INFO,
1049
+ level=logging.WARNING,
988
1050
  )
989
1051
 
990
1052
  bor_token, eor_token = eor_reasoning_matches[0]
991
1053
  log_once(
992
1054
  f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
993
1055
  f"token {eor_token!r} for model {model_id!r}.",
994
- level=logging.INFO,
1056
+ level=logging.DEBUG,
995
1057
  )
996
1058
 
997
1059
  return eor_token
@@ -999,7 +1061,7 @@ def get_end_of_reasoning_token(
999
1061
 
1000
1062
  def get_custom_stop_tokens(
1001
1063
  model: "LLM",
1002
- tokenizer: "PreTrainedTokenizer",
1064
+ tokeniser: "PreTrainedTokenizer",
1003
1065
  model_id: str,
1004
1066
  is_reasoning_model: bool,
1005
1067
  ) -> list[str]:
@@ -1008,8 +1070,8 @@ def get_custom_stop_tokens(
1008
1070
  Args:
1009
1071
  model:
1010
1072
  The vLLM model.
1011
- tokenizer:
1012
- The tokenizer.
1073
+ tokeniser:
1074
+ The tokeniser.
1013
1075
  model_id:
1014
1076
  The model ID.
1015
1077
  is_reasoning_model:
@@ -1022,11 +1084,9 @@ def get_custom_stop_tokens(
1022
1084
  candidate_stop_tokens = CUSTOM_STOP_TOKENS
1023
1085
 
1024
1086
  prompt = "Hello"
1025
- if tokenizer.chat_template is not None:
1026
- templated_prompt = tokenizer.apply_chat_template(
1027
- conversation=[dict(role="user", content=prompt)],
1028
- add_generation_prompt=True,
1029
- tokenize=False,
1087
+ if has_chat_template(tokeniser=tokeniser):
1088
+ templated_prompt = apply_chat_template(
1089
+ conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
1030
1090
  )
1031
1091
  assert isinstance(templated_prompt, str)
1032
1092
  prompt = templated_prompt