EuroEval 15.15.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +323 -193
- euroeval/benchmark_modules/vllm.py +166 -112
- euroeval/benchmarker.py +59 -33
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +8 -7
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +12 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -468
- euroeval-15.15.0.dist-info/RECORD +0 -63
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.15.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import importlib.util
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
import os
|
|
9
8
|
import re
|
|
10
9
|
import typing as t
|
|
11
10
|
from functools import partial
|
|
@@ -16,6 +15,7 @@ import torch
|
|
|
16
15
|
from huggingface_hub import snapshot_download
|
|
17
16
|
from pydantic import conlist, create_model
|
|
18
17
|
from tqdm.auto import tqdm
|
|
18
|
+
from transformers import MistralCommonTokenizer
|
|
19
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
20
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
@@ -24,11 +24,10 @@ from ..constants import (
|
|
|
24
24
|
CUSTOM_STOP_TOKENS,
|
|
25
25
|
GENERATIVE_PIPELINE_TAGS,
|
|
26
26
|
MAX_CONTEXT_LENGTH,
|
|
27
|
-
|
|
27
|
+
MAX_VLLM_LOGPROBS,
|
|
28
28
|
MERGE_TAGS,
|
|
29
29
|
REASONING_MAX_TOKENS,
|
|
30
30
|
REASONING_TOKENS,
|
|
31
|
-
TASKS_USING_JSON,
|
|
32
31
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
32
|
)
|
|
34
33
|
from ..data_models import GenerativeModelOutput, ModelConfig
|
|
@@ -54,17 +53,20 @@ from ..task_group_utils import (
|
|
|
54
53
|
token_classification,
|
|
55
54
|
)
|
|
56
55
|
from ..tokenization_utils import (
|
|
56
|
+
apply_chat_template,
|
|
57
57
|
get_bos_token,
|
|
58
58
|
get_end_of_chat_token_ids,
|
|
59
59
|
get_eos_token,
|
|
60
60
|
get_first_label_token_mapping,
|
|
61
61
|
get_pad_token,
|
|
62
|
+
has_chat_template,
|
|
62
63
|
should_prompts_be_stripped,
|
|
63
64
|
)
|
|
64
65
|
from ..types import ExtractLabelsFunction
|
|
65
66
|
from ..utils import (
|
|
66
67
|
clear_memory,
|
|
67
68
|
create_model_cache_dir,
|
|
69
|
+
get_hf_token,
|
|
68
70
|
get_min_cuda_compute_capability,
|
|
69
71
|
log_once,
|
|
70
72
|
)
|
|
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
79
81
|
from vllm.lora.request import LoRARequest
|
|
80
82
|
from vllm.sampling_params import GuidedDecodingParams
|
|
81
83
|
|
|
82
|
-
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
83
|
-
import ray
|
|
84
|
-
|
|
85
84
|
if t.TYPE_CHECKING:
|
|
86
85
|
from datasets import DatasetDict
|
|
87
86
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
104
103
|
model_config: "ModelConfig",
|
|
105
104
|
dataset_config: "DatasetConfig",
|
|
106
105
|
benchmark_config: "BenchmarkConfig",
|
|
106
|
+
log_metadata: bool = True,
|
|
107
107
|
) -> None:
|
|
108
108
|
"""Initialise the vLLM model.
|
|
109
109
|
|
|
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
114
114
|
The dataset configuration.
|
|
115
115
|
benchmark_config:
|
|
116
116
|
The benchmark configuration.
|
|
117
|
+
log_metadata:
|
|
118
|
+
Whether to log the model and dataset metadata.
|
|
117
119
|
"""
|
|
118
|
-
if (
|
|
119
|
-
importlib.util.find_spec("vllm") is None
|
|
120
|
-
or importlib.util.find_spec("ray") is None
|
|
121
|
-
):
|
|
120
|
+
if importlib.util.find_spec("vllm") is None:
|
|
122
121
|
raise NeedsExtraInstalled(extra="generative")
|
|
123
122
|
|
|
124
|
-
model,
|
|
123
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
125
124
|
model_config=model_config, benchmark_config=benchmark_config
|
|
126
125
|
)
|
|
127
126
|
self._model: "LLM" = model
|
|
128
|
-
self.
|
|
127
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
129
128
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
130
|
-
model=self._model,
|
|
129
|
+
model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
|
|
131
130
|
)
|
|
132
131
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
133
|
-
|
|
132
|
+
tokeniser=self._tokeniser
|
|
134
133
|
)
|
|
135
134
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
136
135
|
model=self._model,
|
|
137
|
-
|
|
136
|
+
tokeniser=self._tokeniser,
|
|
138
137
|
model_id=model_config.model_id,
|
|
139
138
|
is_reasoning_model=self.end_of_reasoning_token is not None,
|
|
140
139
|
)
|
|
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
145
144
|
model_config=model_config,
|
|
146
145
|
dataset_config=dataset_config,
|
|
147
146
|
benchmark_config=benchmark_config,
|
|
147
|
+
log_metadata=log_metadata,
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
self.buffer |= dict(
|
|
151
|
-
instruction_model=self.
|
|
151
|
+
instruction_model=has_chat_template(tokeniser=self._tokeniser),
|
|
152
152
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
153
153
|
dataset_config=self.dataset_config,
|
|
154
154
|
model_config=self.model_config,
|
|
155
|
-
|
|
155
|
+
tokeniser=self._tokeniser,
|
|
156
156
|
generative_type=self.generative_type,
|
|
157
|
+
log_metadata=self.log_metadata,
|
|
157
158
|
),
|
|
158
159
|
)
|
|
159
160
|
if self.model_config.adapter_base_model_id is not None:
|
|
@@ -167,12 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
167
168
|
)
|
|
168
169
|
|
|
169
170
|
def __del__(self) -> None:
|
|
170
|
-
"""Clean up the model and
|
|
171
|
-
|
|
171
|
+
"""Clean up the model and tokeniser."""
|
|
172
|
+
try:
|
|
173
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
174
|
+
clear_vllm()
|
|
175
|
+
except ImportError:
|
|
176
|
+
pass
|
|
172
177
|
if hasattr(self, "_model"):
|
|
173
178
|
del self._model
|
|
174
|
-
if hasattr(self, "
|
|
175
|
-
del self.
|
|
179
|
+
if hasattr(self, "_tokeniser"):
|
|
180
|
+
del self._tokeniser
|
|
176
181
|
|
|
177
182
|
@property
|
|
178
183
|
def generative_type(self) -> GenerativeType | None:
|
|
@@ -181,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
181
186
|
Returns:
|
|
182
187
|
The generative type of the model, or None if it has not been set yet.
|
|
183
188
|
"""
|
|
184
|
-
if not hasattr(self, "
|
|
189
|
+
if not hasattr(self, "_tokeniser"):
|
|
185
190
|
return None
|
|
186
191
|
elif self.end_of_reasoning_token is not None:
|
|
187
192
|
return GenerativeType.REASONING
|
|
188
193
|
elif (
|
|
189
|
-
self.
|
|
194
|
+
has_chat_template(tokeniser=self._tokeniser)
|
|
190
195
|
or "instruct" in self.model_config.model_id.lower()
|
|
191
196
|
):
|
|
192
197
|
return GenerativeType.INSTRUCTION_TUNED
|
|
@@ -266,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
266
271
|
|
|
267
272
|
if self.benchmark_config.few_shot:
|
|
268
273
|
few_shot_examples = extract_few_shot_examples(
|
|
269
|
-
dataset=dataset,
|
|
274
|
+
dataset=dataset,
|
|
275
|
+
dataset_config=self.dataset_config,
|
|
276
|
+
benchmark_config=self.benchmark_config,
|
|
277
|
+
itr_idx=itr_idx,
|
|
270
278
|
)
|
|
271
279
|
else:
|
|
272
280
|
few_shot_examples = list()
|
|
@@ -279,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
279
287
|
dataset_config=self.dataset_config,
|
|
280
288
|
instruction_model=self.buffer["instruction_model"],
|
|
281
289
|
always_populate_text_field=True,
|
|
282
|
-
|
|
290
|
+
tokeniser=self._tokeniser,
|
|
283
291
|
),
|
|
284
292
|
batched=True,
|
|
285
293
|
load_from_cache_file=False,
|
|
@@ -297,35 +305,40 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
297
305
|
|
|
298
306
|
Returns:
|
|
299
307
|
The generated model outputs.
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
InvalidBenchmark:
|
|
311
|
+
If the dataset requires logprobs, but we could not get the first token
|
|
312
|
+
of each label in the dataset.
|
|
300
313
|
"""
|
|
301
314
|
# Get stopping tokens
|
|
302
315
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
303
316
|
if self.buffer["instruction_model"] is False:
|
|
304
317
|
stop_tokens.append("\n\n")
|
|
305
|
-
if self.
|
|
306
|
-
assert isinstance(self.
|
|
318
|
+
if self._tokeniser.pad_token_id is not None:
|
|
319
|
+
assert isinstance(self._tokeniser.pad_token, str), (
|
|
307
320
|
f"The pad token for the model {self.model_config.model_id!r} "
|
|
308
|
-
f"is not a string, which is unexpected: {self.
|
|
321
|
+
f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
|
|
309
322
|
)
|
|
310
|
-
stop_tokens.append(self.
|
|
311
|
-
if self.
|
|
312
|
-
assert isinstance(self.
|
|
323
|
+
stop_tokens.append(self._tokeniser.pad_token)
|
|
324
|
+
if self._tokeniser.eos_token_id is not None:
|
|
325
|
+
assert isinstance(self._tokeniser.eos_token, str), (
|
|
313
326
|
f"The EOS token for the model {self.model_config.model_id!r} "
|
|
314
|
-
f"is not a string, which is unexpected: {self.
|
|
327
|
+
f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
|
|
315
328
|
)
|
|
316
|
-
stop_tokens.append(self.
|
|
317
|
-
if self.
|
|
318
|
-
self.
|
|
319
|
-
self.
|
|
329
|
+
stop_tokens.append(self._tokeniser.eos_token)
|
|
330
|
+
if self._tokeniser.pad_token_id is None:
|
|
331
|
+
self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
|
|
332
|
+
self._tokeniser.pad_token = self._tokeniser.eos_token
|
|
320
333
|
if self.end_of_chat_token_ids is not None:
|
|
321
|
-
end_of_chat_token = self.
|
|
334
|
+
end_of_chat_token = self._tokeniser.decode(
|
|
322
335
|
self.end_of_chat_token_ids
|
|
323
336
|
).strip()
|
|
324
337
|
if end_of_chat_token:
|
|
325
338
|
stop_tokens.append(end_of_chat_token)
|
|
326
339
|
|
|
327
340
|
structured_generation_schema = None
|
|
328
|
-
if self.dataset_config.task
|
|
341
|
+
if self.dataset_config.task.uses_structured_output:
|
|
329
342
|
if self.generative_type == GenerativeType.REASONING:
|
|
330
343
|
log_once(
|
|
331
344
|
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
@@ -354,9 +367,33 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
354
367
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
355
368
|
dataset_config=self.dataset_config,
|
|
356
369
|
model_config=self.model_config,
|
|
357
|
-
|
|
370
|
+
tokeniser=self._tokeniser,
|
|
358
371
|
generative_type=self.generative_type,
|
|
372
|
+
log_metadata=self.log_metadata,
|
|
359
373
|
)
|
|
374
|
+
if (
|
|
375
|
+
not self.buffer["first_label_token_mapping"]
|
|
376
|
+
and self.dataset_config.task.requires_logprobs
|
|
377
|
+
):
|
|
378
|
+
raise InvalidBenchmark(
|
|
379
|
+
"The dataset requires logprobs, but we encountered an error when "
|
|
380
|
+
"trying to get the first token of each label in the dataset. You can "
|
|
381
|
+
"try running this benchmark with the --verbose flag to see what the "
|
|
382
|
+
"error was. Skipping this evaluation."
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Define the guided decoding that we will use for structured generation
|
|
386
|
+
if structured_generation_schema is not None:
|
|
387
|
+
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
388
|
+
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
389
|
+
guided_decoding = GuidedDecodingParams(
|
|
390
|
+
choice=[
|
|
391
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
392
|
+
for label in self.dataset_config.labels
|
|
393
|
+
]
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
guided_decoding = None
|
|
360
397
|
|
|
361
398
|
# Define the parameters used for vLLM generation
|
|
362
399
|
max_tokens: int = (
|
|
@@ -366,14 +403,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
366
403
|
)
|
|
367
404
|
sampling_params = SamplingParams(
|
|
368
405
|
max_tokens=max_tokens,
|
|
369
|
-
logprobs=
|
|
406
|
+
logprobs=MAX_VLLM_LOGPROBS
|
|
407
|
+
if self.buffer["first_label_token_mapping"]
|
|
408
|
+
else None,
|
|
370
409
|
temperature=0.0,
|
|
371
410
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
372
|
-
guided_decoding=
|
|
373
|
-
GuidedDecodingParams(json=structured_generation_schema)
|
|
374
|
-
if structured_generation_schema
|
|
375
|
-
else None
|
|
376
|
-
),
|
|
411
|
+
guided_decoding=guided_decoding,
|
|
377
412
|
)
|
|
378
413
|
|
|
379
414
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
@@ -382,7 +417,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
382
417
|
if any(len(prompt) == 0 for prompt in prompts):
|
|
383
418
|
logger.debug("Found empty prompts, replacing with BOS token.")
|
|
384
419
|
prompts = [
|
|
385
|
-
prompt if len(prompt) > 0 else str(self.
|
|
420
|
+
prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
|
|
386
421
|
for prompt in prompts
|
|
387
422
|
]
|
|
388
423
|
|
|
@@ -393,7 +428,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
393
428
|
if not self.buffer.get(
|
|
394
429
|
"instruction_model", False
|
|
395
430
|
) and should_prompts_be_stripped(
|
|
396
|
-
labels_to_be_generated=labels_to_be_generated,
|
|
431
|
+
labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
|
|
397
432
|
):
|
|
398
433
|
log_once(
|
|
399
434
|
f"Stripping prompts for model {self.model_config.model_id!r}.",
|
|
@@ -431,22 +466,22 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
431
466
|
"Prompts are too long, so truncating them and trying again..."
|
|
432
467
|
)
|
|
433
468
|
logger.debug(f"The error message was: {str(e)}")
|
|
434
|
-
tokenized_prompts = self.
|
|
469
|
+
tokenized_prompts = self._tokeniser(
|
|
435
470
|
text=prompts,
|
|
436
471
|
truncation=True,
|
|
437
472
|
max_length=max(
|
|
438
|
-
min(self.
|
|
473
|
+
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
439
474
|
- max_tokens,
|
|
440
475
|
0,
|
|
441
476
|
),
|
|
442
477
|
)
|
|
443
|
-
prompts = self.
|
|
478
|
+
prompts = self._tokeniser.batch_decode(
|
|
444
479
|
sequences=tokenized_prompts.input_ids, skip_special_tokens=True
|
|
445
480
|
)
|
|
446
481
|
else:
|
|
447
482
|
raise InvalidBenchmark(
|
|
448
483
|
f"An error occurred during vLLM generation: {str(e)}"
|
|
449
|
-
)
|
|
484
|
+
) from e
|
|
450
485
|
else:
|
|
451
486
|
raise InvalidBenchmark(
|
|
452
487
|
f"Could not generate sequences after {num_attempts} attempts."
|
|
@@ -476,7 +511,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
476
511
|
completion_ids: list[list[int]] = [
|
|
477
512
|
output.outputs[0].token_ids for output in raw_outputs
|
|
478
513
|
]
|
|
479
|
-
completions = self.
|
|
514
|
+
completions = self._tokeniser.batch_decode(
|
|
480
515
|
sequences=[
|
|
481
516
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
482
517
|
]
|
|
@@ -624,10 +659,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
624
659
|
)
|
|
625
660
|
|
|
626
661
|
|
|
627
|
-
def
|
|
662
|
+
def load_model_and_tokeniser(
|
|
628
663
|
model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
|
|
629
664
|
) -> tuple["LLM", "PreTrainedTokenizer"]:
|
|
630
|
-
"""Load the model and
|
|
665
|
+
"""Load the model and tokeniser.
|
|
631
666
|
|
|
632
667
|
Args:
|
|
633
668
|
model_config:
|
|
@@ -636,7 +671,7 @@ def load_model_and_tokenizer(
|
|
|
636
671
|
The benchmark configuration.
|
|
637
672
|
|
|
638
673
|
Returns:
|
|
639
|
-
A pair (model,
|
|
674
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
640
675
|
"""
|
|
641
676
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
642
677
|
# during inference in this case
|
|
@@ -674,7 +709,7 @@ def load_model_and_tokenizer(
|
|
|
674
709
|
dtype: str | torch.dtype = "auto"
|
|
675
710
|
|
|
676
711
|
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
677
|
-
if hf_model_config.
|
|
712
|
+
if hf_model_config.dtype == torch.float32:
|
|
678
713
|
if torch.cuda.is_bf16_supported():
|
|
679
714
|
logger.info(
|
|
680
715
|
"You are loading a model with dtype FP32, which we will convert to "
|
|
@@ -690,29 +725,33 @@ def load_model_and_tokenizer(
|
|
|
690
725
|
)
|
|
691
726
|
dtype = torch.float16
|
|
692
727
|
|
|
693
|
-
# If the model is a quantized model, we need to
|
|
694
|
-
if quantization
|
|
728
|
+
# If the model is a quantized model, we might need to change the dtype
|
|
729
|
+
if quantization == "mxfp4" and hf_model_config.dtype is None:
|
|
730
|
+
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
731
|
+
logger.debug(
|
|
732
|
+
"You are loading a quantized model where `dtype` has not been set. "
|
|
733
|
+
f"Setting dtype to {dtype!r}."
|
|
734
|
+
)
|
|
735
|
+
elif quantization is not None and hf_model_config.dtype != torch.float16:
|
|
695
736
|
logger.info(
|
|
696
737
|
"You are loading a quantized model with dtype "
|
|
697
|
-
f"{hf_model_config.
|
|
738
|
+
f"{hf_model_config.dtype}, which vLLM does not support. Setting "
|
|
698
739
|
"dtype to float16 instead."
|
|
699
740
|
)
|
|
700
741
|
dtype = torch.float16
|
|
701
742
|
|
|
702
743
|
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
703
|
-
if hf_model_config.
|
|
744
|
+
if hf_model_config.dtype == torch.bfloat16:
|
|
704
745
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
705
746
|
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
706
747
|
|
|
707
748
|
if min_cuda_compute_capability is not None:
|
|
708
749
|
if min_cuda_compute_capability < required_capability:
|
|
709
750
|
logger.info(
|
|
710
|
-
"You are loading a model with "
|
|
711
|
-
|
|
712
|
-
"
|
|
713
|
-
f"
|
|
714
|
-
"You are using one or more devices with "
|
|
715
|
-
f"compute capability {min_cuda_compute_capability}. "
|
|
751
|
+
f"You are loading a model with dtype {hf_model_config.dtype}, "
|
|
752
|
+
"which vLLM only supports for CUDA devices with CUDA compute "
|
|
753
|
+
f"capability >={required_capability}. You are using one or more "
|
|
754
|
+
f"devices with compute capability {min_cuda_compute_capability}. "
|
|
716
755
|
"Setting dtype to float16 instead."
|
|
717
756
|
)
|
|
718
757
|
dtype = torch.float16
|
|
@@ -740,14 +779,14 @@ def load_model_and_tokenizer(
|
|
|
740
779
|
else:
|
|
741
780
|
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
742
781
|
|
|
743
|
-
|
|
782
|
+
tokeniser = load_tokeniser(
|
|
744
783
|
model_id=model_config.model_id,
|
|
745
784
|
revision=model_config.revision,
|
|
746
785
|
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
747
786
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
748
787
|
model_max_length=true_max_model_len,
|
|
749
788
|
model_cache_dir=model_config.model_cache_dir,
|
|
750
|
-
token=benchmark_config.api_key
|
|
789
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
751
790
|
)
|
|
752
791
|
|
|
753
792
|
clear_vllm()
|
|
@@ -762,9 +801,7 @@ def load_model_and_tokenizer(
|
|
|
762
801
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
763
802
|
revision=revision,
|
|
764
803
|
seed=4242,
|
|
765
|
-
distributed_executor_backend=
|
|
766
|
-
"ray" if torch.cuda.device_count() > 1 else "mp"
|
|
767
|
-
),
|
|
804
|
+
distributed_executor_backend="mp",
|
|
768
805
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
769
806
|
disable_custom_all_reduce=True,
|
|
770
807
|
quantization=quantization,
|
|
@@ -775,29 +812,39 @@ def load_model_and_tokenizer(
|
|
|
775
812
|
enable_prefix_caching=False,
|
|
776
813
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
777
814
|
max_lora_rank=256,
|
|
815
|
+
# Special arguments in case we are dealing with a Mistral model
|
|
816
|
+
tokenizer_mode="mistral"
|
|
817
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
818
|
+
else "auto",
|
|
819
|
+
config_format="mistral"
|
|
820
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
821
|
+
else "auto",
|
|
822
|
+
load_format="mistral"
|
|
823
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
824
|
+
else "auto",
|
|
778
825
|
)
|
|
779
826
|
except (RuntimeError, ValueError, OSError) as e:
|
|
780
827
|
if "awaiting a review from the repo authors" in str(e):
|
|
781
828
|
raise InvalidModel(
|
|
782
829
|
f"The model {model_id!r} is awaiting a review from the repository "
|
|
783
830
|
"authors. Please try again later."
|
|
784
|
-
)
|
|
831
|
+
) from e
|
|
785
832
|
elif "trust_remote_code" in str(e):
|
|
786
833
|
raise InvalidModel(
|
|
787
834
|
f"Loading the model {model_id!r} needs to trust remote code. "
|
|
788
835
|
"If you trust the suppliers of this model, then you can enable "
|
|
789
836
|
"this by setting the `--trust-remote-code` flag."
|
|
790
|
-
)
|
|
837
|
+
) from e
|
|
791
838
|
raise InvalidModel(
|
|
792
839
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
793
|
-
)
|
|
840
|
+
) from e
|
|
794
841
|
|
|
795
842
|
model.config = hf_model_config
|
|
796
843
|
|
|
797
|
-
return model,
|
|
844
|
+
return model, tokeniser
|
|
798
845
|
|
|
799
846
|
|
|
800
|
-
def
|
|
847
|
+
def load_tokeniser(
|
|
801
848
|
model_id: str,
|
|
802
849
|
revision: str,
|
|
803
850
|
adapter_base_model_id: str | None,
|
|
@@ -806,7 +853,7 @@ def load_tokenizer(
|
|
|
806
853
|
model_cache_dir: str,
|
|
807
854
|
token: str | bool,
|
|
808
855
|
) -> "PreTrainedTokenizer":
|
|
809
|
-
"""Load the
|
|
856
|
+
"""Load the tokeniser.
|
|
810
857
|
|
|
811
858
|
Args:
|
|
812
859
|
model_id:
|
|
@@ -826,7 +873,7 @@ def load_tokenizer(
|
|
|
826
873
|
The Hugging Face API token.
|
|
827
874
|
|
|
828
875
|
Returns:
|
|
829
|
-
The loaded
|
|
876
|
+
The loaded tokeniser.
|
|
830
877
|
"""
|
|
831
878
|
revision = revision if adapter_base_model_id is None else "main"
|
|
832
879
|
config = AutoConfig.from_pretrained(
|
|
@@ -839,7 +886,7 @@ def load_tokenizer(
|
|
|
839
886
|
num_retries = 5
|
|
840
887
|
for _ in range(num_retries):
|
|
841
888
|
try:
|
|
842
|
-
|
|
889
|
+
tokeniser = AutoTokenizer.from_pretrained(
|
|
843
890
|
model_id,
|
|
844
891
|
use_fast=True,
|
|
845
892
|
verbose=False,
|
|
@@ -854,30 +901,45 @@ def load_tokenizer(
|
|
|
854
901
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
855
902
|
if adapter_base_model_id is None or model_id == adapter_base_model_id:
|
|
856
903
|
raise InvalidModel(
|
|
857
|
-
f"Could not load
|
|
904
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
858
905
|
f"{str(e)}."
|
|
859
|
-
)
|
|
906
|
+
) from e
|
|
860
907
|
logger.debug(
|
|
861
|
-
f"Could not load
|
|
908
|
+
f"Could not load tokeniser for {model_id!r}. Falling back to "
|
|
862
909
|
f"{adapter_base_model_id!r}."
|
|
863
910
|
)
|
|
864
911
|
model_id = adapter_base_model_id
|
|
865
912
|
except (TimeoutError, RequestError):
|
|
866
|
-
logger.info(f"Couldn't load
|
|
913
|
+
logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
|
|
867
914
|
sleep(5)
|
|
868
915
|
continue
|
|
916
|
+
except (KeyError, ValueError) as e:
|
|
917
|
+
if "mistral" in str(e).lower():
|
|
918
|
+
tokeniser = MistralCommonTokenizer.from_pretrained(
|
|
919
|
+
model_id,
|
|
920
|
+
padding_side="left",
|
|
921
|
+
truncation_side="left",
|
|
922
|
+
model_max_length=model_max_length,
|
|
923
|
+
token=token,
|
|
924
|
+
)
|
|
925
|
+
break
|
|
926
|
+
raise InvalidModel(
|
|
927
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
928
|
+
f"{str(e)}."
|
|
929
|
+
) from e
|
|
869
930
|
else:
|
|
870
931
|
raise InvalidModel(
|
|
871
|
-
f"Could not load
|
|
932
|
+
f"Could not load tokeniser for model {model_id!r} after {num_retries} "
|
|
872
933
|
"attempts."
|
|
873
934
|
)
|
|
874
935
|
|
|
875
936
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
937
|
+
if not isinstance(tokeniser, MistralCommonTokenizer):
|
|
938
|
+
tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
|
|
939
|
+
tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
|
|
940
|
+
tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
|
|
879
941
|
|
|
880
|
-
return
|
|
942
|
+
return tokeniser
|
|
881
943
|
|
|
882
944
|
|
|
883
945
|
def clear_vllm() -> None:
|
|
@@ -885,25 +947,21 @@ def clear_vllm() -> None:
|
|
|
885
947
|
with contextlib.suppress(ValueError):
|
|
886
948
|
destroy_model_parallel()
|
|
887
949
|
destroy_distributed_environment()
|
|
888
|
-
if ray.is_initialized():
|
|
889
|
-
ray.shutdown()
|
|
890
950
|
with contextlib.suppress(AssertionError):
|
|
891
951
|
torch.distributed.destroy_process_group()
|
|
892
|
-
if ray.is_initialized():
|
|
893
|
-
ray.shutdown()
|
|
894
952
|
clear_memory()
|
|
895
953
|
|
|
896
954
|
|
|
897
955
|
def get_end_of_reasoning_token(
|
|
898
|
-
model: "LLM",
|
|
956
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
|
|
899
957
|
) -> str | None:
|
|
900
958
|
"""Get the end-of-reasoning token for a generative model.
|
|
901
959
|
|
|
902
960
|
Args:
|
|
903
961
|
model:
|
|
904
962
|
The vLLM model.
|
|
905
|
-
|
|
906
|
-
The
|
|
963
|
+
tokeniser:
|
|
964
|
+
The tokeniser.
|
|
907
965
|
model_id:
|
|
908
966
|
The model ID.
|
|
909
967
|
|
|
@@ -912,11 +970,9 @@ def get_end_of_reasoning_token(
|
|
|
912
970
|
"""
|
|
913
971
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
914
972
|
prompt = "What is your name?"
|
|
915
|
-
if
|
|
916
|
-
templated_prompt =
|
|
917
|
-
conversation=[dict(role="user", content=prompt)],
|
|
918
|
-
add_generation_prompt=True,
|
|
919
|
-
tokenize=False,
|
|
973
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
974
|
+
templated_prompt = apply_chat_template(
|
|
975
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
920
976
|
)
|
|
921
977
|
assert isinstance(templated_prompt, str)
|
|
922
978
|
prompt = templated_prompt
|
|
@@ -941,7 +997,7 @@ def get_end_of_reasoning_token(
|
|
|
941
997
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
942
998
|
"tokens in the prompt or the completion. Assuming the model is not "
|
|
943
999
|
"a reasoning model.",
|
|
944
|
-
level=logging.
|
|
1000
|
+
level=logging.DEBUG,
|
|
945
1001
|
)
|
|
946
1002
|
return None
|
|
947
1003
|
|
|
@@ -967,7 +1023,7 @@ def get_end_of_reasoning_token(
|
|
|
967
1023
|
"the beginning-of-reasoning tokens "
|
|
968
1024
|
f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
|
|
969
1025
|
"This is probably not correct, so please report this issue.",
|
|
970
|
-
level=logging.
|
|
1026
|
+
level=logging.WARNING,
|
|
971
1027
|
)
|
|
972
1028
|
return None
|
|
973
1029
|
|
|
@@ -977,14 +1033,14 @@ def get_end_of_reasoning_token(
|
|
|
977
1033
|
f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
|
|
978
1034
|
"the reasoning token. If this is not the correct reasoning token, "
|
|
979
1035
|
"please report this issue.",
|
|
980
|
-
level=logging.
|
|
1036
|
+
level=logging.WARNING,
|
|
981
1037
|
)
|
|
982
1038
|
|
|
983
1039
|
bor_token, eor_token = eor_reasoning_matches[0]
|
|
984
1040
|
log_once(
|
|
985
1041
|
f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
|
|
986
1042
|
f"token {eor_token!r} for model {model_id!r}.",
|
|
987
|
-
level=logging.
|
|
1043
|
+
level=logging.DEBUG,
|
|
988
1044
|
)
|
|
989
1045
|
|
|
990
1046
|
return eor_token
|
|
@@ -992,7 +1048,7 @@ def get_end_of_reasoning_token(
|
|
|
992
1048
|
|
|
993
1049
|
def get_custom_stop_tokens(
|
|
994
1050
|
model: "LLM",
|
|
995
|
-
|
|
1051
|
+
tokeniser: "PreTrainedTokenizer",
|
|
996
1052
|
model_id: str,
|
|
997
1053
|
is_reasoning_model: bool,
|
|
998
1054
|
) -> list[str]:
|
|
@@ -1001,8 +1057,8 @@ def get_custom_stop_tokens(
|
|
|
1001
1057
|
Args:
|
|
1002
1058
|
model:
|
|
1003
1059
|
The vLLM model.
|
|
1004
|
-
|
|
1005
|
-
The
|
|
1060
|
+
tokeniser:
|
|
1061
|
+
The tokeniser.
|
|
1006
1062
|
model_id:
|
|
1007
1063
|
The model ID.
|
|
1008
1064
|
is_reasoning_model:
|
|
@@ -1015,11 +1071,9 @@ def get_custom_stop_tokens(
|
|
|
1015
1071
|
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1016
1072
|
|
|
1017
1073
|
prompt = "Hello"
|
|
1018
|
-
if
|
|
1019
|
-
templated_prompt =
|
|
1020
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1021
|
-
add_generation_prompt=True,
|
|
1022
|
-
tokenize=False,
|
|
1074
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
1075
|
+
templated_prompt = apply_chat_template(
|
|
1076
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
1023
1077
|
)
|
|
1024
1078
|
assert isinstance(templated_prompt, str)
|
|
1025
1079
|
prompt = templated_prompt
|