EuroEval 15.16.0__py3-none-any.whl → 16.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +3 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +161 -114
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +13 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +53 -7
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +38 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +46 -14
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +234 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +17 -6
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +96 -23
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +47 -75
- euroeval/tasks.py +31 -6
- euroeval/tokenization_utils.py +295 -207
- euroeval/utils.py +118 -34
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/METADATA +11 -14
- euroeval-16.0.0.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import importlib.util
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
import os
|
|
9
8
|
import re
|
|
10
9
|
import typing as t
|
|
11
10
|
from functools import partial
|
|
@@ -16,6 +15,7 @@ import torch
|
|
|
16
15
|
from huggingface_hub import snapshot_download
|
|
17
16
|
from pydantic import conlist, create_model
|
|
18
17
|
from tqdm.auto import tqdm
|
|
18
|
+
from transformers import MistralCommonTokenizer
|
|
19
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
20
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
@@ -24,11 +24,10 @@ from ..constants import (
|
|
|
24
24
|
CUSTOM_STOP_TOKENS,
|
|
25
25
|
GENERATIVE_PIPELINE_TAGS,
|
|
26
26
|
MAX_CONTEXT_LENGTH,
|
|
27
|
-
|
|
27
|
+
MAX_VLLM_LOGPROBS,
|
|
28
28
|
MERGE_TAGS,
|
|
29
29
|
REASONING_MAX_TOKENS,
|
|
30
30
|
REASONING_TOKENS,
|
|
31
|
-
TASKS_USING_JSON,
|
|
32
31
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
32
|
)
|
|
34
33
|
from ..data_models import GenerativeModelOutput, ModelConfig
|
|
@@ -54,17 +53,20 @@ from ..task_group_utils import (
|
|
|
54
53
|
token_classification,
|
|
55
54
|
)
|
|
56
55
|
from ..tokenization_utils import (
|
|
56
|
+
apply_chat_template,
|
|
57
57
|
get_bos_token,
|
|
58
58
|
get_end_of_chat_token_ids,
|
|
59
59
|
get_eos_token,
|
|
60
60
|
get_first_label_token_mapping,
|
|
61
61
|
get_pad_token,
|
|
62
|
+
has_chat_template,
|
|
62
63
|
should_prompts_be_stripped,
|
|
63
64
|
)
|
|
64
65
|
from ..types import ExtractLabelsFunction
|
|
65
66
|
from ..utils import (
|
|
66
67
|
clear_memory,
|
|
67
68
|
create_model_cache_dir,
|
|
69
|
+
get_hf_token,
|
|
68
70
|
get_min_cuda_compute_capability,
|
|
69
71
|
log_once,
|
|
70
72
|
)
|
|
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
79
81
|
from vllm.lora.request import LoRARequest
|
|
80
82
|
from vllm.sampling_params import GuidedDecodingParams
|
|
81
83
|
|
|
82
|
-
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
83
|
-
import ray
|
|
84
|
-
|
|
85
84
|
if t.TYPE_CHECKING:
|
|
86
85
|
from datasets import DatasetDict
|
|
87
86
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
104
103
|
model_config: "ModelConfig",
|
|
105
104
|
dataset_config: "DatasetConfig",
|
|
106
105
|
benchmark_config: "BenchmarkConfig",
|
|
106
|
+
log_metadata: bool = True,
|
|
107
107
|
) -> None:
|
|
108
108
|
"""Initialise the vLLM model.
|
|
109
109
|
|
|
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
114
114
|
The dataset configuration.
|
|
115
115
|
benchmark_config:
|
|
116
116
|
The benchmark configuration.
|
|
117
|
+
log_metadata:
|
|
118
|
+
Whether to log the model and dataset metadata.
|
|
117
119
|
"""
|
|
118
|
-
if (
|
|
119
|
-
importlib.util.find_spec("vllm") is None
|
|
120
|
-
or importlib.util.find_spec("ray") is None
|
|
121
|
-
):
|
|
120
|
+
if importlib.util.find_spec("vllm") is None:
|
|
122
121
|
raise NeedsExtraInstalled(extra="generative")
|
|
123
122
|
|
|
124
|
-
model,
|
|
123
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
125
124
|
model_config=model_config, benchmark_config=benchmark_config
|
|
126
125
|
)
|
|
127
126
|
self._model: "LLM" = model
|
|
128
|
-
self.
|
|
127
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
129
128
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
130
|
-
model=self._model,
|
|
129
|
+
model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
|
|
131
130
|
)
|
|
132
131
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
133
|
-
|
|
132
|
+
tokeniser=self._tokeniser
|
|
134
133
|
)
|
|
135
134
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
136
135
|
model=self._model,
|
|
137
|
-
|
|
136
|
+
tokeniser=self._tokeniser,
|
|
138
137
|
model_id=model_config.model_id,
|
|
139
138
|
is_reasoning_model=self.end_of_reasoning_token is not None,
|
|
140
139
|
)
|
|
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
145
144
|
model_config=model_config,
|
|
146
145
|
dataset_config=dataset_config,
|
|
147
146
|
benchmark_config=benchmark_config,
|
|
147
|
+
log_metadata=log_metadata,
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
self.buffer |= dict(
|
|
151
|
-
instruction_model=self.
|
|
151
|
+
instruction_model=has_chat_template(tokeniser=self._tokeniser),
|
|
152
152
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
153
153
|
dataset_config=self.dataset_config,
|
|
154
154
|
model_config=self.model_config,
|
|
155
|
-
|
|
155
|
+
tokeniser=self._tokeniser,
|
|
156
156
|
generative_type=self.generative_type,
|
|
157
|
+
log_metadata=self.log_metadata,
|
|
157
158
|
),
|
|
158
159
|
)
|
|
159
160
|
if self.model_config.adapter_base_model_id is not None:
|
|
@@ -167,13 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
167
168
|
)
|
|
168
169
|
|
|
169
170
|
def __del__(self) -> None:
|
|
170
|
-
"""Clean up the model and
|
|
171
|
-
|
|
172
|
-
|
|
171
|
+
"""Clean up the model and tokeniser."""
|
|
172
|
+
try:
|
|
173
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
174
|
+
clear_vllm()
|
|
175
|
+
except ImportError:
|
|
176
|
+
pass
|
|
173
177
|
if hasattr(self, "_model"):
|
|
174
178
|
del self._model
|
|
175
|
-
if hasattr(self, "
|
|
176
|
-
del self.
|
|
179
|
+
if hasattr(self, "_tokeniser"):
|
|
180
|
+
del self._tokeniser
|
|
177
181
|
|
|
178
182
|
@property
|
|
179
183
|
def generative_type(self) -> GenerativeType | None:
|
|
@@ -182,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
182
186
|
Returns:
|
|
183
187
|
The generative type of the model, or None if it has not been set yet.
|
|
184
188
|
"""
|
|
185
|
-
if not hasattr(self, "
|
|
189
|
+
if not hasattr(self, "_tokeniser"):
|
|
186
190
|
return None
|
|
187
191
|
elif self.end_of_reasoning_token is not None:
|
|
188
192
|
return GenerativeType.REASONING
|
|
189
193
|
elif (
|
|
190
|
-
self.
|
|
194
|
+
has_chat_template(tokeniser=self._tokeniser)
|
|
191
195
|
or "instruct" in self.model_config.model_id.lower()
|
|
192
196
|
):
|
|
193
197
|
return GenerativeType.INSTRUCTION_TUNED
|
|
@@ -267,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
267
271
|
|
|
268
272
|
if self.benchmark_config.few_shot:
|
|
269
273
|
few_shot_examples = extract_few_shot_examples(
|
|
270
|
-
dataset=dataset,
|
|
274
|
+
dataset=dataset,
|
|
275
|
+
dataset_config=self.dataset_config,
|
|
276
|
+
benchmark_config=self.benchmark_config,
|
|
277
|
+
itr_idx=itr_idx,
|
|
271
278
|
)
|
|
272
279
|
else:
|
|
273
280
|
few_shot_examples = list()
|
|
@@ -280,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
280
287
|
dataset_config=self.dataset_config,
|
|
281
288
|
instruction_model=self.buffer["instruction_model"],
|
|
282
289
|
always_populate_text_field=True,
|
|
283
|
-
|
|
290
|
+
tokeniser=self._tokeniser,
|
|
284
291
|
),
|
|
285
292
|
batched=True,
|
|
286
293
|
load_from_cache_file=False,
|
|
@@ -298,35 +305,40 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
298
305
|
|
|
299
306
|
Returns:
|
|
300
307
|
The generated model outputs.
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
InvalidBenchmark:
|
|
311
|
+
If the dataset requires logprobs, but we could not get the first token
|
|
312
|
+
of each label in the dataset.
|
|
301
313
|
"""
|
|
302
314
|
# Get stopping tokens
|
|
303
315
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
304
316
|
if self.buffer["instruction_model"] is False:
|
|
305
317
|
stop_tokens.append("\n\n")
|
|
306
|
-
if self.
|
|
307
|
-
assert isinstance(self.
|
|
318
|
+
if self._tokeniser.pad_token_id is not None:
|
|
319
|
+
assert isinstance(self._tokeniser.pad_token, str), (
|
|
308
320
|
f"The pad token for the model {self.model_config.model_id!r} "
|
|
309
|
-
f"is not a string, which is unexpected: {self.
|
|
321
|
+
f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
|
|
310
322
|
)
|
|
311
|
-
stop_tokens.append(self.
|
|
312
|
-
if self.
|
|
313
|
-
assert isinstance(self.
|
|
323
|
+
stop_tokens.append(self._tokeniser.pad_token)
|
|
324
|
+
if self._tokeniser.eos_token_id is not None:
|
|
325
|
+
assert isinstance(self._tokeniser.eos_token, str), (
|
|
314
326
|
f"The EOS token for the model {self.model_config.model_id!r} "
|
|
315
|
-
f"is not a string, which is unexpected: {self.
|
|
327
|
+
f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
|
|
316
328
|
)
|
|
317
|
-
stop_tokens.append(self.
|
|
318
|
-
if self.
|
|
319
|
-
self.
|
|
320
|
-
self.
|
|
329
|
+
stop_tokens.append(self._tokeniser.eos_token)
|
|
330
|
+
if self._tokeniser.pad_token_id is None:
|
|
331
|
+
self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
|
|
332
|
+
self._tokeniser.pad_token = self._tokeniser.eos_token
|
|
321
333
|
if self.end_of_chat_token_ids is not None:
|
|
322
|
-
end_of_chat_token = self.
|
|
334
|
+
end_of_chat_token = self._tokeniser.decode(
|
|
323
335
|
self.end_of_chat_token_ids
|
|
324
336
|
).strip()
|
|
325
337
|
if end_of_chat_token:
|
|
326
338
|
stop_tokens.append(end_of_chat_token)
|
|
327
339
|
|
|
328
340
|
structured_generation_schema = None
|
|
329
|
-
if self.dataset_config.task
|
|
341
|
+
if self.dataset_config.task.uses_structured_output:
|
|
330
342
|
if self.generative_type == GenerativeType.REASONING:
|
|
331
343
|
log_once(
|
|
332
344
|
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
@@ -355,9 +367,33 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
355
367
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
356
368
|
dataset_config=self.dataset_config,
|
|
357
369
|
model_config=self.model_config,
|
|
358
|
-
|
|
370
|
+
tokeniser=self._tokeniser,
|
|
359
371
|
generative_type=self.generative_type,
|
|
372
|
+
log_metadata=self.log_metadata,
|
|
360
373
|
)
|
|
374
|
+
if (
|
|
375
|
+
not self.buffer["first_label_token_mapping"]
|
|
376
|
+
and self.dataset_config.task.requires_logprobs
|
|
377
|
+
):
|
|
378
|
+
raise InvalidBenchmark(
|
|
379
|
+
"The dataset requires logprobs, but we encountered an error when "
|
|
380
|
+
"trying to get the first token of each label in the dataset. You can "
|
|
381
|
+
"try running this benchmark with the --verbose flag to see what the "
|
|
382
|
+
"error was. Skipping this evaluation."
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
# Define the guided decoding that we will use for structured generation
|
|
386
|
+
if structured_generation_schema is not None:
|
|
387
|
+
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
388
|
+
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
389
|
+
guided_decoding = GuidedDecodingParams(
|
|
390
|
+
choice=[
|
|
391
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
392
|
+
for label in self.dataset_config.labels
|
|
393
|
+
]
|
|
394
|
+
)
|
|
395
|
+
else:
|
|
396
|
+
guided_decoding = None
|
|
361
397
|
|
|
362
398
|
# Define the parameters used for vLLM generation
|
|
363
399
|
max_tokens: int = (
|
|
@@ -367,14 +403,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
367
403
|
)
|
|
368
404
|
sampling_params = SamplingParams(
|
|
369
405
|
max_tokens=max_tokens,
|
|
370
|
-
logprobs=
|
|
406
|
+
logprobs=MAX_VLLM_LOGPROBS
|
|
407
|
+
if self.buffer["first_label_token_mapping"]
|
|
408
|
+
else None,
|
|
371
409
|
temperature=0.0,
|
|
372
410
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
373
|
-
guided_decoding=
|
|
374
|
-
GuidedDecodingParams(json=structured_generation_schema)
|
|
375
|
-
if structured_generation_schema
|
|
376
|
-
else None
|
|
377
|
-
),
|
|
411
|
+
guided_decoding=guided_decoding,
|
|
378
412
|
)
|
|
379
413
|
|
|
380
414
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
@@ -383,7 +417,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
383
417
|
if any(len(prompt) == 0 for prompt in prompts):
|
|
384
418
|
logger.debug("Found empty prompts, replacing with BOS token.")
|
|
385
419
|
prompts = [
|
|
386
|
-
prompt if len(prompt) > 0 else str(self.
|
|
420
|
+
prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
|
|
387
421
|
for prompt in prompts
|
|
388
422
|
]
|
|
389
423
|
|
|
@@ -394,7 +428,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
394
428
|
if not self.buffer.get(
|
|
395
429
|
"instruction_model", False
|
|
396
430
|
) and should_prompts_be_stripped(
|
|
397
|
-
labels_to_be_generated=labels_to_be_generated,
|
|
431
|
+
labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
|
|
398
432
|
):
|
|
399
433
|
log_once(
|
|
400
434
|
f"Stripping prompts for model {self.model_config.model_id!r}.",
|
|
@@ -432,22 +466,22 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
432
466
|
"Prompts are too long, so truncating them and trying again..."
|
|
433
467
|
)
|
|
434
468
|
logger.debug(f"The error message was: {str(e)}")
|
|
435
|
-
tokenized_prompts = self.
|
|
469
|
+
tokenized_prompts = self._tokeniser(
|
|
436
470
|
text=prompts,
|
|
437
471
|
truncation=True,
|
|
438
472
|
max_length=max(
|
|
439
|
-
min(self.
|
|
473
|
+
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
440
474
|
- max_tokens,
|
|
441
475
|
0,
|
|
442
476
|
),
|
|
443
477
|
)
|
|
444
|
-
prompts = self.
|
|
478
|
+
prompts = self._tokeniser.batch_decode(
|
|
445
479
|
sequences=tokenized_prompts.input_ids, skip_special_tokens=True
|
|
446
480
|
)
|
|
447
481
|
else:
|
|
448
482
|
raise InvalidBenchmark(
|
|
449
483
|
f"An error occurred during vLLM generation: {str(e)}"
|
|
450
|
-
)
|
|
484
|
+
) from e
|
|
451
485
|
else:
|
|
452
486
|
raise InvalidBenchmark(
|
|
453
487
|
f"Could not generate sequences after {num_attempts} attempts."
|
|
@@ -477,7 +511,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
477
511
|
completion_ids: list[list[int]] = [
|
|
478
512
|
output.outputs[0].token_ids for output in raw_outputs
|
|
479
513
|
]
|
|
480
|
-
completions = self.
|
|
514
|
+
completions = self._tokeniser.batch_decode(
|
|
481
515
|
sequences=[
|
|
482
516
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
483
517
|
]
|
|
@@ -625,10 +659,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
625
659
|
)
|
|
626
660
|
|
|
627
661
|
|
|
628
|
-
def
|
|
662
|
+
def load_model_and_tokeniser(
|
|
629
663
|
model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
|
|
630
664
|
) -> tuple["LLM", "PreTrainedTokenizer"]:
|
|
631
|
-
"""Load the model and
|
|
665
|
+
"""Load the model and tokeniser.
|
|
632
666
|
|
|
633
667
|
Args:
|
|
634
668
|
model_config:
|
|
@@ -637,7 +671,7 @@ def load_model_and_tokenizer(
|
|
|
637
671
|
The benchmark configuration.
|
|
638
672
|
|
|
639
673
|
Returns:
|
|
640
|
-
A pair (model,
|
|
674
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
641
675
|
"""
|
|
642
676
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
643
677
|
# during inference in this case
|
|
@@ -675,7 +709,7 @@ def load_model_and_tokenizer(
|
|
|
675
709
|
dtype: str | torch.dtype = "auto"
|
|
676
710
|
|
|
677
711
|
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
678
|
-
if hf_model_config.
|
|
712
|
+
if hf_model_config.dtype == torch.float32:
|
|
679
713
|
if torch.cuda.is_bf16_supported():
|
|
680
714
|
logger.info(
|
|
681
715
|
"You are loading a model with dtype FP32, which we will convert to "
|
|
@@ -692,34 +726,32 @@ def load_model_and_tokenizer(
|
|
|
692
726
|
dtype = torch.float16
|
|
693
727
|
|
|
694
728
|
# If the model is a quantized model, we might need to change the dtype
|
|
695
|
-
if quantization == "mxfp4" and hf_model_config.
|
|
729
|
+
if quantization == "mxfp4" and hf_model_config.dtype is None:
|
|
696
730
|
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
697
731
|
logger.debug(
|
|
698
|
-
"You are loading a quantized model where `
|
|
732
|
+
"You are loading a quantized model where `dtype` has not been set. "
|
|
699
733
|
f"Setting dtype to {dtype!r}."
|
|
700
734
|
)
|
|
701
|
-
elif quantization is not None and hf_model_config.
|
|
735
|
+
elif quantization is not None and hf_model_config.dtype != torch.float16:
|
|
702
736
|
logger.info(
|
|
703
737
|
"You are loading a quantized model with dtype "
|
|
704
|
-
f"{hf_model_config.
|
|
738
|
+
f"{hf_model_config.dtype}, which vLLM does not support. Setting "
|
|
705
739
|
"dtype to float16 instead."
|
|
706
740
|
)
|
|
707
741
|
dtype = torch.float16
|
|
708
742
|
|
|
709
743
|
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
710
|
-
if hf_model_config.
|
|
744
|
+
if hf_model_config.dtype == torch.bfloat16:
|
|
711
745
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
712
746
|
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
713
747
|
|
|
714
748
|
if min_cuda_compute_capability is not None:
|
|
715
749
|
if min_cuda_compute_capability < required_capability:
|
|
716
750
|
logger.info(
|
|
717
|
-
"You are loading a model with "
|
|
718
|
-
|
|
719
|
-
"
|
|
720
|
-
f"
|
|
721
|
-
"You are using one or more devices with "
|
|
722
|
-
f"compute capability {min_cuda_compute_capability}. "
|
|
751
|
+
f"You are loading a model with dtype {hf_model_config.dtype}, "
|
|
752
|
+
"which vLLM only supports for CUDA devices with CUDA compute "
|
|
753
|
+
f"capability >={required_capability}. You are using one or more "
|
|
754
|
+
f"devices with compute capability {min_cuda_compute_capability}. "
|
|
723
755
|
"Setting dtype to float16 instead."
|
|
724
756
|
)
|
|
725
757
|
dtype = torch.float16
|
|
@@ -747,14 +779,14 @@ def load_model_and_tokenizer(
|
|
|
747
779
|
else:
|
|
748
780
|
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
749
781
|
|
|
750
|
-
|
|
782
|
+
tokeniser = load_tokeniser(
|
|
751
783
|
model_id=model_config.model_id,
|
|
752
784
|
revision=model_config.revision,
|
|
753
785
|
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
754
786
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
755
787
|
model_max_length=true_max_model_len,
|
|
756
788
|
model_cache_dir=model_config.model_cache_dir,
|
|
757
|
-
token=benchmark_config.api_key
|
|
789
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
758
790
|
)
|
|
759
791
|
|
|
760
792
|
clear_vllm()
|
|
@@ -769,9 +801,7 @@ def load_model_and_tokenizer(
|
|
|
769
801
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
770
802
|
revision=revision,
|
|
771
803
|
seed=4242,
|
|
772
|
-
distributed_executor_backend=
|
|
773
|
-
"ray" if torch.cuda.device_count() > 1 else "mp"
|
|
774
|
-
),
|
|
804
|
+
distributed_executor_backend="mp",
|
|
775
805
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
776
806
|
disable_custom_all_reduce=True,
|
|
777
807
|
quantization=quantization,
|
|
@@ -782,29 +812,39 @@ def load_model_and_tokenizer(
|
|
|
782
812
|
enable_prefix_caching=False,
|
|
783
813
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
784
814
|
max_lora_rank=256,
|
|
815
|
+
# Special arguments in case we are dealing with a Mistral model
|
|
816
|
+
tokenizer_mode="mistral"
|
|
817
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
818
|
+
else "auto",
|
|
819
|
+
config_format="mistral"
|
|
820
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
821
|
+
else "auto",
|
|
822
|
+
load_format="mistral"
|
|
823
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
824
|
+
else "auto",
|
|
785
825
|
)
|
|
786
826
|
except (RuntimeError, ValueError, OSError) as e:
|
|
787
827
|
if "awaiting a review from the repo authors" in str(e):
|
|
788
828
|
raise InvalidModel(
|
|
789
829
|
f"The model {model_id!r} is awaiting a review from the repository "
|
|
790
830
|
"authors. Please try again later."
|
|
791
|
-
)
|
|
831
|
+
) from e
|
|
792
832
|
elif "trust_remote_code" in str(e):
|
|
793
833
|
raise InvalidModel(
|
|
794
834
|
f"Loading the model {model_id!r} needs to trust remote code. "
|
|
795
835
|
"If you trust the suppliers of this model, then you can enable "
|
|
796
836
|
"this by setting the `--trust-remote-code` flag."
|
|
797
|
-
)
|
|
837
|
+
) from e
|
|
798
838
|
raise InvalidModel(
|
|
799
839
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
800
|
-
)
|
|
840
|
+
) from e
|
|
801
841
|
|
|
802
842
|
model.config = hf_model_config
|
|
803
843
|
|
|
804
|
-
return model,
|
|
844
|
+
return model, tokeniser
|
|
805
845
|
|
|
806
846
|
|
|
807
|
-
def
|
|
847
|
+
def load_tokeniser(
|
|
808
848
|
model_id: str,
|
|
809
849
|
revision: str,
|
|
810
850
|
adapter_base_model_id: str | None,
|
|
@@ -813,7 +853,7 @@ def load_tokenizer(
|
|
|
813
853
|
model_cache_dir: str,
|
|
814
854
|
token: str | bool,
|
|
815
855
|
) -> "PreTrainedTokenizer":
|
|
816
|
-
"""Load the
|
|
856
|
+
"""Load the tokeniser.
|
|
817
857
|
|
|
818
858
|
Args:
|
|
819
859
|
model_id:
|
|
@@ -833,7 +873,7 @@ def load_tokenizer(
|
|
|
833
873
|
The Hugging Face API token.
|
|
834
874
|
|
|
835
875
|
Returns:
|
|
836
|
-
The loaded
|
|
876
|
+
The loaded tokeniser.
|
|
837
877
|
"""
|
|
838
878
|
revision = revision if adapter_base_model_id is None else "main"
|
|
839
879
|
config = AutoConfig.from_pretrained(
|
|
@@ -846,7 +886,7 @@ def load_tokenizer(
|
|
|
846
886
|
num_retries = 5
|
|
847
887
|
for _ in range(num_retries):
|
|
848
888
|
try:
|
|
849
|
-
|
|
889
|
+
tokeniser = AutoTokenizer.from_pretrained(
|
|
850
890
|
model_id,
|
|
851
891
|
use_fast=True,
|
|
852
892
|
verbose=False,
|
|
@@ -861,30 +901,45 @@ def load_tokenizer(
|
|
|
861
901
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
862
902
|
if adapter_base_model_id is None or model_id == adapter_base_model_id:
|
|
863
903
|
raise InvalidModel(
|
|
864
|
-
f"Could not load
|
|
904
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
865
905
|
f"{str(e)}."
|
|
866
|
-
)
|
|
906
|
+
) from e
|
|
867
907
|
logger.debug(
|
|
868
|
-
f"Could not load
|
|
908
|
+
f"Could not load tokeniser for {model_id!r}. Falling back to "
|
|
869
909
|
f"{adapter_base_model_id!r}."
|
|
870
910
|
)
|
|
871
911
|
model_id = adapter_base_model_id
|
|
872
912
|
except (TimeoutError, RequestError):
|
|
873
|
-
logger.info(f"Couldn't load
|
|
913
|
+
logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
|
|
874
914
|
sleep(5)
|
|
875
915
|
continue
|
|
916
|
+
except (KeyError, ValueError) as e:
|
|
917
|
+
if "mistral" in str(e).lower():
|
|
918
|
+
tokeniser = MistralCommonTokenizer.from_pretrained(
|
|
919
|
+
model_id,
|
|
920
|
+
padding_side="left",
|
|
921
|
+
truncation_side="left",
|
|
922
|
+
model_max_length=model_max_length,
|
|
923
|
+
token=token,
|
|
924
|
+
)
|
|
925
|
+
break
|
|
926
|
+
raise InvalidModel(
|
|
927
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
928
|
+
f"{str(e)}."
|
|
929
|
+
) from e
|
|
876
930
|
else:
|
|
877
931
|
raise InvalidModel(
|
|
878
|
-
f"Could not load
|
|
932
|
+
f"Could not load tokeniser for model {model_id!r} after {num_retries} "
|
|
879
933
|
"attempts."
|
|
880
934
|
)
|
|
881
935
|
|
|
882
936
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
937
|
+
if not isinstance(tokeniser, MistralCommonTokenizer):
|
|
938
|
+
tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
|
|
939
|
+
tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
|
|
940
|
+
tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
|
|
886
941
|
|
|
887
|
-
return
|
|
942
|
+
return tokeniser
|
|
888
943
|
|
|
889
944
|
|
|
890
945
|
def clear_vllm() -> None:
|
|
@@ -892,25 +947,21 @@ def clear_vllm() -> None:
|
|
|
892
947
|
with contextlib.suppress(ValueError):
|
|
893
948
|
destroy_model_parallel()
|
|
894
949
|
destroy_distributed_environment()
|
|
895
|
-
if ray.is_initialized():
|
|
896
|
-
ray.shutdown()
|
|
897
950
|
with contextlib.suppress(AssertionError):
|
|
898
951
|
torch.distributed.destroy_process_group()
|
|
899
|
-
if ray.is_initialized():
|
|
900
|
-
ray.shutdown()
|
|
901
952
|
clear_memory()
|
|
902
953
|
|
|
903
954
|
|
|
904
955
|
def get_end_of_reasoning_token(
|
|
905
|
-
model: "LLM",
|
|
956
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
|
|
906
957
|
) -> str | None:
|
|
907
958
|
"""Get the end-of-reasoning token for a generative model.
|
|
908
959
|
|
|
909
960
|
Args:
|
|
910
961
|
model:
|
|
911
962
|
The vLLM model.
|
|
912
|
-
|
|
913
|
-
The
|
|
963
|
+
tokeniser:
|
|
964
|
+
The tokeniser.
|
|
914
965
|
model_id:
|
|
915
966
|
The model ID.
|
|
916
967
|
|
|
@@ -919,11 +970,9 @@ def get_end_of_reasoning_token(
|
|
|
919
970
|
"""
|
|
920
971
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
921
972
|
prompt = "What is your name?"
|
|
922
|
-
if
|
|
923
|
-
templated_prompt =
|
|
924
|
-
conversation=[dict(role="user", content=prompt)],
|
|
925
|
-
add_generation_prompt=True,
|
|
926
|
-
tokenize=False,
|
|
973
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
974
|
+
templated_prompt = apply_chat_template(
|
|
975
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
927
976
|
)
|
|
928
977
|
assert isinstance(templated_prompt, str)
|
|
929
978
|
prompt = templated_prompt
|
|
@@ -948,7 +997,7 @@ def get_end_of_reasoning_token(
|
|
|
948
997
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
949
998
|
"tokens in the prompt or the completion. Assuming the model is not "
|
|
950
999
|
"a reasoning model.",
|
|
951
|
-
level=logging.
|
|
1000
|
+
level=logging.DEBUG,
|
|
952
1001
|
)
|
|
953
1002
|
return None
|
|
954
1003
|
|
|
@@ -974,7 +1023,7 @@ def get_end_of_reasoning_token(
|
|
|
974
1023
|
"the beginning-of-reasoning tokens "
|
|
975
1024
|
f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
|
|
976
1025
|
"This is probably not correct, so please report this issue.",
|
|
977
|
-
level=logging.
|
|
1026
|
+
level=logging.WARNING,
|
|
978
1027
|
)
|
|
979
1028
|
return None
|
|
980
1029
|
|
|
@@ -984,14 +1033,14 @@ def get_end_of_reasoning_token(
|
|
|
984
1033
|
f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
|
|
985
1034
|
"the reasoning token. If this is not the correct reasoning token, "
|
|
986
1035
|
"please report this issue.",
|
|
987
|
-
level=logging.
|
|
1036
|
+
level=logging.WARNING,
|
|
988
1037
|
)
|
|
989
1038
|
|
|
990
1039
|
bor_token, eor_token = eor_reasoning_matches[0]
|
|
991
1040
|
log_once(
|
|
992
1041
|
f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
|
|
993
1042
|
f"token {eor_token!r} for model {model_id!r}.",
|
|
994
|
-
level=logging.
|
|
1043
|
+
level=logging.DEBUG,
|
|
995
1044
|
)
|
|
996
1045
|
|
|
997
1046
|
return eor_token
|
|
@@ -999,7 +1048,7 @@ def get_end_of_reasoning_token(
|
|
|
999
1048
|
|
|
1000
1049
|
def get_custom_stop_tokens(
|
|
1001
1050
|
model: "LLM",
|
|
1002
|
-
|
|
1051
|
+
tokeniser: "PreTrainedTokenizer",
|
|
1003
1052
|
model_id: str,
|
|
1004
1053
|
is_reasoning_model: bool,
|
|
1005
1054
|
) -> list[str]:
|
|
@@ -1008,8 +1057,8 @@ def get_custom_stop_tokens(
|
|
|
1008
1057
|
Args:
|
|
1009
1058
|
model:
|
|
1010
1059
|
The vLLM model.
|
|
1011
|
-
|
|
1012
|
-
The
|
|
1060
|
+
tokeniser:
|
|
1061
|
+
The tokeniser.
|
|
1013
1062
|
model_id:
|
|
1014
1063
|
The model ID.
|
|
1015
1064
|
is_reasoning_model:
|
|
@@ -1022,11 +1071,9 @@ def get_custom_stop_tokens(
|
|
|
1022
1071
|
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1023
1072
|
|
|
1024
1073
|
prompt = "Hello"
|
|
1025
|
-
if
|
|
1026
|
-
templated_prompt =
|
|
1027
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1028
|
-
add_generation_prompt=True,
|
|
1029
|
-
tokenize=False,
|
|
1074
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
1075
|
+
templated_prompt = apply_chat_template(
|
|
1076
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
1030
1077
|
)
|
|
1031
1078
|
assert isinstance(templated_prompt, str)
|
|
1032
1079
|
prompt = templated_prompt
|