EuroEval 15.16.0__py3-none-any.whl → 16.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of EuroEval might be problematic. Click here for more details.
- euroeval/__init__.py +8 -7
- euroeval/benchmark_config_factory.py +3 -7
- euroeval/benchmark_modules/base.py +35 -19
- euroeval/benchmark_modules/fresh.py +24 -19
- euroeval/benchmark_modules/hf.py +136 -154
- euroeval/benchmark_modules/litellm.py +190 -110
- euroeval/benchmark_modules/vllm.py +199 -139
- euroeval/benchmarker.py +49 -22
- euroeval/cli.py +3 -3
- euroeval/constants.py +19 -15
- euroeval/data_loading.py +33 -28
- euroeval/data_models.py +73 -23
- euroeval/dataset_configs/__init__.py +2 -0
- euroeval/dataset_configs/danish.py +35 -1
- euroeval/dataset_configs/dutch.py +38 -1
- euroeval/dataset_configs/english.py +38 -1
- euroeval/dataset_configs/estonian.py +95 -0
- euroeval/dataset_configs/faroese.py +38 -0
- euroeval/dataset_configs/finnish.py +39 -1
- euroeval/dataset_configs/french.py +38 -1
- euroeval/dataset_configs/german.py +38 -1
- euroeval/dataset_configs/icelandic.py +39 -1
- euroeval/dataset_configs/italian.py +38 -1
- euroeval/dataset_configs/latvian.py +81 -0
- euroeval/dataset_configs/norwegian.py +38 -1
- euroeval/dataset_configs/portuguese.py +38 -1
- euroeval/dataset_configs/spanish.py +38 -1
- euroeval/dataset_configs/swedish.py +38 -1
- euroeval/enums.py +0 -6
- euroeval/finetuning.py +6 -6
- euroeval/generation.py +25 -14
- euroeval/generation_utils.py +90 -20
- euroeval/languages.py +947 -187
- euroeval/metrics/__init__.py +6 -0
- euroeval/metrics/base.py +76 -0
- euroeval/metrics/huggingface.py +192 -0
- euroeval/metrics/llm_as_a_judge.py +257 -0
- euroeval/metrics/pipeline.py +276 -0
- euroeval/metrics/speed.py +51 -0
- euroeval/model_cache.py +13 -1
- euroeval/prompt_templates/linguistic_acceptability.py +40 -2
- euroeval/prompt_templates/multiple_choice.py +23 -2
- euroeval/prompt_templates/named_entity_recognition.py +65 -2
- euroeval/prompt_templates/reading_comprehension.py +42 -2
- euroeval/prompt_templates/sentiment_classification.py +46 -2
- euroeval/prompt_templates/summarization.py +24 -4
- euroeval/scores.py +7 -2
- euroeval/speed_benchmark.py +6 -6
- euroeval/task_group_utils/multiple_choice_classification.py +19 -8
- euroeval/task_group_utils/question_answering.py +35 -28
- euroeval/task_group_utils/sequence_classification.py +128 -42
- euroeval/task_group_utils/text_to_text.py +7 -3
- euroeval/task_group_utils/token_classification.py +59 -73
- euroeval/tasks.py +33 -6
- euroeval/tokenization_utils.py +294 -207
- euroeval/utils.py +150 -35
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/METADATA +13 -14
- euroeval-16.0.1.dist-info/RECORD +69 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/entry_points.txt +0 -1
- euroeval/human_evaluation.py +0 -738
- euroeval/metrics.py +0 -470
- euroeval-15.16.0.dist-info/RECORD +0 -63
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/WHEEL +0 -0
- {euroeval-15.16.0.dist-info → euroeval-16.0.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,6 @@ import contextlib
|
|
|
5
5
|
import importlib.util
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
-
import os
|
|
9
8
|
import re
|
|
10
9
|
import typing as t
|
|
11
10
|
from functools import partial
|
|
@@ -16,6 +15,7 @@ import torch
|
|
|
16
15
|
from huggingface_hub import snapshot_download
|
|
17
16
|
from pydantic import conlist, create_model
|
|
18
17
|
from tqdm.auto import tqdm
|
|
18
|
+
from transformers import MistralCommonTokenizer
|
|
19
19
|
from transformers.models.auto.configuration_auto import AutoConfig
|
|
20
20
|
from transformers.models.auto.tokenization_auto import AutoTokenizer
|
|
21
21
|
from urllib3.exceptions import RequestError
|
|
@@ -24,11 +24,10 @@ from ..constants import (
|
|
|
24
24
|
CUSTOM_STOP_TOKENS,
|
|
25
25
|
GENERATIVE_PIPELINE_TAGS,
|
|
26
26
|
MAX_CONTEXT_LENGTH,
|
|
27
|
-
|
|
27
|
+
MAX_VLLM_LOGPROBS,
|
|
28
28
|
MERGE_TAGS,
|
|
29
29
|
REASONING_MAX_TOKENS,
|
|
30
30
|
REASONING_TOKENS,
|
|
31
|
-
TASKS_USING_JSON,
|
|
32
31
|
VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY,
|
|
33
32
|
)
|
|
34
33
|
from ..data_models import GenerativeModelOutput, ModelConfig
|
|
@@ -54,17 +53,20 @@ from ..task_group_utils import (
|
|
|
54
53
|
token_classification,
|
|
55
54
|
)
|
|
56
55
|
from ..tokenization_utils import (
|
|
56
|
+
apply_chat_template,
|
|
57
57
|
get_bos_token,
|
|
58
58
|
get_end_of_chat_token_ids,
|
|
59
59
|
get_eos_token,
|
|
60
60
|
get_first_label_token_mapping,
|
|
61
61
|
get_pad_token,
|
|
62
|
+
has_chat_template,
|
|
62
63
|
should_prompts_be_stripped,
|
|
63
64
|
)
|
|
64
65
|
from ..types import ExtractLabelsFunction
|
|
65
66
|
from ..utils import (
|
|
66
67
|
clear_memory,
|
|
67
68
|
create_model_cache_dir,
|
|
69
|
+
get_hf_token,
|
|
68
70
|
get_min_cuda_compute_capability,
|
|
69
71
|
log_once,
|
|
70
72
|
)
|
|
@@ -79,9 +81,6 @@ if t.TYPE_CHECKING or importlib.util.find_spec("vllm") is not None:
|
|
|
79
81
|
from vllm.lora.request import LoRARequest
|
|
80
82
|
from vllm.sampling_params import GuidedDecodingParams
|
|
81
83
|
|
|
82
|
-
if t.TYPE_CHECKING or importlib.util.find_spec("ray") is not None:
|
|
83
|
-
import ray
|
|
84
|
-
|
|
85
84
|
if t.TYPE_CHECKING:
|
|
86
85
|
from datasets import DatasetDict
|
|
87
86
|
from transformers.tokenization_utils import PreTrainedTokenizer
|
|
@@ -104,6 +103,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
104
103
|
model_config: "ModelConfig",
|
|
105
104
|
dataset_config: "DatasetConfig",
|
|
106
105
|
benchmark_config: "BenchmarkConfig",
|
|
106
|
+
log_metadata: bool = True,
|
|
107
107
|
) -> None:
|
|
108
108
|
"""Initialise the vLLM model.
|
|
109
109
|
|
|
@@ -114,27 +114,26 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
114
114
|
The dataset configuration.
|
|
115
115
|
benchmark_config:
|
|
116
116
|
The benchmark configuration.
|
|
117
|
+
log_metadata:
|
|
118
|
+
Whether to log the model and dataset metadata.
|
|
117
119
|
"""
|
|
118
|
-
if (
|
|
119
|
-
importlib.util.find_spec("vllm") is None
|
|
120
|
-
or importlib.util.find_spec("ray") is None
|
|
121
|
-
):
|
|
120
|
+
if importlib.util.find_spec("vllm") is None:
|
|
122
121
|
raise NeedsExtraInstalled(extra="generative")
|
|
123
122
|
|
|
124
|
-
model,
|
|
123
|
+
model, tokeniser = load_model_and_tokeniser(
|
|
125
124
|
model_config=model_config, benchmark_config=benchmark_config
|
|
126
125
|
)
|
|
127
126
|
self._model: "LLM" = model
|
|
128
|
-
self.
|
|
127
|
+
self._tokeniser: "PreTrainedTokenizer" = tokeniser
|
|
129
128
|
self.end_of_reasoning_token = get_end_of_reasoning_token(
|
|
130
|
-
model=self._model,
|
|
129
|
+
model=self._model, tokeniser=self._tokeniser, model_id=model_config.model_id
|
|
131
130
|
)
|
|
132
131
|
self.end_of_chat_token_ids = get_end_of_chat_token_ids(
|
|
133
|
-
|
|
132
|
+
tokeniser=self._tokeniser
|
|
134
133
|
)
|
|
135
134
|
self.custom_stop_tokens = get_custom_stop_tokens(
|
|
136
135
|
model=self._model,
|
|
137
|
-
|
|
136
|
+
tokeniser=self._tokeniser,
|
|
138
137
|
model_id=model_config.model_id,
|
|
139
138
|
is_reasoning_model=self.end_of_reasoning_token is not None,
|
|
140
139
|
)
|
|
@@ -145,15 +144,17 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
145
144
|
model_config=model_config,
|
|
146
145
|
dataset_config=dataset_config,
|
|
147
146
|
benchmark_config=benchmark_config,
|
|
147
|
+
log_metadata=log_metadata,
|
|
148
148
|
)
|
|
149
149
|
|
|
150
150
|
self.buffer |= dict(
|
|
151
|
-
instruction_model=self.
|
|
151
|
+
instruction_model=has_chat_template(tokeniser=self._tokeniser),
|
|
152
152
|
first_label_token_mapping=get_first_label_token_mapping(
|
|
153
153
|
dataset_config=self.dataset_config,
|
|
154
154
|
model_config=self.model_config,
|
|
155
|
-
|
|
155
|
+
tokeniser=self._tokeniser,
|
|
156
156
|
generative_type=self.generative_type,
|
|
157
|
+
log_metadata=self.log_metadata,
|
|
157
158
|
),
|
|
158
159
|
)
|
|
159
160
|
if self.model_config.adapter_base_model_id is not None:
|
|
@@ -167,13 +168,16 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
167
168
|
)
|
|
168
169
|
|
|
169
170
|
def __del__(self) -> None:
|
|
170
|
-
"""Clean up the model and
|
|
171
|
-
|
|
172
|
-
|
|
171
|
+
"""Clean up the model and tokeniser."""
|
|
172
|
+
try:
|
|
173
|
+
if importlib.util.find_spec("vllm") is not None:
|
|
174
|
+
clear_vllm()
|
|
175
|
+
except ImportError:
|
|
176
|
+
pass
|
|
173
177
|
if hasattr(self, "_model"):
|
|
174
178
|
del self._model
|
|
175
|
-
if hasattr(self, "
|
|
176
|
-
del self.
|
|
179
|
+
if hasattr(self, "_tokeniser"):
|
|
180
|
+
del self._tokeniser
|
|
177
181
|
|
|
178
182
|
@property
|
|
179
183
|
def generative_type(self) -> GenerativeType | None:
|
|
@@ -182,12 +186,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
182
186
|
Returns:
|
|
183
187
|
The generative type of the model, or None if it has not been set yet.
|
|
184
188
|
"""
|
|
185
|
-
if not hasattr(self, "
|
|
189
|
+
if not hasattr(self, "_tokeniser"):
|
|
186
190
|
return None
|
|
187
191
|
elif self.end_of_reasoning_token is not None:
|
|
188
192
|
return GenerativeType.REASONING
|
|
189
193
|
elif (
|
|
190
|
-
self.
|
|
194
|
+
has_chat_template(tokeniser=self._tokeniser)
|
|
191
195
|
or "instruct" in self.model_config.model_id.lower()
|
|
192
196
|
):
|
|
193
197
|
return GenerativeType.INSTRUCTION_TUNED
|
|
@@ -267,7 +271,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
267
271
|
|
|
268
272
|
if self.benchmark_config.few_shot:
|
|
269
273
|
few_shot_examples = extract_few_shot_examples(
|
|
270
|
-
dataset=dataset,
|
|
274
|
+
dataset=dataset,
|
|
275
|
+
dataset_config=self.dataset_config,
|
|
276
|
+
benchmark_config=self.benchmark_config,
|
|
277
|
+
itr_idx=itr_idx,
|
|
271
278
|
)
|
|
272
279
|
else:
|
|
273
280
|
few_shot_examples = list()
|
|
@@ -280,7 +287,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
280
287
|
dataset_config=self.dataset_config,
|
|
281
288
|
instruction_model=self.buffer["instruction_model"],
|
|
282
289
|
always_populate_text_field=True,
|
|
283
|
-
|
|
290
|
+
tokeniser=self._tokeniser,
|
|
284
291
|
),
|
|
285
292
|
batched=True,
|
|
286
293
|
load_from_cache_file=False,
|
|
@@ -298,66 +305,100 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
298
305
|
|
|
299
306
|
Returns:
|
|
300
307
|
The generated model outputs.
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
InvalidBenchmark:
|
|
311
|
+
If the dataset requires logprobs, but we could not get the first token
|
|
312
|
+
of each label in the dataset.
|
|
301
313
|
"""
|
|
302
314
|
# Get stopping tokens
|
|
303
315
|
stop_tokens: list[str] = self.custom_stop_tokens.copy()
|
|
304
316
|
if self.buffer["instruction_model"] is False:
|
|
305
317
|
stop_tokens.append("\n\n")
|
|
306
|
-
if self.
|
|
307
|
-
assert isinstance(self.
|
|
318
|
+
if self._tokeniser.pad_token_id is not None:
|
|
319
|
+
assert isinstance(self._tokeniser.pad_token, str), (
|
|
308
320
|
f"The pad token for the model {self.model_config.model_id!r} "
|
|
309
|
-
f"is not a string, which is unexpected: {self.
|
|
321
|
+
f"is not a string, which is unexpected: {self._tokeniser.pad_token!r}."
|
|
310
322
|
)
|
|
311
|
-
stop_tokens.append(self.
|
|
312
|
-
if self.
|
|
313
|
-
assert isinstance(self.
|
|
323
|
+
stop_tokens.append(self._tokeniser.pad_token)
|
|
324
|
+
if self._tokeniser.eos_token_id is not None:
|
|
325
|
+
assert isinstance(self._tokeniser.eos_token, str), (
|
|
314
326
|
f"The EOS token for the model {self.model_config.model_id!r} "
|
|
315
|
-
f"is not a string, which is unexpected: {self.
|
|
327
|
+
f"is not a string, which is unexpected: {self._tokeniser.eos_token!r}."
|
|
316
328
|
)
|
|
317
|
-
stop_tokens.append(self.
|
|
318
|
-
if self.
|
|
319
|
-
self.
|
|
320
|
-
self.
|
|
329
|
+
stop_tokens.append(self._tokeniser.eos_token)
|
|
330
|
+
if self._tokeniser.pad_token_id is None:
|
|
331
|
+
self._tokeniser.pad_token_id = self._tokeniser.eos_token_id
|
|
332
|
+
self._tokeniser.pad_token = self._tokeniser.eos_token
|
|
321
333
|
if self.end_of_chat_token_ids is not None:
|
|
322
|
-
end_of_chat_token = self.
|
|
334
|
+
end_of_chat_token = self._tokeniser.decode(
|
|
323
335
|
self.end_of_chat_token_ids
|
|
324
336
|
).strip()
|
|
325
337
|
if end_of_chat_token:
|
|
326
338
|
stop_tokens.append(end_of_chat_token)
|
|
327
339
|
|
|
328
|
-
structured_generation_schema = None
|
|
329
|
-
if self.dataset_config.task in TASKS_USING_JSON:
|
|
330
|
-
if self.generative_type == GenerativeType.REASONING:
|
|
331
|
-
log_once(
|
|
332
|
-
f"The model {self.model_config.model_id!r} is a reasoning model "
|
|
333
|
-
"and thus does not support structured generation, so we do not "
|
|
334
|
-
"enable it.",
|
|
335
|
-
level=logging.DEBUG,
|
|
336
|
-
)
|
|
337
|
-
else:
|
|
338
|
-
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
339
|
-
keys_and_their_types: dict[str, t.Any] = {
|
|
340
|
-
tag_name: (conlist(str, max_length=5), ...)
|
|
341
|
-
for tag_name in ner_tag_names
|
|
342
|
-
}
|
|
343
|
-
answer_format_class = create_model(
|
|
344
|
-
"AnswerFormat", **keys_and_their_types
|
|
345
|
-
)
|
|
346
|
-
structured_generation_schema = answer_format_class.model_json_schema()
|
|
347
|
-
log_once(
|
|
348
|
-
"Using structured generation with the JSON schema "
|
|
349
|
-
f"{structured_generation_schema}",
|
|
350
|
-
level=logging.DEBUG,
|
|
351
|
-
)
|
|
352
|
-
|
|
353
340
|
# Get the mapping from labels to the first token in the label. We call this each
|
|
354
341
|
# time we generate a new dataset since the dataset config can change
|
|
355
342
|
self.buffer["first_label_token_mapping"] = get_first_label_token_mapping(
|
|
356
343
|
dataset_config=self.dataset_config,
|
|
357
344
|
model_config=self.model_config,
|
|
358
|
-
|
|
345
|
+
tokeniser=self._tokeniser,
|
|
359
346
|
generative_type=self.generative_type,
|
|
347
|
+
log_metadata=self.log_metadata,
|
|
360
348
|
)
|
|
349
|
+
if (
|
|
350
|
+
not self.buffer["first_label_token_mapping"]
|
|
351
|
+
and self.dataset_config.task.requires_logprobs
|
|
352
|
+
):
|
|
353
|
+
raise InvalidBenchmark(
|
|
354
|
+
"The dataset requires logprobs, but we encountered an error when "
|
|
355
|
+
"trying to get the first token of each label in the dataset. You can "
|
|
356
|
+
"try running this benchmark with the --verbose flag to see what the "
|
|
357
|
+
"error was. Skipping this evaluation."
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
structured_generation_schema = None
|
|
361
|
+
if (
|
|
362
|
+
self.dataset_config.task.uses_structured_output
|
|
363
|
+
or (self.dataset_config.task.uses_logprobs and self.dataset_config.labels)
|
|
364
|
+
) and self.generative_type == GenerativeType.REASONING:
|
|
365
|
+
guided_decoding = None
|
|
366
|
+
logger.debug(
|
|
367
|
+
"The dataset uses structured output, but we are not using it as the "
|
|
368
|
+
"model is a reasoning model."
|
|
369
|
+
)
|
|
370
|
+
elif self.dataset_config.task.uses_structured_output:
|
|
371
|
+
ner_tag_names = list(self.dataset_config.prompt_label_mapping.values())
|
|
372
|
+
keys_and_their_types: dict[str, t.Any] = {
|
|
373
|
+
tag_name: (conlist(str, max_length=5), ...)
|
|
374
|
+
for tag_name in ner_tag_names
|
|
375
|
+
}
|
|
376
|
+
answer_format_class = create_model("AnswerFormat", **keys_and_their_types)
|
|
377
|
+
structured_generation_schema = answer_format_class.model_json_schema()
|
|
378
|
+
log_once(
|
|
379
|
+
"Using structured generation with the JSON schema: "
|
|
380
|
+
f"{json.dumps(structured_generation_schema)}",
|
|
381
|
+
level=logging.DEBUG,
|
|
382
|
+
)
|
|
383
|
+
guided_decoding = GuidedDecodingParams(json=structured_generation_schema)
|
|
384
|
+
elif self.dataset_config.task.uses_logprobs and self.dataset_config.labels:
|
|
385
|
+
guided_decoding = GuidedDecodingParams(
|
|
386
|
+
choice=[
|
|
387
|
+
self.dataset_config.prompt_label_mapping[label]
|
|
388
|
+
for label in self.dataset_config.labels
|
|
389
|
+
]
|
|
390
|
+
)
|
|
391
|
+
log_once(
|
|
392
|
+
"Using structured generation with the choices: "
|
|
393
|
+
f"{guided_decoding.choice!r}.",
|
|
394
|
+
level=logging.DEBUG,
|
|
395
|
+
)
|
|
396
|
+
else:
|
|
397
|
+
guided_decoding = None
|
|
398
|
+
log_once(
|
|
399
|
+
"Not using structured generation as the dataset does not require it.",
|
|
400
|
+
level=logging.DEBUG,
|
|
401
|
+
)
|
|
361
402
|
|
|
362
403
|
# Define the parameters used for vLLM generation
|
|
363
404
|
max_tokens: int = (
|
|
@@ -367,14 +408,12 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
367
408
|
)
|
|
368
409
|
sampling_params = SamplingParams(
|
|
369
410
|
max_tokens=max_tokens,
|
|
370
|
-
logprobs=
|
|
411
|
+
logprobs=MAX_VLLM_LOGPROBS
|
|
412
|
+
if self.buffer["first_label_token_mapping"]
|
|
413
|
+
else None,
|
|
371
414
|
temperature=0.0,
|
|
372
415
|
stop=[stop_token for stop_token in stop_tokens if stop_token],
|
|
373
|
-
guided_decoding=
|
|
374
|
-
GuidedDecodingParams(json=structured_generation_schema)
|
|
375
|
-
if structured_generation_schema
|
|
376
|
-
else None
|
|
377
|
-
),
|
|
416
|
+
guided_decoding=guided_decoding,
|
|
378
417
|
)
|
|
379
418
|
|
|
380
419
|
# If any of the prompts are empty then we need to replace them with a BOS token
|
|
@@ -383,7 +422,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
383
422
|
if any(len(prompt) == 0 for prompt in prompts):
|
|
384
423
|
logger.debug("Found empty prompts, replacing with BOS token.")
|
|
385
424
|
prompts = [
|
|
386
|
-
prompt if len(prompt) > 0 else str(self.
|
|
425
|
+
prompt if len(prompt) > 0 else str(self._tokeniser.bos_token)
|
|
387
426
|
for prompt in prompts
|
|
388
427
|
]
|
|
389
428
|
|
|
@@ -394,7 +433,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
394
433
|
if not self.buffer.get(
|
|
395
434
|
"instruction_model", False
|
|
396
435
|
) and should_prompts_be_stripped(
|
|
397
|
-
labels_to_be_generated=labels_to_be_generated,
|
|
436
|
+
labels_to_be_generated=labels_to_be_generated, tokeniser=self._tokeniser
|
|
398
437
|
):
|
|
399
438
|
log_once(
|
|
400
439
|
f"Stripping prompts for model {self.model_config.model_id!r}.",
|
|
@@ -405,6 +444,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
405
444
|
# Generate sequences using vLLM
|
|
406
445
|
input_is_a_test = len(prompts) == 1 and len(set(prompts[0])) == 1
|
|
407
446
|
num_attempts = 3
|
|
447
|
+
truncation_attempts = 0
|
|
408
448
|
for _ in range(num_attempts):
|
|
409
449
|
try:
|
|
410
450
|
raw_outputs = self._model.generate(
|
|
@@ -432,22 +472,29 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
432
472
|
"Prompts are too long, so truncating them and trying again..."
|
|
433
473
|
)
|
|
434
474
|
logger.debug(f"The error message was: {str(e)}")
|
|
435
|
-
|
|
475
|
+
|
|
476
|
+
# If we have already tried truncating the prompts a few times, then
|
|
477
|
+
# we truncate a bit more aggressively
|
|
478
|
+
extra_truncation = 50 * truncation_attempts
|
|
479
|
+
truncation_attempts += 1
|
|
480
|
+
|
|
481
|
+
tokenized_prompts = self._tokeniser(
|
|
436
482
|
text=prompts,
|
|
437
483
|
truncation=True,
|
|
438
484
|
max_length=max(
|
|
439
|
-
min(self.
|
|
440
|
-
- max_tokens
|
|
485
|
+
min(self._tokeniser.model_max_length, MAX_CONTEXT_LENGTH)
|
|
486
|
+
- max_tokens
|
|
487
|
+
- extra_truncation,
|
|
441
488
|
0,
|
|
442
489
|
),
|
|
443
490
|
)
|
|
444
|
-
prompts = self.
|
|
491
|
+
prompts = self._tokeniser.batch_decode(
|
|
445
492
|
sequences=tokenized_prompts.input_ids, skip_special_tokens=True
|
|
446
493
|
)
|
|
447
494
|
else:
|
|
448
495
|
raise InvalidBenchmark(
|
|
449
496
|
f"An error occurred during vLLM generation: {str(e)}"
|
|
450
|
-
)
|
|
497
|
+
) from e
|
|
451
498
|
else:
|
|
452
499
|
raise InvalidBenchmark(
|
|
453
500
|
f"Could not generate sequences after {num_attempts} attempts."
|
|
@@ -477,7 +524,7 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
477
524
|
completion_ids: list[list[int]] = [
|
|
478
525
|
output.outputs[0].token_ids for output in raw_outputs
|
|
479
526
|
]
|
|
480
|
-
completions = self.
|
|
527
|
+
completions = self._tokeniser.batch_decode(
|
|
481
528
|
sequences=[
|
|
482
529
|
torch.LongTensor(completion_id) for completion_id in completion_ids
|
|
483
530
|
]
|
|
@@ -625,10 +672,10 @@ class VLLMModel(HuggingFaceEncoderModel):
|
|
|
625
672
|
)
|
|
626
673
|
|
|
627
674
|
|
|
628
|
-
def
|
|
675
|
+
def load_model_and_tokeniser(
|
|
629
676
|
model_config: "ModelConfig", benchmark_config: "BenchmarkConfig"
|
|
630
677
|
) -> tuple["LLM", "PreTrainedTokenizer"]:
|
|
631
|
-
"""Load the model and
|
|
678
|
+
"""Load the model and tokeniser.
|
|
632
679
|
|
|
633
680
|
Args:
|
|
634
681
|
model_config:
|
|
@@ -637,7 +684,7 @@ def load_model_and_tokenizer(
|
|
|
637
684
|
The benchmark configuration.
|
|
638
685
|
|
|
639
686
|
Returns:
|
|
640
|
-
A pair (model,
|
|
687
|
+
A pair (model, tokeniser), with the loaded model and tokeniser
|
|
641
688
|
"""
|
|
642
689
|
# Prefer base model ID if the model is an adapter - the adapter will be added on
|
|
643
690
|
# during inference in this case
|
|
@@ -675,7 +722,7 @@ def load_model_and_tokenizer(
|
|
|
675
722
|
dtype: str | torch.dtype = "auto"
|
|
676
723
|
|
|
677
724
|
# Choose bf16 over fp16 if the model is a fp32 model and the GPU supports it
|
|
678
|
-
if hf_model_config.
|
|
725
|
+
if hf_model_config.dtype == torch.float32:
|
|
679
726
|
if torch.cuda.is_bf16_supported():
|
|
680
727
|
logger.info(
|
|
681
728
|
"You are loading a model with dtype FP32, which we will convert to "
|
|
@@ -692,34 +739,32 @@ def load_model_and_tokenizer(
|
|
|
692
739
|
dtype = torch.float16
|
|
693
740
|
|
|
694
741
|
# If the model is a quantized model, we might need to change the dtype
|
|
695
|
-
if quantization == "mxfp4" and hf_model_config.
|
|
742
|
+
if quantization == "mxfp4" and hf_model_config.dtype is None:
|
|
696
743
|
dtype = torch.bfloat16 if torch.cuda.is_bf16_supported() else torch.float16
|
|
697
744
|
logger.debug(
|
|
698
|
-
"You are loading a quantized model where `
|
|
745
|
+
"You are loading a quantized model where `dtype` has not been set. "
|
|
699
746
|
f"Setting dtype to {dtype!r}."
|
|
700
747
|
)
|
|
701
|
-
elif quantization is not None and hf_model_config.
|
|
748
|
+
elif quantization is not None and hf_model_config.dtype != torch.float16:
|
|
702
749
|
logger.info(
|
|
703
750
|
"You are loading a quantized model with dtype "
|
|
704
|
-
f"{hf_model_config.
|
|
751
|
+
f"{hf_model_config.dtype}, which vLLM does not support. Setting "
|
|
705
752
|
"dtype to float16 instead."
|
|
706
753
|
)
|
|
707
754
|
dtype = torch.float16
|
|
708
755
|
|
|
709
756
|
# If the model is a bf16 model, we need to check the CUDA compute capability
|
|
710
|
-
if hf_model_config.
|
|
757
|
+
if hf_model_config.dtype == torch.bfloat16:
|
|
711
758
|
min_cuda_compute_capability = get_min_cuda_compute_capability()
|
|
712
759
|
required_capability = VLLM_BF16_MIN_CUDA_COMPUTE_CAPABILITY
|
|
713
760
|
|
|
714
761
|
if min_cuda_compute_capability is not None:
|
|
715
762
|
if min_cuda_compute_capability < required_capability:
|
|
716
763
|
logger.info(
|
|
717
|
-
"You are loading a model with "
|
|
718
|
-
|
|
719
|
-
"
|
|
720
|
-
f"
|
|
721
|
-
"You are using one or more devices with "
|
|
722
|
-
f"compute capability {min_cuda_compute_capability}. "
|
|
764
|
+
f"You are loading a model with dtype {hf_model_config.dtype}, "
|
|
765
|
+
"which vLLM only supports for CUDA devices with CUDA compute "
|
|
766
|
+
f"capability >={required_capability}. You are using one or more "
|
|
767
|
+
f"devices with compute capability {min_cuda_compute_capability}. "
|
|
723
768
|
"Setting dtype to float16 instead."
|
|
724
769
|
)
|
|
725
770
|
dtype = torch.float16
|
|
@@ -747,14 +792,14 @@ def load_model_and_tokenizer(
|
|
|
747
792
|
else:
|
|
748
793
|
true_max_model_len = MAX_CONTEXT_LENGTH
|
|
749
794
|
|
|
750
|
-
|
|
795
|
+
tokeniser = load_tokeniser(
|
|
751
796
|
model_id=model_config.model_id,
|
|
752
797
|
revision=model_config.revision,
|
|
753
798
|
adapter_base_model_id=model_config.adapter_base_model_id,
|
|
754
799
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
755
800
|
model_max_length=true_max_model_len,
|
|
756
801
|
model_cache_dir=model_config.model_cache_dir,
|
|
757
|
-
token=benchmark_config.api_key
|
|
802
|
+
token=get_hf_token(api_key=benchmark_config.api_key),
|
|
758
803
|
)
|
|
759
804
|
|
|
760
805
|
clear_vllm()
|
|
@@ -769,9 +814,7 @@ def load_model_and_tokenizer(
|
|
|
769
814
|
trust_remote_code=benchmark_config.trust_remote_code,
|
|
770
815
|
revision=revision,
|
|
771
816
|
seed=4242,
|
|
772
|
-
distributed_executor_backend=
|
|
773
|
-
"ray" if torch.cuda.device_count() > 1 else "mp"
|
|
774
|
-
),
|
|
817
|
+
distributed_executor_backend="mp",
|
|
775
818
|
tensor_parallel_size=torch.cuda.device_count(),
|
|
776
819
|
disable_custom_all_reduce=True,
|
|
777
820
|
quantization=quantization,
|
|
@@ -782,29 +825,39 @@ def load_model_and_tokenizer(
|
|
|
782
825
|
enable_prefix_caching=False,
|
|
783
826
|
enable_lora=model_config.adapter_base_model_id is not None,
|
|
784
827
|
max_lora_rank=256,
|
|
828
|
+
# Special arguments in case we are dealing with a Mistral model
|
|
829
|
+
tokenizer_mode="mistral"
|
|
830
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
831
|
+
else "auto",
|
|
832
|
+
config_format="mistral"
|
|
833
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
834
|
+
else "auto",
|
|
835
|
+
load_format="mistral"
|
|
836
|
+
if isinstance(tokeniser, MistralCommonTokenizer)
|
|
837
|
+
else "auto",
|
|
785
838
|
)
|
|
786
839
|
except (RuntimeError, ValueError, OSError) as e:
|
|
787
840
|
if "awaiting a review from the repo authors" in str(e):
|
|
788
841
|
raise InvalidModel(
|
|
789
842
|
f"The model {model_id!r} is awaiting a review from the repository "
|
|
790
843
|
"authors. Please try again later."
|
|
791
|
-
)
|
|
844
|
+
) from e
|
|
792
845
|
elif "trust_remote_code" in str(e):
|
|
793
846
|
raise InvalidModel(
|
|
794
847
|
f"Loading the model {model_id!r} needs to trust remote code. "
|
|
795
848
|
"If you trust the suppliers of this model, then you can enable "
|
|
796
849
|
"this by setting the `--trust-remote-code` flag."
|
|
797
|
-
)
|
|
850
|
+
) from e
|
|
798
851
|
raise InvalidModel(
|
|
799
852
|
f"The model {model_id!r} could not be loaded. The error was {e!r}."
|
|
800
|
-
)
|
|
853
|
+
) from e
|
|
801
854
|
|
|
802
855
|
model.config = hf_model_config
|
|
803
856
|
|
|
804
|
-
return model,
|
|
857
|
+
return model, tokeniser
|
|
805
858
|
|
|
806
859
|
|
|
807
|
-
def
|
|
860
|
+
def load_tokeniser(
|
|
808
861
|
model_id: str,
|
|
809
862
|
revision: str,
|
|
810
863
|
adapter_base_model_id: str | None,
|
|
@@ -813,7 +866,7 @@ def load_tokenizer(
|
|
|
813
866
|
model_cache_dir: str,
|
|
814
867
|
token: str | bool,
|
|
815
868
|
) -> "PreTrainedTokenizer":
|
|
816
|
-
"""Load the
|
|
869
|
+
"""Load the tokeniser.
|
|
817
870
|
|
|
818
871
|
Args:
|
|
819
872
|
model_id:
|
|
@@ -833,7 +886,7 @@ def load_tokenizer(
|
|
|
833
886
|
The Hugging Face API token.
|
|
834
887
|
|
|
835
888
|
Returns:
|
|
836
|
-
The loaded
|
|
889
|
+
The loaded tokeniser.
|
|
837
890
|
"""
|
|
838
891
|
revision = revision if adapter_base_model_id is None else "main"
|
|
839
892
|
config = AutoConfig.from_pretrained(
|
|
@@ -846,7 +899,7 @@ def load_tokenizer(
|
|
|
846
899
|
num_retries = 5
|
|
847
900
|
for _ in range(num_retries):
|
|
848
901
|
try:
|
|
849
|
-
|
|
902
|
+
tokeniser = AutoTokenizer.from_pretrained(
|
|
850
903
|
model_id,
|
|
851
904
|
use_fast=True,
|
|
852
905
|
verbose=False,
|
|
@@ -861,30 +914,45 @@ def load_tokenizer(
|
|
|
861
914
|
except (json.JSONDecodeError, OSError, TypeError) as e:
|
|
862
915
|
if adapter_base_model_id is None or model_id == adapter_base_model_id:
|
|
863
916
|
raise InvalidModel(
|
|
864
|
-
f"Could not load
|
|
917
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
865
918
|
f"{str(e)}."
|
|
866
|
-
)
|
|
919
|
+
) from e
|
|
867
920
|
logger.debug(
|
|
868
|
-
f"Could not load
|
|
921
|
+
f"Could not load tokeniser for {model_id!r}. Falling back to "
|
|
869
922
|
f"{adapter_base_model_id!r}."
|
|
870
923
|
)
|
|
871
924
|
model_id = adapter_base_model_id
|
|
872
925
|
except (TimeoutError, RequestError):
|
|
873
|
-
logger.info(f"Couldn't load
|
|
926
|
+
logger.info(f"Couldn't load tokeniser for {model_id!r}. Retrying.")
|
|
874
927
|
sleep(5)
|
|
875
928
|
continue
|
|
929
|
+
except (KeyError, ValueError) as e:
|
|
930
|
+
if "mistral" in str(e).lower():
|
|
931
|
+
tokeniser = MistralCommonTokenizer.from_pretrained(
|
|
932
|
+
model_id,
|
|
933
|
+
padding_side="left",
|
|
934
|
+
truncation_side="left",
|
|
935
|
+
model_max_length=model_max_length,
|
|
936
|
+
token=token,
|
|
937
|
+
)
|
|
938
|
+
break
|
|
939
|
+
raise InvalidModel(
|
|
940
|
+
f"Could not load tokeniser for model {model_id!r}. The error was "
|
|
941
|
+
f"{str(e)}."
|
|
942
|
+
) from e
|
|
876
943
|
else:
|
|
877
944
|
raise InvalidModel(
|
|
878
|
-
f"Could not load
|
|
945
|
+
f"Could not load tokeniser for model {model_id!r} after {num_retries} "
|
|
879
946
|
"attempts."
|
|
880
947
|
)
|
|
881
948
|
|
|
882
949
|
# Ensure that BOS, EOS and PAD tokens are set
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
950
|
+
if not isinstance(tokeniser, MistralCommonTokenizer):
|
|
951
|
+
tokeniser.bos_token, tokeniser.bos_token_id = get_bos_token(tokeniser=tokeniser)
|
|
952
|
+
tokeniser.eos_token, tokeniser.eos_token_id = get_eos_token(tokeniser=tokeniser)
|
|
953
|
+
tokeniser.pad_token, tokeniser.pad_token_id = get_pad_token(tokeniser=tokeniser)
|
|
886
954
|
|
|
887
|
-
return
|
|
955
|
+
return tokeniser
|
|
888
956
|
|
|
889
957
|
|
|
890
958
|
def clear_vllm() -> None:
|
|
@@ -892,25 +960,21 @@ def clear_vllm() -> None:
|
|
|
892
960
|
with contextlib.suppress(ValueError):
|
|
893
961
|
destroy_model_parallel()
|
|
894
962
|
destroy_distributed_environment()
|
|
895
|
-
if ray.is_initialized():
|
|
896
|
-
ray.shutdown()
|
|
897
963
|
with contextlib.suppress(AssertionError):
|
|
898
964
|
torch.distributed.destroy_process_group()
|
|
899
|
-
if ray.is_initialized():
|
|
900
|
-
ray.shutdown()
|
|
901
965
|
clear_memory()
|
|
902
966
|
|
|
903
967
|
|
|
904
968
|
def get_end_of_reasoning_token(
|
|
905
|
-
model: "LLM",
|
|
969
|
+
model: "LLM", tokeniser: "PreTrainedTokenizer", model_id: str
|
|
906
970
|
) -> str | None:
|
|
907
971
|
"""Get the end-of-reasoning token for a generative model.
|
|
908
972
|
|
|
909
973
|
Args:
|
|
910
974
|
model:
|
|
911
975
|
The vLLM model.
|
|
912
|
-
|
|
913
|
-
The
|
|
976
|
+
tokeniser:
|
|
977
|
+
The tokeniser.
|
|
914
978
|
model_id:
|
|
915
979
|
The model ID.
|
|
916
980
|
|
|
@@ -919,11 +983,9 @@ def get_end_of_reasoning_token(
|
|
|
919
983
|
"""
|
|
920
984
|
# Create a prompt to check if the model uses the reasoning tokens
|
|
921
985
|
prompt = "What is your name?"
|
|
922
|
-
if
|
|
923
|
-
templated_prompt =
|
|
924
|
-
conversation=[dict(role="user", content=prompt)],
|
|
925
|
-
add_generation_prompt=True,
|
|
926
|
-
tokenize=False,
|
|
986
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
987
|
+
templated_prompt = apply_chat_template(
|
|
988
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
927
989
|
)
|
|
928
990
|
assert isinstance(templated_prompt, str)
|
|
929
991
|
prompt = templated_prompt
|
|
@@ -948,7 +1010,7 @@ def get_end_of_reasoning_token(
|
|
|
948
1010
|
f"The model {model_id!r} did not generate any beginning-of-reasoning "
|
|
949
1011
|
"tokens in the prompt or the completion. Assuming the model is not "
|
|
950
1012
|
"a reasoning model.",
|
|
951
|
-
level=logging.
|
|
1013
|
+
level=logging.DEBUG,
|
|
952
1014
|
)
|
|
953
1015
|
return None
|
|
954
1016
|
|
|
@@ -974,7 +1036,7 @@ def get_end_of_reasoning_token(
|
|
|
974
1036
|
"the beginning-of-reasoning tokens "
|
|
975
1037
|
f"{[bor_token for bor_token, _ in bor_reasoning_matches]!r}. "
|
|
976
1038
|
"This is probably not correct, so please report this issue.",
|
|
977
|
-
level=logging.
|
|
1039
|
+
level=logging.WARNING,
|
|
978
1040
|
)
|
|
979
1041
|
return None
|
|
980
1042
|
|
|
@@ -984,14 +1046,14 @@ def get_end_of_reasoning_token(
|
|
|
984
1046
|
f"model {model_id!r}. Using {eor_reasoning_matches[0]!r} as "
|
|
985
1047
|
"the reasoning token. If this is not the correct reasoning token, "
|
|
986
1048
|
"please report this issue.",
|
|
987
|
-
level=logging.
|
|
1049
|
+
level=logging.WARNING,
|
|
988
1050
|
)
|
|
989
1051
|
|
|
990
1052
|
bor_token, eor_token = eor_reasoning_matches[0]
|
|
991
1053
|
log_once(
|
|
992
1054
|
f"Detected beginning-of-reasoning token {bor_token!r} and end-of-reasoning "
|
|
993
1055
|
f"token {eor_token!r} for model {model_id!r}.",
|
|
994
|
-
level=logging.
|
|
1056
|
+
level=logging.DEBUG,
|
|
995
1057
|
)
|
|
996
1058
|
|
|
997
1059
|
return eor_token
|
|
@@ -999,7 +1061,7 @@ def get_end_of_reasoning_token(
|
|
|
999
1061
|
|
|
1000
1062
|
def get_custom_stop_tokens(
|
|
1001
1063
|
model: "LLM",
|
|
1002
|
-
|
|
1064
|
+
tokeniser: "PreTrainedTokenizer",
|
|
1003
1065
|
model_id: str,
|
|
1004
1066
|
is_reasoning_model: bool,
|
|
1005
1067
|
) -> list[str]:
|
|
@@ -1008,8 +1070,8 @@ def get_custom_stop_tokens(
|
|
|
1008
1070
|
Args:
|
|
1009
1071
|
model:
|
|
1010
1072
|
The vLLM model.
|
|
1011
|
-
|
|
1012
|
-
The
|
|
1073
|
+
tokeniser:
|
|
1074
|
+
The tokeniser.
|
|
1013
1075
|
model_id:
|
|
1014
1076
|
The model ID.
|
|
1015
1077
|
is_reasoning_model:
|
|
@@ -1022,11 +1084,9 @@ def get_custom_stop_tokens(
|
|
|
1022
1084
|
candidate_stop_tokens = CUSTOM_STOP_TOKENS
|
|
1023
1085
|
|
|
1024
1086
|
prompt = "Hello"
|
|
1025
|
-
if
|
|
1026
|
-
templated_prompt =
|
|
1027
|
-
conversation=[dict(role="user", content=prompt)],
|
|
1028
|
-
add_generation_prompt=True,
|
|
1029
|
-
tokenize=False,
|
|
1087
|
+
if has_chat_template(tokeniser=tokeniser):
|
|
1088
|
+
templated_prompt = apply_chat_template(
|
|
1089
|
+
conversation=[dict(role="user", content=prompt)], tokeniser=tokeniser
|
|
1030
1090
|
)
|
|
1031
1091
|
assert isinstance(templated_prompt, str)
|
|
1032
1092
|
prompt = templated_prompt
|