vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,313 @@
|
|
1
|
+
from typing import Dict, List, Optional, Tuple, Union
|
2
|
+
|
3
|
+
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
4
|
+
|
5
|
+
from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
|
6
|
+
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
|
7
|
+
BaseTokenizerGroup)
|
8
|
+
|
9
|
+
# Used eg. for marking rejected tokens in spec decoding.
|
10
|
+
INVALID_TOKEN_ID = -1
|
11
|
+
|
12
|
+
|
13
|
+
class Detokenizer:
|
14
|
+
"""Provides methods to decode the output of a model into text."""
|
15
|
+
|
16
|
+
def __init__(self, tokenizer_group: BaseTokenizerGroup):
|
17
|
+
self.tokenizer_group = tokenizer_group
|
18
|
+
|
19
|
+
def get_tokenizer_for_seq(self,
|
20
|
+
sequence: Sequence) -> "PreTrainedTokenizer":
|
21
|
+
"""Returns the HF tokenizer to use for a given sequence."""
|
22
|
+
return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
|
23
|
+
|
24
|
+
def decode_prompt_logprobs_inplace(
|
25
|
+
self, seq_group: SequenceGroup,
|
26
|
+
prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
|
27
|
+
"""Decodes the logprobs for the prompt of a sequence group.
|
28
|
+
|
29
|
+
Args:
|
30
|
+
seq_group: The sequence group to decode.
|
31
|
+
prompt_logprobs: The logprobs to decode.
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
The prompt logprobs with the decoded tokens.
|
35
|
+
"""
|
36
|
+
prms = seq_group.sampling_params
|
37
|
+
# We can pick any sequence for the prompt.
|
38
|
+
seq = next(iter(seq_group.seqs_dict.values()))
|
39
|
+
# Only prompt, without the generated token.
|
40
|
+
all_token_ids = seq.get_token_ids()
|
41
|
+
prompt_token_ids = all_token_ids[:-1]
|
42
|
+
tokenizer = self.get_tokenizer_for_seq(seq)
|
43
|
+
prefix_offset = 0
|
44
|
+
read_offset = 0
|
45
|
+
next_iter_prefix_offset = 0
|
46
|
+
next_iter_read_offset = 0
|
47
|
+
next_iter_tokens = []
|
48
|
+
prev_tokens = None
|
49
|
+
|
50
|
+
for token_position, prompt_logprobs_for_token in enumerate(
|
51
|
+
prompt_logprobs):
|
52
|
+
if not prompt_logprobs_for_token:
|
53
|
+
continue
|
54
|
+
for token_id, sample_logprob in prompt_logprobs_for_token.items():
|
55
|
+
if (sample_logprob.decoded_token is None
|
56
|
+
and token_id != INVALID_TOKEN_ID):
|
57
|
+
prompt_token_ids_with_token = (
|
58
|
+
prompt_token_ids[:token_position] + [token_id])
|
59
|
+
(new_tokens, new_text, new_prefix_offset,
|
60
|
+
new_read_offset) = detokenize_incrementally(
|
61
|
+
tokenizer=tokenizer,
|
62
|
+
all_input_ids=prompt_token_ids_with_token,
|
63
|
+
prev_tokens=prev_tokens,
|
64
|
+
prefix_offset=prefix_offset,
|
65
|
+
read_offset=read_offset,
|
66
|
+
skip_special_tokens=prms.skip_special_tokens,
|
67
|
+
spaces_between_special_tokens=prms.
|
68
|
+
spaces_between_special_tokens,
|
69
|
+
)
|
70
|
+
|
71
|
+
sample_logprob.decoded_token = new_text
|
72
|
+
|
73
|
+
# Use the offsets & prev tokens corresponding to
|
74
|
+
# real tokens to ensure detokenization is consistent
|
75
|
+
# actual with prompt.
|
76
|
+
if token_id == all_token_ids[token_position]:
|
77
|
+
next_iter_prefix_offset = new_prefix_offset
|
78
|
+
next_iter_read_offset = new_read_offset
|
79
|
+
next_iter_tokens = new_tokens
|
80
|
+
|
81
|
+
# Advance to the next token position.
|
82
|
+
prefix_offset = next_iter_prefix_offset
|
83
|
+
read_offset = next_iter_read_offset
|
84
|
+
if prev_tokens is None:
|
85
|
+
prev_tokens = next_iter_tokens
|
86
|
+
else:
|
87
|
+
prev_tokens.extend(next_iter_tokens)
|
88
|
+
|
89
|
+
def decode_sequence_inplace(self, seq: Sequence,
|
90
|
+
prms: SamplingParams) -> int:
|
91
|
+
"""Decodes the new token for a sequence. In-place operation.
|
92
|
+
|
93
|
+
Args:
|
94
|
+
seq: The sequence to decode.
|
95
|
+
prms: The sampling parameters used to generate the sequence.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
The number of characters added to the output text.
|
99
|
+
"""
|
100
|
+
all_input_ids = seq.get_token_ids()
|
101
|
+
token_id_generated_this_iteration = all_input_ids[-1]
|
102
|
+
tokenizer = self.get_tokenizer_for_seq(seq)
|
103
|
+
|
104
|
+
# Convert prompt token IDs to tokens if necessary.
|
105
|
+
# Do it here so that we don't have to repeat this
|
106
|
+
# computation for each logprob.
|
107
|
+
if seq.tokens is None:
|
108
|
+
(seq.tokens, seq.prefix_offset,
|
109
|
+
seq.read_offset) = convert_prompt_ids_to_tokens(
|
110
|
+
tokenizer=tokenizer,
|
111
|
+
prompt_ids=all_input_ids[:-1],
|
112
|
+
skip_special_tokens=prms.skip_special_tokens,
|
113
|
+
)
|
114
|
+
|
115
|
+
(new_tokens, new_decoded_token_text, prefix_offset,
|
116
|
+
read_offset) = detokenize_incrementally(
|
117
|
+
tokenizer=tokenizer,
|
118
|
+
all_input_ids=all_input_ids,
|
119
|
+
prev_tokens=seq.tokens,
|
120
|
+
prefix_offset=seq.prefix_offset,
|
121
|
+
read_offset=seq.read_offset,
|
122
|
+
skip_special_tokens=prms.skip_special_tokens,
|
123
|
+
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
124
|
+
)
|
125
|
+
|
126
|
+
# Decode logprobs
|
127
|
+
logprobs = seq.output_logprobs[-1]
|
128
|
+
if logprobs:
|
129
|
+
previous_tokens = all_input_ids[:-1]
|
130
|
+
for token_id, sample_logprob in logprobs.items():
|
131
|
+
# If the token was generated this iteration,
|
132
|
+
# use the provided text.
|
133
|
+
if token_id == token_id_generated_this_iteration:
|
134
|
+
sample_logprob.decoded_token = new_decoded_token_text
|
135
|
+
continue
|
136
|
+
|
137
|
+
if (sample_logprob.decoded_token is None
|
138
|
+
and token_id != INVALID_TOKEN_ID):
|
139
|
+
all_input_ids_with_logprob = previous_tokens + [token_id]
|
140
|
+
(_, new_text, _, _) = detokenize_incrementally(
|
141
|
+
tokenizer=tokenizer,
|
142
|
+
all_input_ids=all_input_ids_with_logprob,
|
143
|
+
prev_tokens=seq.tokens,
|
144
|
+
prefix_offset=seq.prefix_offset,
|
145
|
+
read_offset=seq.read_offset,
|
146
|
+
skip_special_tokens=prms.skip_special_tokens,
|
147
|
+
spaces_between_special_tokens=prms.
|
148
|
+
spaces_between_special_tokens,
|
149
|
+
)
|
150
|
+
sample_logprob.decoded_token = new_text
|
151
|
+
|
152
|
+
seq.tokens.extend(new_tokens)
|
153
|
+
seq.prefix_offset = prefix_offset
|
154
|
+
seq.read_offset = read_offset
|
155
|
+
seq.output_text += new_decoded_token_text
|
156
|
+
|
157
|
+
return len(new_decoded_token_text)
|
158
|
+
|
159
|
+
|
160
|
+
def _convert_tokens_to_string_with_added_encoders(
|
161
|
+
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
162
|
+
output_tokens: List[str],
|
163
|
+
skip_special_tokens: bool,
|
164
|
+
spaces_between_special_tokens: bool,
|
165
|
+
) -> str:
|
166
|
+
# Adapted from
|
167
|
+
# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
|
168
|
+
# NOTE(woosuk): The following code is slow because it runs a for loop over
|
169
|
+
# the output_tokens. In Python, running a for loop over a list can be slow
|
170
|
+
# even when the loop body is very simple.
|
171
|
+
sub_texts: List[str] = []
|
172
|
+
current_sub_text: List[str] = []
|
173
|
+
all_special_tokens = set(tokenizer.all_special_tokens)
|
174
|
+
for token in output_tokens:
|
175
|
+
if skip_special_tokens and token in all_special_tokens:
|
176
|
+
continue
|
177
|
+
if token in tokenizer.get_added_vocab():
|
178
|
+
if current_sub_text:
|
179
|
+
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
180
|
+
sub_texts.append(sub_text)
|
181
|
+
current_sub_text = []
|
182
|
+
sub_texts.append(token)
|
183
|
+
else:
|
184
|
+
current_sub_text.append(token)
|
185
|
+
if current_sub_text:
|
186
|
+
sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
|
187
|
+
sub_texts.append(sub_text)
|
188
|
+
if spaces_between_special_tokens:
|
189
|
+
return " ".join(sub_texts)
|
190
|
+
else:
|
191
|
+
return "".join(sub_texts)
|
192
|
+
|
193
|
+
|
194
|
+
# 5 is an arbitrary value that should work for all
|
195
|
+
# tokenizers (bigger = more conservative).
|
196
|
+
INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
|
197
|
+
|
198
|
+
|
199
|
+
def convert_prompt_ids_to_tokens(
|
200
|
+
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
201
|
+
prompt_ids: List[int],
|
202
|
+
skip_special_tokens: bool = False,
|
203
|
+
) -> Tuple[List[str], int, int]:
|
204
|
+
"""Converts the prompt ids to tokens and returns the tokens and offsets
|
205
|
+
for incremental detokenization.
|
206
|
+
|
207
|
+
Note that not all tokens are converted to strings. Only the tokens that
|
208
|
+
are necessary for incremental detokenization are converted to strings.
|
209
|
+
"""
|
210
|
+
# We do not need to convert the whole prompt to tokens.
|
211
|
+
# Offset a little more in case we have special tokens.
|
212
|
+
new_tokens = tokenizer.convert_ids_to_tokens(
|
213
|
+
prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
|
214
|
+
skip_special_tokens=skip_special_tokens)
|
215
|
+
read_offset = len(new_tokens)
|
216
|
+
prefix_offset = max(
|
217
|
+
read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
|
218
|
+
return new_tokens, prefix_offset, read_offset
|
219
|
+
|
220
|
+
|
221
|
+
# Based on
|
222
|
+
# https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
|
223
|
+
# under Apache 2.0 license
|
224
|
+
def detokenize_incrementally(
|
225
|
+
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
226
|
+
all_input_ids: List[int],
|
227
|
+
prev_tokens: Optional[List[str]],
|
228
|
+
prefix_offset: int,
|
229
|
+
read_offset: int,
|
230
|
+
skip_special_tokens: bool = False,
|
231
|
+
spaces_between_special_tokens: bool = True,
|
232
|
+
) -> Tuple[List[str], str, int, int]:
|
233
|
+
"""Detokenizes the input ids incrementally and returns the new tokens
|
234
|
+
and the new text.
|
235
|
+
|
236
|
+
If `prev_tokens` is None, this function will convert the input ids to
|
237
|
+
tokens and return the tokens and the new text. Otherwise, it will return the
|
238
|
+
new tokens and the new text.
|
239
|
+
|
240
|
+
This function will also return the new prefix offset and the new read
|
241
|
+
offset to be used in the next iteration.
|
242
|
+
|
243
|
+
The offsets are necessary to defeat cleanup algorithms in the decode which
|
244
|
+
decide to add a space or not depending on the surrounding ids.
|
245
|
+
|
246
|
+
Args:
|
247
|
+
tokenizer: The tokenizer to use.
|
248
|
+
all_input_ids: The input ids. The last id is the new token id.
|
249
|
+
prev_tokens: The previous tokens. If None, this function will convert
|
250
|
+
the input ids to tokens and return the tokens and the new text.
|
251
|
+
prefix_offset: The prefix offset.
|
252
|
+
read_offset: The read offset.
|
253
|
+
skip_special_tokens: Whether to skip special tokens.
|
254
|
+
spaces_between_special_tokens: Whether to add spaces between special
|
255
|
+
tokens.
|
256
|
+
"""
|
257
|
+
new_token_id = all_input_ids[-1]
|
258
|
+
# This is the first iteration for this sequence
|
259
|
+
is_first_iter = prev_tokens is None
|
260
|
+
if is_first_iter:
|
261
|
+
(prev_tokens, prefix_offset,
|
262
|
+
read_offset) = convert_prompt_ids_to_tokens(
|
263
|
+
tokenizer,
|
264
|
+
all_input_ids[:-1],
|
265
|
+
skip_special_tokens=skip_special_tokens)
|
266
|
+
assert prev_tokens is not None
|
267
|
+
|
268
|
+
# If the new token id is out of bounds, return an empty string.
|
269
|
+
if new_token_id >= len(tokenizer):
|
270
|
+
new_tokens = [""]
|
271
|
+
else:
|
272
|
+
# Put new_token_id in a list so skip_special_tokens is respected
|
273
|
+
new_tokens = tokenizer.convert_ids_to_tokens(
|
274
|
+
[new_token_id], skip_special_tokens=skip_special_tokens)
|
275
|
+
if isinstance(new_tokens, str):
|
276
|
+
new_tokens = [new_tokens]
|
277
|
+
output_tokens = prev_tokens + new_tokens
|
278
|
+
|
279
|
+
# If this is the first iteration, return all tokens.
|
280
|
+
if is_first_iter:
|
281
|
+
new_tokens = output_tokens
|
282
|
+
|
283
|
+
# The prefix text is necessary only to defeat cleanup algorithms in
|
284
|
+
# the decode which decide to add a space or not depending on the
|
285
|
+
# surrounding ids.
|
286
|
+
if tokenizer.is_fast or not tokenizer.get_added_vocab():
|
287
|
+
prefix_text = tokenizer.convert_tokens_to_string(
|
288
|
+
output_tokens[prefix_offset:read_offset])
|
289
|
+
new_text = tokenizer.convert_tokens_to_string(
|
290
|
+
output_tokens[prefix_offset:])
|
291
|
+
else:
|
292
|
+
prefix_text = _convert_tokens_to_string_with_added_encoders(
|
293
|
+
tokenizer,
|
294
|
+
output_tokens[prefix_offset:read_offset],
|
295
|
+
skip_special_tokens=skip_special_tokens,
|
296
|
+
spaces_between_special_tokens=spaces_between_special_tokens,
|
297
|
+
)
|
298
|
+
new_text = _convert_tokens_to_string_with_added_encoders(
|
299
|
+
tokenizer,
|
300
|
+
output_tokens[prefix_offset:],
|
301
|
+
skip_special_tokens=skip_special_tokens,
|
302
|
+
spaces_between_special_tokens=spaces_between_special_tokens,
|
303
|
+
)
|
304
|
+
|
305
|
+
if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
|
306
|
+
# utf-8 char at the end means it's a potential unfinished byte sequence
|
307
|
+
# from byte fallback tokenization.
|
308
|
+
# If it's in the middle, it's probably a real invalid id generated
|
309
|
+
# by the model
|
310
|
+
return new_tokens, "", prefix_offset, read_offset
|
311
|
+
|
312
|
+
new_text = new_text[len(prefix_text):]
|
313
|
+
return new_tokens, new_text, read_offset, len(output_tokens)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Optional, Union
|
3
|
+
|
4
|
+
import huggingface_hub
|
5
|
+
from transformers import (AutoTokenizer, PreTrainedTokenizer,
|
6
|
+
PreTrainedTokenizerFast)
|
7
|
+
|
8
|
+
from vllm.envs import VLLM_USE_MODELSCOPE
|
9
|
+
from vllm.logger import init_logger
|
10
|
+
from vllm.lora.request import LoRARequest
|
11
|
+
from vllm.transformers_utils.tokenizers import BaichuanTokenizer
|
12
|
+
from vllm.utils import make_async
|
13
|
+
|
14
|
+
logger = init_logger(__name__)
|
15
|
+
|
16
|
+
|
17
|
+
def get_cached_tokenizer(
|
18
|
+
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
|
19
|
+
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
20
|
+
"""Get tokenizer with cached properties.
|
21
|
+
|
22
|
+
This will patch the tokenizer object in place.
|
23
|
+
|
24
|
+
By default, transformers will recompute multiple tokenizer properties
|
25
|
+
each time they are called, leading to a significant slowdown. This
|
26
|
+
function caches these properties for faster access."""
|
27
|
+
|
28
|
+
tokenizer_all_special_ids = set(tokenizer.all_special_ids)
|
29
|
+
tokenizer_all_special_tokens_extended = (
|
30
|
+
tokenizer.all_special_tokens_extended)
|
31
|
+
tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
|
32
|
+
tokenizer_len = len(tokenizer)
|
33
|
+
|
34
|
+
class CachedTokenizer(tokenizer.__class__): # type: ignore
|
35
|
+
|
36
|
+
@property
|
37
|
+
def all_special_ids(self):
|
38
|
+
return tokenizer_all_special_ids
|
39
|
+
|
40
|
+
@property
|
41
|
+
def all_special_tokens(self):
|
42
|
+
return tokenizer_all_special_tokens
|
43
|
+
|
44
|
+
@property
|
45
|
+
def all_special_tokens_extended(self):
|
46
|
+
return tokenizer_all_special_tokens_extended
|
47
|
+
|
48
|
+
def __len__(self):
|
49
|
+
return tokenizer_len
|
50
|
+
|
51
|
+
CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
|
52
|
+
|
53
|
+
tokenizer.__class__ = CachedTokenizer
|
54
|
+
return tokenizer
|
55
|
+
|
56
|
+
|
57
|
+
def get_tokenizer(
|
58
|
+
tokenizer_name: str,
|
59
|
+
*args,
|
60
|
+
tokenizer_mode: str = "auto",
|
61
|
+
trust_remote_code: bool = False,
|
62
|
+
revision: Optional[str] = None,
|
63
|
+
download_dir: Optional[str] = None,
|
64
|
+
**kwargs,
|
65
|
+
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
66
|
+
"""Gets a tokenizer for the given model name via HuggingFace or ModelScope.
|
67
|
+
"""
|
68
|
+
if VLLM_USE_MODELSCOPE:
|
69
|
+
# download model from ModelScope hub,
|
70
|
+
# lazy import so that modelscope is not required for normal use.
|
71
|
+
# pylint: disable=C.
|
72
|
+
from modelscope.hub.snapshot_download import snapshot_download
|
73
|
+
|
74
|
+
# Only set the tokenizer here, model will be downloaded on the workers.
|
75
|
+
if not os.path.exists(tokenizer_name):
|
76
|
+
tokenizer_path = snapshot_download(
|
77
|
+
model_id=tokenizer_name,
|
78
|
+
cache_dir=download_dir,
|
79
|
+
revision=revision,
|
80
|
+
local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
|
81
|
+
# Ignore weights - we only need the tokenizer.
|
82
|
+
ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
|
83
|
+
tokenizer_name = tokenizer_path
|
84
|
+
|
85
|
+
if tokenizer_mode == "slow":
|
86
|
+
if kwargs.get("use_fast", False):
|
87
|
+
raise ValueError(
|
88
|
+
"Cannot use the fast tokenizer in slow tokenizer mode.")
|
89
|
+
kwargs["use_fast"] = False
|
90
|
+
|
91
|
+
try:
|
92
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
93
|
+
tokenizer_name,
|
94
|
+
*args,
|
95
|
+
trust_remote_code=trust_remote_code,
|
96
|
+
revision=revision,
|
97
|
+
**kwargs)
|
98
|
+
except ValueError as e:
|
99
|
+
# If the error pertains to the tokenizer class not existing or not
|
100
|
+
# currently being imported, suggest using the --trust-remote-code flag.
|
101
|
+
if (not trust_remote_code and
|
102
|
+
("does not exist or is not currently imported." in str(e)
|
103
|
+
or "requires you to execute the tokenizer file" in str(e))):
|
104
|
+
err_msg = (
|
105
|
+
"Failed to load the tokenizer. If the tokenizer is a custom "
|
106
|
+
"tokenizer not yet available in the HuggingFace transformers "
|
107
|
+
"library, consider setting `trust_remote_code=True` in LLM "
|
108
|
+
"or using the `--trust-remote-code` flag in the CLI.")
|
109
|
+
raise RuntimeError(err_msg) from e
|
110
|
+
else:
|
111
|
+
raise e
|
112
|
+
except AttributeError as e:
|
113
|
+
if "BaichuanTokenizer" in str(e):
|
114
|
+
# This is for the error "'BaichuanTokenizer' object has no
|
115
|
+
# attribute 'sp_model'".
|
116
|
+
tokenizer = BaichuanTokenizer.from_pretrained(
|
117
|
+
tokenizer_name,
|
118
|
+
*args,
|
119
|
+
trust_remote_code=trust_remote_code,
|
120
|
+
revision=revision,
|
121
|
+
**kwargs)
|
122
|
+
else:
|
123
|
+
raise e
|
124
|
+
|
125
|
+
if not isinstance(tokenizer, PreTrainedTokenizerFast):
|
126
|
+
logger.warning(
|
127
|
+
"Using a slow tokenizer. This might cause a significant "
|
128
|
+
"slowdown. Consider using a fast tokenizer instead.")
|
129
|
+
return get_cached_tokenizer(tokenizer)
|
130
|
+
|
131
|
+
|
132
|
+
def get_lora_tokenizer(lora_request: LoRARequest, *args,
|
133
|
+
**kwargs) -> Optional[PreTrainedTokenizer]:
|
134
|
+
if lora_request is None:
|
135
|
+
return None
|
136
|
+
try:
|
137
|
+
tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
|
138
|
+
**kwargs)
|
139
|
+
except OSError as e:
|
140
|
+
# No tokenizer was found in the LoRA folder,
|
141
|
+
# use base model tokenizer
|
142
|
+
logger.warning(
|
143
|
+
"No tokenizer found in %s, using base model tokenizer instead. "
|
144
|
+
"(Exception: %s)", lora_request.lora_local_path, e)
|
145
|
+
tokenizer = None
|
146
|
+
return tokenizer
|
147
|
+
|
148
|
+
|
149
|
+
get_lora_tokenizer_async = make_async(get_lora_tokenizer)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from vllm.config import TokenizerPoolConfig
|
4
|
+
from vllm.executor.ray_utils import ray
|
5
|
+
from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
|
6
|
+
BaseTokenizerGroup)
|
7
|
+
from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
|
8
|
+
TokenizerGroup)
|
9
|
+
|
10
|
+
if ray:
|
11
|
+
from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
|
12
|
+
RayTokenizerGroupPool)
|
13
|
+
else:
|
14
|
+
RayTokenizerGroupPool = None # type: ignore
|
15
|
+
|
16
|
+
|
17
|
+
def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
|
18
|
+
**init_kwargs) -> BaseTokenizerGroup:
|
19
|
+
if tokenizer_pool_config is None:
|
20
|
+
return TokenizerGroup(**init_kwargs)
|
21
|
+
if tokenizer_pool_config.pool_type == "ray":
|
22
|
+
if RayTokenizerGroupPool is None:
|
23
|
+
raise ImportError(
|
24
|
+
"RayTokenizerGroupPool is not available. Please install "
|
25
|
+
"the ray package to use the Ray tokenizer group pool.")
|
26
|
+
return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
|
27
|
+
**init_kwargs)
|
28
|
+
else:
|
29
|
+
raise ValueError(
|
30
|
+
f"Unknown pool type: {tokenizer_pool_config.pool_type}")
|
31
|
+
|
32
|
+
|
33
|
+
__all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
|
@@ -0,0 +1,55 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from typing import List, Optional
|
3
|
+
|
4
|
+
from transformers import PreTrainedTokenizer
|
5
|
+
|
6
|
+
from vllm.lora.request import LoRARequest
|
7
|
+
|
8
|
+
|
9
|
+
class BaseTokenizerGroup(ABC):
|
10
|
+
"""A group of tokenizers that can be used for LoRA adapters."""
|
11
|
+
|
12
|
+
@abstractmethod
|
13
|
+
def ping(self) -> bool:
|
14
|
+
"""Check if the tokenizer group is alive."""
|
15
|
+
pass
|
16
|
+
|
17
|
+
@abstractmethod
|
18
|
+
def get_max_input_len(self,
|
19
|
+
lora_request: Optional[LoRARequest] = None
|
20
|
+
) -> Optional[int]:
|
21
|
+
"""Get the maximum input length for the LoRA request."""
|
22
|
+
pass
|
23
|
+
|
24
|
+
@abstractmethod
|
25
|
+
def encode(self,
|
26
|
+
prompt: str,
|
27
|
+
request_id: Optional[str] = None,
|
28
|
+
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
29
|
+
"""Encode a prompt using the tokenizer group."""
|
30
|
+
pass
|
31
|
+
|
32
|
+
@abstractmethod
|
33
|
+
async def encode_async(
|
34
|
+
self,
|
35
|
+
prompt: str,
|
36
|
+
request_id: Optional[str] = None,
|
37
|
+
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
38
|
+
"""Encode a prompt using the tokenizer group."""
|
39
|
+
pass
|
40
|
+
|
41
|
+
@abstractmethod
|
42
|
+
def get_lora_tokenizer(
|
43
|
+
self,
|
44
|
+
lora_request: Optional[LoRARequest] = None
|
45
|
+
) -> "PreTrainedTokenizer":
|
46
|
+
"""Get a tokenizer for a LoRA request."""
|
47
|
+
pass
|
48
|
+
|
49
|
+
@abstractmethod
|
50
|
+
async def get_lora_tokenizer_async(
|
51
|
+
self,
|
52
|
+
lora_request: Optional[LoRARequest] = None
|
53
|
+
) -> "PreTrainedTokenizer":
|
54
|
+
"""Get a tokenizer for a LoRA request."""
|
55
|
+
pass
|