vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,313 @@
1
+ from typing import Dict, List, Optional, Tuple, Union
2
+
3
+ from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
4
+
5
+ from vllm.sequence import Logprob, SamplingParams, Sequence, SequenceGroup
6
+ from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
7
+ BaseTokenizerGroup)
8
+
9
+ # Used eg. for marking rejected tokens in spec decoding.
10
+ INVALID_TOKEN_ID = -1
11
+
12
+
13
+ class Detokenizer:
14
+ """Provides methods to decode the output of a model into text."""
15
+
16
+ def __init__(self, tokenizer_group: BaseTokenizerGroup):
17
+ self.tokenizer_group = tokenizer_group
18
+
19
+ def get_tokenizer_for_seq(self,
20
+ sequence: Sequence) -> "PreTrainedTokenizer":
21
+ """Returns the HF tokenizer to use for a given sequence."""
22
+ return self.tokenizer_group.get_lora_tokenizer(sequence.lora_request)
23
+
24
+ def decode_prompt_logprobs_inplace(
25
+ self, seq_group: SequenceGroup,
26
+ prompt_logprobs: List[Optional[Dict[int, Logprob]]]) -> None:
27
+ """Decodes the logprobs for the prompt of a sequence group.
28
+
29
+ Args:
30
+ seq_group: The sequence group to decode.
31
+ prompt_logprobs: The logprobs to decode.
32
+
33
+ Returns:
34
+ The prompt logprobs with the decoded tokens.
35
+ """
36
+ prms = seq_group.sampling_params
37
+ # We can pick any sequence for the prompt.
38
+ seq = next(iter(seq_group.seqs_dict.values()))
39
+ # Only prompt, without the generated token.
40
+ all_token_ids = seq.get_token_ids()
41
+ prompt_token_ids = all_token_ids[:-1]
42
+ tokenizer = self.get_tokenizer_for_seq(seq)
43
+ prefix_offset = 0
44
+ read_offset = 0
45
+ next_iter_prefix_offset = 0
46
+ next_iter_read_offset = 0
47
+ next_iter_tokens = []
48
+ prev_tokens = None
49
+
50
+ for token_position, prompt_logprobs_for_token in enumerate(
51
+ prompt_logprobs):
52
+ if not prompt_logprobs_for_token:
53
+ continue
54
+ for token_id, sample_logprob in prompt_logprobs_for_token.items():
55
+ if (sample_logprob.decoded_token is None
56
+ and token_id != INVALID_TOKEN_ID):
57
+ prompt_token_ids_with_token = (
58
+ prompt_token_ids[:token_position] + [token_id])
59
+ (new_tokens, new_text, new_prefix_offset,
60
+ new_read_offset) = detokenize_incrementally(
61
+ tokenizer=tokenizer,
62
+ all_input_ids=prompt_token_ids_with_token,
63
+ prev_tokens=prev_tokens,
64
+ prefix_offset=prefix_offset,
65
+ read_offset=read_offset,
66
+ skip_special_tokens=prms.skip_special_tokens,
67
+ spaces_between_special_tokens=prms.
68
+ spaces_between_special_tokens,
69
+ )
70
+
71
+ sample_logprob.decoded_token = new_text
72
+
73
+ # Use the offsets & prev tokens corresponding to
74
+ # real tokens to ensure detokenization is consistent
75
+ # actual with prompt.
76
+ if token_id == all_token_ids[token_position]:
77
+ next_iter_prefix_offset = new_prefix_offset
78
+ next_iter_read_offset = new_read_offset
79
+ next_iter_tokens = new_tokens
80
+
81
+ # Advance to the next token position.
82
+ prefix_offset = next_iter_prefix_offset
83
+ read_offset = next_iter_read_offset
84
+ if prev_tokens is None:
85
+ prev_tokens = next_iter_tokens
86
+ else:
87
+ prev_tokens.extend(next_iter_tokens)
88
+
89
+ def decode_sequence_inplace(self, seq: Sequence,
90
+ prms: SamplingParams) -> int:
91
+ """Decodes the new token for a sequence. In-place operation.
92
+
93
+ Args:
94
+ seq: The sequence to decode.
95
+ prms: The sampling parameters used to generate the sequence.
96
+
97
+ Returns:
98
+ The number of characters added to the output text.
99
+ """
100
+ all_input_ids = seq.get_token_ids()
101
+ token_id_generated_this_iteration = all_input_ids[-1]
102
+ tokenizer = self.get_tokenizer_for_seq(seq)
103
+
104
+ # Convert prompt token IDs to tokens if necessary.
105
+ # Do it here so that we don't have to repeat this
106
+ # computation for each logprob.
107
+ if seq.tokens is None:
108
+ (seq.tokens, seq.prefix_offset,
109
+ seq.read_offset) = convert_prompt_ids_to_tokens(
110
+ tokenizer=tokenizer,
111
+ prompt_ids=all_input_ids[:-1],
112
+ skip_special_tokens=prms.skip_special_tokens,
113
+ )
114
+
115
+ (new_tokens, new_decoded_token_text, prefix_offset,
116
+ read_offset) = detokenize_incrementally(
117
+ tokenizer=tokenizer,
118
+ all_input_ids=all_input_ids,
119
+ prev_tokens=seq.tokens,
120
+ prefix_offset=seq.prefix_offset,
121
+ read_offset=seq.read_offset,
122
+ skip_special_tokens=prms.skip_special_tokens,
123
+ spaces_between_special_tokens=prms.spaces_between_special_tokens,
124
+ )
125
+
126
+ # Decode logprobs
127
+ logprobs = seq.output_logprobs[-1]
128
+ if logprobs:
129
+ previous_tokens = all_input_ids[:-1]
130
+ for token_id, sample_logprob in logprobs.items():
131
+ # If the token was generated this iteration,
132
+ # use the provided text.
133
+ if token_id == token_id_generated_this_iteration:
134
+ sample_logprob.decoded_token = new_decoded_token_text
135
+ continue
136
+
137
+ if (sample_logprob.decoded_token is None
138
+ and token_id != INVALID_TOKEN_ID):
139
+ all_input_ids_with_logprob = previous_tokens + [token_id]
140
+ (_, new_text, _, _) = detokenize_incrementally(
141
+ tokenizer=tokenizer,
142
+ all_input_ids=all_input_ids_with_logprob,
143
+ prev_tokens=seq.tokens,
144
+ prefix_offset=seq.prefix_offset,
145
+ read_offset=seq.read_offset,
146
+ skip_special_tokens=prms.skip_special_tokens,
147
+ spaces_between_special_tokens=prms.
148
+ spaces_between_special_tokens,
149
+ )
150
+ sample_logprob.decoded_token = new_text
151
+
152
+ seq.tokens.extend(new_tokens)
153
+ seq.prefix_offset = prefix_offset
154
+ seq.read_offset = read_offset
155
+ seq.output_text += new_decoded_token_text
156
+
157
+ return len(new_decoded_token_text)
158
+
159
+
160
+ def _convert_tokens_to_string_with_added_encoders(
161
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
162
+ output_tokens: List[str],
163
+ skip_special_tokens: bool,
164
+ spaces_between_special_tokens: bool,
165
+ ) -> str:
166
+ # Adapted from
167
+ # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
168
+ # NOTE(woosuk): The following code is slow because it runs a for loop over
169
+ # the output_tokens. In Python, running a for loop over a list can be slow
170
+ # even when the loop body is very simple.
171
+ sub_texts: List[str] = []
172
+ current_sub_text: List[str] = []
173
+ all_special_tokens = set(tokenizer.all_special_tokens)
174
+ for token in output_tokens:
175
+ if skip_special_tokens and token in all_special_tokens:
176
+ continue
177
+ if token in tokenizer.get_added_vocab():
178
+ if current_sub_text:
179
+ sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
180
+ sub_texts.append(sub_text)
181
+ current_sub_text = []
182
+ sub_texts.append(token)
183
+ else:
184
+ current_sub_text.append(token)
185
+ if current_sub_text:
186
+ sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
187
+ sub_texts.append(sub_text)
188
+ if spaces_between_special_tokens:
189
+ return " ".join(sub_texts)
190
+ else:
191
+ return "".join(sub_texts)
192
+
193
+
194
+ # 5 is an arbitrary value that should work for all
195
+ # tokenizers (bigger = more conservative).
196
+ INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET = 5
197
+
198
+
199
+ def convert_prompt_ids_to_tokens(
200
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
201
+ prompt_ids: List[int],
202
+ skip_special_tokens: bool = False,
203
+ ) -> Tuple[List[str], int, int]:
204
+ """Converts the prompt ids to tokens and returns the tokens and offsets
205
+ for incremental detokenization.
206
+
207
+ Note that not all tokens are converted to strings. Only the tokens that
208
+ are necessary for incremental detokenization are converted to strings.
209
+ """
210
+ # We do not need to convert the whole prompt to tokens.
211
+ # Offset a little more in case we have special tokens.
212
+ new_tokens = tokenizer.convert_ids_to_tokens(
213
+ prompt_ids[-INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET - 2:],
214
+ skip_special_tokens=skip_special_tokens)
215
+ read_offset = len(new_tokens)
216
+ prefix_offset = max(
217
+ read_offset - INITIAL_INCREMENTAL_DETOKENIZATION_OFFSET, 0)
218
+ return new_tokens, prefix_offset, read_offset
219
+
220
+
221
+ # Based on
222
+ # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
223
+ # under Apache 2.0 license
224
+ def detokenize_incrementally(
225
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
226
+ all_input_ids: List[int],
227
+ prev_tokens: Optional[List[str]],
228
+ prefix_offset: int,
229
+ read_offset: int,
230
+ skip_special_tokens: bool = False,
231
+ spaces_between_special_tokens: bool = True,
232
+ ) -> Tuple[List[str], str, int, int]:
233
+ """Detokenizes the input ids incrementally and returns the new tokens
234
+ and the new text.
235
+
236
+ If `prev_tokens` is None, this function will convert the input ids to
237
+ tokens and return the tokens and the new text. Otherwise, it will return the
238
+ new tokens and the new text.
239
+
240
+ This function will also return the new prefix offset and the new read
241
+ offset to be used in the next iteration.
242
+
243
+ The offsets are necessary to defeat cleanup algorithms in the decode which
244
+ decide to add a space or not depending on the surrounding ids.
245
+
246
+ Args:
247
+ tokenizer: The tokenizer to use.
248
+ all_input_ids: The input ids. The last id is the new token id.
249
+ prev_tokens: The previous tokens. If None, this function will convert
250
+ the input ids to tokens and return the tokens and the new text.
251
+ prefix_offset: The prefix offset.
252
+ read_offset: The read offset.
253
+ skip_special_tokens: Whether to skip special tokens.
254
+ spaces_between_special_tokens: Whether to add spaces between special
255
+ tokens.
256
+ """
257
+ new_token_id = all_input_ids[-1]
258
+ # This is the first iteration for this sequence
259
+ is_first_iter = prev_tokens is None
260
+ if is_first_iter:
261
+ (prev_tokens, prefix_offset,
262
+ read_offset) = convert_prompt_ids_to_tokens(
263
+ tokenizer,
264
+ all_input_ids[:-1],
265
+ skip_special_tokens=skip_special_tokens)
266
+ assert prev_tokens is not None
267
+
268
+ # If the new token id is out of bounds, return an empty string.
269
+ if new_token_id >= len(tokenizer):
270
+ new_tokens = [""]
271
+ else:
272
+ # Put new_token_id in a list so skip_special_tokens is respected
273
+ new_tokens = tokenizer.convert_ids_to_tokens(
274
+ [new_token_id], skip_special_tokens=skip_special_tokens)
275
+ if isinstance(new_tokens, str):
276
+ new_tokens = [new_tokens]
277
+ output_tokens = prev_tokens + new_tokens
278
+
279
+ # If this is the first iteration, return all tokens.
280
+ if is_first_iter:
281
+ new_tokens = output_tokens
282
+
283
+ # The prefix text is necessary only to defeat cleanup algorithms in
284
+ # the decode which decide to add a space or not depending on the
285
+ # surrounding ids.
286
+ if tokenizer.is_fast or not tokenizer.get_added_vocab():
287
+ prefix_text = tokenizer.convert_tokens_to_string(
288
+ output_tokens[prefix_offset:read_offset])
289
+ new_text = tokenizer.convert_tokens_to_string(
290
+ output_tokens[prefix_offset:])
291
+ else:
292
+ prefix_text = _convert_tokens_to_string_with_added_encoders(
293
+ tokenizer,
294
+ output_tokens[prefix_offset:read_offset],
295
+ skip_special_tokens=skip_special_tokens,
296
+ spaces_between_special_tokens=spaces_between_special_tokens,
297
+ )
298
+ new_text = _convert_tokens_to_string_with_added_encoders(
299
+ tokenizer,
300
+ output_tokens[prefix_offset:],
301
+ skip_special_tokens=skip_special_tokens,
302
+ spaces_between_special_tokens=spaces_between_special_tokens,
303
+ )
304
+
305
+ if len(new_text) <= len(prefix_text) or new_text.endswith("�"):
306
+ # utf-8 char at the end means it's a potential unfinished byte sequence
307
+ # from byte fallback tokenization.
308
+ # If it's in the middle, it's probably a real invalid id generated
309
+ # by the model
310
+ return new_tokens, "", prefix_offset, read_offset
311
+
312
+ new_text = new_text[len(prefix_text):]
313
+ return new_tokens, new_text, read_offset, len(output_tokens)
@@ -0,0 +1,149 @@
1
+ import os
2
+ from typing import Optional, Union
3
+
4
+ import huggingface_hub
5
+ from transformers import (AutoTokenizer, PreTrainedTokenizer,
6
+ PreTrainedTokenizerFast)
7
+
8
+ from vllm.envs import VLLM_USE_MODELSCOPE
9
+ from vllm.logger import init_logger
10
+ from vllm.lora.request import LoRARequest
11
+ from vllm.transformers_utils.tokenizers import BaichuanTokenizer
12
+ from vllm.utils import make_async
13
+
14
+ logger = init_logger(__name__)
15
+
16
+
17
+ def get_cached_tokenizer(
18
+ tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
19
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
20
+ """Get tokenizer with cached properties.
21
+
22
+ This will patch the tokenizer object in place.
23
+
24
+ By default, transformers will recompute multiple tokenizer properties
25
+ each time they are called, leading to a significant slowdown. This
26
+ function caches these properties for faster access."""
27
+
28
+ tokenizer_all_special_ids = set(tokenizer.all_special_ids)
29
+ tokenizer_all_special_tokens_extended = (
30
+ tokenizer.all_special_tokens_extended)
31
+ tokenizer_all_special_tokens = set(tokenizer.all_special_tokens)
32
+ tokenizer_len = len(tokenizer)
33
+
34
+ class CachedTokenizer(tokenizer.__class__): # type: ignore
35
+
36
+ @property
37
+ def all_special_ids(self):
38
+ return tokenizer_all_special_ids
39
+
40
+ @property
41
+ def all_special_tokens(self):
42
+ return tokenizer_all_special_tokens
43
+
44
+ @property
45
+ def all_special_tokens_extended(self):
46
+ return tokenizer_all_special_tokens_extended
47
+
48
+ def __len__(self):
49
+ return tokenizer_len
50
+
51
+ CachedTokenizer.__name__ = f"Cached{tokenizer.__class__.__name__}"
52
+
53
+ tokenizer.__class__ = CachedTokenizer
54
+ return tokenizer
55
+
56
+
57
+ def get_tokenizer(
58
+ tokenizer_name: str,
59
+ *args,
60
+ tokenizer_mode: str = "auto",
61
+ trust_remote_code: bool = False,
62
+ revision: Optional[str] = None,
63
+ download_dir: Optional[str] = None,
64
+ **kwargs,
65
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
66
+ """Gets a tokenizer for the given model name via HuggingFace or ModelScope.
67
+ """
68
+ if VLLM_USE_MODELSCOPE:
69
+ # download model from ModelScope hub,
70
+ # lazy import so that modelscope is not required for normal use.
71
+ # pylint: disable=C.
72
+ from modelscope.hub.snapshot_download import snapshot_download
73
+
74
+ # Only set the tokenizer here, model will be downloaded on the workers.
75
+ if not os.path.exists(tokenizer_name):
76
+ tokenizer_path = snapshot_download(
77
+ model_id=tokenizer_name,
78
+ cache_dir=download_dir,
79
+ revision=revision,
80
+ local_files_only=huggingface_hub.constants.HF_HUB_OFFLINE,
81
+ # Ignore weights - we only need the tokenizer.
82
+ ignore_file_pattern=[".*.pt", ".*.safetensors", ".*.bin"])
83
+ tokenizer_name = tokenizer_path
84
+
85
+ if tokenizer_mode == "slow":
86
+ if kwargs.get("use_fast", False):
87
+ raise ValueError(
88
+ "Cannot use the fast tokenizer in slow tokenizer mode.")
89
+ kwargs["use_fast"] = False
90
+
91
+ try:
92
+ tokenizer = AutoTokenizer.from_pretrained(
93
+ tokenizer_name,
94
+ *args,
95
+ trust_remote_code=trust_remote_code,
96
+ revision=revision,
97
+ **kwargs)
98
+ except ValueError as e:
99
+ # If the error pertains to the tokenizer class not existing or not
100
+ # currently being imported, suggest using the --trust-remote-code flag.
101
+ if (not trust_remote_code and
102
+ ("does not exist or is not currently imported." in str(e)
103
+ or "requires you to execute the tokenizer file" in str(e))):
104
+ err_msg = (
105
+ "Failed to load the tokenizer. If the tokenizer is a custom "
106
+ "tokenizer not yet available in the HuggingFace transformers "
107
+ "library, consider setting `trust_remote_code=True` in LLM "
108
+ "or using the `--trust-remote-code` flag in the CLI.")
109
+ raise RuntimeError(err_msg) from e
110
+ else:
111
+ raise e
112
+ except AttributeError as e:
113
+ if "BaichuanTokenizer" in str(e):
114
+ # This is for the error "'BaichuanTokenizer' object has no
115
+ # attribute 'sp_model'".
116
+ tokenizer = BaichuanTokenizer.from_pretrained(
117
+ tokenizer_name,
118
+ *args,
119
+ trust_remote_code=trust_remote_code,
120
+ revision=revision,
121
+ **kwargs)
122
+ else:
123
+ raise e
124
+
125
+ if not isinstance(tokenizer, PreTrainedTokenizerFast):
126
+ logger.warning(
127
+ "Using a slow tokenizer. This might cause a significant "
128
+ "slowdown. Consider using a fast tokenizer instead.")
129
+ return get_cached_tokenizer(tokenizer)
130
+
131
+
132
+ def get_lora_tokenizer(lora_request: LoRARequest, *args,
133
+ **kwargs) -> Optional[PreTrainedTokenizer]:
134
+ if lora_request is None:
135
+ return None
136
+ try:
137
+ tokenizer = get_tokenizer(lora_request.lora_local_path, *args,
138
+ **kwargs)
139
+ except OSError as e:
140
+ # No tokenizer was found in the LoRA folder,
141
+ # use base model tokenizer
142
+ logger.warning(
143
+ "No tokenizer found in %s, using base model tokenizer instead. "
144
+ "(Exception: %s)", lora_request.lora_local_path, e)
145
+ tokenizer = None
146
+ return tokenizer
147
+
148
+
149
+ get_lora_tokenizer_async = make_async(get_lora_tokenizer)
@@ -0,0 +1,33 @@
1
+ from typing import Optional
2
+
3
+ from vllm.config import TokenizerPoolConfig
4
+ from vllm.executor.ray_utils import ray
5
+ from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
6
+ BaseTokenizerGroup)
7
+ from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
8
+ TokenizerGroup)
9
+
10
+ if ray:
11
+ from vllm.transformers_utils.tokenizer_group.ray_tokenizer_group import (
12
+ RayTokenizerGroupPool)
13
+ else:
14
+ RayTokenizerGroupPool = None # type: ignore
15
+
16
+
17
+ def get_tokenizer_group(tokenizer_pool_config: Optional[TokenizerPoolConfig],
18
+ **init_kwargs) -> BaseTokenizerGroup:
19
+ if tokenizer_pool_config is None:
20
+ return TokenizerGroup(**init_kwargs)
21
+ if tokenizer_pool_config.pool_type == "ray":
22
+ if RayTokenizerGroupPool is None:
23
+ raise ImportError(
24
+ "RayTokenizerGroupPool is not available. Please install "
25
+ "the ray package to use the Ray tokenizer group pool.")
26
+ return RayTokenizerGroupPool.from_config(tokenizer_pool_config,
27
+ **init_kwargs)
28
+ else:
29
+ raise ValueError(
30
+ f"Unknown pool type: {tokenizer_pool_config.pool_type}")
31
+
32
+
33
+ __all__ = ["get_tokenizer_group", "BaseTokenizerGroup"]
@@ -0,0 +1,55 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import List, Optional
3
+
4
+ from transformers import PreTrainedTokenizer
5
+
6
+ from vllm.lora.request import LoRARequest
7
+
8
+
9
+ class BaseTokenizerGroup(ABC):
10
+ """A group of tokenizers that can be used for LoRA adapters."""
11
+
12
+ @abstractmethod
13
+ def ping(self) -> bool:
14
+ """Check if the tokenizer group is alive."""
15
+ pass
16
+
17
+ @abstractmethod
18
+ def get_max_input_len(self,
19
+ lora_request: Optional[LoRARequest] = None
20
+ ) -> Optional[int]:
21
+ """Get the maximum input length for the LoRA request."""
22
+ pass
23
+
24
+ @abstractmethod
25
+ def encode(self,
26
+ prompt: str,
27
+ request_id: Optional[str] = None,
28
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
29
+ """Encode a prompt using the tokenizer group."""
30
+ pass
31
+
32
+ @abstractmethod
33
+ async def encode_async(
34
+ self,
35
+ prompt: str,
36
+ request_id: Optional[str] = None,
37
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
38
+ """Encode a prompt using the tokenizer group."""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def get_lora_tokenizer(
43
+ self,
44
+ lora_request: Optional[LoRARequest] = None
45
+ ) -> "PreTrainedTokenizer":
46
+ """Get a tokenizer for a LoRA request."""
47
+ pass
48
+
49
+ @abstractmethod
50
+ async def get_lora_tokenizer_async(
51
+ self,
52
+ lora_request: Optional[LoRARequest] = None
53
+ ) -> "PreTrainedTokenizer":
54
+ """Get a tokenizer for a LoRA request."""
55
+ pass