vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
@@ -0,0 +1,169 @@
1
+ import asyncio
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
6
+ from transformers import PreTrainedTokenizer
7
+
8
+ from vllm.config import TokenizerPoolConfig
9
+ from vllm.executor.ray_utils import ray
10
+ from vllm.lora.request import LoRARequest
11
+ from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
12
+ BaseTokenizerGroup)
13
+ from vllm.transformers_utils.tokenizer_group.tokenizer_group import (
14
+ TokenizerGroup)
15
+
16
+
17
+ class RayTokenizerGroupPool(BaseTokenizerGroup):
18
+ """A Ray-based pool of TokenizerGroups for async tokenization."""
19
+
20
+ # Class to use for workers making up the pool.
21
+ _worker_cls = TokenizerGroup
22
+
23
+ @classmethod
24
+ def from_config(cls, tokenizer_pool_config: TokenizerPoolConfig,
25
+ **init_kwargs) -> "RayTokenizerGroupPool":
26
+ ray_actor_options = (tokenizer_pool_config.extra_config or {
27
+ "num_cpus": 0
28
+ })
29
+ ray_actor_options.setdefault(
30
+ "scheduling_strategy",
31
+ NodeAffinitySchedulingStrategy(
32
+ node_id=ray.get_runtime_context().get_node_id(), soft=True))
33
+
34
+ # Carry over the env vars to the actors.
35
+ # This is necessary for API keys and such.
36
+ ray_actor_options.setdefault("runtime_env", {})
37
+ _carry_over_env_vars_to_runtime_env(ray_actor_options["runtime_env"])
38
+
39
+ init_kwargs["num_actors"] = tokenizer_pool_config.pool_size
40
+ init_kwargs["ray_actor_options"] = ray_actor_options
41
+
42
+ return cls(**init_kwargs)
43
+
44
+ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
45
+ max_input_length: Optional[int], num_actors: int,
46
+ ray_actor_options: dict, **tokenizer_config):
47
+ # Store a local copy of the TokenizerGroup for quick access
48
+ # to underlying HF tokenizers.
49
+ self._local_tokenizer_group = self._worker_cls(
50
+ tokenizer_id=tokenizer_id,
51
+ enable_lora=enable_lora,
52
+ max_num_seqs=max_num_seqs,
53
+ max_input_length=max_input_length,
54
+ **tokenizer_config,
55
+ )
56
+
57
+ ray_tokenizer_group_cls = ray.remote(
58
+ self._worker_cls).options(**ray_actor_options)
59
+ self.tokenizer_actors = [
60
+ ray_tokenizer_group_cls.remote(tokenizer_id, enable_lora,
61
+ max_num_seqs, max_input_length,
62
+ **tokenizer_config)
63
+ for _ in range(num_actors)
64
+ ]
65
+ self._idle_actors: Optional[asyncio.Queue] = None
66
+
67
+ @property
68
+ def pool_size(self) -> int:
69
+ return len(self.tokenizer_actors)
70
+
71
+ def ping(self):
72
+ return ray.get(
73
+ [actor.ping.remote() for actor in self.tokenizer_actors])
74
+
75
+ def _ensure_queue_initialized(self):
76
+ if self._idle_actors is None:
77
+ self._idle_actors = asyncio.Queue()
78
+ for actor in self.tokenizer_actors:
79
+ self._idle_actors.put_nowait(actor)
80
+
81
+ def encode(self,
82
+ prompt: str,
83
+ request_id: Optional[str] = None,
84
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
85
+ """Encode a prompt using the tokenizer group.
86
+
87
+ We pick an idle actor and use it to encode the prompt.
88
+ The actor is then put back in the queue for future use.
89
+ This is blocking.
90
+ """
91
+ self._ensure_queue_initialized()
92
+ assert self._idle_actors is not None
93
+
94
+ if self._idle_actors.empty():
95
+ raise RuntimeError("No idle actors available.")
96
+ actor = self._idle_actors.get_nowait()
97
+ try:
98
+ ret = ray.get(
99
+ actor.encode.remote(request_id=request_id,
100
+ prompt=prompt,
101
+ lora_request=lora_request))
102
+ finally:
103
+ # Put the actor back in the queue.
104
+ # This is done in a finally block to ensure that the actor is
105
+ # always put back in the queue, even if an exception/cancellation
106
+ # is raised.
107
+ self._idle_actors.put_nowait(actor)
108
+ return ret
109
+
110
+ async def encode_async(
111
+ self,
112
+ prompt: str,
113
+ request_id: Optional[str] = None,
114
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
115
+ """Encode a prompt using the tokenizer group.
116
+
117
+ We pick an idle actor and use it to encode the prompt.
118
+ If there are no idle actors, we wait until one becomes
119
+ available.
120
+ The actor is then put back in the queue for future use.
121
+ This is non-blocking.
122
+ """
123
+ self._ensure_queue_initialized()
124
+ assert self._idle_actors is not None
125
+
126
+ actor = await self._idle_actors.get()
127
+ try:
128
+ ret = await actor.encode.remote(request_id=request_id,
129
+ prompt=prompt,
130
+ lora_request=lora_request)
131
+ finally:
132
+ # Put the actor back in the queue.
133
+ # This is done in a finally block to ensure that the actor is
134
+ # always put back in the queue, even if an exception/cancellation
135
+ # is raised.
136
+ self._idle_actors.put_nowait(actor)
137
+ return ret
138
+
139
+ def get_max_input_len(self,
140
+ lora_request: Optional[LoRARequest] = None
141
+ ) -> Optional[int]:
142
+ """Get the maximum input length for the LoRA request."""
143
+ return self._local_tokenizer_group.get_max_input_len(lora_request)
144
+
145
+ def get_lora_tokenizer(
146
+ self,
147
+ lora_request: Optional[LoRARequest] = None
148
+ ) -> "PreTrainedTokenizer":
149
+ return self._local_tokenizer_group.get_lora_tokenizer(lora_request)
150
+
151
+ async def get_lora_tokenizer_async(
152
+ self,
153
+ lora_request: Optional[LoRARequest] = None
154
+ ) -> "PreTrainedTokenizer":
155
+ return await self._local_tokenizer_group.get_lora_tokenizer_async(
156
+ lora_request)
157
+
158
+
159
+ def _carry_over_env_vars_to_runtime_env(runtime_env: dict) -> None:
160
+ """Copy over all current process environment variables to the runtime_env.
161
+
162
+ The variables in runtime_env will take precedence over the current process
163
+ environment variables.
164
+
165
+ runtime_env will be modified in place."""
166
+ env_vars = os.environ.copy()
167
+ runtime_env.setdefault("env_vars", {})
168
+ env_vars.update(runtime_env["env_vars"])
169
+ runtime_env["env_vars"] = env_vars
@@ -0,0 +1,78 @@
1
+ from typing import List, Optional
2
+
3
+ from transformers import PreTrainedTokenizer
4
+
5
+ from vllm.lora.request import LoRARequest
6
+ from vllm.transformers_utils.tokenizer import (get_lora_tokenizer,
7
+ get_lora_tokenizer_async,
8
+ get_tokenizer)
9
+ from vllm.transformers_utils.tokenizer_group.base_tokenizer_group import (
10
+ BaseTokenizerGroup)
11
+ from vllm.utils import LRUCache
12
+
13
+
14
+ class TokenizerGroup(BaseTokenizerGroup):
15
+ """A group of tokenizers that can be used for LoRA adapters."""
16
+
17
+ def __init__(self, tokenizer_id: str, enable_lora: bool, max_num_seqs: int,
18
+ max_input_length: Optional[int], **tokenizer_config):
19
+ self.tokenizer_id = tokenizer_id
20
+ self.tokenizer_config = tokenizer_config
21
+ self.enable_lora = enable_lora
22
+ self.max_input_length = max_input_length
23
+ self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config)
24
+ self.lora_tokenizers = LRUCache[PreTrainedTokenizer](
25
+ capacity=max_num_seqs) if enable_lora else None
26
+
27
+ def ping(self) -> bool:
28
+ """Check if the tokenizer group is alive."""
29
+ return True
30
+
31
+ def get_max_input_len(self,
32
+ lora_request: Optional[LoRARequest] = None
33
+ ) -> Optional[int]:
34
+ """Get the maximum input length for the LoRA request."""
35
+ return self.max_input_length
36
+
37
+ def encode(self,
38
+ prompt: str,
39
+ request_id: Optional[str] = None,
40
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
41
+ tokenizer = self.get_lora_tokenizer(lora_request)
42
+ return tokenizer.encode(prompt)
43
+
44
+ async def encode_async(
45
+ self,
46
+ prompt: str,
47
+ request_id: Optional[str] = None,
48
+ lora_request: Optional[LoRARequest] = None) -> List[int]:
49
+ tokenizer = await self.get_lora_tokenizer_async(lora_request)
50
+ return tokenizer.encode(prompt)
51
+
52
+ def get_lora_tokenizer(
53
+ self,
54
+ lora_request: Optional[LoRARequest] = None
55
+ ) -> "PreTrainedTokenizer":
56
+ if not lora_request or not self.enable_lora:
57
+ return self.tokenizer
58
+ if lora_request.lora_int_id not in self.lora_tokenizers:
59
+ tokenizer = (get_lora_tokenizer(
60
+ lora_request, **self.tokenizer_config) or self.tokenizer)
61
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
62
+ return tokenizer
63
+ else:
64
+ return self.lora_tokenizers.get(lora_request.lora_int_id)
65
+
66
+ async def get_lora_tokenizer_async(
67
+ self,
68
+ lora_request: Optional[LoRARequest] = None
69
+ ) -> "PreTrainedTokenizer":
70
+ if not lora_request or not self.enable_lora:
71
+ return self.tokenizer
72
+ if lora_request.lora_int_id not in self.lora_tokenizers:
73
+ tokenizer = (await get_lora_tokenizer_async(
74
+ lora_request, **self.tokenizer_config) or self.tokenizer)
75
+ self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
76
+ return tokenizer
77
+ else:
78
+ return self.lora_tokenizers.get(lora_request.lora_int_id)
@@ -0,0 +1,5 @@
1
+ from vllm.transformers_utils.tokenizers.baichuan import BaichuanTokenizer
2
+
3
+ __all__ = [
4
+ "BaichuanTokenizer",
5
+ ]
@@ -0,0 +1,255 @@
1
+ # Adapted from
2
+ # https://huggingface.co/baichuan-inc/Baichuan2-13B-Chat/blob/8f6e343d545c503b91429582231d1d354dac2740/tokenization_baichuan.py
3
+ # This includes a fix suggested in
4
+ # https://github.com/vllm-project/vllm/issues/1403#issuecomment-1767503058
5
+ # Copyright (c) 2023, Baichuan Intelligent Technology. All rights reserved.
6
+
7
+ import os
8
+ from shutil import copyfile
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ import sentencepiece as spm
12
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
13
+ from transformers.utils import logging
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+ VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model"}
18
+
19
+ PRETRAINED_VOCAB_FILES_MAP = { # type: ignore
20
+ "vocab_file": {},
21
+ "tokenizer_file": {},
22
+ }
23
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {} # type: ignore
24
+
25
+
26
+ class BaichuanTokenizer(PreTrainedTokenizer):
27
+ """
28
+ Construct a Baichuan tokenizer. Based on byte-level Byte-Pair-Encoding.
29
+
30
+ Args:
31
+ vocab_file (`str`):
32
+ Path to the vocabulary file.
33
+ """
34
+
35
+ vocab_files_names = VOCAB_FILES_NAMES
36
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
37
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
38
+ model_input_names = ["input_ids", "attention_mask"]
39
+
40
+ def __init__(
41
+ self,
42
+ vocab_file,
43
+ unk_token="<unk>",
44
+ bos_token="<s>",
45
+ eos_token="</s>",
46
+ pad_token=None,
47
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
48
+ add_bos_token=True,
49
+ add_eos_token=False,
50
+ clean_up_tokenization_spaces=False,
51
+ **kwargs,
52
+ ):
53
+ self.sp_model_kwargs = ({} if sp_model_kwargs is None else
54
+ sp_model_kwargs)
55
+ bos_token = (AddedToken(bos_token, lstrip=False, rstrip=False)
56
+ if isinstance(bos_token, str) else bos_token)
57
+ eos_token = (AddedToken(eos_token, lstrip=False, rstrip=False)
58
+ if isinstance(eos_token, str) else eos_token)
59
+ unk_token = (AddedToken(unk_token, lstrip=False, rstrip=False)
60
+ if isinstance(unk_token, str) else unk_token)
61
+ pad_token = (AddedToken(pad_token, lstrip=False, rstrip=False)
62
+ if isinstance(pad_token, str) else pad_token)
63
+ self.vocab_file = vocab_file
64
+ self.add_bos_token = add_bos_token
65
+ self.add_eos_token = add_eos_token
66
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
67
+ self.sp_model.Load(vocab_file)
68
+ super().__init__(
69
+ bos_token=bos_token,
70
+ eos_token=eos_token,
71
+ unk_token=unk_token,
72
+ pad_token=pad_token,
73
+ add_bos_token=add_bos_token,
74
+ add_eos_token=add_eos_token,
75
+ sp_model_kwargs=self.sp_model_kwargs,
76
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
+ **kwargs,
78
+ )
79
+
80
+ def __getstate__(self):
81
+ state = self.__dict__.copy()
82
+ state["sp_model"] = None
83
+ return state
84
+
85
+ def __setstate__(self, d):
86
+ self.__dict__ = d
87
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
88
+ self.sp_model.Load(self.vocab_file)
89
+
90
+ @property
91
+ def vocab_size(self):
92
+ """Returns vocab size"""
93
+ return self.sp_model.get_piece_size()
94
+
95
+ def get_vocab(self):
96
+ """Returns vocab as a dict"""
97
+ vocab = {
98
+ self.convert_ids_to_tokens(i): i
99
+ for i in range(self.vocab_size)
100
+ }
101
+ vocab.update(self.added_tokens_encoder)
102
+ return vocab
103
+
104
+ def _tokenize(self, text):
105
+ """Returns a tokenized string."""
106
+ return self.sp_model.encode(text, out_type=str)
107
+
108
+ def _convert_token_to_id(self, token):
109
+ """Converts a token (str) in an id using the vocab."""
110
+ return self.sp_model.piece_to_id(token)
111
+
112
+ def _convert_id_to_token(self, index):
113
+ """Converts an index (integer) in a token (str) using the vocab."""
114
+ token = self.sp_model.IdToPiece(index)
115
+ return token
116
+
117
+ def convert_tokens_to_string(self, tokens: List[str]):
118
+ """Converts a sequence of tokens (string) in a single string."""
119
+ current_sub_tokens: List[str] = []
120
+ out_string = ""
121
+ prev_is_special = False
122
+ for i, token in enumerate(tokens):
123
+ # make sure that special tokens are not decoded using
124
+ # sentencepiece model
125
+ if token in self.all_special_tokens:
126
+ if not prev_is_special and i != 0:
127
+ out_string += " "
128
+ out_string += self.sp_model.decode(current_sub_tokens) + token
129
+ prev_is_special = True
130
+ current_sub_tokens = []
131
+ else:
132
+ current_sub_tokens.append(token)
133
+ prev_is_special = False
134
+ out_string += self.sp_model.decode(current_sub_tokens)
135
+ return out_string
136
+
137
+ def save_vocabulary(self,
138
+ save_directory,
139
+ filename_prefix: Optional[str] = None) -> Tuple[str]:
140
+ """
141
+ Save the vocabulary and special tokens file to a directory.
142
+
143
+ Args:
144
+ save_directory (`str`):
145
+ The directory in which to save the vocabulary.
146
+
147
+ Returns:
148
+ `Tuple(str)`: Paths to the files saved.
149
+ """
150
+ if not os.path.isdir(save_directory):
151
+ raise ValueError(f"Vocabulary path ({save_directory}) "
152
+ "should be a directory")
153
+
154
+ out_vocab_file = os.path.join(
155
+ save_directory,
156
+ (filename_prefix + "-" if filename_prefix else "") +
157
+ VOCAB_FILES_NAMES["vocab_file"],
158
+ )
159
+
160
+ if os.path.abspath(self.vocab_file) != os.path.abspath(
161
+ out_vocab_file) and os.path.isfile(self.vocab_file):
162
+ copyfile(self.vocab_file, out_vocab_file)
163
+ elif not os.path.isfile(self.vocab_file):
164
+ with open(out_vocab_file, "wb") as fi:
165
+ content_spiece_model = self.sp_model.serialized_model_proto()
166
+ fi.write(content_spiece_model)
167
+
168
+ return (out_vocab_file, )
169
+
170
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
171
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
172
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
173
+
174
+ output = bos_token_id + token_ids_0 + eos_token_id
175
+
176
+ if token_ids_1 is not None:
177
+ output = output + bos_token_id + token_ids_1 + eos_token_id
178
+
179
+ return output
180
+
181
+ def get_special_tokens_mask(
182
+ self,
183
+ token_ids_0: List[int],
184
+ token_ids_1: Optional[List[int]] = None,
185
+ already_has_special_tokens: bool = False,
186
+ ) -> List[int]:
187
+ """
188
+ Retrieve sequence ids from a token list that has no special tokens
189
+ added. This method is called when adding
190
+ special tokens using the tokenizer `prepare_for_model` method.
191
+
192
+ Args:
193
+ token_ids_0 (`List[int]`):
194
+ List of IDs.
195
+ token_ids_1 (`List[int]`, *optional*):
196
+ Optional second list of IDs for sequence pairs.
197
+ already_has_special_tokens (`bool`, *optional*, defaults to
198
+ `False`):
199
+ Whether or not the token list is already formatted with
200
+ special tokens for the model.
201
+
202
+ Returns:
203
+ `List[int]`: A list of integers in the range [0, 1]:
204
+ 1 for a special token, 0 for a sequence token.
205
+ """
206
+ if already_has_special_tokens:
207
+ return super().get_special_tokens_mask(
208
+ token_ids_0=token_ids_0,
209
+ token_ids_1=token_ids_1,
210
+ already_has_special_tokens=True,
211
+ )
212
+
213
+ bos_token_id = [1] if self.add_bos_token else []
214
+ eos_token_id = [1] if self.add_eos_token else []
215
+
216
+ if token_ids_1 is None:
217
+ return bos_token_id + ([0] * len(token_ids_0)) + eos_token_id
218
+ return (bos_token_id + ([0] * len(token_ids_0)) + eos_token_id +
219
+ bos_token_id + ([0] * len(token_ids_1)) + eos_token_id)
220
+
221
+ def create_token_type_ids_from_sequences(
222
+ self,
223
+ token_ids_0: List[int],
224
+ token_ids_1: Optional[List[int]] = None) -> List[int]:
225
+ """
226
+ Creates a mask from the two sequences passed to be used in a
227
+ sequence-pair classification task. An ALBERT
228
+ sequence pair mask has the following format:
229
+
230
+ ```
231
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
232
+ | first sequence | second sequence |
233
+ ```
234
+
235
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
236
+
237
+ Args:
238
+ token_ids_0 (`List[int]`):
239
+ List of ids.
240
+ token_ids_1 (`List[int]`, *optional*):
241
+ Optional second list of IDs for sequence pairs.
242
+
243
+ Returns:
244
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids)
245
+ according to the given sequence(s).
246
+ """
247
+ bos_token_id = [self.bos_token_id] if self.add_bos_token else []
248
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
249
+
250
+ output = [0] * len(bos_token_id + token_ids_0 + eos_token_id)
251
+
252
+ if token_ids_1 is not None:
253
+ output += [1] * len(bos_token_id + token_ids_1 + eos_token_id)
254
+
255
+ return output
vllm/usage/__init__.py ADDED
File without changes