vllm-npu 0.4.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (219) hide show
  1. vllm/__init__.py +23 -0
  2. vllm/_custom_ops.py +251 -0
  3. vllm/attention/__init__.py +13 -0
  4. vllm/attention/backends/__init__.py +0 -0
  5. vllm/attention/backends/abstract.py +127 -0
  6. vllm/attention/backends/flash_attn.py +271 -0
  7. vllm/attention/backends/flashinfer.py +220 -0
  8. vllm/attention/backends/rocm_flash_attn.py +374 -0
  9. vllm/attention/backends/torch_sdpa.py +250 -0
  10. vllm/attention/backends/xformers.py +393 -0
  11. vllm/attention/layer.py +56 -0
  12. vllm/attention/ops/__init__.py +0 -0
  13. vllm/attention/ops/paged_attn.py +216 -0
  14. vllm/attention/ops/prefix_prefill.py +792 -0
  15. vllm/attention/ops/triton_flash_attention.py +810 -0
  16. vllm/attention/selector.py +91 -0
  17. vllm/block.py +84 -0
  18. vllm/config.py +1225 -0
  19. vllm/core/__init__.py +0 -0
  20. vllm/core/block/__init__.py +0 -0
  21. vllm/core/block/block_table.py +295 -0
  22. vllm/core/block/common.py +199 -0
  23. vllm/core/block/cpu_gpu_block_allocator.py +228 -0
  24. vllm/core/block/interfaces.py +205 -0
  25. vllm/core/block/naive_block.py +318 -0
  26. vllm/core/block/prefix_caching_block.py +606 -0
  27. vllm/core/block_manager_v1.py +625 -0
  28. vllm/core/block_manager_v2.py +258 -0
  29. vllm/core/evictor_v1.py +105 -0
  30. vllm/core/evictor_v2.py +127 -0
  31. vllm/core/interfaces.py +113 -0
  32. vllm/core/policy.py +45 -0
  33. vllm/core/scheduler.py +1163 -0
  34. vllm/distributed/__init__.py +3 -0
  35. vllm/distributed/communication_op.py +237 -0
  36. vllm/distributed/device_communicators/__init__.py +0 -0
  37. vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
  38. vllm/distributed/device_communicators/pynccl.py +287 -0
  39. vllm/distributed/device_communicators/pynccl_utils.py +66 -0
  40. vllm/distributed/parallel_state.py +339 -0
  41. vllm/distributed/utils.py +136 -0
  42. vllm/engine/__init__.py +0 -0
  43. vllm/engine/arg_utils.py +649 -0
  44. vllm/engine/async_llm_engine.py +737 -0
  45. vllm/engine/llm_engine.py +784 -0
  46. vllm/engine/metrics.py +368 -0
  47. vllm/engine/output_processor/__init__.py +0 -0
  48. vllm/engine/output_processor/interfaces.py +76 -0
  49. vllm/engine/output_processor/multi_step.py +142 -0
  50. vllm/engine/output_processor/single_step.py +284 -0
  51. vllm/engine/output_processor/stop_checker.py +101 -0
  52. vllm/engine/output_processor/util.py +19 -0
  53. vllm/entrypoints/__init__.py +0 -0
  54. vllm/entrypoints/api_server.py +119 -0
  55. vllm/entrypoints/llm.py +259 -0
  56. vllm/entrypoints/openai/__init__.py +0 -0
  57. vllm/entrypoints/openai/api_server.py +186 -0
  58. vllm/entrypoints/openai/cli_args.py +115 -0
  59. vllm/entrypoints/openai/protocol.py +460 -0
  60. vllm/entrypoints/openai/serving_chat.py +392 -0
  61. vllm/entrypoints/openai/serving_completion.py +347 -0
  62. vllm/entrypoints/openai/serving_engine.py +234 -0
  63. vllm/envs.py +217 -0
  64. vllm/executor/__init__.py +0 -0
  65. vllm/executor/cpu_executor.py +152 -0
  66. vllm/executor/distributed_gpu_executor.py +115 -0
  67. vllm/executor/executor_base.py +115 -0
  68. vllm/executor/gpu_executor.py +150 -0
  69. vllm/executor/multiproc_worker_utils.py +263 -0
  70. vllm/executor/neuron_executor.py +91 -0
  71. vllm/executor/ray_gpu_executor.py +327 -0
  72. vllm/executor/ray_utils.py +119 -0
  73. vllm/logger.py +153 -0
  74. vllm/logging/__init__.py +5 -0
  75. vllm/logging/formatter.py +15 -0
  76. vllm/lora/__init__.py +0 -0
  77. vllm/lora/fully_sharded_layers.py +262 -0
  78. vllm/lora/layers.py +1181 -0
  79. vllm/lora/lora.py +167 -0
  80. vllm/lora/models.py +645 -0
  81. vllm/lora/punica.py +213 -0
  82. vllm/lora/request.py +32 -0
  83. vllm/lora/utils.py +98 -0
  84. vllm/lora/worker_manager.py +251 -0
  85. vllm/model_executor/__init__.py +7 -0
  86. vllm/model_executor/guided_decoding/__init__.py +25 -0
  87. vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
  88. vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
  89. vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
  90. vllm/model_executor/layers/__init__.py +0 -0
  91. vllm/model_executor/layers/activation.py +173 -0
  92. vllm/model_executor/layers/fused_moe/__init__.py +7 -0
  93. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  94. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  97. vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  98. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  99. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  100. vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  101. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  102. vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  103. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
  104. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  105. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
  106. vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  107. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  108. vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  109. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  110. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
  111. vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  112. vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
  113. vllm/model_executor/layers/layernorm.py +71 -0
  114. vllm/model_executor/layers/linear.py +709 -0
  115. vllm/model_executor/layers/logits_processor.py +115 -0
  116. vllm/model_executor/layers/ops/__init__.py +0 -0
  117. vllm/model_executor/layers/ops/rand.py +157 -0
  118. vllm/model_executor/layers/ops/sample.py +406 -0
  119. vllm/model_executor/layers/quantization/__init__.py +35 -0
  120. vllm/model_executor/layers/quantization/aqlm.py +376 -0
  121. vllm/model_executor/layers/quantization/awq.py +175 -0
  122. vllm/model_executor/layers/quantization/base_config.py +97 -0
  123. vllm/model_executor/layers/quantization/fp8.py +265 -0
  124. vllm/model_executor/layers/quantization/gptq.py +224 -0
  125. vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
  126. vllm/model_executor/layers/quantization/marlin.py +227 -0
  127. vllm/model_executor/layers/quantization/schema.py +84 -0
  128. vllm/model_executor/layers/quantization/squeezellm.py +137 -0
  129. vllm/model_executor/layers/rejection_sampler.py +405 -0
  130. vllm/model_executor/layers/rotary_embedding.py +525 -0
  131. vllm/model_executor/layers/sampler.py +1051 -0
  132. vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
  133. vllm/model_executor/model_loader/__init__.py +30 -0
  134. vllm/model_executor/model_loader/loader.py +362 -0
  135. vllm/model_executor/model_loader/neuron.py +136 -0
  136. vllm/model_executor/model_loader/tensorizer.py +368 -0
  137. vllm/model_executor/model_loader/utils.py +41 -0
  138. vllm/model_executor/model_loader/weight_utils.py +372 -0
  139. vllm/model_executor/models/__init__.py +119 -0
  140. vllm/model_executor/models/baichuan.py +410 -0
  141. vllm/model_executor/models/bloom.py +327 -0
  142. vllm/model_executor/models/chatglm.py +386 -0
  143. vllm/model_executor/models/commandr.py +373 -0
  144. vllm/model_executor/models/dbrx.py +413 -0
  145. vllm/model_executor/models/decilm.py +122 -0
  146. vllm/model_executor/models/deepseek.py +438 -0
  147. vllm/model_executor/models/falcon.py +444 -0
  148. vllm/model_executor/models/gemma.py +393 -0
  149. vllm/model_executor/models/gpt2.py +266 -0
  150. vllm/model_executor/models/gpt_bigcode.py +274 -0
  151. vllm/model_executor/models/gpt_j.py +281 -0
  152. vllm/model_executor/models/gpt_neox.py +295 -0
  153. vllm/model_executor/models/internlm2.py +323 -0
  154. vllm/model_executor/models/jais.py +333 -0
  155. vllm/model_executor/models/llama.py +442 -0
  156. vllm/model_executor/models/llava.py +239 -0
  157. vllm/model_executor/models/minicpm.py +531 -0
  158. vllm/model_executor/models/mixtral.py +583 -0
  159. vllm/model_executor/models/mixtral_quant.py +404 -0
  160. vllm/model_executor/models/mpt.py +295 -0
  161. vllm/model_executor/models/olmo.py +356 -0
  162. vllm/model_executor/models/opt.py +349 -0
  163. vllm/model_executor/models/orion.py +319 -0
  164. vllm/model_executor/models/phi.py +300 -0
  165. vllm/model_executor/models/qwen.py +284 -0
  166. vllm/model_executor/models/qwen2.py +367 -0
  167. vllm/model_executor/models/qwen2_moe.py +447 -0
  168. vllm/model_executor/models/stablelm.py +301 -0
  169. vllm/model_executor/models/starcoder2.py +302 -0
  170. vllm/model_executor/models/xverse.py +366 -0
  171. vllm/model_executor/sampling_metadata.py +588 -0
  172. vllm/model_executor/utils.py +35 -0
  173. vllm/outputs.py +150 -0
  174. vllm/py.typed +2 -0
  175. vllm/sampling_params.py +340 -0
  176. vllm/sequence.py +766 -0
  177. vllm/spec_decode/__init__.py +0 -0
  178. vllm/spec_decode/batch_expansion.py +397 -0
  179. vllm/spec_decode/interfaces.py +73 -0
  180. vllm/spec_decode/metrics.py +191 -0
  181. vllm/spec_decode/multi_step_worker.py +203 -0
  182. vllm/spec_decode/ngram_worker.py +176 -0
  183. vllm/spec_decode/spec_decode_worker.py +472 -0
  184. vllm/spec_decode/top1_proposer.py +200 -0
  185. vllm/spec_decode/util.py +228 -0
  186. vllm/test_utils.py +41 -0
  187. vllm/transformers_utils/__init__.py +0 -0
  188. vllm/transformers_utils/config.py +58 -0
  189. vllm/transformers_utils/configs/__init__.py +16 -0
  190. vllm/transformers_utils/configs/chatglm.py +68 -0
  191. vllm/transformers_utils/configs/dbrx.py +278 -0
  192. vllm/transformers_utils/configs/falcon.py +87 -0
  193. vllm/transformers_utils/configs/jais.py +236 -0
  194. vllm/transformers_utils/configs/mpt.py +178 -0
  195. vllm/transformers_utils/detokenizer.py +313 -0
  196. vllm/transformers_utils/tokenizer.py +149 -0
  197. vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
  198. vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
  199. vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
  200. vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
  201. vllm/transformers_utils/tokenizers/__init__.py +5 -0
  202. vllm/transformers_utils/tokenizers/baichuan.py +255 -0
  203. vllm/usage/__init__.py +0 -0
  204. vllm/usage/usage_lib.py +209 -0
  205. vllm/utils.py +677 -0
  206. vllm/worker/__init__.py +0 -0
  207. vllm/worker/cache_engine.py +105 -0
  208. vllm/worker/cpu_model_runner.py +346 -0
  209. vllm/worker/cpu_worker.py +321 -0
  210. vllm/worker/model_runner.py +1168 -0
  211. vllm/worker/neuron_model_runner.py +196 -0
  212. vllm/worker/neuron_worker.py +98 -0
  213. vllm/worker/worker.py +345 -0
  214. vllm/worker/worker_base.py +146 -0
  215. vllm_npu-0.4.2.dist-info/LICENSE +201 -0
  216. vllm_npu-0.4.2.dist-info/METADATA +173 -0
  217. vllm_npu-0.4.2.dist-info/RECORD +219 -0
  218. vllm_npu-0.4.2.dist-info/WHEEL +5 -0
  219. vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
File without changes
@@ -0,0 +1,397 @@
1
+ from itertools import chain, count
2
+ from typing import Iterator, List, Tuple
3
+
4
+ import torch
5
+
6
+ from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
7
+ SequenceGroupMetadata)
8
+ from vllm.spec_decode.interfaces import (SpeculativeProposals,
9
+ SpeculativeScorer, SpeculativeScores)
10
+ from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
11
+ sampler_output_to_torch,
12
+ split_batch_by_proposal_len)
13
+ from vllm.worker.worker_base import WorkerBase
14
+
15
+ SeqId = int
16
+ TargetSeqId = int
17
+ TokenId = int
18
+
19
+
20
+ class BatchExpansionTop1Scorer(SpeculativeScorer):
21
+ """Implements a speculative scorer that uses batch expansion to get
22
+ probabilities of speculative tokens according to the scoring model.
23
+
24
+ Batch expansion converts a list of sequences and multiple query positions
25
+ to a new batch of sequences, each with a single query position. This allows
26
+ for MQA-like scoring in speculative decoding without requiring an MQA
27
+ kernel.
28
+
29
+ It is strictly less efficient than MQA scoring.
30
+
31
+ It only supports scoring the top1 proposal tokens of the proposer, instead
32
+ of topk/tree.
33
+ """
34
+
35
+ def __init__(self, scorer_worker: WorkerBase, device: str,
36
+ vocab_size: int):
37
+ self._scorer_worker = scorer_worker
38
+ self._device = device
39
+ self._vocab_size = vocab_size
40
+
41
+ @nvtx_range("BatchExpansionTop1Scorer.score_proposals")
42
+ def score_proposals(
43
+ self,
44
+ execute_model_req: ExecuteModelRequest,
45
+ proposals: SpeculativeProposals,
46
+ ) -> SpeculativeScores:
47
+ """Score the proposed tokens via the scorer model.
48
+
49
+ This converts each input sequence to a set of k+1 target sequences. The
50
+ target sequences have the unique continuations to be scored and a
51
+ unique sequence ID that is different from all input sequence ids.
52
+
53
+ If a speculative sequence length would exceed the max model length, then
54
+ no speculation is produced for that sequence.
55
+
56
+ Args:
57
+ execute_model_req: The execution request.
58
+ proposals: The speculative proposals to score.
59
+ Returns:
60
+ SpeculativeScores: The scores of each speculative token, along with
61
+ which sequences were ignored during scoring.
62
+ """
63
+
64
+ # TODO(cade) perform this on GPU to remove blocking call.
65
+ proposal_lens_list = proposals.proposal_lens.tolist()
66
+ proposal_token_ids_list = proposals.proposal_token_ids.tolist()
67
+
68
+ # Filter the list to ignore -1 proposals.
69
+ proposal_token_ids_list_without_skips = [
70
+ proposals for proposals in proposal_token_ids_list
71
+ if -1 not in proposals
72
+ ]
73
+
74
+ (spec_indices, non_spec_indices, target_seq_group_metadata_list,
75
+ num_scoring_tokens) = self._expand_batch(
76
+ seq_group_metadata_list=execute_model_req.seq_group_metadata_list,
77
+ proposal_token_ids_list=proposal_token_ids_list_without_skips,
78
+ proposal_lens_list=proposal_lens_list,
79
+ )
80
+
81
+ target_sampler_output = self._scorer_worker.execute_model(
82
+ execute_model_req=execute_model_req.clone(
83
+ seq_group_metadata_list=target_seq_group_metadata_list, ))
84
+ assert len(target_sampler_output) == 1, "expected single-step output"
85
+ target_sampler_output = target_sampler_output[0]
86
+
87
+ all_tokens, all_probs, spec_logprobs = self._contract_batch(
88
+ contracted_bs=len(execute_model_req.seq_group_metadata_list),
89
+ target_sampler_output=target_sampler_output,
90
+ proposals=proposals,
91
+ num_scoring_tokens=num_scoring_tokens,
92
+ non_spec_indices=non_spec_indices,
93
+ spec_indices=spec_indices,
94
+ k=execute_model_req.num_lookahead_slots,
95
+ )
96
+
97
+ return SpeculativeScores(
98
+ probs=all_probs,
99
+ token_ids=all_tokens,
100
+ logprobs=spec_logprobs,
101
+ )
102
+
103
+ def _expand_batch(
104
+ self,
105
+ seq_group_metadata_list: List[SequenceGroupMetadata],
106
+ proposal_token_ids_list: List[List[TokenId]],
107
+ proposal_lens_list: List[int],
108
+ ) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
109
+ """Given the input sequences and potentially multiple corresponding
110
+ proposal tokens, create a new batch where each sequence has a single
111
+ query token.
112
+ """
113
+
114
+ # vLLM currently only supports proposal lens equal to zero or the batch
115
+ # proposal len. This adds some complexity (splitting the batch into spec
116
+ # and non spec sequences) and should be removed in the future. It can be
117
+ # done by supporting per-sequence proposal lens.
118
+ spec_seqs, spec_indices = split_batch_by_proposal_len(
119
+ seq_group_metadata_list,
120
+ proposal_lens_list,
121
+ select_proposal_len_zero=False)
122
+ non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
123
+ seq_group_metadata_list,
124
+ proposal_lens_list,
125
+ select_proposal_len_zero=True)
126
+
127
+ target_seq_group_metadata_list = self._create_scoring_model_input(
128
+ seq_group_metadata_list=spec_seqs,
129
+ proposal_token_ids=proposal_token_ids_list,
130
+ # NOTE: We determine the seq ids in the expanded batch using the
131
+ # full seq_group_metadata_list, instead of only spec_seqs.
132
+ target_seq_ids_iter=self._create_target_seq_id_iterator(
133
+ seq_ids=get_all_seq_ids(seq_group_metadata_list)),
134
+ )
135
+
136
+ num_scoring_tokens = len(target_seq_group_metadata_list)
137
+ target_seq_group_metadata_list.extend(non_spec_seqs)
138
+
139
+ return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
140
+ num_scoring_tokens)
141
+
142
+ def _contract_batch(
143
+ self, contracted_bs: int,
144
+ target_sampler_output: List[SamplerOutput],
145
+ proposals: SpeculativeProposals, num_scoring_tokens: int,
146
+ non_spec_indices: List[int], spec_indices: List[int],
147
+ k: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
148
+ """Contract the expanded batch back into its original size.
149
+ This maps the scores of speculative tokens back to their original
150
+ sequences.
151
+
152
+ contracted_bs is the original batch size, and the batch size that the
153
+ target_sampler_output will be contracted to.
154
+ """
155
+ (target_token_ids, target_probs, target_logprobs,
156
+ non_spec_target_token_ids, non_spec_target_probs,
157
+ non_spec_target_logprobs) = self._split_scoring_output(
158
+ target_sampler_output, num_scoring_tokens)
159
+
160
+ # Map distinct sequences used to score each token
161
+ # of shape [batch_size * k + 1] back to [batch_size, k + 1].
162
+ expanded_batch_size, k = proposals.proposal_token_ids.shape
163
+
164
+ # The number of tokens in the expanded batch used for speculation is
165
+ # equal to the total expanded batch size minus the number of samples for
166
+ # non-speculative sequences.
167
+ non_spec_expanded_bs, _ = non_spec_target_token_ids.shape
168
+ spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
169
+
170
+ target_token_ids = target_token_ids.squeeze().reshape(
171
+ spec_expanded_bs, k + 1)
172
+ target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
173
+ self._vocab_size)
174
+ target_logprobs = target_logprobs.squeeze().reshape(
175
+ spec_expanded_bs, k + 1, self._vocab_size)
176
+
177
+ all_tokens = torch.full(size=(contracted_bs, k + 1),
178
+ fill_value=-1,
179
+ device=self._device,
180
+ dtype=torch.long)
181
+ all_probs = torch.zeros(contracted_bs,
182
+ k + 1,
183
+ self._vocab_size,
184
+ device=self._device,
185
+ dtype=torch.float32)
186
+ all_logprobs = torch.full(size=(
187
+ contracted_bs,
188
+ k + 1,
189
+ self._vocab_size,
190
+ ),
191
+ fill_value=-float("inf"),
192
+ device=self._device,
193
+ dtype=torch.float32)
194
+
195
+ if non_spec_indices:
196
+ all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
197
+ all_probs[non_spec_indices, :1, :] = non_spec_target_probs
198
+ all_logprobs[non_spec_indices, :1, :] = non_spec_target_logprobs
199
+
200
+ if spec_indices:
201
+ all_tokens[spec_indices] = target_token_ids
202
+ all_probs[spec_indices] = target_probs
203
+ all_logprobs[spec_indices] = target_logprobs
204
+
205
+ return all_tokens, all_probs, all_logprobs
206
+
207
+ def _create_scoring_model_input(
208
+ self,
209
+ seq_group_metadata_list: List[SequenceGroupMetadata],
210
+ proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
211
+ target_seq_ids_iter: Iterator[TargetSeqId],
212
+ ) -> List[SequenceGroupMetadata]:
213
+ """Given the original input sequences and proposed tokens from the draft
214
+ model, create a list of target sequences that can be used for scoring.
215
+
216
+ target_seq_ids_iter provides sequence ids for the expanded batch,
217
+ fulfilling the requirement that no seq id in the expanded batch is equal
218
+ to the seq id in the original batch.
219
+ """
220
+
221
+ if not seq_group_metadata_list:
222
+ return []
223
+
224
+ target_seq_group_metadata = list(
225
+ chain.from_iterable(
226
+ self._create_target_seq_group_metadata(
227
+ seq_group_metadata,
228
+ proposal_token_ids,
229
+ i,
230
+ target_seq_ids_iter,
231
+ ) for i, seq_group_metadata in enumerate(
232
+ seq_group_metadata_list)))
233
+
234
+ return target_seq_group_metadata
235
+
236
+ def _create_target_seq_group_metadata(
237
+ self,
238
+ input_seq_group_metadata: SequenceGroupMetadata,
239
+ proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
240
+ batch_index: int,
241
+ target_seq_ids_iter: Iterator[TargetSeqId],
242
+ ) -> List[SequenceGroupMetadata]:
243
+ """Given an input sequence group metadata and a list of draft tokens,
244
+ create a list of target SequenceGroupMetadata, one for each
245
+ token id that needs to be scored.
246
+
247
+ Naive speculative decoding requires K target model scores, one for each
248
+ draft model token. However one can add a bonus token such that if each
249
+ token is accepted, then a final token may be sampled from the model.
250
+ This function creates K+1 target SequenceGroupMetadata to take
251
+ advantage of the bonus token.
252
+ """
253
+ assert not input_seq_group_metadata.is_prompt, (
254
+ "Speculating on "
255
+ "prompts not yet supported")
256
+ assert len(input_seq_group_metadata.seq_data) == 1, (
257
+ "Beam search "
258
+ "not supported in speculative decoding")
259
+ input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys()))
260
+
261
+ token_ids_to_score = self._get_token_ids_to_score(
262
+ proposal_token_ids[batch_index])
263
+
264
+ target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
265
+ for token_ids in token_ids_to_score:
266
+ target_seq_group_metadata_list.append(
267
+ self._create_single_target_seq_group_metadata(
268
+ input_seq_group_metadata,
269
+ input_seq_id,
270
+ next(target_seq_ids_iter),
271
+ token_ids,
272
+ ))
273
+
274
+ return target_seq_group_metadata_list
275
+
276
+ def _create_single_target_seq_group_metadata(
277
+ self,
278
+ seq_group_metadata: SequenceGroupMetadata,
279
+ seq_id: SeqId,
280
+ target_seq_id: TargetSeqId,
281
+ token_ids: List[TokenId],
282
+ ) -> SequenceGroupMetadata:
283
+ """Create a single target SequenceGroupMetadata.
284
+
285
+ Args:
286
+ seq_group_metadata: The metadata for the input sequence.
287
+ seq_id: The input sequence ID.
288
+ target_seq_id: The corresponding target sequence ID.
289
+ token_ids: The list of token ids that are to be appended to the
290
+ input sequence.
291
+ """
292
+ seq_data = seq_group_metadata.seq_data[seq_id]
293
+ prompt_token_ids = seq_data.get_prompt_token_ids()
294
+ new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
295
+
296
+ return SequenceGroupMetadata(
297
+ request_id=seq_group_metadata.request_id,
298
+ is_prompt=seq_group_metadata.is_prompt,
299
+ seq_data={
300
+ target_seq_id:
301
+ SequenceData(
302
+ prompt_token_ids=prompt_token_ids,
303
+ output_token_ids=new_output_token_ids,
304
+ ),
305
+ },
306
+ sampling_params=seq_group_metadata.sampling_params,
307
+ block_tables={
308
+ target_seq_id: seq_group_metadata.block_tables[seq_id],
309
+ },
310
+ lora_request=None,
311
+ )
312
+
313
+ def _split_scoring_output(
314
+ self, sampler_output: SamplerOutput, num_scoring_tokens: int
315
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
316
+ torch.Tensor, torch.Tensor]:
317
+ """Split the target model output into speculative and non-speculative
318
+ output.
319
+ """
320
+
321
+ # vLLM currently only supports proposal lens equal to zero or the batch
322
+ # proposal len. This adds some complexity (splitting the batch into spec
323
+ # and non spec sequences) and should be removed in the future. It can be
324
+ # done by supporting per-sequence proposal lens.
325
+ #
326
+ # First samples are from speculative scoring, latter samples are non-
327
+ # speculative samples.
328
+ split_sizes = [
329
+ num_scoring_tokens,
330
+ sampler_output.sampled_token_ids.numel() - num_scoring_tokens
331
+ ]
332
+ (spec_probs, non_spec_probs
333
+ ) = sampler_output.sampled_token_probs.split(split_sizes)
334
+ (spec_sampled_tokens, non_spec_sampled_tokens
335
+ ) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
336
+ (
337
+ spec_logprobs,
338
+ non_spec_logprobs,
339
+ ) = sampler_output.logprobs.split(split_sizes)
340
+
341
+ # Convert scores to tensors.
342
+ sampler_output.sampled_token_probs = spec_probs
343
+ sampler_output.sampled_token_ids = spec_sampled_tokens
344
+ sampler_output.logprobs = spec_logprobs
345
+ (target_token_ids, target_probs,
346
+ target_logprobs) = sampler_output_to_torch([sampler_output], True)
347
+
348
+ # Convert non-speculative output tokens to tensors.
349
+ sampler_output.sampled_token_probs = non_spec_probs
350
+ sampler_output.sampled_token_ids = non_spec_sampled_tokens
351
+ sampler_output.logprobs = non_spec_logprobs
352
+ (non_spec_target_token_ids, non_spec_target_probs,
353
+ non_spec_target_logprobs) = sampler_output_to_torch([sampler_output],
354
+ True)
355
+
356
+ return (target_token_ids, target_probs, target_logprobs,
357
+ non_spec_target_token_ids, non_spec_target_probs,
358
+ non_spec_target_logprobs)
359
+
360
+ def _create_target_seq_id_iterator(
361
+ self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
362
+ """Create an iterator for creating target sequence ids.
363
+ Target sequence ids are distinct from sequence ids because we create a
364
+ distinct target sequence id for each proposal token to be scored.
365
+
366
+ This implementation increments a counter starting at 1 + max of all
367
+ provided input sequence ids.
368
+ """
369
+ return count(start=max(seq_ids) + 1)
370
+
371
+ def _get_token_ids_to_score(
372
+ self,
373
+ full_spec_token_ids: List[TokenId] # shape: [k]
374
+ ) -> List[List[TokenId]]:
375
+ """Given an int tensor of proposal token ids, return a list of
376
+ token ids that should be scored.
377
+
378
+ Returns k+1 output lists. The additional one is used for generating the
379
+ bonus token.
380
+
381
+ Example:
382
+ Input: [0, 1, 2, 3] (k=4)
383
+ Output: (k+1 lists)
384
+ []
385
+ [0]
386
+ [0, 1]
387
+ [0, 1, 2]
388
+ [0, 1, 2, 3]
389
+ """
390
+ empty_token_ids: List[TokenId] = []
391
+
392
+ token_ids_to_score = [empty_token_ids]
393
+ token_ids_to_score.extend([
394
+ full_spec_token_ids[:i + 1]
395
+ for i in range(len(full_spec_token_ids))
396
+ ])
397
+ return token_ids_to_score
@@ -0,0 +1,73 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+
4
+ import torch
5
+
6
+ from vllm.sequence import ExecuteModelRequest
7
+
8
+
9
+ @dataclass
10
+ class SpeculativeProposals:
11
+ """Datastructure used to represent proposal tokens from some proposer. It
12
+ also tracks how many speculative tokens each sequence has.
13
+ """
14
+
15
+ # Speculative proposal tokens.
16
+ proposal_token_ids: torch.Tensor
17
+
18
+ # Probabilities of the proposal tokens according to the proposer.
19
+ proposal_probs: torch.Tensor
20
+
21
+ # The valid length of each proposal; can be zero.
22
+ proposal_lens: torch.Tensor
23
+
24
+ def __repr__(self):
25
+ return (f"SpeculativeProposals("
26
+ f"proposal_token_ids={self.proposal_token_ids}, "
27
+ f"proposal_probs={self.proposal_probs.shape}, "
28
+ f"proposal_lens={self.proposal_lens})")
29
+
30
+
31
+ @dataclass
32
+ class SpeculativeScores:
33
+ """Datastructure used to represent the scores of speculative tokens
34
+ according to the scoring model.
35
+ """
36
+
37
+ # Probabilities of the speculative tokens according to the scoring model.
38
+ probs: torch.Tensor
39
+
40
+ # Log-probabilities of the speculative tokens according to the scoring
41
+ # model. These values can be used to generate Logprob objects that are
42
+ # returned to the user.
43
+ logprobs: torch.Tensor
44
+
45
+ # Token ids sampled from the scoring model. Used for speculative bonus
46
+ # tokens and also non-speculative normal decoding.
47
+ token_ids: torch.Tensor
48
+
49
+ def __repr__(self):
50
+ return (f"SpeculativeScores("
51
+ f"probs={self.probs.shape}, "
52
+ f"token_ids={self.token_ids.shape})")
53
+
54
+
55
+ class SpeculativeProposer(ABC):
56
+
57
+ @abstractmethod
58
+ def get_proposals(
59
+ self,
60
+ execute_model_req: ExecuteModelRequest,
61
+ ) -> SpeculativeProposals:
62
+ raise NotImplementedError
63
+
64
+
65
+ class SpeculativeScorer(ABC):
66
+
67
+ @abstractmethod
68
+ def score_proposals(
69
+ self,
70
+ execute_model_req: ExecuteModelRequest,
71
+ proposals: SpeculativeProposals,
72
+ ) -> SpeculativeScores:
73
+ raise NotImplementedError
@@ -0,0 +1,191 @@
1
+ import time
2
+ from dataclasses import dataclass
3
+ from typing import Callable, Optional
4
+
5
+ import torch
6
+
7
+ from vllm.model_executor.layers.rejection_sampler import RejectionSampler
8
+ from vllm.utils import is_pin_memory_available
9
+
10
+
11
+ @dataclass
12
+ class SpecDecodeWorkerMetrics:
13
+ """Dataclass holding metrics emitted from the spec decode worker.
14
+ """
15
+
16
+ # The empirical acceptance rate of the proposal method on a per-token basis.
17
+ # This is useful for evaluating how well the proposal method aligns with the
18
+ # scoring method.
19
+ draft_acceptance_rate: float
20
+
21
+ # The empirical efficiency, measured as the number of tokens emitted by the
22
+ # system divided by the number of tokens that could be emitted by the system
23
+ # if the proposal method were perfect.
24
+ system_efficiency: float
25
+
26
+ # The number of speculative tokens produced by the proposal method.
27
+ draft_tokens: int
28
+
29
+ # The number of tokens emitted by the entire system.
30
+ emitted_tokens: int
31
+
32
+ # The number of tokens accepted by the scoring model and verification
33
+ # routine, e.g. Llama2-70B and lossless rejection sampling.
34
+ #
35
+ # NOTE: Any token accepted by the verification routine is considered
36
+ # accepted (regardless of if the speculative prefix is also accepted). The
37
+ # user will usually see less accepted tokens. This metric is helpful when
38
+ # evaluating alignment of the proposal method with the scoring model.
39
+ accepted_tokens: int
40
+
41
+ # The number of speculative tokens per sequence.
42
+ num_spec_tokens: int
43
+
44
+
45
+ Timer = Callable[[], float]
46
+
47
+
48
+ class AsyncMetricsCollector:
49
+ """Class which copies rejection sampler metrics from the device to CPU on a
50
+ non-default Torch stream.
51
+ """
52
+
53
+ def __init__(self,
54
+ rejection_sampler: RejectionSampler,
55
+ timer: Optional[Timer] = None,
56
+ collect_interval_s: float = 5.0):
57
+ self._rejection_sampler = rejection_sampler
58
+ self._timer = time.time if timer is None else timer
59
+
60
+ self._rank: Optional[int] = None
61
+
62
+ # We don't have a device set yet.
63
+ self._copy_stream: Optional[torch.cuda.Stream] = None
64
+
65
+ self._in_flight_copy: Optional[torch.cuda.Event] = None
66
+
67
+ pin_memory = is_pin_memory_available()
68
+ self._aggregate_num_accepted_tokens = torch.tensor(
69
+ 0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
70
+ self._aggregate_num_emitted_tokens = torch.tensor(
71
+ 0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
72
+ self._aggregate_num_draft_tokens = 0
73
+
74
+ self._rejsample_metrics_collect_interval_s = collect_interval_s
75
+ self._last_metrics_collect_time = self._timer()
76
+
77
+ def init_gpu_tensors(self, rank: int) -> None:
78
+ self._rank = rank
79
+ self._copy_stream = torch.cuda.Stream()
80
+
81
+ def maybe_collect_rejsample_metrics(
82
+ self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
83
+
84
+ # If a copy was initiated in the previous call, collect and return.
85
+ if self._in_flight_copy is not None:
86
+ ready_event = self._in_flight_copy
87
+ self._in_flight_copy = None
88
+ return self._collect_rejsample_metrics(k, ready_event)
89
+
90
+ # Otherwise, check if we should start a new copy.
91
+ if self._should_collect_rejsample_metrics(self._timer()):
92
+ assert self._in_flight_copy is None
93
+ self._in_flight_copy = self._copy_rejsample_metrics_async()
94
+
95
+ return None
96
+
97
+ def _should_collect_rejsample_metrics(self, now: float) -> bool:
98
+ """Return whether or not this iteration should print rejection sampling
99
+ metrics.
100
+ """
101
+ if self._rank != 0:
102
+ return False
103
+
104
+ if (now - self._last_metrics_collect_time <
105
+ self._rejsample_metrics_collect_interval_s):
106
+ return False
107
+ return True
108
+
109
+ def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
110
+ """Copy rejection sampling metrics (number of accepted tokens, etc) to
111
+ CPU asynchronously.
112
+
113
+ Returns a CUDA event recording when the copy is complete.
114
+ """
115
+ assert self._copy_stream is not None
116
+ self._copy_stream.wait_stream(torch.cuda.current_stream())
117
+
118
+ with torch.cuda.stream(self._copy_stream):
119
+ self._aggregate_num_accepted_tokens.copy_(
120
+ self._rejection_sampler.num_accepted_tokens, non_blocking=True)
121
+ self._aggregate_num_emitted_tokens.copy_(
122
+ self._rejection_sampler.num_emitted_tokens, non_blocking=True)
123
+ # Number of draft tokens is calculated on CPU, so no copy is
124
+ # required.
125
+ self._aggregate_num_draft_tokens = (
126
+ self._rejection_sampler.num_draft_tokens)
127
+
128
+ aggregate_metrics_ready = torch.cuda.Event()
129
+ aggregate_metrics_ready.record(self._copy_stream)
130
+
131
+ return aggregate_metrics_ready
132
+
133
+ def _collect_rejsample_metrics(
134
+ self, k: int,
135
+ ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics:
136
+ """Create metrics object from statistics copied asynchronously.
137
+
138
+ Args:
139
+ k: int. The number of speculative tokens; used to determine system
140
+ efficiency.
141
+ ready_event: torch.cuda.Event. The CUDA event recording when the
142
+ async GPU->CPU copy is complete.
143
+ """
144
+
145
+ ready_event.synchronize()
146
+ accepted_tokens = self._aggregate_num_accepted_tokens.item()
147
+ emitted_tokens = self._aggregate_num_emitted_tokens.item()
148
+ draft_tokens = self._aggregate_num_draft_tokens
149
+
150
+ max_num_emitted_tokens = self.get_max_num_emitted_tokens(
151
+ draft_tokens, k)
152
+
153
+ if draft_tokens > 0:
154
+ draft_acceptance_rate = accepted_tokens / draft_tokens
155
+ else:
156
+ draft_acceptance_rate = float("nan")
157
+
158
+ if max_num_emitted_tokens > 0:
159
+ system_efficiency = emitted_tokens / max_num_emitted_tokens
160
+ else:
161
+ system_efficiency = float("nan")
162
+
163
+ return SpecDecodeWorkerMetrics(
164
+ num_spec_tokens=k,
165
+ draft_acceptance_rate=draft_acceptance_rate,
166
+ system_efficiency=system_efficiency,
167
+ accepted_tokens=accepted_tokens,
168
+ draft_tokens=draft_tokens,
169
+ emitted_tokens=emitted_tokens,
170
+ )
171
+
172
+ @staticmethod
173
+ def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int:
174
+ """Calculate the number of emitted tokens, assuming all tokens are
175
+ accepted.
176
+
177
+ This is equal to the number of sequences that have been speculated on,
178
+ times (speculation len + 1). The +1 comes from the bonus token.
179
+ """
180
+ # Determine the number of sequences that have been speculated on. Since
181
+ # the batch size can be variable, we divide by k.
182
+ assert draft_tokens % k == 0
183
+ total_num_spec_seqs = draft_tokens // k
184
+
185
+ # A single sequence may emit k accepted tokens and one bonus token in
186
+ # the best case.
187
+ num_emitted_per_seq_if_all_accepted = k + 1
188
+
189
+ # The max num of emitted tokens is the number of speculated sequences
190
+ # times the max emitted per seq.
191
+ return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted