vllm-npu 0.4.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- vllm/__init__.py +23 -0
- vllm/_custom_ops.py +251 -0
- vllm/attention/__init__.py +13 -0
- vllm/attention/backends/__init__.py +0 -0
- vllm/attention/backends/abstract.py +127 -0
- vllm/attention/backends/flash_attn.py +271 -0
- vllm/attention/backends/flashinfer.py +220 -0
- vllm/attention/backends/rocm_flash_attn.py +374 -0
- vllm/attention/backends/torch_sdpa.py +250 -0
- vllm/attention/backends/xformers.py +393 -0
- vllm/attention/layer.py +56 -0
- vllm/attention/ops/__init__.py +0 -0
- vllm/attention/ops/paged_attn.py +216 -0
- vllm/attention/ops/prefix_prefill.py +792 -0
- vllm/attention/ops/triton_flash_attention.py +810 -0
- vllm/attention/selector.py +91 -0
- vllm/block.py +84 -0
- vllm/config.py +1225 -0
- vllm/core/__init__.py +0 -0
- vllm/core/block/__init__.py +0 -0
- vllm/core/block/block_table.py +295 -0
- vllm/core/block/common.py +199 -0
- vllm/core/block/cpu_gpu_block_allocator.py +228 -0
- vllm/core/block/interfaces.py +205 -0
- vllm/core/block/naive_block.py +318 -0
- vllm/core/block/prefix_caching_block.py +606 -0
- vllm/core/block_manager_v1.py +625 -0
- vllm/core/block_manager_v2.py +258 -0
- vllm/core/evictor_v1.py +105 -0
- vllm/core/evictor_v2.py +127 -0
- vllm/core/interfaces.py +113 -0
- vllm/core/policy.py +45 -0
- vllm/core/scheduler.py +1163 -0
- vllm/distributed/__init__.py +3 -0
- vllm/distributed/communication_op.py +237 -0
- vllm/distributed/device_communicators/__init__.py +0 -0
- vllm/distributed/device_communicators/custom_all_reduce.py +274 -0
- vllm/distributed/device_communicators/pynccl.py +287 -0
- vllm/distributed/device_communicators/pynccl_utils.py +66 -0
- vllm/distributed/parallel_state.py +339 -0
- vllm/distributed/utils.py +136 -0
- vllm/engine/__init__.py +0 -0
- vllm/engine/arg_utils.py +649 -0
- vllm/engine/async_llm_engine.py +737 -0
- vllm/engine/llm_engine.py +784 -0
- vllm/engine/metrics.py +368 -0
- vllm/engine/output_processor/__init__.py +0 -0
- vllm/engine/output_processor/interfaces.py +76 -0
- vllm/engine/output_processor/multi_step.py +142 -0
- vllm/engine/output_processor/single_step.py +284 -0
- vllm/engine/output_processor/stop_checker.py +101 -0
- vllm/engine/output_processor/util.py +19 -0
- vllm/entrypoints/__init__.py +0 -0
- vllm/entrypoints/api_server.py +119 -0
- vllm/entrypoints/llm.py +259 -0
- vllm/entrypoints/openai/__init__.py +0 -0
- vllm/entrypoints/openai/api_server.py +186 -0
- vllm/entrypoints/openai/cli_args.py +115 -0
- vllm/entrypoints/openai/protocol.py +460 -0
- vllm/entrypoints/openai/serving_chat.py +392 -0
- vllm/entrypoints/openai/serving_completion.py +347 -0
- vllm/entrypoints/openai/serving_engine.py +234 -0
- vllm/envs.py +217 -0
- vllm/executor/__init__.py +0 -0
- vllm/executor/cpu_executor.py +152 -0
- vllm/executor/distributed_gpu_executor.py +115 -0
- vllm/executor/executor_base.py +115 -0
- vllm/executor/gpu_executor.py +150 -0
- vllm/executor/multiproc_worker_utils.py +263 -0
- vllm/executor/neuron_executor.py +91 -0
- vllm/executor/ray_gpu_executor.py +327 -0
- vllm/executor/ray_utils.py +119 -0
- vllm/logger.py +153 -0
- vllm/logging/__init__.py +5 -0
- vllm/logging/formatter.py +15 -0
- vllm/lora/__init__.py +0 -0
- vllm/lora/fully_sharded_layers.py +262 -0
- vllm/lora/layers.py +1181 -0
- vllm/lora/lora.py +167 -0
- vllm/lora/models.py +645 -0
- vllm/lora/punica.py +213 -0
- vllm/lora/request.py +32 -0
- vllm/lora/utils.py +98 -0
- vllm/lora/worker_manager.py +251 -0
- vllm/model_executor/__init__.py +7 -0
- vllm/model_executor/guided_decoding/__init__.py +25 -0
- vllm/model_executor/guided_decoding/lm_format_enforcer_decoding.py +70 -0
- vllm/model_executor/guided_decoding/outlines_decoding.py +130 -0
- vllm/model_executor/guided_decoding/outlines_logits_processors.py +184 -0
- vllm/model_executor/layers/__init__.py +0 -0
- vllm/model_executor/layers/activation.py +173 -0
- vllm/model_executor/layers/fused_moe/__init__.py +7 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +140 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json +146 -0
- vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- vllm/model_executor/layers/fused_moe/fused_moe.py +479 -0
- vllm/model_executor/layers/layernorm.py +71 -0
- vllm/model_executor/layers/linear.py +709 -0
- vllm/model_executor/layers/logits_processor.py +115 -0
- vllm/model_executor/layers/ops/__init__.py +0 -0
- vllm/model_executor/layers/ops/rand.py +157 -0
- vllm/model_executor/layers/ops/sample.py +406 -0
- vllm/model_executor/layers/quantization/__init__.py +35 -0
- vllm/model_executor/layers/quantization/aqlm.py +376 -0
- vllm/model_executor/layers/quantization/awq.py +175 -0
- vllm/model_executor/layers/quantization/base_config.py +97 -0
- vllm/model_executor/layers/quantization/fp8.py +265 -0
- vllm/model_executor/layers/quantization/gptq.py +224 -0
- vllm/model_executor/layers/quantization/gptq_marlin.py +438 -0
- vllm/model_executor/layers/quantization/marlin.py +227 -0
- vllm/model_executor/layers/quantization/schema.py +84 -0
- vllm/model_executor/layers/quantization/squeezellm.py +137 -0
- vllm/model_executor/layers/rejection_sampler.py +405 -0
- vllm/model_executor/layers/rotary_embedding.py +525 -0
- vllm/model_executor/layers/sampler.py +1051 -0
- vllm/model_executor/layers/vocab_parallel_embedding.py +155 -0
- vllm/model_executor/model_loader/__init__.py +30 -0
- vllm/model_executor/model_loader/loader.py +362 -0
- vllm/model_executor/model_loader/neuron.py +136 -0
- vllm/model_executor/model_loader/tensorizer.py +368 -0
- vllm/model_executor/model_loader/utils.py +41 -0
- vllm/model_executor/model_loader/weight_utils.py +372 -0
- vllm/model_executor/models/__init__.py +119 -0
- vllm/model_executor/models/baichuan.py +410 -0
- vllm/model_executor/models/bloom.py +327 -0
- vllm/model_executor/models/chatglm.py +386 -0
- vllm/model_executor/models/commandr.py +373 -0
- vllm/model_executor/models/dbrx.py +413 -0
- vllm/model_executor/models/decilm.py +122 -0
- vllm/model_executor/models/deepseek.py +438 -0
- vllm/model_executor/models/falcon.py +444 -0
- vllm/model_executor/models/gemma.py +393 -0
- vllm/model_executor/models/gpt2.py +266 -0
- vllm/model_executor/models/gpt_bigcode.py +274 -0
- vllm/model_executor/models/gpt_j.py +281 -0
- vllm/model_executor/models/gpt_neox.py +295 -0
- vllm/model_executor/models/internlm2.py +323 -0
- vllm/model_executor/models/jais.py +333 -0
- vllm/model_executor/models/llama.py +442 -0
- vllm/model_executor/models/llava.py +239 -0
- vllm/model_executor/models/minicpm.py +531 -0
- vllm/model_executor/models/mixtral.py +583 -0
- vllm/model_executor/models/mixtral_quant.py +404 -0
- vllm/model_executor/models/mpt.py +295 -0
- vllm/model_executor/models/olmo.py +356 -0
- vllm/model_executor/models/opt.py +349 -0
- vllm/model_executor/models/orion.py +319 -0
- vllm/model_executor/models/phi.py +300 -0
- vllm/model_executor/models/qwen.py +284 -0
- vllm/model_executor/models/qwen2.py +367 -0
- vllm/model_executor/models/qwen2_moe.py +447 -0
- vllm/model_executor/models/stablelm.py +301 -0
- vllm/model_executor/models/starcoder2.py +302 -0
- vllm/model_executor/models/xverse.py +366 -0
- vllm/model_executor/sampling_metadata.py +588 -0
- vllm/model_executor/utils.py +35 -0
- vllm/outputs.py +150 -0
- vllm/py.typed +2 -0
- vllm/sampling_params.py +340 -0
- vllm/sequence.py +766 -0
- vllm/spec_decode/__init__.py +0 -0
- vllm/spec_decode/batch_expansion.py +397 -0
- vllm/spec_decode/interfaces.py +73 -0
- vllm/spec_decode/metrics.py +191 -0
- vllm/spec_decode/multi_step_worker.py +203 -0
- vllm/spec_decode/ngram_worker.py +176 -0
- vllm/spec_decode/spec_decode_worker.py +472 -0
- vllm/spec_decode/top1_proposer.py +200 -0
- vllm/spec_decode/util.py +228 -0
- vllm/test_utils.py +41 -0
- vllm/transformers_utils/__init__.py +0 -0
- vllm/transformers_utils/config.py +58 -0
- vllm/transformers_utils/configs/__init__.py +16 -0
- vllm/transformers_utils/configs/chatglm.py +68 -0
- vllm/transformers_utils/configs/dbrx.py +278 -0
- vllm/transformers_utils/configs/falcon.py +87 -0
- vllm/transformers_utils/configs/jais.py +236 -0
- vllm/transformers_utils/configs/mpt.py +178 -0
- vllm/transformers_utils/detokenizer.py +313 -0
- vllm/transformers_utils/tokenizer.py +149 -0
- vllm/transformers_utils/tokenizer_group/__init__.py +33 -0
- vllm/transformers_utils/tokenizer_group/base_tokenizer_group.py +55 -0
- vllm/transformers_utils/tokenizer_group/ray_tokenizer_group.py +169 -0
- vllm/transformers_utils/tokenizer_group/tokenizer_group.py +78 -0
- vllm/transformers_utils/tokenizers/__init__.py +5 -0
- vllm/transformers_utils/tokenizers/baichuan.py +255 -0
- vllm/usage/__init__.py +0 -0
- vllm/usage/usage_lib.py +209 -0
- vllm/utils.py +677 -0
- vllm/worker/__init__.py +0 -0
- vllm/worker/cache_engine.py +105 -0
- vllm/worker/cpu_model_runner.py +346 -0
- vllm/worker/cpu_worker.py +321 -0
- vllm/worker/model_runner.py +1168 -0
- vllm/worker/neuron_model_runner.py +196 -0
- vllm/worker/neuron_worker.py +98 -0
- vllm/worker/worker.py +345 -0
- vllm/worker/worker_base.py +146 -0
- vllm_npu-0.4.2.dist-info/LICENSE +201 -0
- vllm_npu-0.4.2.dist-info/METADATA +173 -0
- vllm_npu-0.4.2.dist-info/RECORD +219 -0
- vllm_npu-0.4.2.dist-info/WHEEL +5 -0
- vllm_npu-0.4.2.dist-info/top_level.txt +1 -0
File without changes
|
@@ -0,0 +1,397 @@
|
|
1
|
+
from itertools import chain, count
|
2
|
+
from typing import Iterator, List, Tuple
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from vllm.sequence import (ExecuteModelRequest, SamplerOutput, SequenceData,
|
7
|
+
SequenceGroupMetadata)
|
8
|
+
from vllm.spec_decode.interfaces import (SpeculativeProposals,
|
9
|
+
SpeculativeScorer, SpeculativeScores)
|
10
|
+
from vllm.spec_decode.util import (get_all_seq_ids, nvtx_range,
|
11
|
+
sampler_output_to_torch,
|
12
|
+
split_batch_by_proposal_len)
|
13
|
+
from vllm.worker.worker_base import WorkerBase
|
14
|
+
|
15
|
+
SeqId = int
|
16
|
+
TargetSeqId = int
|
17
|
+
TokenId = int
|
18
|
+
|
19
|
+
|
20
|
+
class BatchExpansionTop1Scorer(SpeculativeScorer):
|
21
|
+
"""Implements a speculative scorer that uses batch expansion to get
|
22
|
+
probabilities of speculative tokens according to the scoring model.
|
23
|
+
|
24
|
+
Batch expansion converts a list of sequences and multiple query positions
|
25
|
+
to a new batch of sequences, each with a single query position. This allows
|
26
|
+
for MQA-like scoring in speculative decoding without requiring an MQA
|
27
|
+
kernel.
|
28
|
+
|
29
|
+
It is strictly less efficient than MQA scoring.
|
30
|
+
|
31
|
+
It only supports scoring the top1 proposal tokens of the proposer, instead
|
32
|
+
of topk/tree.
|
33
|
+
"""
|
34
|
+
|
35
|
+
def __init__(self, scorer_worker: WorkerBase, device: str,
|
36
|
+
vocab_size: int):
|
37
|
+
self._scorer_worker = scorer_worker
|
38
|
+
self._device = device
|
39
|
+
self._vocab_size = vocab_size
|
40
|
+
|
41
|
+
@nvtx_range("BatchExpansionTop1Scorer.score_proposals")
|
42
|
+
def score_proposals(
|
43
|
+
self,
|
44
|
+
execute_model_req: ExecuteModelRequest,
|
45
|
+
proposals: SpeculativeProposals,
|
46
|
+
) -> SpeculativeScores:
|
47
|
+
"""Score the proposed tokens via the scorer model.
|
48
|
+
|
49
|
+
This converts each input sequence to a set of k+1 target sequences. The
|
50
|
+
target sequences have the unique continuations to be scored and a
|
51
|
+
unique sequence ID that is different from all input sequence ids.
|
52
|
+
|
53
|
+
If a speculative sequence length would exceed the max model length, then
|
54
|
+
no speculation is produced for that sequence.
|
55
|
+
|
56
|
+
Args:
|
57
|
+
execute_model_req: The execution request.
|
58
|
+
proposals: The speculative proposals to score.
|
59
|
+
Returns:
|
60
|
+
SpeculativeScores: The scores of each speculative token, along with
|
61
|
+
which sequences were ignored during scoring.
|
62
|
+
"""
|
63
|
+
|
64
|
+
# TODO(cade) perform this on GPU to remove blocking call.
|
65
|
+
proposal_lens_list = proposals.proposal_lens.tolist()
|
66
|
+
proposal_token_ids_list = proposals.proposal_token_ids.tolist()
|
67
|
+
|
68
|
+
# Filter the list to ignore -1 proposals.
|
69
|
+
proposal_token_ids_list_without_skips = [
|
70
|
+
proposals for proposals in proposal_token_ids_list
|
71
|
+
if -1 not in proposals
|
72
|
+
]
|
73
|
+
|
74
|
+
(spec_indices, non_spec_indices, target_seq_group_metadata_list,
|
75
|
+
num_scoring_tokens) = self._expand_batch(
|
76
|
+
seq_group_metadata_list=execute_model_req.seq_group_metadata_list,
|
77
|
+
proposal_token_ids_list=proposal_token_ids_list_without_skips,
|
78
|
+
proposal_lens_list=proposal_lens_list,
|
79
|
+
)
|
80
|
+
|
81
|
+
target_sampler_output = self._scorer_worker.execute_model(
|
82
|
+
execute_model_req=execute_model_req.clone(
|
83
|
+
seq_group_metadata_list=target_seq_group_metadata_list, ))
|
84
|
+
assert len(target_sampler_output) == 1, "expected single-step output"
|
85
|
+
target_sampler_output = target_sampler_output[0]
|
86
|
+
|
87
|
+
all_tokens, all_probs, spec_logprobs = self._contract_batch(
|
88
|
+
contracted_bs=len(execute_model_req.seq_group_metadata_list),
|
89
|
+
target_sampler_output=target_sampler_output,
|
90
|
+
proposals=proposals,
|
91
|
+
num_scoring_tokens=num_scoring_tokens,
|
92
|
+
non_spec_indices=non_spec_indices,
|
93
|
+
spec_indices=spec_indices,
|
94
|
+
k=execute_model_req.num_lookahead_slots,
|
95
|
+
)
|
96
|
+
|
97
|
+
return SpeculativeScores(
|
98
|
+
probs=all_probs,
|
99
|
+
token_ids=all_tokens,
|
100
|
+
logprobs=spec_logprobs,
|
101
|
+
)
|
102
|
+
|
103
|
+
def _expand_batch(
|
104
|
+
self,
|
105
|
+
seq_group_metadata_list: List[SequenceGroupMetadata],
|
106
|
+
proposal_token_ids_list: List[List[TokenId]],
|
107
|
+
proposal_lens_list: List[int],
|
108
|
+
) -> Tuple[List[int], List[int], List[SequenceGroupMetadata], int]:
|
109
|
+
"""Given the input sequences and potentially multiple corresponding
|
110
|
+
proposal tokens, create a new batch where each sequence has a single
|
111
|
+
query token.
|
112
|
+
"""
|
113
|
+
|
114
|
+
# vLLM currently only supports proposal lens equal to zero or the batch
|
115
|
+
# proposal len. This adds some complexity (splitting the batch into spec
|
116
|
+
# and non spec sequences) and should be removed in the future. It can be
|
117
|
+
# done by supporting per-sequence proposal lens.
|
118
|
+
spec_seqs, spec_indices = split_batch_by_proposal_len(
|
119
|
+
seq_group_metadata_list,
|
120
|
+
proposal_lens_list,
|
121
|
+
select_proposal_len_zero=False)
|
122
|
+
non_spec_seqs, non_spec_indices = split_batch_by_proposal_len(
|
123
|
+
seq_group_metadata_list,
|
124
|
+
proposal_lens_list,
|
125
|
+
select_proposal_len_zero=True)
|
126
|
+
|
127
|
+
target_seq_group_metadata_list = self._create_scoring_model_input(
|
128
|
+
seq_group_metadata_list=spec_seqs,
|
129
|
+
proposal_token_ids=proposal_token_ids_list,
|
130
|
+
# NOTE: We determine the seq ids in the expanded batch using the
|
131
|
+
# full seq_group_metadata_list, instead of only spec_seqs.
|
132
|
+
target_seq_ids_iter=self._create_target_seq_id_iterator(
|
133
|
+
seq_ids=get_all_seq_ids(seq_group_metadata_list)),
|
134
|
+
)
|
135
|
+
|
136
|
+
num_scoring_tokens = len(target_seq_group_metadata_list)
|
137
|
+
target_seq_group_metadata_list.extend(non_spec_seqs)
|
138
|
+
|
139
|
+
return (spec_indices, non_spec_indices, target_seq_group_metadata_list,
|
140
|
+
num_scoring_tokens)
|
141
|
+
|
142
|
+
def _contract_batch(
|
143
|
+
self, contracted_bs: int,
|
144
|
+
target_sampler_output: List[SamplerOutput],
|
145
|
+
proposals: SpeculativeProposals, num_scoring_tokens: int,
|
146
|
+
non_spec_indices: List[int], spec_indices: List[int],
|
147
|
+
k: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
148
|
+
"""Contract the expanded batch back into its original size.
|
149
|
+
This maps the scores of speculative tokens back to their original
|
150
|
+
sequences.
|
151
|
+
|
152
|
+
contracted_bs is the original batch size, and the batch size that the
|
153
|
+
target_sampler_output will be contracted to.
|
154
|
+
"""
|
155
|
+
(target_token_ids, target_probs, target_logprobs,
|
156
|
+
non_spec_target_token_ids, non_spec_target_probs,
|
157
|
+
non_spec_target_logprobs) = self._split_scoring_output(
|
158
|
+
target_sampler_output, num_scoring_tokens)
|
159
|
+
|
160
|
+
# Map distinct sequences used to score each token
|
161
|
+
# of shape [batch_size * k + 1] back to [batch_size, k + 1].
|
162
|
+
expanded_batch_size, k = proposals.proposal_token_ids.shape
|
163
|
+
|
164
|
+
# The number of tokens in the expanded batch used for speculation is
|
165
|
+
# equal to the total expanded batch size minus the number of samples for
|
166
|
+
# non-speculative sequences.
|
167
|
+
non_spec_expanded_bs, _ = non_spec_target_token_ids.shape
|
168
|
+
spec_expanded_bs = expanded_batch_size - non_spec_expanded_bs
|
169
|
+
|
170
|
+
target_token_ids = target_token_ids.squeeze().reshape(
|
171
|
+
spec_expanded_bs, k + 1)
|
172
|
+
target_probs = target_probs.squeeze().reshape(spec_expanded_bs, k + 1,
|
173
|
+
self._vocab_size)
|
174
|
+
target_logprobs = target_logprobs.squeeze().reshape(
|
175
|
+
spec_expanded_bs, k + 1, self._vocab_size)
|
176
|
+
|
177
|
+
all_tokens = torch.full(size=(contracted_bs, k + 1),
|
178
|
+
fill_value=-1,
|
179
|
+
device=self._device,
|
180
|
+
dtype=torch.long)
|
181
|
+
all_probs = torch.zeros(contracted_bs,
|
182
|
+
k + 1,
|
183
|
+
self._vocab_size,
|
184
|
+
device=self._device,
|
185
|
+
dtype=torch.float32)
|
186
|
+
all_logprobs = torch.full(size=(
|
187
|
+
contracted_bs,
|
188
|
+
k + 1,
|
189
|
+
self._vocab_size,
|
190
|
+
),
|
191
|
+
fill_value=-float("inf"),
|
192
|
+
device=self._device,
|
193
|
+
dtype=torch.float32)
|
194
|
+
|
195
|
+
if non_spec_indices:
|
196
|
+
all_tokens[non_spec_indices, :1] = non_spec_target_token_ids
|
197
|
+
all_probs[non_spec_indices, :1, :] = non_spec_target_probs
|
198
|
+
all_logprobs[non_spec_indices, :1, :] = non_spec_target_logprobs
|
199
|
+
|
200
|
+
if spec_indices:
|
201
|
+
all_tokens[spec_indices] = target_token_ids
|
202
|
+
all_probs[spec_indices] = target_probs
|
203
|
+
all_logprobs[spec_indices] = target_logprobs
|
204
|
+
|
205
|
+
return all_tokens, all_probs, all_logprobs
|
206
|
+
|
207
|
+
def _create_scoring_model_input(
|
208
|
+
self,
|
209
|
+
seq_group_metadata_list: List[SequenceGroupMetadata],
|
210
|
+
proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
|
211
|
+
target_seq_ids_iter: Iterator[TargetSeqId],
|
212
|
+
) -> List[SequenceGroupMetadata]:
|
213
|
+
"""Given the original input sequences and proposed tokens from the draft
|
214
|
+
model, create a list of target sequences that can be used for scoring.
|
215
|
+
|
216
|
+
target_seq_ids_iter provides sequence ids for the expanded batch,
|
217
|
+
fulfilling the requirement that no seq id in the expanded batch is equal
|
218
|
+
to the seq id in the original batch.
|
219
|
+
"""
|
220
|
+
|
221
|
+
if not seq_group_metadata_list:
|
222
|
+
return []
|
223
|
+
|
224
|
+
target_seq_group_metadata = list(
|
225
|
+
chain.from_iterable(
|
226
|
+
self._create_target_seq_group_metadata(
|
227
|
+
seq_group_metadata,
|
228
|
+
proposal_token_ids,
|
229
|
+
i,
|
230
|
+
target_seq_ids_iter,
|
231
|
+
) for i, seq_group_metadata in enumerate(
|
232
|
+
seq_group_metadata_list)))
|
233
|
+
|
234
|
+
return target_seq_group_metadata
|
235
|
+
|
236
|
+
def _create_target_seq_group_metadata(
|
237
|
+
self,
|
238
|
+
input_seq_group_metadata: SequenceGroupMetadata,
|
239
|
+
proposal_token_ids: List[List[TokenId]], # shape: [batch_size, k]
|
240
|
+
batch_index: int,
|
241
|
+
target_seq_ids_iter: Iterator[TargetSeqId],
|
242
|
+
) -> List[SequenceGroupMetadata]:
|
243
|
+
"""Given an input sequence group metadata and a list of draft tokens,
|
244
|
+
create a list of target SequenceGroupMetadata, one for each
|
245
|
+
token id that needs to be scored.
|
246
|
+
|
247
|
+
Naive speculative decoding requires K target model scores, one for each
|
248
|
+
draft model token. However one can add a bonus token such that if each
|
249
|
+
token is accepted, then a final token may be sampled from the model.
|
250
|
+
This function creates K+1 target SequenceGroupMetadata to take
|
251
|
+
advantage of the bonus token.
|
252
|
+
"""
|
253
|
+
assert not input_seq_group_metadata.is_prompt, (
|
254
|
+
"Speculating on "
|
255
|
+
"prompts not yet supported")
|
256
|
+
assert len(input_seq_group_metadata.seq_data) == 1, (
|
257
|
+
"Beam search "
|
258
|
+
"not supported in speculative decoding")
|
259
|
+
input_seq_id = next(iter(input_seq_group_metadata.seq_data.keys()))
|
260
|
+
|
261
|
+
token_ids_to_score = self._get_token_ids_to_score(
|
262
|
+
proposal_token_ids[batch_index])
|
263
|
+
|
264
|
+
target_seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
265
|
+
for token_ids in token_ids_to_score:
|
266
|
+
target_seq_group_metadata_list.append(
|
267
|
+
self._create_single_target_seq_group_metadata(
|
268
|
+
input_seq_group_metadata,
|
269
|
+
input_seq_id,
|
270
|
+
next(target_seq_ids_iter),
|
271
|
+
token_ids,
|
272
|
+
))
|
273
|
+
|
274
|
+
return target_seq_group_metadata_list
|
275
|
+
|
276
|
+
def _create_single_target_seq_group_metadata(
|
277
|
+
self,
|
278
|
+
seq_group_metadata: SequenceGroupMetadata,
|
279
|
+
seq_id: SeqId,
|
280
|
+
target_seq_id: TargetSeqId,
|
281
|
+
token_ids: List[TokenId],
|
282
|
+
) -> SequenceGroupMetadata:
|
283
|
+
"""Create a single target SequenceGroupMetadata.
|
284
|
+
|
285
|
+
Args:
|
286
|
+
seq_group_metadata: The metadata for the input sequence.
|
287
|
+
seq_id: The input sequence ID.
|
288
|
+
target_seq_id: The corresponding target sequence ID.
|
289
|
+
token_ids: The list of token ids that are to be appended to the
|
290
|
+
input sequence.
|
291
|
+
"""
|
292
|
+
seq_data = seq_group_metadata.seq_data[seq_id]
|
293
|
+
prompt_token_ids = seq_data.get_prompt_token_ids()
|
294
|
+
new_output_token_ids = [*seq_data.get_output_token_ids(), *token_ids]
|
295
|
+
|
296
|
+
return SequenceGroupMetadata(
|
297
|
+
request_id=seq_group_metadata.request_id,
|
298
|
+
is_prompt=seq_group_metadata.is_prompt,
|
299
|
+
seq_data={
|
300
|
+
target_seq_id:
|
301
|
+
SequenceData(
|
302
|
+
prompt_token_ids=prompt_token_ids,
|
303
|
+
output_token_ids=new_output_token_ids,
|
304
|
+
),
|
305
|
+
},
|
306
|
+
sampling_params=seq_group_metadata.sampling_params,
|
307
|
+
block_tables={
|
308
|
+
target_seq_id: seq_group_metadata.block_tables[seq_id],
|
309
|
+
},
|
310
|
+
lora_request=None,
|
311
|
+
)
|
312
|
+
|
313
|
+
def _split_scoring_output(
|
314
|
+
self, sampler_output: SamplerOutput, num_scoring_tokens: int
|
315
|
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
|
316
|
+
torch.Tensor, torch.Tensor]:
|
317
|
+
"""Split the target model output into speculative and non-speculative
|
318
|
+
output.
|
319
|
+
"""
|
320
|
+
|
321
|
+
# vLLM currently only supports proposal lens equal to zero or the batch
|
322
|
+
# proposal len. This adds some complexity (splitting the batch into spec
|
323
|
+
# and non spec sequences) and should be removed in the future. It can be
|
324
|
+
# done by supporting per-sequence proposal lens.
|
325
|
+
#
|
326
|
+
# First samples are from speculative scoring, latter samples are non-
|
327
|
+
# speculative samples.
|
328
|
+
split_sizes = [
|
329
|
+
num_scoring_tokens,
|
330
|
+
sampler_output.sampled_token_ids.numel() - num_scoring_tokens
|
331
|
+
]
|
332
|
+
(spec_probs, non_spec_probs
|
333
|
+
) = sampler_output.sampled_token_probs.split(split_sizes)
|
334
|
+
(spec_sampled_tokens, non_spec_sampled_tokens
|
335
|
+
) = sampler_output.sampled_token_ids.flatten().split(split_sizes)
|
336
|
+
(
|
337
|
+
spec_logprobs,
|
338
|
+
non_spec_logprobs,
|
339
|
+
) = sampler_output.logprobs.split(split_sizes)
|
340
|
+
|
341
|
+
# Convert scores to tensors.
|
342
|
+
sampler_output.sampled_token_probs = spec_probs
|
343
|
+
sampler_output.sampled_token_ids = spec_sampled_tokens
|
344
|
+
sampler_output.logprobs = spec_logprobs
|
345
|
+
(target_token_ids, target_probs,
|
346
|
+
target_logprobs) = sampler_output_to_torch([sampler_output], True)
|
347
|
+
|
348
|
+
# Convert non-speculative output tokens to tensors.
|
349
|
+
sampler_output.sampled_token_probs = non_spec_probs
|
350
|
+
sampler_output.sampled_token_ids = non_spec_sampled_tokens
|
351
|
+
sampler_output.logprobs = non_spec_logprobs
|
352
|
+
(non_spec_target_token_ids, non_spec_target_probs,
|
353
|
+
non_spec_target_logprobs) = sampler_output_to_torch([sampler_output],
|
354
|
+
True)
|
355
|
+
|
356
|
+
return (target_token_ids, target_probs, target_logprobs,
|
357
|
+
non_spec_target_token_ids, non_spec_target_probs,
|
358
|
+
non_spec_target_logprobs)
|
359
|
+
|
360
|
+
def _create_target_seq_id_iterator(
|
361
|
+
self, seq_ids: List[SeqId]) -> Iterator[TargetSeqId]:
|
362
|
+
"""Create an iterator for creating target sequence ids.
|
363
|
+
Target sequence ids are distinct from sequence ids because we create a
|
364
|
+
distinct target sequence id for each proposal token to be scored.
|
365
|
+
|
366
|
+
This implementation increments a counter starting at 1 + max of all
|
367
|
+
provided input sequence ids.
|
368
|
+
"""
|
369
|
+
return count(start=max(seq_ids) + 1)
|
370
|
+
|
371
|
+
def _get_token_ids_to_score(
|
372
|
+
self,
|
373
|
+
full_spec_token_ids: List[TokenId] # shape: [k]
|
374
|
+
) -> List[List[TokenId]]:
|
375
|
+
"""Given an int tensor of proposal token ids, return a list of
|
376
|
+
token ids that should be scored.
|
377
|
+
|
378
|
+
Returns k+1 output lists. The additional one is used for generating the
|
379
|
+
bonus token.
|
380
|
+
|
381
|
+
Example:
|
382
|
+
Input: [0, 1, 2, 3] (k=4)
|
383
|
+
Output: (k+1 lists)
|
384
|
+
[]
|
385
|
+
[0]
|
386
|
+
[0, 1]
|
387
|
+
[0, 1, 2]
|
388
|
+
[0, 1, 2, 3]
|
389
|
+
"""
|
390
|
+
empty_token_ids: List[TokenId] = []
|
391
|
+
|
392
|
+
token_ids_to_score = [empty_token_ids]
|
393
|
+
token_ids_to_score.extend([
|
394
|
+
full_spec_token_ids[:i + 1]
|
395
|
+
for i in range(len(full_spec_token_ids))
|
396
|
+
])
|
397
|
+
return token_ids_to_score
|
@@ -0,0 +1,73 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from dataclasses import dataclass
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from vllm.sequence import ExecuteModelRequest
|
7
|
+
|
8
|
+
|
9
|
+
@dataclass
|
10
|
+
class SpeculativeProposals:
|
11
|
+
"""Datastructure used to represent proposal tokens from some proposer. It
|
12
|
+
also tracks how many speculative tokens each sequence has.
|
13
|
+
"""
|
14
|
+
|
15
|
+
# Speculative proposal tokens.
|
16
|
+
proposal_token_ids: torch.Tensor
|
17
|
+
|
18
|
+
# Probabilities of the proposal tokens according to the proposer.
|
19
|
+
proposal_probs: torch.Tensor
|
20
|
+
|
21
|
+
# The valid length of each proposal; can be zero.
|
22
|
+
proposal_lens: torch.Tensor
|
23
|
+
|
24
|
+
def __repr__(self):
|
25
|
+
return (f"SpeculativeProposals("
|
26
|
+
f"proposal_token_ids={self.proposal_token_ids}, "
|
27
|
+
f"proposal_probs={self.proposal_probs.shape}, "
|
28
|
+
f"proposal_lens={self.proposal_lens})")
|
29
|
+
|
30
|
+
|
31
|
+
@dataclass
|
32
|
+
class SpeculativeScores:
|
33
|
+
"""Datastructure used to represent the scores of speculative tokens
|
34
|
+
according to the scoring model.
|
35
|
+
"""
|
36
|
+
|
37
|
+
# Probabilities of the speculative tokens according to the scoring model.
|
38
|
+
probs: torch.Tensor
|
39
|
+
|
40
|
+
# Log-probabilities of the speculative tokens according to the scoring
|
41
|
+
# model. These values can be used to generate Logprob objects that are
|
42
|
+
# returned to the user.
|
43
|
+
logprobs: torch.Tensor
|
44
|
+
|
45
|
+
# Token ids sampled from the scoring model. Used for speculative bonus
|
46
|
+
# tokens and also non-speculative normal decoding.
|
47
|
+
token_ids: torch.Tensor
|
48
|
+
|
49
|
+
def __repr__(self):
|
50
|
+
return (f"SpeculativeScores("
|
51
|
+
f"probs={self.probs.shape}, "
|
52
|
+
f"token_ids={self.token_ids.shape})")
|
53
|
+
|
54
|
+
|
55
|
+
class SpeculativeProposer(ABC):
|
56
|
+
|
57
|
+
@abstractmethod
|
58
|
+
def get_proposals(
|
59
|
+
self,
|
60
|
+
execute_model_req: ExecuteModelRequest,
|
61
|
+
) -> SpeculativeProposals:
|
62
|
+
raise NotImplementedError
|
63
|
+
|
64
|
+
|
65
|
+
class SpeculativeScorer(ABC):
|
66
|
+
|
67
|
+
@abstractmethod
|
68
|
+
def score_proposals(
|
69
|
+
self,
|
70
|
+
execute_model_req: ExecuteModelRequest,
|
71
|
+
proposals: SpeculativeProposals,
|
72
|
+
) -> SpeculativeScores:
|
73
|
+
raise NotImplementedError
|
@@ -0,0 +1,191 @@
|
|
1
|
+
import time
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Callable, Optional
|
4
|
+
|
5
|
+
import torch
|
6
|
+
|
7
|
+
from vllm.model_executor.layers.rejection_sampler import RejectionSampler
|
8
|
+
from vllm.utils import is_pin_memory_available
|
9
|
+
|
10
|
+
|
11
|
+
@dataclass
|
12
|
+
class SpecDecodeWorkerMetrics:
|
13
|
+
"""Dataclass holding metrics emitted from the spec decode worker.
|
14
|
+
"""
|
15
|
+
|
16
|
+
# The empirical acceptance rate of the proposal method on a per-token basis.
|
17
|
+
# This is useful for evaluating how well the proposal method aligns with the
|
18
|
+
# scoring method.
|
19
|
+
draft_acceptance_rate: float
|
20
|
+
|
21
|
+
# The empirical efficiency, measured as the number of tokens emitted by the
|
22
|
+
# system divided by the number of tokens that could be emitted by the system
|
23
|
+
# if the proposal method were perfect.
|
24
|
+
system_efficiency: float
|
25
|
+
|
26
|
+
# The number of speculative tokens produced by the proposal method.
|
27
|
+
draft_tokens: int
|
28
|
+
|
29
|
+
# The number of tokens emitted by the entire system.
|
30
|
+
emitted_tokens: int
|
31
|
+
|
32
|
+
# The number of tokens accepted by the scoring model and verification
|
33
|
+
# routine, e.g. Llama2-70B and lossless rejection sampling.
|
34
|
+
#
|
35
|
+
# NOTE: Any token accepted by the verification routine is considered
|
36
|
+
# accepted (regardless of if the speculative prefix is also accepted). The
|
37
|
+
# user will usually see less accepted tokens. This metric is helpful when
|
38
|
+
# evaluating alignment of the proposal method with the scoring model.
|
39
|
+
accepted_tokens: int
|
40
|
+
|
41
|
+
# The number of speculative tokens per sequence.
|
42
|
+
num_spec_tokens: int
|
43
|
+
|
44
|
+
|
45
|
+
Timer = Callable[[], float]
|
46
|
+
|
47
|
+
|
48
|
+
class AsyncMetricsCollector:
|
49
|
+
"""Class which copies rejection sampler metrics from the device to CPU on a
|
50
|
+
non-default Torch stream.
|
51
|
+
"""
|
52
|
+
|
53
|
+
def __init__(self,
|
54
|
+
rejection_sampler: RejectionSampler,
|
55
|
+
timer: Optional[Timer] = None,
|
56
|
+
collect_interval_s: float = 5.0):
|
57
|
+
self._rejection_sampler = rejection_sampler
|
58
|
+
self._timer = time.time if timer is None else timer
|
59
|
+
|
60
|
+
self._rank: Optional[int] = None
|
61
|
+
|
62
|
+
# We don't have a device set yet.
|
63
|
+
self._copy_stream: Optional[torch.cuda.Stream] = None
|
64
|
+
|
65
|
+
self._in_flight_copy: Optional[torch.cuda.Event] = None
|
66
|
+
|
67
|
+
pin_memory = is_pin_memory_available()
|
68
|
+
self._aggregate_num_accepted_tokens = torch.tensor(
|
69
|
+
0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
|
70
|
+
self._aggregate_num_emitted_tokens = torch.tensor(
|
71
|
+
0, dtype=torch.long, device="cpu", pin_memory=pin_memory)
|
72
|
+
self._aggregate_num_draft_tokens = 0
|
73
|
+
|
74
|
+
self._rejsample_metrics_collect_interval_s = collect_interval_s
|
75
|
+
self._last_metrics_collect_time = self._timer()
|
76
|
+
|
77
|
+
def init_gpu_tensors(self, rank: int) -> None:
|
78
|
+
self._rank = rank
|
79
|
+
self._copy_stream = torch.cuda.Stream()
|
80
|
+
|
81
|
+
def maybe_collect_rejsample_metrics(
|
82
|
+
self, k: int) -> Optional[SpecDecodeWorkerMetrics]:
|
83
|
+
|
84
|
+
# If a copy was initiated in the previous call, collect and return.
|
85
|
+
if self._in_flight_copy is not None:
|
86
|
+
ready_event = self._in_flight_copy
|
87
|
+
self._in_flight_copy = None
|
88
|
+
return self._collect_rejsample_metrics(k, ready_event)
|
89
|
+
|
90
|
+
# Otherwise, check if we should start a new copy.
|
91
|
+
if self._should_collect_rejsample_metrics(self._timer()):
|
92
|
+
assert self._in_flight_copy is None
|
93
|
+
self._in_flight_copy = self._copy_rejsample_metrics_async()
|
94
|
+
|
95
|
+
return None
|
96
|
+
|
97
|
+
def _should_collect_rejsample_metrics(self, now: float) -> bool:
|
98
|
+
"""Return whether or not this iteration should print rejection sampling
|
99
|
+
metrics.
|
100
|
+
"""
|
101
|
+
if self._rank != 0:
|
102
|
+
return False
|
103
|
+
|
104
|
+
if (now - self._last_metrics_collect_time <
|
105
|
+
self._rejsample_metrics_collect_interval_s):
|
106
|
+
return False
|
107
|
+
return True
|
108
|
+
|
109
|
+
def _copy_rejsample_metrics_async(self) -> torch.cuda.Event:
|
110
|
+
"""Copy rejection sampling metrics (number of accepted tokens, etc) to
|
111
|
+
CPU asynchronously.
|
112
|
+
|
113
|
+
Returns a CUDA event recording when the copy is complete.
|
114
|
+
"""
|
115
|
+
assert self._copy_stream is not None
|
116
|
+
self._copy_stream.wait_stream(torch.cuda.current_stream())
|
117
|
+
|
118
|
+
with torch.cuda.stream(self._copy_stream):
|
119
|
+
self._aggregate_num_accepted_tokens.copy_(
|
120
|
+
self._rejection_sampler.num_accepted_tokens, non_blocking=True)
|
121
|
+
self._aggregate_num_emitted_tokens.copy_(
|
122
|
+
self._rejection_sampler.num_emitted_tokens, non_blocking=True)
|
123
|
+
# Number of draft tokens is calculated on CPU, so no copy is
|
124
|
+
# required.
|
125
|
+
self._aggregate_num_draft_tokens = (
|
126
|
+
self._rejection_sampler.num_draft_tokens)
|
127
|
+
|
128
|
+
aggregate_metrics_ready = torch.cuda.Event()
|
129
|
+
aggregate_metrics_ready.record(self._copy_stream)
|
130
|
+
|
131
|
+
return aggregate_metrics_ready
|
132
|
+
|
133
|
+
def _collect_rejsample_metrics(
|
134
|
+
self, k: int,
|
135
|
+
ready_event: torch.cuda.Event) -> SpecDecodeWorkerMetrics:
|
136
|
+
"""Create metrics object from statistics copied asynchronously.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
k: int. The number of speculative tokens; used to determine system
|
140
|
+
efficiency.
|
141
|
+
ready_event: torch.cuda.Event. The CUDA event recording when the
|
142
|
+
async GPU->CPU copy is complete.
|
143
|
+
"""
|
144
|
+
|
145
|
+
ready_event.synchronize()
|
146
|
+
accepted_tokens = self._aggregate_num_accepted_tokens.item()
|
147
|
+
emitted_tokens = self._aggregate_num_emitted_tokens.item()
|
148
|
+
draft_tokens = self._aggregate_num_draft_tokens
|
149
|
+
|
150
|
+
max_num_emitted_tokens = self.get_max_num_emitted_tokens(
|
151
|
+
draft_tokens, k)
|
152
|
+
|
153
|
+
if draft_tokens > 0:
|
154
|
+
draft_acceptance_rate = accepted_tokens / draft_tokens
|
155
|
+
else:
|
156
|
+
draft_acceptance_rate = float("nan")
|
157
|
+
|
158
|
+
if max_num_emitted_tokens > 0:
|
159
|
+
system_efficiency = emitted_tokens / max_num_emitted_tokens
|
160
|
+
else:
|
161
|
+
system_efficiency = float("nan")
|
162
|
+
|
163
|
+
return SpecDecodeWorkerMetrics(
|
164
|
+
num_spec_tokens=k,
|
165
|
+
draft_acceptance_rate=draft_acceptance_rate,
|
166
|
+
system_efficiency=system_efficiency,
|
167
|
+
accepted_tokens=accepted_tokens,
|
168
|
+
draft_tokens=draft_tokens,
|
169
|
+
emitted_tokens=emitted_tokens,
|
170
|
+
)
|
171
|
+
|
172
|
+
@staticmethod
|
173
|
+
def get_max_num_emitted_tokens(draft_tokens: int, k: int) -> int:
|
174
|
+
"""Calculate the number of emitted tokens, assuming all tokens are
|
175
|
+
accepted.
|
176
|
+
|
177
|
+
This is equal to the number of sequences that have been speculated on,
|
178
|
+
times (speculation len + 1). The +1 comes from the bonus token.
|
179
|
+
"""
|
180
|
+
# Determine the number of sequences that have been speculated on. Since
|
181
|
+
# the batch size can be variable, we divide by k.
|
182
|
+
assert draft_tokens % k == 0
|
183
|
+
total_num_spec_seqs = draft_tokens // k
|
184
|
+
|
185
|
+
# A single sequence may emit k accepted tokens and one bonus token in
|
186
|
+
# the best case.
|
187
|
+
num_emitted_per_seq_if_all_accepted = k + 1
|
188
|
+
|
189
|
+
# The max num of emitted tokens is the number of speculated sequences
|
190
|
+
# times the max emitted per seq.
|
191
|
+
return total_num_spec_seqs * num_emitted_per_seq_if_all_accepted
|