PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.9.1__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1197) hide show

vllm/spec_decode/target_model_runner.py ADDED Viewed

@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import List, Optional
+from vllm.sequence import SequenceGroupMetadata
+from vllm.worker.model_runner_base import (ModelRunnerBase,
+                                           ModelRunnerInputBase,
+                                           ModelRunnerWrapperBase)
+class TargetModelRunner(ModelRunnerWrapperBase):
+    """Specialized model runner for speculative decoding target model.
+    In speculative decoding, the log probabilities selected finally may not
+    be the same ones as selected by the target model sampling. This means
+    that the time spent in the log probability calculation of the target model
+    is time wasted, since we calculate log probabilities after deciding which
+    tokens are accepted. For this reason disabling log probabilities in the
+    target model will make decode faster. The model runner sets the
+    SamplingMetadata parameters according to whether log probabilities are
+    requested or not.
+    """
+    def __init__(self, model_runner: ModelRunnerBase):
+        # An internal boolean member variable to indicate if token log
+        # probabilities are needed or not.
+        super().__init__(model_runner)
+        self.disable_logprobs = True
+    def prepare_model_input(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        virtual_engine: int = 0,
+        finished_requests_ids: Optional[List[str]] = None,
+    ) -> ModelRunnerInputBase:
+        model_input: ModelRunnerInputBase =\
+            self.model_runner.prepare_model_input(
+            seq_group_metadata_list, virtual_engine, finished_requests_ids)
+        # If token log probabilities is disabled then skip generating sampler
+        # CPU output. We directly serialize the GPU sampled_token_id tensors
+        # as needed. If log probabilities is enabled then synchronize all the
+        # sampling related tensors which includes the logprobs tensors.
+        model_input.sampling_metadata.skip_sampler_cpu_output = (
+            self.disable_logprobs)
+        return model_input

vllm/spec_decode/top1_proposer.py ADDED Viewed

@@ -0,0 +1,275 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import List, Optional, Set, Tuple
+import torch
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.sequence import ExecuteModelRequest, SequenceGroupMetadata
+from vllm.spec_decode.interfaces import (SpeculativeProposals,
+                                         SpeculativeProposer)
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
+from vllm.spec_decode.util import sampler_output_to_torch
+class Top1Proposer(SpeculativeProposer):
+    """Helper class which separates out sequences which would exceed the max
+    model length when speculated upon.
+    This allows combinations of models such as JackFram/llama-68m draft with
+    meta-llama/Llama2-13b-chat-hf, as llama-68m has max_position_embeddings of
+    2048 while Llama2-13b has max_position_embeddings of 4096.
+    We treat the sequences which exceed the proposal draft model length as
+    "non-spec sequences". Essentially they skip the draft model and go through
+    normal decoding in the target model.
+    Currently, only proposal_lens of 0 and k are supported, where k is a global
+    batch proposal length. In the future vLLM should support per-sequence
+    proposal lengths.
+    """
+    def __init__(
+        self,
+        worker: ProposerWorkerBase,
+        device: str,
+        vocab_size: int,
+        max_proposal_len: Optional[int] = None,
+    ):
+        self._worker = worker
+        self._device = device
+        self.max_proposal_len = max_proposal_len
+        self._vocab_size = vocab_size
+    def get_spec_proposals(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        seq_ids_with_bonus_token_in_last_step: Set[int],
+    ) -> SpeculativeProposals:
+        """Get speculative proposals given the input batch.
+        Sequences which would exceed the max model length are skipped during
+        speculation.
+        """
+        proposal_len = execute_model_req.num_lookahead_slots
+        seq_group_metadata_list = execute_model_req.seq_group_metadata_list
+        # Split speculative- and non-speculative- sequences.
+        (
+            proposal_lens,
+            nonzero_proposal_len_seqs,
+            nonzero_proposal_len_indices,
+        ) = self._split_by_proposal_len(seq_group_metadata_list, proposal_len)
+        if nonzero_proposal_len_seqs:
+            # Speculate tokens using the draft worker for the speculative
+            # sequences.
+            # If sampler_transposed is true, then maybe_sampler_output's
+            # token_ids is like [batch] format in proposal_len size list,
+            # while if it is false, the format would be [proposal_len]
+            # in batch size list
+            hidden_states = execute_model_req.previous_hidden_states
+            if hidden_states is not None:
+                hidden_states.prune(nonzero_proposal_len_seqs)
+            nonzero_execute_model_req = ExecuteModelRequest(
+                seq_group_metadata_list=nonzero_proposal_len_seqs,
+                num_lookahead_slots=proposal_len,
+                previous_hidden_states=hidden_states,
+            )
+            maybe_sampler_output, transposed = self._worker.sampler_output(
+                execute_model_req=nonzero_execute_model_req,
+                sample_len=proposal_len,
+                seq_ids_with_bonus_token_in_last_step=\
+                    seq_ids_with_bonus_token_in_last_step,
+            )
+            (
+                proposal_lens,
+                maybe_sampler_output,
+                nonzero_proposal_len_indices,
+            ) = self._remove_no_proposal_seqs(proposal_lens,
+                                              maybe_sampler_output,
+                                              nonzero_proposal_len_indices,
+                                              transposed)
+        else:
+            # If no sequences can be speculated, set sampler output to None.
+            maybe_sampler_output = None
+            transposed = False
+        # Combine speculative- and non-speculative sequences into the same
+        # representation.
+        proposal_tokens, proposal_probs, proposal_lens = self._merge_outputs(
+            batch_size=len(seq_group_metadata_list),
+            proposal_len=proposal_len,
+            maybe_sampler_output=maybe_sampler_output,
+            proposal_lens=proposal_lens,
+            nonzero_proposal_len_indices=nonzero_proposal_len_indices,
+            sampler_transposed=transposed,
+        )
+        proposals = SpeculativeProposals(proposal_token_ids=proposal_tokens,
+                                         proposal_probs=proposal_probs,
+                                         proposal_lens=proposal_lens,
+                                         no_proposals=maybe_sampler_output
+                                         is None)
+        return proposals
+    def _split_by_proposal_len(
+        self,
+        seq_group_metadata_list: List[SequenceGroupMetadata],
+        proposal_len: int,
+    ) -> Tuple[List[int], List[SequenceGroupMetadata], List[int]]:
+        """Split sequences by two groups:
+        1. Sequences with non-zero proposal length.
+        2. Sequences with zero proposal length (due to disabled speculation
+        or exceed the maximum model length).
+        """
+        proposal_lens: List[int] = []
+        nonzero_proposal_len_seqs: List[SequenceGroupMetadata] = []
+        nonzero_proposal_len_indices: List[int] = []
+        for i, seq_group_metadata in enumerate(seq_group_metadata_list):
+            # The speculative decoding for this request has either been disabled
+            # (e.g. due to high traffic) or this is a prompt request.
+            if (seq_group_metadata.is_prompt
+                    or seq_group_metadata.num_speculative_tokens == 0):
+                proposal_lens.append(0)
+                continue
+            seq_data = next(iter(seq_group_metadata.seq_data.values()))
+            seq_len = seq_data.get_len()
+            # Currently only proposal lens of 0 or the global batch proposal len
+            # are supported.
+            # If max_proposal_len is defined, then we shall not exceed this
+            # quota for nonzero_proposal
+            new_k = 0
+            if (self.max_proposal_len is None
+                    or seq_len + proposal_len < self.max_proposal_len):
+                new_k = proposal_len
+                nonzero_proposal_len_seqs.append(seq_group_metadata)
+                nonzero_proposal_len_indices.append(i)
+            proposal_lens.append(new_k)
+            seq_group_metadata.num_speculative_tokens = new_k
+        return (
+            proposal_lens,
+            nonzero_proposal_len_seqs,
+            nonzero_proposal_len_indices,
+        )
+    @staticmethod
+    def _remove_no_proposal_seqs(proposal_lens, maybe_sampler_output,
+                                 nonzero_proposal_len_indices, transposed):
+        """Remove sequences from nonzero_proposal_len_indices and reset
+        their proposal_len to 0 the draft worker does not provide a proposal
+        (maybe_sampler_output=None). This can avoid scoring overheads.
+        """
+        # If maybe_sampler_output is None, then the draft worker did not
+        # provide a proposal for any sequence and thus no action needed.
+        # Also we do not support transposed maybe_sampler_output for now
+        # because it seems not straightforward for draft workers outputting
+        # transposed sampler outputs to handle the case of no proposal.
+        if maybe_sampler_output is None or transposed:
+            return (proposal_lens, maybe_sampler_output,
+                    nonzero_proposal_len_indices)
+        new_proposal_lens: List[int] = []
+        new_nonzero_proposal_len_indices: List[int] = []
+        new_maybe_sampler_output: List[SamplerOutput] = []
+        nonzero_proposal_len_idx_ptr = 0
+        seq_idx = 0
+        while seq_idx < len(
+                proposal_lens) and nonzero_proposal_len_idx_ptr < len(
+                    nonzero_proposal_len_indices):
+            if seq_idx < nonzero_proposal_len_indices[
+                    nonzero_proposal_len_idx_ptr]:
+                # Sequence is not in the original nonzero_proposal_len_indices,
+                # meaning that it has a proposal length of 0 before sending to
+                # the draft worker.
+                assert proposal_lens[seq_idx] == 0
+                new_proposal_lens.append(0)
+            else:
+                # Sequence is in the original nonzero_proposal_len_indices
+                if maybe_sampler_output[nonzero_proposal_len_idx_ptr] is None:
+                    # but does not have a proposal from the draft worker.
+                    new_proposal_lens.append(0)
+                else:
+                    # and has a proposal from the draft worker. Add it to the
+                    # new nonzero proposal list and keep the sampler output.
+                    new_proposal_lens.append(proposal_lens[seq_idx])
+                    new_nonzero_proposal_len_indices.append(seq_idx)
+                    new_maybe_sampler_output.append(
+                        maybe_sampler_output[nonzero_proposal_len_idx_ptr])
+                nonzero_proposal_len_idx_ptr += 1
+            seq_idx += 1
+        # The remaining sequences should have proposal length of 0.
+        new_proposal_lens.extend(proposal_lens[seq_idx:])
+        # We assume sampler_output will not be a list of all Nones.
+        # In this case this function should not be called.
+        assert new_maybe_sampler_output
+        return (new_proposal_lens, new_maybe_sampler_output,
+                new_nonzero_proposal_len_indices)
+    def _merge_outputs(
+        self,
+        batch_size: int,
+        proposal_len: int,
+        maybe_sampler_output: Optional[List[SamplerOutput]],
+        proposal_lens: List[int],
+        nonzero_proposal_len_indices: List[int],
+        sampler_transposed: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """After speculations are produced, merge the speculation results with
+        the skipped sequences.
+        """
+        if maybe_sampler_output is None:
+            # If no speculative tokens, the sampler output will be None.
+            # In this case we return empty proposals.
+            proposal_tokens = torch.tensor(-1,
+                                           dtype=torch.long,
+                                           device=self._device).expand(
+                                               batch_size, proposal_len)
+            proposal_probs = torch.tensor(0,
+                                          dtype=torch.float32,
+                                          device=self._device).expand(
+                                              batch_size, proposal_len,
+                                              self._vocab_size)
+            proposal_lens_tensor = torch.tensor(0,
+                                                dtype=torch.long,
+                                                device=self._device).expand(
+                                                    len(proposal_lens))
+            return proposal_tokens, proposal_probs, proposal_lens_tensor
+        sampler_output = maybe_sampler_output
+        proposal_tokens, proposal_probs, *_ = sampler_output_to_torch(
+            sampler_output, sampler_transposed)
+        # Now, reformat the output GPU tensors such that each sequence has
+        # a proposal. the proposal can be empty, e.g. [-1, -1, -1]
+        entire_proposal_tokens = proposal_tokens.new_full(
+            size=(batch_size, *proposal_tokens.shape[1:]),
+            fill_value=-1,
+        )
+        entire_proposal_tokens[nonzero_proposal_len_indices] = proposal_tokens
+        entire_proposal_probs = proposal_probs.new_zeros(
+            batch_size,
+            *proposal_probs.shape[1:],
+        )
+        entire_proposal_probs[nonzero_proposal_len_indices] = proposal_probs
+        proposal_tokens, proposal_probs = (
+            entire_proposal_tokens,
+            entire_proposal_probs,
+        )
+        proposal_lens_tensor = torch.zeros(batch_size,
+                                           dtype=torch.long,
+                                           device=self._device)
+        proposal_lens_tensor[nonzero_proposal_len_indices] = proposal_len
+        return proposal_tokens, proposal_probs, proposal_lens_tensor

vllm/spec_decode/util.py ADDED Viewed

@@ -0,0 +1,277 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import time
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Sequence, Tuple
+import torch
+from vllm.model_executor.layers.sampler import SamplerOutput
+from vllm.platforms import current_platform
+from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
+                           PromptLogprobs, SequenceGroupMetadata,
+                           SequenceOutput)
+SeqId = int
+def get_all_num_logprobs(
+        seq_group_metadata_list: List[SequenceGroupMetadata]) -> List[int]:
+    """Given a list of SequenceGroupMetadata, create a list of all num_logprobs.
+    If the sampling params do not call for any logprobs, return 0 for that
+    sequence.
+    """
+    all_num_logprobs: List[int] = []
+    for seq_group_metadata in seq_group_metadata_list:
+        num_logprobs = seq_group_metadata.sampling_params.logprobs
+        if num_logprobs is None:
+            num_logprobs = 0
+        all_num_logprobs.append(num_logprobs)
+    return all_num_logprobs
+def get_sampled_token_logprobs(
+        # shape [num_steps, batch_size, vocab_size]
+        logprob_tensor: torch.Tensor,
+        sampled_token_ids: torch.Tensor,  # shape [num_steps, batch_size]
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Get the logprobs for the sampled tokens. Returns the ranks and logprobs.
+    """
+    num_steps, batch_size, vocab_size = logprob_tensor.shape
+    selected_logprobs = logprob_tensor[
+        torch.arange(num_steps).unsqueeze(1),
+        torch.arange(batch_size),
+        sampled_token_ids,
+    ]
+    expanded_selected_logprobs = selected_logprobs.unsqueeze(-1).expand(
+        -1, -1, vocab_size)
+    sampled_token_ids_ranks = (logprob_tensor
+                               > expanded_selected_logprobs).sum(-1).add_(1)
+    return sampled_token_ids_ranks, selected_logprobs
+def create_logprobs_output(
+    token_id: int,
+    token_id_logprob_rank: int,
+    token_id_logprob: float,
+    topk_token_ids: List[Optional[int]],
+    topk_logprobs: List[Optional[float]],
+) -> Dict[int, Logprob]:
+    """Create a Logprob Dict for a token given the sampling results.
+    Args:
+        token_id (int): The sampled token for the sequence.
+        token_id_logprob_rank (int): The logprob rank of the sampled token.
+        token_id_logprob (float): The logprob value of the sampled token.
+        topk_token_ids (List[Optional[int]]): The list of top-k token ids.
+        topk_logprobs (List[Optional[float]]): The list of top-k logprobs.
+    """
+    # vLLM logprobs always include the sampled token. In addition, the user may
+    # request topk-logprobs (where top-k varies per user up to max_logprobs).
+    logprobs: Dict[int, Logprob] = {
+        token_id: Logprob(
+            logprob=token_id_logprob,
+            rank=token_id_logprob_rank,
+        ),
+    }
+    logprobs.update({
+        topk_token_id: Logprob(
+            logprob=topk_logprob if topk_logprob is not None else 0.0,
+            rank=topk_index + 1,
+        )
+        for topk_index, (topk_token_id, topk_logprob) \
+            in enumerate(zip(topk_token_ids, topk_logprobs)) \
+        if topk_token_id is not None
+    })
+    return logprobs
+def create_sequence_group_output(
+        token_id: int,
+        token_id_logprob_rank: int,
+        token_id_logprob: float,
+        seq_id: SeqId,
+        topk_token_ids: List[Optional[int]],
+        topk_logprobs: List[Optional[float]],
+        prompt_logprobs: Optional[PromptLogprobs] = None,
+        step_index: Optional[int] = 0) -> CompletionSequenceGroupOutput:
+    """Create a SequenceGroupOutput given the sampling results.
+    Args:
+        token_id (int): The sampled token for the sequence.
+        token_id_logprob_rank (int): The logprob rank of the sampled token.
+        token_id_logprob (float): The logprob value of the sampled token.
+        seq_id (int): The sequence id.
+        topk_token_ids (List[Optional[int]]): The list of top-k token ids.
+        topk_logprobs (List[Optional[float]]): The list of top-k logprobs.
+        step_index: (Optional[int]): The index of the speculative token.
+    """
+    logprobs = create_logprobs_output(
+        token_id,
+        token_id_logprob_rank,
+        token_id_logprob,
+        topk_token_ids,
+        topk_logprobs,
+    )
+    return CompletionSequenceGroupOutput(samples=[
+        SequenceOutput(parent_seq_id=seq_id,
+                       output_token=token_id,
+                       logprobs=logprobs)
+    ],
+                                         prompt_logprobs=prompt_logprobs,
+                                         step_index=step_index)
+def split_batch_by_proposal_len(
+    seq_group_metadata_list: List[SequenceGroupMetadata],
+    proposal_lens: List[int],
+) -> Tuple[Tuple[List[SequenceGroupMetadata], List[int]], Tuple[
+        List[SequenceGroupMetadata], List[int]]]:
+    """Utility function that splits a batch based on whether the proposal len is
+    zero or not. We should remove this once vLLM supports per-sequence proposal
+    lens in a batch.
+    """
+    nonzero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
+    zero_lists: Tuple[List[SequenceGroupMetadata], List[int]] = ([], [])
+    for i, (seq_group, proposal_len) in enumerate(
+            zip(seq_group_metadata_list, proposal_lens)):
+        seq_groups, indices = nonzero_lists if proposal_len else zero_lists
+        seq_groups.append(seq_group)
+        indices.append(i)
+    return nonzero_lists, zero_lists
+def sampler_output_to_torch(
+    sampler_output_list: Sequence[SamplerOutput], sampler_transposed: bool
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+    """Utility function which converts a list of SamplerOutput to tensors.
+        sampler_transposed here is used as the indicator for whether
+        we need do additional tensor transpose logic here.
+        Returns:
+            sampled_token_ids: torch.Tensor
+                shape: [batch_size, len(sampler_output_list)]
+            sampled_token_probs: torch.Tensor
+                shape: [batch_size, len(sampler_output_list), vocab_size]
+        """
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_probs = torch.stack(
+        [
+            sampler_output.sampled_token_probs
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    )
+    # shape: [batch_size, num_sampler_output, vocab_size]
+    sampled_token_logprobs = torch.stack(
+        [sampler_output.logprobs for sampler_output in sampler_output_list],
+        dim=0,
+    )
+    # shape: [batch_size, num_sampler_output]
+    sampled_token_ids = torch.stack(
+        [
+            sampler_output.sampled_token_ids.flatten()
+            for sampler_output in sampler_output_list
+        ],
+        dim=0,
+    )
+    if sampler_transposed:
+        sampled_token_probs = sampled_token_probs.transpose(0, 1)
+        sampled_token_logprobs = sampled_token_logprobs.transpose(0, 1)
+        sampled_token_ids = sampled_token_ids.transpose(0, 1)
+    if sampler_output_list[0].hidden_states is not None:
+        # shape: [batch_size, num_sampler_output, hidden_dim]
+        sampled_hidden_states = torch.stack(
+            [
+                sampler_output.hidden_states
+                for sampler_output in sampler_output_list
+            ],
+            dim=0,
+        )
+        if sampler_transposed:
+            sampled_hidden_states = sampled_hidden_states.transpose(0, 1)
+    else:
+        sampled_hidden_states = None
+    return (sampled_token_ids, sampled_token_probs, sampled_token_logprobs,
+            sampled_hidden_states)
+def maybe_mock_device_tensors(sampler_output: SamplerOutput, batch_size: int,
+                              vocab_size: int, device: str) -> None:
+    """Helper method which mocks out the GPU tensors in SamplerOutput with dummy
+    values. This will be removed in PR 7/9.
+    https://docs.google.com/document/d/1rE4pr3IdspRw97XbImY4fS9IWYuJJ3HGtL7AdIKGrw8/edit#heading=h.qijw1sdidrer
+    """
+    values = [
+        sampler_output.sampled_token_probs, sampler_output.sampled_token_ids
+    ]
+    assert all(v is None for v in values) or not any(v is None for v in values)
+    if not any(v is None for v in values):
+        # Do nothing if the tensors are already created (usually in unit tests).
+        return
+    # Softmax to ensure valid probs.
+    sampler_output.sampled_token_probs = torch.nn.functional.softmax(
+        torch.rand(batch_size, vocab_size, dtype=torch.float32, device=device),
+        dim=-1)
+    sampler_output.sampled_token_ids = torch.randint(low=10,
+                                                     high=100,
+                                                     size=(batch_size, ),
+                                                     dtype=torch.long,
+                                                     device=device)
+@contextmanager
+def nvtx_range(msg, *args, **kwargs):
+    """
+    Context manager / decorator that pushes an NVTX range at the beginning
+    of its scope, and pops it at the end. If extra arguments are given,
+    they are passed as arguments to msg.format().
+    If running with cuda graphs, you must enable nsys cuda graph profiling.
+    Arguments:
+        msg (string): message to associate with the range
+    """
+    if current_platform.is_cuda_alike():
+        torch.cuda.nvtx.range_push(msg.format(*args, **kwargs))
+        try:
+            yield
+        finally:
+            torch.cuda.nvtx.range_pop()
+    else:
+        yield
+class Timer:
+    """Basic timer context manager for measuring CPU time.
+    """
+    def __enter__(self):
+        self.start_time = time.time()
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.end_time = time.time()
+        self.elapsed_time_s = self.end_time - self.start_time
+        self.elapsed_time_ms = self.elapsed_time_s * 1000