PyPI - vllm-cpu-avx512bf16 - Versions diffs - 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-avx512bf16 0.9.0.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1175) hide show

vllm/v1/spec_decode/metrics.py ADDED Viewed

@@ -0,0 +1,177 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import prometheus_client
+from vllm.config import SpeculativeConfig
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+@dataclass
+class SpecDecodingStats:
+    """Per-step iteration decoding stats from scheduler.
+    Each scheduler step, statistics on spec decoding performance are
+    aggregated across requests by the scheduler and returned to the
+    frontend in EngineCoreOutputs->SchedulerStats.
+    """
+    num_spec_tokens: int
+    num_drafts: int = 0
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
+    @classmethod
+    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
+        return cls(num_spec_tokens=num_spec_tokens,
+                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
+    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_drafts += 1
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+        assert num_accepted_tokens <= self.num_spec_tokens
+        for i in range(num_accepted_tokens):
+            self.num_accepted_tokens_per_pos[i] += 1
+class SpecDecodingLogging:
+    """Aggregate and log spec decoding metrics.
+    LoggingStatLogger aggregates per-iteration metrics over a set
+    time interval using observe() and then logs them using log()
+    before resetting to zero.
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.num_drafts: list[int] = []
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+        self.accepted_tokens_per_pos_lists: list[list[int]] = []
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_drafts.append(spec_decoding_stats.num_drafts)
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+        self.accepted_tokens_per_pos_lists.append(
+            spec_decoding_stats.num_accepted_tokens_per_pos)
+    def log(self, log_fn=logger.info):
+        if not self.num_drafts:
+            return
+        num_drafts = np.sum(self.num_drafts)
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
+        # Conventionally, mean acceptance length includes the bonus token
+        mean_acceptance_length = 1 + (num_accepted_tokens / num_drafts)
+        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
+        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
+        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
+        log_fn(
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Mean acceptance length: %.2f, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens, "
+            "Per-position acceptance rate: %s",
+            draft_acceptance_rate,
+            mean_acceptance_length,
+            num_accepted_tokens,
+            num_draft_tokens,
+            rates_str,
+        )
+        self.reset()
+class SpecDecodingProm:
+    """Record spec decoding metrics in Prometheus.
+    The acceptance rate can be calculated using a PromQL query:
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+    The mean acceptance length (conventionally including bonus tokens)
+    can be calculated using:
+      1 + (
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_drafts[$interval]))
+    A per-position acceptance rate vector can be computed using
+      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
+      vllm:spec_decode_num_drafts[$interval]
+    """
+    _counter_cls = prometheus_client.Counter
+    def __init__(
+        self,
+        speculative_config: Optional[SpeculativeConfig],
+        labelnames: list[str],
+        labelvalues: list[str],
+    ):
+        self.spec_decoding_enabled = speculative_config is not None
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_drafts",
+                documentation="Number of spec decoding drafts.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_draft_tokens = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_draft_tokens",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames,).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            self._counter_cls(
+                name="vllm:spec_decode_num_accepted_tokens",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        assert speculative_config is not None
+        num_spec_tokens = (speculative_config.num_speculative_tokens
+                           if self.spec_decoding_enabled else 0)
+        pos_labelnames = labelnames + ["position"]
+        base_counter = self._counter_cls(
+            name="vllm:spec_decode_num_accepted_tokens_per_pos",
+            documentation="Accepted tokens per draft position.",
+            labelnames=pos_labelnames,
+        )
+        self.counter_spec_decode_num_accepted_tokens_per_pos: list[
+            prometheus_client.Counter] = []
+        for pos in range(num_spec_tokens):
+            pos_labelvalues = labelvalues + [str(pos)]
+            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
+                base_counter.labels(*pos_labelvalues))
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
+        self.counter_spec_decode_num_draft_tokens.inc(
+            spec_decoding_stats.num_draft_tokens)
+        self.counter_spec_decode_num_accepted_tokens.inc(
+            spec_decoding_stats.num_accepted_tokens)
+        for pos, counter in enumerate(
+                self.counter_spec_decode_num_accepted_tokens_per_pos):
+            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])

vllm/v1/spec_decode/ngram_proposer.py ADDED Viewed

@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import numpy as np
+from numba import jit
+from vllm.config import VllmConfig
+class NgramProposer:
+    def __init__(self, vllm_config: VllmConfig):
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
+    def propose(
+        self,
+        context_token_ids: np.ndarray,
+    ) -> Optional[np.ndarray]:
+        """Proposes the next sequence of tokens based on n-gram pattern
+        matching in the context. The function finds matches of the last n
+        tokens in the previous context, and returns k tokens that followed
+        that match.
+        Args:
+            context_token_ids: Numpy array of token IDs representing the
+                               context sequence.
+        Returns:
+            np.ndarray: The sequence of tokens that followed
+                        the matched n-gram in the context.
+            None: If no matching n-gram pattern is found.
+        Example:
+            If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
+            k = 4:
+            - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
+            - The last 2 tokens [2,3] will be matched against the previous
+              4 tokens [1,2,3,4].
+            - Finding a match of [2,3] would return the tokens that
+              followed that pattern. Here we will return [4,2,3] because
+              we only have three tokens after the match.
+        """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+        # TODO(woosuk): Optimize this.
+        for n in range(self.max_n, self.min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, k)
+            if result is not None:
+                return result
+        return None
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+@jit(nopython=True)
+def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+    """
+    Build the lps (longest proper prefix which is also suffix)
+    array for the pattern.
+    """
+    lps = np.zeros(len(pattern), dtype=np.int32)
+    prev_lps = 0  # length of the previous longest prefix suffix
+    i = 1
+    while i < len(pattern):
+        if pattern[i] == pattern[prev_lps]:
+            prev_lps += 1
+            lps[i] = prev_lps
+            i += 1
+        else:
+            if prev_lps != 0:
+                prev_lps = lps[prev_lps - 1]
+            else:
+                lps[i] = 0
+                i += 1
+    return lps
+@jit(nopython=True)
+def _find_subarray_kmp(
+    context_token_ids: np.ndarray,
+    n: int,
+    k: int,
+) -> Optional[np.ndarray]:
+    context_len = context_token_ids.shape[0]
+    assert n > 0
+    pattern = context_token_ids[-n:]
+    # Precompute lps array for Y
+    lps = _kmp_lps_array(pattern)
+    i = 0
+    j = 0
+    # -n because the last n tokens are used as pattern
+    while i < context_len - n:
+        if context_token_ids[i] == pattern[j]:
+            i += 1
+            j += 1
+            # If we have matched the entire Y
+            if j == n:
+                # Found pattern in context, gather the next K elements
+                return context_token_ids[i:i + k]
+        else:
+            # Mismatch
+            if j != 0:
+                # Use the lps array to avoid re-checking elements
+                j = lps[j - 1]
+            else:
+                i += 1
+    # Y not found
+    return None

vllm/v1/spec_decode/utils.py ADDED Viewed

@@ -0,0 +1,45 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.triton_utils import tl, triton
+from vllm.v1.worker.gpu_input_batch import InputBatch
+def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
+    if req_id in input_batch.min_p_reqs:
+        # Spec decode doesn't support min_p sampling.
+        return False
+    elif (req_id in input_batch.frequency_penalties_reqs
+          or req_id in input_batch.presence_penalties_reqs
+          or req_id in input_batch.repetition_penalties_reqs):
+        # Spec decode doesn't support penalties.
+        return False
+    elif req_id in input_batch.num_logprobs:
+        # Spec decode doesn't support logprobs.
+        return False
+    return True
+@triton.jit
+def prepare_eagle_input_kernel(
+    out_ptr,
+    cu_query_lens_ptr,
+    cu_num_tokens_ptr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    # [start_pos, end_pos)
+    start_pos = tl.load(cu_num_tokens_ptr + pid)
+    end_pos = tl.load(cu_num_tokens_ptr + pid + 1)
+    num_tokens = end_pos - start_pos
+    index_start = tl.load(cu_query_lens_ptr + pid)
+    num_blocks = tl.cdiv(num_tokens, BLOCK_SIZE)
+    for i in tl.range(num_blocks):
+        offset = i * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+        tl.store(
+            out_ptr + start_pos + offset,
+            index_start + offset,
+            mask=offset < num_tokens,
+        )

vllm/v1/structured_output/__init__.py ADDED Viewed

@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import multiprocessing
+from concurrent.futures import ThreadPoolExecutor
+from typing import TYPE_CHECKING, Optional
+from vllm.config import VllmConfig
+from vllm.logger import init_logger
+from vllm.reasoning import ReasoningParserManager
+from vllm.transformers_utils.tokenizer_group import init_tokenizer_from_configs
+from vllm.utils import LazyLoader
+from vllm.v1.structured_output.backend_guidance import GuidanceBackend
+from vllm.v1.structured_output.backend_types import (StructuredOutputBackend,
+                                                     StructuredOutputGrammar)
+from vllm.v1.structured_output.backend_xgrammar import XgrammarBackend
+if TYPE_CHECKING:
+    import numpy as np
+    import numpy.typing as npt
+    import torch
+    from vllm.reasoning import ReasoningParser
+    from vllm.v1.request import Request
+else:
+    torch = LazyLoader("torch", globals(), "torch")
+logger = init_logger(__name__)
+class StructuredOutputManager:
+    """Engine-level manager for structured output requests."""
+    def __init__(self, vllm_config: VllmConfig):
+        self.backend: Optional[StructuredOutputBackend] = None
+        self.reasoner: Optional[ReasoningParser] = None
+        self.vllm_config = vllm_config
+        self._grammar_bitmask: Optional[torch.Tensor] = None
+        self._full_mask = torch.tensor(-1, dtype=torch.int32)
+        # The default max_workers if not specified is the number of CPUs * 5,
+        # which is way too high since these tasks are CPU-bound, not I/O bound.
+        # We also know we would never dominate CPU usage with just grammar
+        # compilation, so we set it to half the number of CPUs.
+        max_workers = max(1, (multiprocessing.cpu_count() + 1) // 2)
+        self.executor = ThreadPoolExecutor(max_workers=max_workers)
+        self.tokenizer = init_tokenizer_from_configs(
+            model_config=self.vllm_config.model_config,
+            scheduler_config=self.vllm_config.scheduler_config,
+            lora_config=self.vllm_config.lora_config,
+        ).get_lora_tokenizer(None)
+        reasoning_backend = vllm_config.decoding_config.reasoning_backend
+        if reasoning_backend:
+            reasoner_cls = ReasoningParserManager.get_reasoning_parser(
+                reasoning_backend)
+            self.reasoner = reasoner_cls(tokenizer=self.tokenizer)
+    def grammar_init(self, request: Request) -> None:
+        if request.structured_output_request is None:
+            return
+        if TYPE_CHECKING:
+            assert request.sampling_params.guided_decoding is not None
+        # Initialize the backend the first time it is needed.
+        #
+        # NOTE: We only support a single backend. We do NOT support different
+        # backends on a per-request basis in V1 (for now, anyway...).
+        if self.backend is None:
+            backend = request.sampling_params.guided_decoding.backend
+            vocab_size = self.vllm_config.model_config.get_vocab_size()
+            if backend == "xgrammar":
+                self.backend = XgrammarBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            elif backend == "guidance":
+                self.backend = GuidanceBackend(
+                    self.vllm_config,
+                    tokenizer=self.tokenizer,
+                    vocab_size=vocab_size,
+                )
+            else:
+                raise ValueError(
+                    f"Unsupported structured output backend: {backend}")
+        grammar = self.executor.submit(self._async_create_grammar, request)
+        request.structured_output_request.grammar = grammar  # type: ignore[assignment]
+    def _async_create_grammar(
+        self,
+        request: Request,
+    ) -> StructuredOutputGrammar:
+        key = request.structured_output_request.structured_output_key  # type: ignore[union-attr]
+        # Note that the request was validated in the engine core client,
+        # so at this point we know it is a supported type of request.
+        #
+        # TODO: we still need to handle xgrammar compilation failures,
+        # though it should be unlikely as we test that up front as well.
+        request_type, grammar_spec = key
+        assert self.backend is not None
+        return self.backend.compile_grammar(request_type, grammar_spec)
+    def grammar_bitmask(
+        self,
+        requests: dict[str, Request],
+        structured_output_request_ids: dict[str, int],
+        scheduled_spec_decode_tokens: dict[str, list[int]],
+    ) -> Optional[npt.NDArray[np.int32]]:
+        # Prepare the structured output bitmask for this batch.
+        if not structured_output_request_ids:
+            return None
+        max_num_spec_tokens = 0
+        if self.vllm_config.speculative_config is not None:
+            max_num_spec_tokens = \
+                self.vllm_config.speculative_config.num_speculative_tokens
+        if self._grammar_bitmask is None:
+            assert self.backend is not None
+            max_batch_size = self.vllm_config.scheduler_config.max_num_seqs
+            # Allocate a bitmask for each token needing to be checked:
+            # one for each speculative position, and one more for the
+            # bonus token / non-speculative token.
+            self._grammar_bitmask = \
+                self.backend.allocate_token_bitmask(
+                    max_batch_size * (1 + max_num_spec_tokens))
+        bitmask_tensor = self._grammar_bitmask
+        # Generate a batched bitmask for all structured output requests.
+        # When speculative decoding is enabled, we need to include multiple
+        # masks for each request, one for each possible bonus token position.
+        # These are stored inline in the tensor and unpacked by the gpu runner.
+        cumulative_index = 0
+        ordered_seq = sorted(structured_output_request_ids.items(),
+                             key=lambda x: x[1])
+        # Note that for thinking support, we will need to
+        # reset the relevant part of the bitmask for consequent
+        # request here.
+        bitmask_tensor[:(len(ordered_seq) * (1 + max_num_spec_tokens))].fill_(
+            self._full_mask)
+        # NOTE: This outer loop can likely be parallelized to improve
+        # performance of bitmask generation for large batches.
+        for req_id, _ in ordered_seq:
+            request = requests[req_id].structured_output_request
+            if TYPE_CHECKING:
+                assert request is not None
+                assert request.grammar is not None
+            apply_bitmask = (
+                request.reasoning_ended if self.reasoner is not None else True
+            )  # noqa: E501
+            state_advancements = 0
+            req_tokens = scheduled_spec_decode_tokens.get(req_id, []) + [None]
+            for i, token in enumerate(req_tokens):
+                if apply_bitmask and not request.grammar.is_terminated():
+                    request.grammar.fill_bitmask(bitmask_tensor,
+                                                 cumulative_index)
+                    if token is not None:
+                        # In order to generate the correct bitmask for each
+                        # position in the speculative sequence, we advance
+                        # the FSM state for each speculative token and rollback
+                        # to restore the previous state when we are finished.
+                        assert request.grammar.accept_tokens(req_id, [token])
+                        state_advancements += 1
+                cumulative_index += 1
+            if state_advancements > 0:
+                request.grammar.rollback(state_advancements)
+        if cumulative_index < bitmask_tensor.shape[0]:
+            bitmask_tensor = bitmask_tensor[:cumulative_index]
+        # After finishing with the xgrammar operations, we convert to
+        # np.ndarray, because that is much more efficient for serialization
+        # and deserialization when sending this to the GPU workers.
+        return bitmask_tensor.numpy()
+    def should_advance(self, request: Request) -> bool:
+        if not request.use_structured_output:
+            return False
+        # To determine whether we can advance the FSM.
+        # Supports thinking usage where we skip the reasoning components.
+        if TYPE_CHECKING:
+            assert request.structured_output_request is not None
+            assert request.structured_output_request.grammar is not None
+        # by default, we should always advance
+        # for cases that doesn't uses thinking mode.
+        if self.reasoner is not None:
+            structured_req = request.structured_output_request
+            if structured_req.reasoning_ended:
+                return True
+            # Check if reasoning ends in *this* step
+            if self.reasoner.is_reasoning_end(request.all_token_ids):
+                # Reasoning just ended, so we shouldn't advanced til
+                # next pass
+                structured_req.reasoning_ended = True
+            return False
+        else:
+            return True
+    def clear_backend(self) -> None:
+        if self.backend is not None:
+            self.backend.destroy()