PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/v1/spec_decode/metrics.py ADDED Viewed

@@ -0,0 +1,164 @@
+# SPDX-License-Identifier: Apache-2.0
+from dataclasses import dataclass, field
+from typing import Optional
+import numpy as np
+import prometheus_client
+from vllm.config import SpeculativeConfig
+from vllm.logger import init_logger
+logger = init_logger(__name__)
+@dataclass
+class SpecDecodingStats:
+    """Per-step iteration decoding stats from scheduler.
+    Each scheduler step, statistics on spec decoding performance are
+    aggregated across requests by the scheduler and returned to the
+    frontend in EngineCoreOutputs->SchedulerStats.
+    """
+    num_spec_tokens: int
+    num_drafts: int = 0
+    num_draft_tokens: int = 0
+    num_accepted_tokens: int = 0
+    num_accepted_tokens_per_pos: list[int] = field(default_factory=list)
+    @classmethod
+    def new(cls, num_spec_tokens: int) -> "SpecDecodingStats":
+        return cls(num_spec_tokens=num_spec_tokens,
+                   num_accepted_tokens_per_pos=[0] * num_spec_tokens)
+    def observe_draft(self, num_draft_tokens: int, num_accepted_tokens: int):
+        self.num_drafts += 1
+        self.num_draft_tokens += num_draft_tokens
+        self.num_accepted_tokens += num_accepted_tokens
+        assert num_accepted_tokens <= self.num_spec_tokens
+        for i in range(num_accepted_tokens):
+            self.num_accepted_tokens_per_pos[i] += 1
+class SpecDecodingLogging:
+    """Aggregate and log spec decoding metrics.
+    LoggingStatLogger aggregates per-iteration metrics over a set
+    time interval using observe() and then logs them using log()
+    before resetting to zero.
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.num_drafts: list[int] = []
+        self.num_draft_tokens: list[int] = []
+        self.num_accepted_tokens: list[int] = []
+        self.accepted_tokens_per_pos_lists: list[list[int]] = []
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        self.num_drafts.append(spec_decoding_stats.num_drafts)
+        self.num_draft_tokens.append(spec_decoding_stats.num_draft_tokens)
+        self.num_accepted_tokens.append(
+            spec_decoding_stats.num_accepted_tokens)
+        self.accepted_tokens_per_pos_lists.append(
+            spec_decoding_stats.num_accepted_tokens_per_pos)
+    def log(self, log_fn=logger.info):
+        num_drafts = np.sum(self.num_drafts)
+        num_draft_tokens = np.sum(self.num_draft_tokens)
+        num_accepted_tokens = np.sum(self.num_accepted_tokens)
+        draft_acceptance_rate = (num_accepted_tokens / num_draft_tokens *
+                                 100 if num_draft_tokens > 0 else float("nan"))
+        mean_acceptance_length = (num_accepted_tokens / num_drafts)
+        pos_matrix = np.array(self.accepted_tokens_per_pos_lists)
+        acceptance_rates = np.sum(pos_matrix, axis=0) / num_drafts
+        rates_str = ", ".join(f"{p:.3f}" for p in acceptance_rates)
+        log_fn(
+            "SpecDecoding metrics: "
+            "Draft acceptance rate: %.1f%%, "
+            "Mean acceptance length: %.2f, "
+            "Accepted: %d tokens, "
+            "Drafted: %d tokens, "
+            "Per-position acceptance rate: %s",
+            draft_acceptance_rate,
+            mean_acceptance_length,
+            num_accepted_tokens,
+            num_draft_tokens,
+            rates_str,
+        )
+        self.reset()
+class SpecDecodingProm:
+    """Record spec decoding metrics in Prometheus.
+    The acceptance rate can be calculated using a PromQL query:
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_draft_tokens_total[$interval])
+    The mean acceptance length can be calculated using:
+      rate(vllm:spec_decode_num_accepted_tokens_total[$interval]) /
+      rate(vllm:spec_decode_num_drafts[$interval])
+    A per-position acceptance rate vector can be computed using
+      vllm:spec_decode_num_accepted_tokens_per_pos[$interval] /
+      vllm:spec_decode_num_drafts[$interval]
+    """
+    def __init__(self, speculative_config: Optional[SpeculativeConfig],
+                 labelnames: list[str], labelvalues: list[str]):
+        self.spec_decoding_enabled = speculative_config is not None
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_drafts_total",
+                documentation="Number of spec decoding drafts.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_draft_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_draft_tokens_total",
+                documentation="Number of draft tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        self.counter_spec_decode_num_accepted_tokens = \
+            prometheus_client.Counter(
+                name="vllm:spec_decode_num_accepted_tokens_total",
+                documentation="Number of accepted tokens.",
+                labelnames=labelnames).labels(*labelvalues)
+        assert speculative_config is not None
+        num_spec_tokens = (speculative_config.num_speculative_tokens
+                           if self.spec_decoding_enabled else 0)
+        pos_labelnames = labelnames + ["position"]
+        base_counter = prometheus_client.Counter(
+            name="vllm:spec_decode_num_accepted_tokens_per_pos",
+            documentation="Accepted tokens per draft position.",
+            labelnames=pos_labelnames)
+        self.counter_spec_decode_num_accepted_tokens_per_pos: \
+            list[prometheus_client.Counter] = []
+        for pos in range(num_spec_tokens):
+            pos_labelvalues = labelvalues + [str(pos)]
+            self.counter_spec_decode_num_accepted_tokens_per_pos.append(
+                base_counter.labels(*pos_labelvalues))
+    def observe(self, spec_decoding_stats: SpecDecodingStats):
+        if not self.spec_decoding_enabled:
+            return
+        self.counter_spec_decode_num_drafts.inc(spec_decoding_stats.num_drafts)
+        self.counter_spec_decode_num_draft_tokens.inc(
+            spec_decoding_stats.num_draft_tokens)
+        self.counter_spec_decode_num_accepted_tokens.inc(
+            spec_decoding_stats.num_accepted_tokens)
+        for pos, counter in enumerate(
+                self.counter_spec_decode_num_accepted_tokens_per_pos):
+            counter.inc(spec_decoding_stats.num_accepted_tokens_per_pos[pos])

vllm/v1/spec_decode/ngram_proposer.py ADDED Viewed

@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Optional
+import numpy as np
+from numba import jit
+from vllm.config import VllmConfig
+class NgramProposer:
+    def __init__(self, vllm_config: VllmConfig):
+        # Minimum length of the n-gram to match.
+        self.min_n = vllm_config.speculative_config.prompt_lookup_min
+        # Maximum length of the n-gram to match.
+        self.max_n = vllm_config.speculative_config.prompt_lookup_max
+        # Number of tokens follow the match. If there are less than k
+        # tokens follow the match, we will return the maximum amount of
+        # tokens until the end.
+        self.k = vllm_config.speculative_config.num_speculative_tokens
+        # Maximum length of the model.
+        self.max_model_len = vllm_config.model_config.max_model_len
+        # Trigger Numba JIT compilation for N-gram proposer.
+        # This usually takes less than 1 second.
+        self.propose(np.zeros(1024, dtype=np.int32))
+    def propose(
+        self,
+        context_token_ids: np.ndarray,
+    ) -> Optional[np.ndarray]:
+        """Proposes the next sequence of tokens based on n-gram pattern
+        matching in the context. The function finds matches of the last n
+        tokens in the previous context, and returns k tokens that followed
+        that match.
+        Args:
+            context_token_ids: Numpy array of token IDs representing the
+                               context sequence.
+        Returns:
+            np.ndarray: The sequence of tokens that followed
+                        the matched n-gram in the context.
+            None: If no matching n-gram pattern is found.
+        Example:
+            If context_token_ids = [1,2,3,4,2,3], min_n = 2, max_n = 3, and
+            k = 4:
+            - The last 3 (= max_n) tokens [4,2,3] cannot find a match.
+            - The last 2 tokens [2,3] will be matched against the previous
+              4 tokens [1,2,3,4].
+            - Finding a match of [2,3] would return the tokens that
+              followed that pattern. Here we will return [4,2,3] because
+              we only have three tokens after the match.
+        """
+        # Do not generate draft tokens beyond the max model length.
+        k = min(self.k, self.max_model_len - context_token_ids.shape[0])
+        if k <= 0:
+            return None
+        # TODO(woosuk): Optimize this.
+        for n in range(self.max_n, self.min_n - 1, -1):
+            result = _find_subarray_kmp(context_token_ids, n, k)
+            if result is not None:
+                return result
+        return None
+    def load_model(self, *args, **kwargs):
+        # No model to load.
+        pass
+@jit(nopython=True)
+def _kmp_lps_array(pattern: np.ndarray) -> np.ndarray:
+    """
+    Build the lps (longest proper prefix which is also suffix)
+    array for the pattern.
+    """
+    lps = np.zeros(len(pattern), dtype=np.int32)
+    prev_lps = 0  # length of the previous longest prefix suffix
+    i = 1
+    while i < len(pattern):
+        if pattern[i] == pattern[prev_lps]:
+            prev_lps += 1
+            lps[i] = prev_lps
+            i += 1
+        else:
+            if prev_lps != 0:
+                prev_lps = lps[prev_lps - 1]
+            else:
+                lps[i] = 0
+                i += 1
+    return lps
+@jit(nopython=True)
+def _find_subarray_kmp(
+    context_token_ids: np.ndarray,
+    n: int,
+    k: int,
+) -> Optional[np.ndarray]:
+    context_len = context_token_ids.shape[0]
+    assert n > 0
+    pattern = context_token_ids[-n:]
+    # Precompute lps array for Y
+    lps = _kmp_lps_array(pattern)
+    i = 0
+    j = 0
+    # -n because the last n tokens are used as pattern
+    while i < context_len - n:
+        if context_token_ids[i] == pattern[j]:
+            i += 1
+            j += 1
+            # If we have matched the entire Y
+            if j == n:
+                # Found pattern in context, gather the next K elements
+                return context_token_ids[i:i + k]
+        else:
+            # Mismatch
+            if j != 0:
+                # Use the lps array to avoid re-checking elements
+                j = lps[j - 1]
+            else:
+                i += 1
+    # Y not found
+    return None

vllm/v1/spec_decode/utils.py ADDED Viewed

@@ -0,0 +1,18 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.v1.worker.gpu_input_batch import InputBatch
+def is_spec_decode_supported(req_id: str, input_batch: InputBatch) -> bool:
+    if req_id in input_batch.min_p_reqs:
+        # Spec decode doesn't support min_p sampling.
+        return False
+    elif (req_id in input_batch.frequency_penalties_reqs
+          or req_id in input_batch.presence_penalties_reqs
+          or req_id in input_batch.repetition_penalties_reqs):
+        # Spec decode doesn't support penalties.
+        return False
+    elif req_id in input_batch.num_logprobs:
+        # Spec decode doesn't support logprobs.
+        return False
+    return True

vllm/v1/stats/__init__.py ADDED Viewed

File without changes