PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/v1/sample/logits_processor/builtin.py ADDED Viewed

@@ -0,0 +1,274 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING, TypeVar
+import torch
+from vllm import SamplingParams
+from vllm.v1.sample.logits_processor.interface import (
+    BatchUpdate,
+    LogitsProcessor,
+    MoveDirectionality,
+)
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+T = TypeVar("T")
+class MinPLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
+        max_num_reqs = vllm_config.scheduler_config.max_num_seqs
+        self.min_p_count: int = 0
+        self.min_p_cpu_tensor = torch.zeros(
+            (max_num_reqs,), dtype=torch.float32, device="cpu", pin_memory=is_pin_memory
+        )
+        self.min_p_cpu = self.min_p_cpu_tensor.numpy()
+        self.use_double_tensor = torch.device(device).type != "cpu"
+        if self.use_double_tensor:
+            # Pre-allocated device tensor
+            self.min_p_device: torch.Tensor = torch.empty(
+                (max_num_reqs,), dtype=torch.float32, device=device
+            )
+        else:
+            self.min_p_device = self.min_p_cpu_tensor
+        # Current slice of the device tensor
+        self.min_p: torch.Tensor = self.min_p_device[:0]
+    def is_argmax_invariant(self) -> bool:
+        """Min-p never impacts greedy sampling"""
+        return True
+    def get_min_p_by_index(self, index: int) -> float:
+        return float(self.min_p_cpu[index])
+    def update_state(self, batch_update: BatchUpdate | None):
+        if not batch_update:
+            return
+        needs_update = False
+        # Process added requests.
+        for index, params, _, _ in batch_update.added:
+            min_p = params.min_p
+            min_p_before = self.min_p_cpu[index]
+            if min_p_before != min_p:
+                needs_update = True
+                self.min_p_cpu[index] = min_p
+                if min_p and not min_p_before:
+                    self.min_p_count += 1
+                elif not min_p and min_p_before:
+                    self.min_p_count -= 1
+        if self.min_p_count:
+            # Process removed requests.
+            if batch_update.removed:
+                needs_update = True
+                for index in batch_update.removed:
+                    if self.min_p_cpu[index]:
+                        self.min_p_cpu[index] = 0
+                        self.min_p_count -= 1
+            # Process moved requests, unidirectional (a->b) and swap (a<->b).
+            for adx, bdx, direct in batch_update.moved:
+                min_p_a, min_p_b = self.min_p_cpu[adx], self.min_p_cpu[bdx]
+                if min_p_a != min_p_b:
+                    needs_update = True
+                    self.min_p_cpu[bdx] = min_p_a
+                    if direct == MoveDirectionality.SWAP:
+                        self.min_p_cpu[adx] = min_p_b
+                if direct == MoveDirectionality.UNIDIRECTIONAL:
+                    if min_p_a:
+                        self.min_p_cpu[adx] = 0
+                    if min_p_b:
+                        self.min_p_count -= 1
+        # Update tensors if needed.
+        size = batch_update.batch_size
+        if self.min_p_count and (needs_update or self.min_p.shape[0] != size):
+            self.min_p = self.min_p_device[:size]
+            if self.use_double_tensor:
+                self.min_p.copy_(self.min_p_cpu_tensor[:size], non_blocking=True)
+            self.min_p.unsqueeze_(1)
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if not self.min_p_count:
+            return logits
+        # Convert logits to probability distribution
+        probability_values = torch.nn.functional.softmax(logits, dim=-1)
+        # Calculate maximum probabilities per sequence
+        max_probabilities = torch.amax(probability_values, dim=-1, keepdim=True)
+        # Adjust min_p
+        adjusted_min_p = max_probabilities.mul_(self.min_p)
+        # Identify valid tokens using threshold comparison
+        invalid_token_mask = probability_values < adjusted_min_p
+        # Apply mask using boolean indexing
+        logits[invalid_token_mask] = -float("inf")
+        return logits
+class LogitBiasLogitsProcessor(LogitsProcessor):
+    def __init__(self, _, device: torch.device, is_pin_memory: bool):
+        self.device = device
+        self.pin_memory = is_pin_memory
+        self.biases: dict[int, dict[int, float]] = {}
+        self.bias_tensor: torch.Tensor = torch.tensor(())
+        self.logits_slice = (
+            self._device_tensor([], torch.int32),
+            self._device_tensor([], torch.int32),
+        )
+    def is_argmax_invariant(self) -> bool:
+        """Logit bias can rebalance token probabilities and change the
+        outcome of argmax in greedy sampling."""
+        return False
+    def update_state(self, batch_update: BatchUpdate | None):
+        needs_update = process_dict_updates(
+            self.biases, batch_update, lambda params, _, __: params.logit_bias or None
+        )
+        # Update tensors if needed.
+        if needs_update:
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            biases: list[float] = []
+            for req, lb in self.biases.items():
+                reqs.extend([req] * len(lb))
+                tok_ids.extend(lb.keys())
+                biases.extend(lb.values())
+            self.bias_tensor = self._device_tensor(biases, torch.float32)
+            self.logits_slice = (
+                self._device_tensor(reqs, torch.int32),
+                self._device_tensor(tok_ids, torch.int32),
+            )
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return torch.tensor(
+            data, device="cpu", dtype=dtype, pin_memory=self.pin_memory
+        ).to(device=self.device, non_blocking=True)
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.biases:
+            logits[self.logits_slice] += self.bias_tensor
+        return logits
+class MinTokensLogitsProcessor(LogitsProcessor):
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ):
+        # index -> (min_toks, output_token_ids, stop_token_ids)
+        self.device = device
+        self.pin_memory = is_pin_memory
+        self.min_toks: dict[int, tuple[int, Sequence[int], set[int]]] = {}
+        # (req_idx_tensor,eos_tok_id_tensor)
+        self.logits_slice: tuple[torch.Tensor, torch.Tensor] = (
+            self._device_tensor([], torch.int32),
+            self._device_tensor([], torch.int32),
+        )
+    def is_argmax_invariant(self) -> bool:
+        """By censoring stop tokens, min-tokens can change the outcome
+        of the argmax operation in greedy sampling."""
+        return False
+    @staticmethod
+    def add_request(
+        params: SamplingParams, _: list[int] | None, output_tok_ids: list[int]
+    ) -> tuple[int, Sequence[int], set[int]] | None:
+        min_tokens = params.min_tokens
+        if not min_tokens or len(output_tok_ids) >= min_tokens:
+            return None
+        return min_tokens, output_tok_ids, params.all_stop_token_ids
+    def update_state(self, batch_update: BatchUpdate | None):
+        needs_update = process_dict_updates(
+            self.min_toks, batch_update, self.add_request
+        )
+        if self.min_toks:
+            # Check for any requests that have attained their min tokens.
+            to_remove = tuple(
+                index
+                for index, (min_toks, out_tok_ids, _) in self.min_toks.items()
+                if len(out_tok_ids) >= min_toks
+            )
+            if to_remove:
+                needs_update = True
+                for index in to_remove:
+                    del self.min_toks[index]
+        # Update tensors if needed.
+        if needs_update:
+            reqs: list[int] = []
+            tok_ids: list[int] = []
+            for req, (_, _, stop_tok_ids) in self.min_toks.items():
+                reqs.extend([req] * len(stop_tok_ids))
+                tok_ids.extend(stop_tok_ids)
+            self.logits_slice = (
+                self._device_tensor(reqs, torch.int32),
+                self._device_tensor(tok_ids, torch.int32),
+            )
+    def _device_tensor(self, data: list, dtype: torch.dtype) -> torch.Tensor:
+        return torch.tensor(
+            data, device="cpu", dtype=dtype, pin_memory=self.pin_memory
+        ).to(device=self.device, non_blocking=True)
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        if self.min_toks:
+            # Inhibit EOS token for requests which have not reached min length
+            logits[self.logits_slice] = -float("inf")
+        return logits
+def process_dict_updates(
+    req_entries: dict[int, T],
+    batch_update: BatchUpdate | None,
+    new_state: Callable[[SamplingParams, list[int] | None, list[int]], T | None],
+) -> bool:
+    """Utility function to update dict state for sparse LogitsProcessors."""
+    if not batch_update:
+        # Nothing to do.
+        return False
+    updated = False
+    for index, params, prompt_tok_ids, output_tok_ids in batch_update.added:
+        if (state := new_state(params, prompt_tok_ids, output_tok_ids)) is not None:
+            req_entries[index] = state
+            updated = True
+        elif req_entries.pop(index, None) is not None:
+            updated = True
+    if req_entries:
+        # Process removed requests.
+        for index in batch_update.removed:
+            if req_entries.pop(index, None):
+                updated = True
+        # Process moved requests, unidirectional (a->b) and
+        # swapped (a<->b)
+        for a_index, b_index, direct in batch_update.moved:
+            a_entry = req_entries.pop(a_index, None)
+            b_entry = req_entries.pop(b_index, None)
+            if a_entry is not None:
+                req_entries[b_index] = a_entry
+                updated = True
+            if b_entry is not None:
+                updated = True
+                if direct == MoveDirectionality.SWAP:
+                    req_entries[a_index] = b_entry
+    return updated

vllm/v1/sample/logits_processor/interface.py ADDED Viewed

@@ -0,0 +1,106 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Sequence
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import TYPE_CHECKING, Optional
+import torch
+from vllm import SamplingParams
+if TYPE_CHECKING:
+    from vllm.config import VllmConfig
+class MoveDirectionality(Enum):
+    # One-way i1->i2 req move within batch
+    UNIDIRECTIONAL = auto()
+    # Two-way i1<->i2 req swap within batch
+    SWAP = auto()
+# Batch indices of any removed requests.
+RemovedRequest = int
+# (index, params, prompt_tok_ids, output_tok_ids) tuples for new
+# requests added to the batch.
+AddedRequest = tuple[int, SamplingParams, list[int] | None, list[int]]
+# (index 1, index 2, directionality) tuples representing
+# one-way moves or two-way swaps of requests in batch
+MovedRequest = tuple[int, int, MoveDirectionality]
+@dataclass(frozen=True)
+class BatchUpdate:
+    """Persistent batch state change info for logitsprocs"""
+    batch_size: int  # Current num reqs in batch
+    # Metadata for requests added to, removed from, and moved
+    # within the persistent batch.
+    #
+    # Key assumption: the `output_tok_ids` list (which is an element of each
+    # tuple in `added`) is a reference to the request's running output tokens
+    # list; via this reference, the logits processors always see the latest
+    # list of generated output tokens.
+    #
+    # NOTE:
+    # * Added or moved requests may replace existing requests with the same
+    #   index.
+    # * Operations should be processed in the following order:
+    #   - removed, added, moved
+    removed: Sequence[RemovedRequest]
+    added: Sequence[AddedRequest]
+    moved: Sequence[MovedRequest]
+class LogitsProcessor(ABC):
+    @classmethod
+    def validate_params(cls, sampling_params: SamplingParams):
+        """Validate sampling params for this logits processor.
+        Raise ValueError for invalid ones.
+        """
+        return None
+    @abstractmethod
+    def __init__(
+        self, vllm_config: "VllmConfig", device: torch.device, is_pin_memory: bool
+    ) -> None:
+        raise NotImplementedError
+    @abstractmethod
+    def apply(self, logits: torch.Tensor) -> torch.Tensor:
+        """Apply LogitsProcessor to batch logits tensor.
+        The updated tensor must be returned but may be
+        modified in-place.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def is_argmax_invariant(self) -> bool:
+        """True if logits processor has no impact on the
+        argmax computation in greedy sampling.
+        NOTE: may or may not have the same value for all
+        instances of a given LogitsProcessor subclass,
+        depending on subclass implementation.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def update_state(
+        self,
+        batch_update: Optional["BatchUpdate"],
+    ) -> None:
+        """Called when there are new output tokens, prior
+        to each forward pass.
+        Args:
+            batch_update: Non-None iff there have been changes
+                to the batch makeup.
+        """
+        raise NotImplementedError

vllm/v1/sample/logits_processor/state.py ADDED Viewed

@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Iterator
+from itertools import chain
+from typing import TYPE_CHECKING
+from vllm.v1.sample.logits_processor.interface import (
+    AddedRequest,
+    BatchUpdate,
+    MovedRequest,
+    RemovedRequest,
+)
+if TYPE_CHECKING:
+    from vllm.v1.sample.logits_processor.interface import LogitsProcessor
+class BatchUpdateBuilder:
+    """Helps track persistent batch state changes and build
+    a batch update data structure for logitsprocs
+    Assumptions:
+    * All information about requests removed from persistent batch
+      during a step is aggregated in self._removed through calls to
+      self.removed_append() at the beginning of a step. This must happen
+      before the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are invoked in a given step
+    * After the first time that self.removed, self.pop_removed()
+      or self.peek_removed() are read in a step, no new removals
+      are registered using self.removed_append()
+    * Elements of self._removed are never directly modified, added or
+      removed (i.e. modification is only via self.removed_append() and
+      self.pop_removed())
+    Guarantees under above assumptions:
+    * self.removed is always sorted in descending order
+    * self.pop_removed() and self.peek_removed() both return
+      the lowest removed request index in the current step
+    """
+    _removed: list[RemovedRequest]
+    _is_removed_sorted: bool
+    added: list[AddedRequest]
+    moved: list[MovedRequest]
+    def __init__(
+        self,
+        removed: list[RemovedRequest] | None = None,
+        added: list[AddedRequest] | None = None,
+        moved: list[MovedRequest] | None = None,
+    ) -> None:
+        self._removed = removed or []
+        self.added = added or []
+        self.moved = moved or []
+        self._is_removed_sorted = False
+        # Used to track changes in the pooling case
+        # where we don't populate the added list.
+        self.batch_changed = False
+    def _ensure_removed_sorted(self) -> None:
+        """Sort removed request indices in
+        descending order.
+        Idempotent after first call in a
+        given step, until reset.
+        """
+        if not self._is_removed_sorted:
+            self._removed.sort(reverse=True)
+            self._is_removed_sorted = True
+    @property
+    def removed(self) -> list[RemovedRequest]:
+        """Removed request indices sorted in
+        descending order"""
+        self._ensure_removed_sorted()
+        return self._removed
+    def removed_append(self, index: int) -> None:
+        """Register the removal of a request from the persistent batch.
+        Must not be called after the first time self.removed,
+        self.pop_removed() or self.peek_removed() are invoked.
+        Args:
+          index: request index
+        """
+        if self._is_removed_sorted:
+            raise RuntimeError(
+                "Cannot register new removed request after self.removed has been read."
+            )
+        self._removed.append(index)
+        self.batch_changed = True
+    def has_removed(self) -> bool:
+        return bool(self._removed)
+    def peek_removed(self) -> int | None:
+        """Return lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed[-1]
+        return None
+    def pop_removed(self) -> int | None:
+        """Pop lowest removed request index"""
+        if self.has_removed():
+            self._ensure_removed_sorted()
+            return self._removed.pop()
+        return None
+    def reset(self) -> bool:
+        """Returns True if there were any changes to the batch."""
+        self._is_removed_sorted = False
+        self._removed.clear()
+        self.added.clear()
+        self.moved.clear()
+        batch_changed = self.batch_changed
+        self.batch_changed = False
+        return batch_changed
+    def get_and_reset(self, batch_size: int) -> BatchUpdate | None:
+        """Generate a logitsprocs batch update data structure and reset
+        internal batch update builder state.
+        Args:
+          batch_size: current persistent batch size
+        Returns:
+          Frozen logitsprocs batch update instance; `None` if no updates
+        """
+        # Reset removal-sorting logic
+        self._is_removed_sorted = False
+        self.batch_changed = False
+        if not any((self._removed, self.moved, self.added)):
+            # No update; short-circuit
+            return None
+        # Build batch state update
+        batch_update = BatchUpdate(
+            batch_size=batch_size,
+            removed=self._removed,
+            moved=self.moved,
+            added=self.added,
+        )
+        self._removed = []
+        self.moved = []
+        self.added = []
+        return batch_update
+class LogitsProcessors:
+    """Encapsulates initialized logitsproc objects."""
+    def __init__(self, logitsprocs: Iterator["LogitsProcessor"] | None = None) -> None:
+        self.argmax_invariant: list[LogitsProcessor] = []
+        self.non_argmax_invariant: list[LogitsProcessor] = []
+        if logitsprocs:
+            for logitproc in logitsprocs:
+                (
+                    self.argmax_invariant
+                    if logitproc.is_argmax_invariant()
+                    else self.non_argmax_invariant
+                ).append(logitproc)
+    @property
+    def all(self) -> Iterator["LogitsProcessor"]:
+        """Iterator over all logits processors."""
+        return chain(self.argmax_invariant, self.non_argmax_invariant)

vllm/v1/sample/metadata.py ADDED Viewed

@@ -0,0 +1,44 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+import torch
+from vllm.v1.sample.logits_processor import LogitsProcessors
+@dataclass
+class SamplingMetadata:
+    temperature: torch.Tensor | None
+    all_greedy: bool
+    all_random: bool
+    top_p: torch.Tensor | None
+    top_k: torch.Tensor | None
+    generators: dict[int, torch.Generator]
+    # None means no logprobs, 0 means sampled token logprobs only
+    max_num_logprobs: int | None
+    no_penalties: bool
+    prompt_token_ids: torch.Tensor | None
+    frequency_penalties: torch.Tensor
+    presence_penalties: torch.Tensor
+    repetition_penalties: torch.Tensor
+    output_token_ids: list[list[int]]
+    # `allowed_token_ids_mask` is a 2D bool tensor of shape (max batch size,
+    # vocab size).
+    allowed_token_ids_mask: torch.Tensor | None
+    # req_index -> bad_words_token_ids
+    bad_words_token_ids: dict[int, list[list[int]]]
+    # Loaded logits processors
+    logitsprocs: LogitsProcessors
+    # Speculative token ids
+    spec_token_ids: list[list[int]] | None = None

vllm/v1/sample/ops/__init__.py ADDED Viewed

File without changes

vllm/v1/sample/ops/bad_words.py ADDED Viewed

@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+_SMALLEST_LOGIT = float("-inf")
+def _apply_bad_words_single_batch(
+    logits: torch.Tensor,
+    bad_words_token_ids: list[list[int]],
+    past_tokens_ids: list[int],
+) -> None:
+    for bad_word_ids in bad_words_token_ids:
+        if len(bad_word_ids) > len(past_tokens_ids) + 1:
+            continue
+        prefix_length = len(bad_word_ids) - 1
+        last_token_id = bad_word_ids[-1]
+        actual_prefix = past_tokens_ids[-prefix_length:] if prefix_length > 0 else []
+        expected_prefix = bad_word_ids[:prefix_length]
+        assert len(actual_prefix) == len(expected_prefix)
+        if actual_prefix == expected_prefix:
+            logits[last_token_id] = _SMALLEST_LOGIT
+def apply_bad_words(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+) -> None:
+    for i, bad_words_ids in bad_words_token_ids.items():
+        _apply_bad_words_single_batch(logits[i], bad_words_ids, past_tokens_ids[i])
+def apply_bad_words_with_drafts(
+    logits: torch.Tensor,
+    bad_words_token_ids: dict[int, list[list[int]]],
+    past_tokens_ids: list[list[int]],
+    num_draft_tokens: list[int],
+) -> None:
+    start_idx = 0
+    for i, bad_words_ids in bad_words_token_ids.items():
+        for draft_idx in range(num_draft_tokens[i]):
+            _apply_bad_words_single_batch(
+                logits[start_idx + draft_idx],
+                bad_words_ids,
+                past_tokens_ids[start_idx + draft_idx],
+            )
+        start_idx += num_draft_tokens[i]

vllm/v1/sample/ops/logprobs.py ADDED Viewed

@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Some utilities for logprobs, including logits."""
+import torch
+from vllm.platforms import current_platform
+@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+def batched_count_greater_than(x: torch.Tensor, values: torch.Tensor) -> torch.Tensor:
+    """
+    Counts elements in each row of x that are greater than the corresponding
+    value in values.  Use torch.compile to generate an optimized kernel for
+    this function. otherwise, it will create additional copies of the input
+    tensors and cause memory issues.
+    Args:
+        x (torch.Tensor): A 2D tensor of shape (batch_size, n_elements).
+        values (torch.Tensor): A 2D tensor of shape (batch_size, 1).
+    Returns:
+        torch.Tensor: A 1D tensor of shape (batch_size,) with the counts.
+    """
+    return (x >= values).sum(-1)