PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/lora/punica_wrapper/punica_hpu.py ADDED Viewed

@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union, final
+import torch
+from vllm_hpu_extension.ops import (dispatch_bgmv_embedding,
+                                    dispatch_bgmv_linear)
+from .punica_base import PunicaWrapperBase
+from .utils import convert_mapping
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+@final
+class PunicaWrapperHPU(PunicaWrapperBase):
+    def __init__(self, max_num_batched_tokens: int, max_batches: int,
+                 device: Union[torch.device, str], **kwargs):
+        # Increasing max_num_batched_tokens by 3x to handle increase in
+        # tensor size due to padding.
+        PunicaWrapperBase.__init__(self, 3 * max_num_batched_tokens,
+                                   max_batches, device)
+    def _update_base_metadata(
+        self,
+        mapping: "LoRAMapping",
+        lora_index_to_id: List[Optional[int]],
+        max_loras: int,
+        vocab_size: int,
+        extra_vocab_size: int,
+        long_lora_context: Optional["LongContextLoRAContext"] = None,
+    ):
+        (
+            base_indices,
+            sampler_indices,
+            sampler_indices_padded,
+            embeddings_indices,
+            long_lora_offsets_tensor,
+            indices_len,
+        ) = convert_mapping(mapping, lora_index_to_id, max_loras, vocab_size,
+                            extra_vocab_size, self.device, None)
+        # Updating each element in `long_lora_offsets` with `lora_offset` slows
+        # down perf in HPU due to a series of `strided_insert` ops during lazy
+        # graph accumulation. Hence HPU appends `lora_offset` to a list and
+        # converts it to a tensor only after it is ready.
+        if long_lora_context:
+            index_mapping_indices: List[int] = list(
+                mapping.index_mapping).copy()
+            long_lora_offsets: List[int] = []
+            for i in range(len(index_mapping_indices)):
+                lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                    index_mapping_indices[i], 0)
+                long_lora_offsets.append(lora_offset)
+            long_lora_offsets_tensor = torch.tensor(long_lora_offsets,
+                                                    device=self.device,
+                                                    dtype=torch.long)
+            indices_len[-1] = long_lora_offsets_tensor.shape[-1]
+        self._token_lora_indices[:base_indices.shape[0]].copy_(base_indices)
+        self._sampler_indices[:sampler_indices.shape[0]].copy_(sampler_indices)
+        self._sampler_indices_padded[:sampler_indices_padded.shape[0]].copy_(
+            sampler_indices_padded)
+        self._embeddings_indices[:embeddings_indices.
+                                 shape[0], :embeddings_indices.shape[1]].copy_(
+                                     embeddings_indices)
+        if long_lora_offsets_tensor is not None:
+            self._long_lora_indices[:long_lora_offsets_tensor.shape[0]].copy_(
+                long_lora_offsets_tensor)
+        else:
+            self._long_lora_indices.zero_()
+        self.indices_len[:] = indices_len
+    def add_lora_embedding(self,
+                           y: torch.Tensor,
+                           x: torch.Tensor,
+                           lora_b_stacked: torch.Tensor,
+                           add_inputs: bool = True,
+                           **kwargs) -> None:
+        dispatch_bgmv_embedding(y, x, lora_b_stacked, 0)
+    def add_lora_linear(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: Tuple[torch.Tensor, ...],
+                        lora_b_stacked: Tuple[torch.Tensor, ...],
+                        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+                        scale: float,
+                        output_slices: Tuple[int, ...],
+                        *,
+                        buffer: Optional[Tuple[torch.Tensor, ...]] = None,
+                        **kwargs) -> None:
+        y_org = y
+        x = x.view(-1, x.shape[-1])
+        y = y.view(-1, y.shape[-1])
+        offset_left = 0
+        for slice_idx in range(len(output_slices)):
+            dispatch_bgmv_linear(
+                y[:, offset_left:offset_left + output_slices[slice_idx]], x,
+                lora_a_stacked[slice_idx], lora_b_stacked[slice_idx], 0, scale)
+            offset_left += output_slices[slice_idx]
+        y = y.view_as(y_org)
+    def add_lora_logits(self,
+                        y: torch.Tensor,
+                        x: torch.Tensor,
+                        lora_a_stacked: torch.Tensor,
+                        lora_b_stacked: torch.Tensor,
+                        scale,
+                        *,
+                        buffer: Optional[torch.Tensor] = None,
+                        **kwargs) -> None:
+        y_org = y
+        y = y.view(-1, y.shape[-1])
+        x = x.view(-1, x.shape[-1])
+        dispatch_bgmv_linear(y, x, lora_a_stacked, lora_b_stacked, 0, scale)
+        y = y.view_as(y_org)
+    def add_shrink(
+        self,
+        y: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        x: torch.Tensor,
+        lora_a_stacked: Tuple[torch.Tensor, ...],
+        scale: float,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError
+    def add_expand(
+        self,
+        y: torch.Tensor,
+        x: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+        lora_b_stacked: Tuple[torch.Tensor, ...],
+        lora_bias_stacked: Optional[Tuple[torch.Tensor, ...]],
+        output_slices: Tuple[int, ...],
+        offset_start: int = 0,
+        add_inputs=True,
+        **kwargs,
+    ) -> None:
+        raise NotImplementedError

vllm/lora/punica_wrapper/punica_selector.py ADDED Viewed

@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+from vllm.logger import init_logger
+from vllm.platforms import current_platform
+from vllm.utils import resolve_obj_by_qualname
+from .punica_base import PunicaWrapperBase
+logger = init_logger(__name__)
+def get_punica_wrapper(*args, **kwargs) -> PunicaWrapperBase:
+    punica_wrapper_qualname = current_platform.get_punica_wrapper()
+    punica_wrapper_cls = resolve_obj_by_qualname(punica_wrapper_qualname)
+    punica_wrapper = punica_wrapper_cls(*args, **kwargs)
+    assert punica_wrapper is not None, \
+        "the punica_wrapper_qualname(" + punica_wrapper_qualname + ") is wrong."
+    logger.info_once("Using " + punica_wrapper_qualname.rsplit(".", 1)[1] +
+                     ".")
+    return punica_wrapper

vllm/lora/punica_wrapper/utils.py ADDED Viewed

@@ -0,0 +1,161 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import torch
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+    from vllm.lora.models import LongContextLoRAContext
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
+    """
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    token_nums = seq_length_tensor.sum().item()
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, token_nums, no_lora)
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: List[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    device: torch.device,
+    long_lora_context: Optional["LongContextLoRAContext"] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
+           Optional[torch.Tensor], List[int]]:
+    """Converts LoRAMapping to index tensors.
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+        long_lora_context: Passed if there are long context lora in a batch.
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indicies. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indicies, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            long_lora_indices: Tensor of shape [batch_size] mapping
+                requests to RoPE offsets and rot dims for long LoRAs.
+                None if long context lora doesn't exist.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices, long_lora_indices).
+    """
+    index_mapping_indices: List[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    long_lora_offsets: Optional[torch.Tensor] = None
+    if long_lora_context:
+        long_lora_offsets = torch.zeros(len(index_mapping_indices),
+                                        device=device,
+                                        dtype=torch.long)
+    prompt_mapping: List[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+        if long_lora_context:
+            assert long_lora_offsets is not None
+            lora_offset: int = long_lora_context.offsets_by_lora_id.get(
+                index_mapping_indices[i], 0)
+            long_lora_offsets[i] = lora_offset
+    indices_list: List[Union[List[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    if long_lora_context:
+        assert long_lora_offsets is not None
+        indices_list.append(long_lora_offsets)
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         dtype=torch.long,
+                                         device=device)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices[embeddings_indices == -1] = max_loras - 1
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded[sampler_indices_padded == -1] = max_loras - 1
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    long_lora_indices = None
+    long_lora_indices_len: Optional[int] = None
+    if long_lora_context:
+        long_lora_indices = indices[3]
+        long_lora_indices_len = long_lora_indices.shape[-1]
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    if long_lora_indices_len is not None:
+        indices_len.append(long_lora_indices_len)
+    else:
+        # If long_lora doesn't exist,append None
+        indices_len.append(None)
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        long_lora_indices,
+        indices_len,
+    )

vllm/lora/request.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+import warnings
+from typing import Optional
+import msgspec
+from vllm.adapter_commons.request import AdapterRequest
+class LoRARequest(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """
+    Request for a LoRA adapter.
+    Note that this class should be used internally. For online
+    serving, it is recommended to not allow users to use this class but
+    instead provide another layer of abstraction to prevent users from
+    accessing unauthorized LoRA adapters.
+    lora_int_id must be globally unique for a given adapter.
+    This is currently not enforced in vLLM.
+    """
+    __metaclass__ = AdapterRequest
+    lora_name: str
+    lora_int_id: int
+    lora_path: str = ""
+    lora_local_path: Optional[str] = msgspec.field(default=None)
+    long_lora_max_len: Optional[int] = None
+    base_model_name: Optional[str] = msgspec.field(default=None)
+    def __post_init__(self):
+        if self.lora_local_path:
+            warnings.warn(
+                "The 'lora_local_path' attribute is deprecated "
+                "and will be removed in a future version. "
+                "Please use 'lora_path' instead.",
+                DeprecationWarning,
+                stacklevel=2)
+            if not self.lora_path:
+                self.lora_path = self.lora_local_path or ""
+        # Ensure lora_path is not empty
+        assert self.lora_path, "lora_path cannot be empty"
+    @property
+    def adapter_id(self):
+        return self.lora_int_id
+    @property
+    def name(self):
+        return self.lora_name
+    @property
+    def path(self):
+        return self.lora_path
+    @property
+    def local_path(self):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        return self.lora_path
+    @local_path.setter
+    def local_path(self, value):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        self.lora_path = value
+    def __eq__(self, value: object) -> bool:
+        """
+        Overrides the equality method to compare LoRARequest
+        instances based on lora_name. This allows for identification
+        and comparison lora adapter across engines.
+        """
+        return isinstance(value,
+                          self.__class__) and self.lora_name == value.lora_name
+    def __hash__(self) -> int:
+        """
+        Overrides the hash method to hash LoRARequest instances
+        based on lora_name. This ensures that LoRARequest instances
+        can be used in hash-based collections such as sets and dictionaries,
+        identified by their names across engines.
+        """
+        return hash(self.lora_name)

vllm/lora/resolver.py ADDED Viewed

@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import AbstractSet, Dict, Optional
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+logger = init_logger(__name__)
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: Dict[str, LoRAResolver] = field(default_factory=dict)
+    def get_supported_resolvers(self) -> AbstractSet[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+        self.resolvers[resolver_name] = resolver
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+LoRAResolverRegistry = _LoRAResolverRegistry()