PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/lora/punica_wrapper/utils.py ADDED Viewed

@@ -0,0 +1,136 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import TYPE_CHECKING, Optional, Union
+import torch
+if TYPE_CHECKING:
+    # avoid circuit import
+    from vllm.lora.layers import LoRAMapping
+def compute_meta(
+    token_lora_tensor: torch.Tensor
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, int, int, int, bool]:
+    """
+    Get the information required for the sgmv kernel. With the  features:
+    1. If consecutive requests in the batch use the same LoRA, this function
+    will combine them into a single request, improving sgmv kernel inference
+    performance.
+    2. At the beginning of each prefill stage inference, recalculations are
+    needed based on the input, but only once.
+    """
+    lora_indices_tensor, seq_length_tensor = torch.unique_consecutive(
+        token_lora_tensor, return_counts=True)
+    cum_result = torch.cumsum(seq_length_tensor, dim=0)
+    b_seq_start_tensor = torch.zeros_like(seq_length_tensor)
+    b_seq_start_tensor[1:].copy_(cum_result[:-1])
+    max_length = seq_length_tensor.max().item()
+    token_nums = seq_length_tensor.sum().item()
+    batch_size = lora_indices_tensor.size(0)
+    no_lora = False
+    # -1 means no lora should be applied. Use `no_lora` to determine whether
+    # the current step requires LoRA. If LoRA is not needed, the prefill stage
+    # does not need to launch the triton kernel, which can improve performance
+    if batch_size == 1 and lora_indices_tensor == -1:
+        no_lora = True
+    return (b_seq_start_tensor, seq_length_tensor, lora_indices_tensor,
+            batch_size, max_length, token_nums, no_lora)
+# TODO see if this can be vectorized
+def convert_mapping(
+    mapping: "LoRAMapping",
+    lora_index_to_id: list[Optional[int]],
+    max_loras: int,
+    vocab_size: int,
+    extra_vocab_size: int,
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, list[int]]:
+    """Converts LoRAMapping to index tensors.
+    Args:
+        mapping: LoRAMapping mapping rows in a batch to LoRA ids.
+        lora_index_to_id: List mapping LoRA ids to LoRA indices.
+        max_loras: Maximum number of LoRAs.
+        vocab_size: Model vocab size.
+        extra_vocab_size: Extra vocab size each LoRA can have.
+    Returns:
+        A tuple of tensors:
+            base_indices: Tensor of shape [batch_size] mapping batch rows to
+                LoRA indices.
+            sampler_indices: Tensor of shape [batch_size] mapping requests to
+                LoRA indices for sampler. For generation, this will be the
+                same as base_indices. For prefill, this will map requests
+                to LoRA indices.
+            sampler_indices_padded: Tensor of shape [batch_size] mapping
+                requests to LoRA indices for sampler with padding.
+                Same as sampler_indices, but -1 is replaced with
+                max_loras.
+            embeddings_indices: Tensor of shape [2, batch_size] mapping
+                requests to embedding indices. First row is for embeddings
+                added by the LoRAs, second row is for the LoRA.lora_a
+                embeddings.
+            indices_len: List of lengths of the above tensors. It contains
+                (base_indices, sampler_indices, sampler_indices_padded,
+                embeddings_indices).
+    """
+    index_mapping_indices: list[int] = list(mapping.index_mapping).copy()
+    embedding_indices = index_mapping_indices.copy()
+    lora_indices = index_mapping_indices.copy()
+    prompt_mapping: list[int] = [
+        lora_index_to_id.index(x) if x > 0 else -1
+        for x in mapping.prompt_mapping
+    ]
+    lora_idx = None
+    for i in range(len(index_mapping_indices)):
+        # TODO index can be slow. optimize
+        lora_idx = (lora_index_to_id.index(index_mapping_indices[i])
+                    if index_mapping_indices[i] > 0 else -1)
+        embedding_indices[i] = lora_idx if index_mapping_indices[i] > 0 else 0
+        lora_indices[i] = lora_idx
+    indices_list: list[Union[list[int], torch.Tensor]] = [
+        index_mapping_indices,
+        lora_indices,
+        embedding_indices,
+    ]
+    indices = torch.tensor(indices_list, dtype=torch.long, device=device)
+    prompt_mapping_tensor = torch.tensor(prompt_mapping,
+                                         dtype=torch.long,
+                                         device=device)
+    embeddings_indices = torch.stack([
+        indices[2] * extra_vocab_size,
+        indices[2] * (vocab_size + extra_vocab_size),
+    ])
+    embeddings_indices = torch.where(embeddings_indices == -1, max_loras - 1,
+                                     embeddings_indices)
+    base_indices = indices[1]
+    sampler_indices = prompt_mapping_tensor
+    sampler_indices_padded = sampler_indices.clone()
+    sampler_indices_padded = torch.where(sampler_indices_padded == -1,
+                                         max_loras - 1, sampler_indices_padded)
+    sampler_indices_padded = torch.arange(
+        0, len(sampler_indices_padded), device=device, dtype=torch.long) + (
+            sampler_indices_padded * len(sampler_indices_padded))
+    # Contain length of indices tensors. Used to index into each tensor.
+    indices_len = [
+        base_indices.shape[-1],
+        sampler_indices.shape[-1],
+        sampler_indices_padded.shape[-1],
+        embeddings_indices.shape[-1],
+    ]
+    return (
+        base_indices,
+        sampler_indices,
+        sampler_indices_padded,
+        embeddings_indices,
+        indices_len,
+    )

vllm/lora/request.py ADDED Viewed

@@ -0,0 +1,97 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import warnings
+from typing import Optional
+import msgspec
+class LoRARequest(
+        msgspec.Struct,
+        omit_defaults=True,  # type: ignore[call-arg]
+        array_like=True):  # type: ignore[call-arg]
+    """
+    Request for a LoRA adapter.
+    Note that this class should be used internally. For online
+    serving, it is recommended to not allow users to use this class but
+    instead provide another layer of abstraction to prevent users from
+    accessing unauthorized LoRA adapters.
+    lora_int_id must be globally unique for a given adapter.
+    This is currently not enforced in vLLM.
+    """
+    lora_name: str
+    lora_int_id: int
+    lora_path: str = ""
+    lora_local_path: Optional[str] = msgspec.field(default=None)
+    long_lora_max_len: Optional[int] = None
+    base_model_name: Optional[str] = msgspec.field(default=None)
+    tensorizer_config_dict: Optional[dict] = None
+    def __post_init__(self):
+        if self.lora_int_id < 1:
+            raise ValueError(f"id must be > 0, got {self.lora_int_id}")
+        if self.lora_local_path:
+            warnings.warn(
+                "The 'lora_local_path' attribute is deprecated "
+                "and will be removed in a future version. "
+                "Please use 'lora_path' instead.",
+                DeprecationWarning,
+                stacklevel=2)
+            if not self.lora_path:
+                self.lora_path = self.lora_local_path or ""
+        # Ensure lora_path is not empty
+        assert self.lora_path, "lora_path cannot be empty"
+    @property
+    def adapter_id(self):
+        return self.lora_int_id
+    @property
+    def name(self):
+        return self.lora_name
+    @property
+    def path(self):
+        return self.lora_path
+    @property
+    def local_path(self):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        return self.lora_path
+    @local_path.setter
+    def local_path(self, value):
+        warnings.warn(
+            "The 'local_path' attribute is deprecated "
+            "and will be removed in a future version. "
+            "Please use 'path' instead.",
+            DeprecationWarning,
+            stacklevel=2)
+        self.lora_path = value
+    def __eq__(self, value: object) -> bool:
+        """
+        Overrides the equality method to compare LoRARequest
+        instances based on lora_name. This allows for identification
+        and comparison lora adapter across engines.
+        """
+        return isinstance(value,
+                          self.__class__) and self.lora_name == value.lora_name
+    def __hash__(self) -> int:
+        """
+        Overrides the hash method to hash LoRARequest instances
+        based on lora_name. This ensures that LoRARequest instances
+        can be used in hash-based collections such as sets and dictionaries,
+        identified by their names across engines.
+        """
+        return hash(self.lora_name)

vllm/lora/resolver.py ADDED Viewed

@@ -0,0 +1,85 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from abc import ABC, abstractmethod
+from collections.abc import Set
+from dataclasses import dataclass, field
+from typing import Optional
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+logger = init_logger(__name__)
+class LoRAResolver(ABC):
+    """Base class for LoRA adapter resolvers.
+    This class defines the interface for resolving and fetching LoRA adapters.
+    Implementations of this class should handle the logic for locating and
+    downloading LoRA adapters from various sources (e.g. S3, cloud storage,
+    etc.).
+    """
+    @abstractmethod
+    async def resolve_lora(self, base_model_name: str,
+                           lora_name: str) -> Optional[LoRARequest]:
+        """Abstract method to resolve and fetch a LoRA model adapter.
+        Implements logic to locate and download LoRA adapter based on the name.
+        Implementations might fetch from a blob storage or other sources.
+        Args:
+            base_model_name: The name/identifier of the base model to resolve.
+            lora_name: The name/identifier of the LoRA model to resolve.
+        Returns:
+            Optional[LoRARequest]: The resolved LoRA model information, or None
+            if the LoRA model cannot be found.
+        """
+        pass
+@dataclass
+class _LoRAResolverRegistry:
+    resolvers: dict[str, LoRAResolver] = field(default_factory=dict)
+    def get_supported_resolvers(self) -> Set[str]:
+        """Get all registered resolver names."""
+        return self.resolvers.keys()
+    def register_resolver(
+        self,
+        resolver_name: str,
+        resolver: LoRAResolver,
+    ) -> None:
+        """Register a LoRA resolver.
+        Args:
+            resolver_name: Name to register the resolver under.
+            resolver: The LoRA resolver instance to register.
+        """
+        if resolver_name in self.resolvers:
+            logger.warning(
+                "LoRA resolver %s is already registered, and will be "
+                "overwritten by the new resolver instance %s.", resolver_name,
+                resolver)
+        self.resolvers[resolver_name] = resolver
+    def get_resolver(self, resolver_name: str) -> LoRAResolver:
+        """Get a registered resolver instance by name.
+        Args:
+            resolver_name: Name of the resolver to get.
+        Returns:
+            The resolver instance.
+        Raises:
+            KeyError: If the resolver is not found in the registry.
+        """
+        if resolver_name not in self.resolvers:
+            raise KeyError(
+                f"LoRA resolver '{resolver_name}' not found. "
+                f"Available resolvers: {list(self.resolvers.keys())}")
+        return self.resolvers[resolver_name]
+LoRAResolverRegistry = _LoRAResolverRegistry()

vllm/lora/utils.py ADDED Viewed

@@ -0,0 +1,246 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import os
+from typing import TYPE_CHECKING, Optional, Union
+import huggingface_hub
+import regex as re
+from huggingface_hub.utils import (EntryNotFoundError, HfHubHTTPError,
+                                   HFValidationError, RepositoryNotFoundError)
+from torch import nn
+from transformers import PretrainedConfig
+from vllm.config.lora import LoRAConfig
+from vllm.logger import init_logger
+# being imported for _all_lora_classes below
+# yapf conflicts with isort for this block
+# yapf: disable
+from vllm.lora.layers import (BaseLayerWithLoRA, ColumnParallelLinearWithLoRA,
+                              ColumnParallelLinearWithShardedLoRA,
+                              LogitsProcessorWithLoRA,
+                              MergedColumnParallelLinearWithLoRA,
+                              MergedColumnParallelLinearWithShardedLoRA,
+                              MergedQKVParallelLinearWithLoRA,
+                              MergedQKVParallelLinearWithShardedLoRA,
+                              QKVParallelLinearWithLoRA,
+                              QKVParallelLinearWithShardedLoRA,
+                              ReplicatedLinearWithLoRA,
+                              RowParallelLinearWithLoRA,
+                              RowParallelLinearWithShardedLoRA,
+                              VocabParallelEmbeddingWithLoRA)
+from vllm.model_executor.layers.linear import LinearBase
+# yapf: enable
+if TYPE_CHECKING:
+    from vllm.model_executor.layers.logits_processor import LogitsProcessor
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        ParallelLMHead)
+    from vllm.model_executor.models.utils import WeightsMapper
+logger = init_logger(__name__)
+_all_lora_classes: set[type[BaseLayerWithLoRA]] = {
+    VocabParallelEmbeddingWithLoRA,
+    ColumnParallelLinearWithLoRA,
+    MergedColumnParallelLinearWithLoRA,
+    QKVParallelLinearWithLoRA,
+    MergedQKVParallelLinearWithLoRA,
+    RowParallelLinearWithLoRA,
+    ReplicatedLinearWithLoRA,
+    LogitsProcessorWithLoRA,
+    ColumnParallelLinearWithShardedLoRA,
+    QKVParallelLinearWithShardedLoRA,
+    MergedColumnParallelLinearWithShardedLoRA,
+    MergedQKVParallelLinearWithShardedLoRA,
+    RowParallelLinearWithShardedLoRA,
+}
+def from_layer(layer: nn.Module,
+               max_loras: int,
+               lora_config: LoRAConfig,
+               packed_modules_list: list,
+               model_config: Optional[PretrainedConfig] = None) -> nn.Module:
+    for lora_cls in _all_lora_classes:
+        # specifying kwargs so they can be easily accessed in decorator
+        if lora_cls.can_replace_layer(source_layer=layer,
+                                      lora_config=lora_config,
+                                      packed_modules_list=packed_modules_list,
+                                      model_config=model_config):
+            instance_layer = lora_cls(layer)
+            instance_layer.create_lora_weights(max_loras, lora_config,
+                                               model_config)
+            return instance_layer
+    return layer
+def from_layer_logits_processor(
+    layer: "LogitsProcessor",
+    lm_head: "ParallelLMHead",
+    max_loras: int,
+    lora_config: LoRAConfig,
+    model_config: Optional[PretrainedConfig] = None,
+) -> LogitsProcessorWithLoRA:
+    ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
+                                  lm_head.weight.dtype, lm_head.weight.device,
+                                  lm_head.get_sharded_to_full_mapping())
+    ret.create_lora_weights(max_loras, lora_config, model_config)
+    return ret
+def replace_submodule(model: nn.Module, module_name: str,
+                      new_module: nn.Module) -> nn.Module:
+    """Replace a submodule in a model with a new module."""
+    parent = model.get_submodule(".".join(module_name.split(".")[:-1]))
+    target_name = module_name.split(".")[-1]
+    setattr(parent, target_name, new_module)
+    return new_module
+def parse_fine_tuned_lora_name(
+    name: str,
+    weights_mapper: Optional["WeightsMapper"] = None
+) -> tuple[str, bool, bool]:
+    """Parse the name of lora weights.
+    args:
+        name: the name of the fine-tuned LoRA, e.g.
+            base_model.model.dense1.weight
+        weights_mapper: maps the name of weight, e.g.
+            `model.` -> `language_model.model.`,
+    return:
+        tuple(module_name, is_lora_a):
+            module_name: the name of the module, e.g. model.dense1,
+            is_lora_a whether the tensor is lora_a or lora_b.
+            is_bias whether the tensor is lora bias.
+    """
+    # LoRA weight qualified name usually starts with `base_model.model.`,
+    # so we remove the prefix `base_model.model.` to make the following
+    # mapping correctly.
+    if name.startswith("base_model.model."):
+        name = name.replace("base_model.model.", "")
+        name = weights_mapper._map_name(name) if weights_mapper else name
+        # recover the prefix `base_model.model.`
+        name = "base_model.model." + name
+    else:
+        name = weights_mapper._map_name(name) if weights_mapper else name
+    # In some situations, we may not start with `base_model.model.`.
+    # If we don't (e.g., ibm-granite/granite-speech-3.3-8b),
+    # we should keep the prefix intact.
+    start_index = 2 if name.startswith("base_model.model.") else 0
+    parts = name.split(".")
+    if parts[-1] == "weight" and (parts[-2] == "lora_A"
+                                  or parts[-2] == "lora_B"):
+        new_name = ".".join(parts[start_index:-2])
+        return new_name, parts[-2] == "lora_A", False
+    if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+        new_name = ".".join(parts[start_index:-1])
+        return new_name, parts[-1] == "lora_embedding_A", False
+    if parts[-1] == "bias":
+        new_name = ".".join(parts[start_index:-2])
+        return new_name, False, True
+    raise ValueError(f"{name} is unsupported LoRA weight")
+def is_regex_target_modules(load_modules: Union[str, list[str]],
+                            expected_lora_modules: list[str]) -> bool:
+    """
+    PEFT supports passing `target_modules` in the form of regular expressions,
+    such as `model.*(q_proj|k_proj|v_proj)$`. This function is mainly used to
+    determine whether the suffix in the regular expression is present in the
+    `expected_lora_modules`.
+    """
+    def is_valid_regex(pattern):
+        try:
+            re.compile(pattern)
+            return True
+        except re.error:
+            return False
+    def is_subset(sub_list, full_list):
+        return set(sub_list).issubset(set(full_list))
+    # Similar to PEFT's processing logic, regex-related operations are only
+    #  executed when the load_modules is a `str`.
+    if not isinstance(load_modules, str):
+        return False
+    if is_valid_regex(load_modules):
+        match = re.search(r"\((.*?)\)\$?$", load_modules)
+        if match:
+            suffix = match.group(1).split("|")
+            return is_subset(suffix, expected_lora_modules)
+    return False
+def get_supported_lora_modules(model: nn.Module) -> list[str]:
+    """
+    In vLLM, all linear layers support LoRA.
+    """
+    supported_lora_modules: set[str] = set()
+    for name, module in model.named_modules():
+        # get the embedding modules if the module's embedding_modules
+        # is not empty.
+        embedding_modules = getattr(module, "embedding_modules", None)
+        if embedding_modules is not None:
+            for name in embedding_modules:
+                supported_lora_modules.add(name)
+        # get all the linear subfixes.
+        if isinstance(module, (LinearBase, )):
+            supported_lora_modules.add(name.split(".")[-1])
+    return list(supported_lora_modules)
+def get_adapter_absolute_path(lora_path: str) -> str:
+    """
+    Resolves the given lora_path to an absolute local path.
+    If the lora_path is identified as a Hugging Face model identifier,
+    it will download the model and return the local snapshot path.
+    Otherwise, it treats the lora_path as a local file path and
+    converts it to an absolute path.
+    Parameters:
+    lora_path (str): The path to the lora model, which can be an absolute path,
+                     a relative path, or a Hugging Face model identifier.
+    Returns:
+    str: The resolved absolute local path to the lora model.
+    """
+    # Check if the path is an absolute path. Return it no matter exists or not.
+    if os.path.isabs(lora_path):
+        return lora_path
+    # If the path starts with ~, expand the user home directory.
+    if lora_path.startswith('~'):
+        return os.path.expanduser(lora_path)
+    # Check if the expanded relative path exists locally.
+    if os.path.exists(lora_path):
+        return os.path.abspath(lora_path)
+    # If the path does not exist locally, assume it's a Hugging Face repo.
+    try:
+        local_snapshot_path = huggingface_hub.snapshot_download(
+            repo_id=lora_path)
+    except (HfHubHTTPError, RepositoryNotFoundError, EntryNotFoundError,
+            HFValidationError):
+        # Handle errors that may occur during the download
+        # Return original path instead of throwing error here
+        logger.exception("Error downloading the HuggingFace model")
+        return lora_path
+    return local_snapshot_path