PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/lora/layers/logits_processor.py ADDED Viewed

@@ -0,0 +1,252 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from vllm.config.lora import LoRAConfig
+from vllm.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding
+from vllm.platforms import current_platform
+from .base import BaseLayerWithLoRA
+class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+    """
+    LoRA wrapper for LogitsProcessor, with extra logic to handle the
+    application of the LoRA adapter and added LoRA vocabulary.
+    Args:
+        base_layer: LogitsProcessor layer
+        hidden_size: hidden size of the model
+        dtype: data type of the model
+        device: device of the model
+        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
+            received from base_layer.get_sharded_to_full_mapping(). If None,
+            no reindexing will be done.
+    """
+    def __init__(
+        self,
+        base_layer: LogitsProcessor,
+        hidden_size: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        sharded_to_full_mapping: list[int] | None,
+    ) -> None:
+        super().__init__()
+        self.base_layer = base_layer
+        self.hidden_size = hidden_size
+        self.dtype = dtype
+        self.device = device
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.sharded_to_full_mapping = sharded_to_full_mapping
+    @property
+    def logits_as_input(self):
+        return self.base_layer.logits_as_input
+    @property
+    def vocab_size(self):
+        return self.base_layer.vocab_size
+    @property
+    def scale(self):
+        return self.base_layer.scale
+    @property
+    def soft_cap(self):
+        return self.base_layer.soft_cap
+    @property
+    def use_all_gather(self):
+        return self.base_layer.use_all_gather
+    @property
+    def org_vocab_size(self):
+        return self.base_layer.org_vocab_size
+    @property
+    def include_gpu_probs_tensor(self):
+        return self.base_layer.include_gpu_probs_tensor
+    @property
+    def should_modify_greedy_probs_inplace(self):
+        return self.base_layer.should_modify_greedy_probs_inplace
+    def create_lora_weights(
+        self,
+        max_loras: int,
+        lora_config: LoRAConfig,
+        model_config: PretrainedConfig | None = None,
+    ) -> None:
+        # TODO: Verify if this condition can be further relaxed
+        if 32000 < self.base_layer.vocab_size > 257024:
+            raise ValueError(
+                "When using LoRA, vocab size must be 32000 >= vocab_size <= 257024"
+            )
+        self.lora_a_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                lora_config.max_lora_rank,
+                self.hidden_size,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.lora_b_stacked = torch.zeros(
+            (
+                max_loras,
+                1,
+                # Pad for kernel compatibility
+                math.ceil(
+                    self.base_layer.vocab_size / lora_config.lora_vocab_padding_size
+                )
+                * lora_config.lora_vocab_padding_size,
+                lora_config.max_lora_rank,
+            ),
+            dtype=lora_config.lora_dtype,
+            device=self.device,
+        )
+        self.embeddings_tensors = torch.full(
+            (max_loras, lora_config.lora_extra_vocab_size, self.hidden_size),
+            fill_value=float("-inf"),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        if self.sharded_to_full_mapping is not None:
+            self.sharded_to_full_mapping_gpu = torch.tensor(
+                self.sharded_to_full_mapping, device=self.device, dtype=torch.long
+            )
+        else:
+            self.sharded_to_full_mapping_gpu = None
+    def reset_lora(self, index: int):
+        self.lora_a_stacked[index] = 0
+        self.lora_b_stacked[index] = 0
+        self.embeddings_tensors[index] = float("-inf")
+    def set_lora(
+        self,
+        index: int,
+        lora_a: torch.Tensor,
+        lora_b: torch.Tensor,
+        embeddings_tensor: torch.Tensor | None,
+    ):
+        self.reset_lora(index)
+        self.lora_a_stacked[index, 0, : lora_a.shape[0], : lora_a.shape[1]].copy_(
+            lora_a, non_blocking=True
+        )
+        self.lora_b_stacked[index, 0, : lora_b.shape[0], : lora_b.shape[1]].copy_(
+            lora_b, non_blocking=True
+        )
+        if embeddings_tensor is not None:
+            self.embeddings_tensors[
+                index,
+                : embeddings_tensor.shape[0],
+                : embeddings_tensor.shape[1],
+            ] = embeddings_tensor
+    def _get_logits(
+        self,
+        hidden_states: torch.Tensor,
+        lm_head: VocabParallelEmbedding,
+        embedding_bias: torch.Tensor | None = None,
+    ) -> torch.Tensor | None:
+        # Get the logits for the next tokens.
+        logits = lm_head.quant_method.apply(lm_head, hidden_states)
+        if embedding_bias is not None:
+            logits += embedding_bias
+        # Gather logits for TP
+        logits = self.base_layer._gather_logits(logits)
+        if logits is None:
+            return None
+        if self.sharded_to_full_mapping_gpu is not None:
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
+            logits = logits[:, self.sharded_to_full_mapping_gpu]
+        lora_logits = torch.empty(
+            self.embeddings_tensors.shape[0] + 1,
+            self.embeddings_tensors.shape[1],
+            hidden_states.shape[0],
+            dtype=self.embeddings_tensors.dtype,
+            device=self.embeddings_tensors.device,
+        )
+        torch.matmul(self.embeddings_tensors, hidden_states.T, out=lora_logits[:-1])
+        neg_inf, pos_inf = current_platform.get_infinity_values(lora_logits.dtype)
+        lora_logits[-1] = neg_inf
+        lora_logits = lora_logits.mT
+        indices_padded = self.punica_wrapper.sampler_indices_padded
+        if current_platform.is_tpu() or current_platform.is_xpu():
+            indices_padded = indices_padded[: logits.size(0)]
+        lora_logits = (
+            lora_logits.reshape(
+                lora_logits.shape[0] * lora_logits.shape[1],
+                lora_logits.shape[2],
+            )
+            .index_select(0, indices_padded)
+            .nan_to_num_(nan=neg_inf, posinf=pos_inf, neginf=neg_inf)
+        )
+        logits[
+            :,
+            self.base_layer.org_vocab_size : self.base_layer.org_vocab_size
+            + lora_logits.shape[1],
+        ] = lora_logits
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_lora_logits(
+            logits, hidden_states, self.lora_a_stacked, self.lora_b_stacked, 1.0
+        )
+        if not current_platform.can_update_inplace():
+            logits = lora_output
+        # Remove paddings in vocab (if any).
+        logits = logits[:, : self.base_layer.vocab_size]
+        return logits
+    def forward(self, *args, **kwargs):
+        return type(self.base_layer).forward(self, *args, **kwargs)
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        # Special handling for the LogitsProcessor.
+        return False

vllm/lora/layers/replicated_linear.py ADDED Viewed

@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from vllm.config.lora import LoRAConfig
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from .base_linear import BaseLinearLayerWithLoRA
+class ReplicatedLinearWithLoRA(BaseLinearLayerWithLoRA):
+    def __init__(self, base_layer: ReplicatedLinear) -> None:
+        super().__init__(
+            base_layer,
+        )
+        # To ensure interface compatibility, set to 1 always.
+        self.output_size = self.base_layer.output_size
+        self.n_slices = 1
+    def forward(
+        self, input_: torch.Tensor
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
+        """Forward of ReplicatedLinearWithLoRA
+        Args:
+            input_: Tensor whose last dimension is `input_size`.
+        Returns:
+            - output
+            - bias
+        """
+        bias = self.base_layer.bias if not self.base_layer.skip_bias_add else None
+        # Matrix multiply.
+        output = self.apply(input_, bias)
+        output_bias = self.base_layer.bias if self.base_layer.skip_bias_add else None
+        if not self.base_layer.return_bias:
+            return output
+        return output, output_bias
+    # ReplicatedLinear should always be replaced, regardless of the fully
+    # sharded LoRAs setting, because it is, by definition, copied per GPU.
+    @classmethod
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        return type(source_layer) is ReplicatedLinear
+    def slice_lora_a(
+        self, lora_a: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
+        """Slice lora a if splitting for tensor parallelism."""
+        return lora_a
+    def slice_lora_b(
+        self, lora_b: torch.Tensor | list[torch.Tensor | None]
+    ) -> torch.Tensor | list[torch.Tensor | None]:
+        """Slice lora b if splitting with tensor parallelism."""
+        return lora_b

vllm/lora/layers/row_parallel_linear.py ADDED Viewed

@@ -0,0 +1,181 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from vllm.config.lora import LoRAConfig
+from vllm.distributed import (
+    split_tensor_along_last_dim,
+    tensor_model_parallel_all_reduce,
+)
+from vllm.model_executor.layers.linear import RowParallelLinear
+from vllm.platforms import current_platform
+from .base_linear import BaseLinearLayerWithLoRA
+from .utils import _fully_sharded_can_replace, _not_fully_sharded_can_replace
+class RowParallelLinearWithLoRA(BaseLinearLayerWithLoRA):
+    def __init__(self, base_layer: RowParallelLinear) -> None:
+        super().__init__(base_layer)
+        # reset input_size
+        self.input_size = self.base_layer.input_size_per_partition
+        self.output_size = self.base_layer.output_size
+        # There is only one LoRA layer.
+        self.n_slices = 1
+    def slice_lora_a(self, lora_a: torch.Tensor) -> torch.Tensor:
+        shard_size = self.input_size
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_a = lora_a[:, start_idx:end_idx]
+        return lora_a
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        return lora_b
+    def forward(
+        self, input_: torch.Tensor
+    ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor | None]:
+        """Forward of RowParallelLinear
+        Args:
+            input_: tensor whose last dimension is `input_size`. If
+                    `input_is_parallel` is set, then the last dimension
+                    is `input_size // tp_size`.
+        Returns:
+            - output
+            - bias
+        """
+        # set up backprop all-reduce.
+        if self.base_layer.input_is_parallel:
+            input_parallel = input_
+        else:
+            # TODO: simplify code below
+            splitted_input = split_tensor_along_last_dim(
+                input_, num_partitions=self.tp_size
+            )
+            input_parallel = splitted_input[self.tp_rank].contiguous()
+        # Matrix multiply.
+        output_parallel = self.apply(input_parallel)
+        if self.base_layer.reduce_results and self.tp_size > 1:
+            output_ = tensor_model_parallel_all_reduce(output_parallel)
+        else:
+            output_ = output_parallel
+        if not self.base_layer.skip_bias_add:
+            output = (
+                output_ + self.base_layer.bias
+                if self.base_layer.bias is not None
+                else output_
+            )
+            output_bias = None
+        else:
+            output = output_
+            output_bias = self.base_layer.bias
+        if not self.base_layer.return_bias:
+            return output
+        return output, output_bias
+    @classmethod
+    @_not_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        return type(source_layer) is RowParallelLinear
+# The following layer is based on the tensor parallelism strategy given in
+# Y. Sheng et al., S-LoRA: Serving Thousands of Concurrent LoRA Adapters. 2023,
+# https://arxiv.org/abs/2311.03285.
+class RowParallelLinearWithShardedLoRA(RowParallelLinearWithLoRA):
+    """
+    Differs from RowParallelLinearWithLoRA by slicing the
+    LoRA B's also.
+    Based on S-LoRA, slicing happens along the output dim.
+    This yields a combined partial sum from the row parallel base
+    layer and column partitioned output from the LoRA.
+    """
+    def slice_lora_b(self, lora_b: torch.Tensor) -> torch.Tensor:
+        shard_size = self.lora_b_stacked[0].shape[2]
+        start_idx = self.tp_rank * shard_size
+        end_idx = (self.tp_rank + 1) * shard_size
+        lora_b = lora_b[start_idx:end_idx, :]
+        return lora_b
+    def apply(self, x: torch.Tensor, bias: torch.Tensor | None = None) -> torch.Tensor:
+        output = self.base_layer.quant_method.apply(self.base_layer, x)
+        x = x.view(-1, x.shape[-1])
+        output, out_orig_shape = output.view(-1, output.shape[-1]), output.shape
+        buffer = torch.zeros(
+            (self.n_slices, x.shape[0], self.lora_a_stacked[0].shape[2]),
+            dtype=torch.float32,
+            device=x.device,
+        )
+        shrunk_buffer: torch.Tensor | None = self.punica_wrapper.add_shrink(
+            buffer, x, self.lora_a_stacked, 1.0
+        )
+        if not current_platform.can_update_inplace():
+            buffer = shrunk_buffer
+        if self.tp_size > 1:
+            buffer = tensor_model_parallel_all_reduce(buffer)
+        # following S-LoRA, allows the fusing of all_gather and all_reduce
+        # by adding the column partitioned lora output to a slice of output
+        # tensor, which is a partial sum due to row parallel. All that
+        # remains is a standard all_reduce. User should be aware though that
+        # the output is not the same as a normal row_parallel, it should be
+        # reduced before being used
+        # NOTE offset are based on the rank.
+        shard_size = self.lora_b_stacked[0].shape[2]
+        offset_start = self.tp_rank * shard_size
+        lora_output: torch.Tensor | None = self.punica_wrapper.add_expand(
+            output,
+            buffer,
+            self.lora_b_stacked,
+            self.output_slices,
+            offset_start=offset_start,
+            add_input=True,
+        )
+        if not current_platform.can_update_inplace():
+            output = lora_output
+        output = output.view(*out_orig_shape)
+        return output
+    @classmethod
+    @_fully_sharded_can_replace
+    def can_replace_layer(
+        cls,
+        source_layer: nn.Module,
+        lora_config: LoRAConfig,
+        packed_modules_list: list,
+        model_config: PretrainedConfig | None,
+    ) -> bool:
+        # specifying kwargs so they can be easily accessed in decorator
+        return super().can_replace_layer(
+            source_layer=source_layer,
+            lora_config=lora_config,
+            packed_modules_list=packed_modules_list,
+            model_config=model_config,
+            decorate=False,
+        )

vllm/lora/layers/utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+import torch
+import torch.nn as nn
+@dataclass
+class LoRAMapping:
+    index_mapping: tuple[int, ...]
+    prompt_mapping: tuple[int, ...]
+    is_prefill: bool = False
+    def __post_init__(self):
+        self.index_mapping = tuple(self.index_mapping)
+        self.prompt_mapping = tuple(self.prompt_mapping)
+def _get_lora_device(base_layer: nn.Module) -> torch.device:
+    # code borrowed from https://github.com/fmmoret/vllm/blob/fm-support-lora-on-quantized-models/vllm/lora/layers.py#L34
+    """Returns the device for where to place the LoRA tensors."""
+    # unquantizedLinear
+    if hasattr(base_layer, "weight"):
+        return base_layer.weight.device
+    # Compressed Tensor
+    elif hasattr(base_layer, "weight_packed"):
+        return base_layer.weight_packed.device
+    # GPTQ/AWQ
+    elif hasattr(base_layer, "qweight"):
+        return base_layer.qweight.device
+    # HQQ marlin
+    elif hasattr(base_layer, "W_q"):
+        return base_layer.W_q.device
+    else:
+        raise ValueError(f"Unsupported base layer: {base_layer}")
+def _not_fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of not using fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+    def dec(*args, **kwargs):
+        decorate = kwargs.pop("decorate") if "decorate" in kwargs else True
+        condition = not kwargs["lora_config"].fully_sharded_loras if decorate else True
+        return can_replace(*args, **kwargs) and condition
+    return dec
+def _fully_sharded_can_replace(can_replace):
+    """
+    decorator which adds the condition of fully sharded loras
+    intended to wrap can_replace_layer()
+    """
+    def dec(*args, **kwargs):
+        return (
+            can_replace(*args, **kwargs) and kwargs["lora_config"].fully_sharded_loras
+        )
+    return dec