PyPI - vllm-cpu - Versions diffs - 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.8.5.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu might be problematic. Click here for more details.

Files changed (1103) hide show

vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py ADDED Viewed

@@ -0,0 +1,20 @@
+# SPDX-License-Identifier: Apache-2.0
+from .compressed_tensors_scheme import CompressedTensorsScheme
+from .compressed_tensors_w4a16_24 import (W4A16SPARSE24_SUPPORTED_BITS,
+                                          CompressedTensorsW4A16Sparse24)
+from .compressed_tensors_w8a8_fp8 import CompressedTensorsW8A8Fp8
+from .compressed_tensors_w8a8_int8 import CompressedTensorsW8A8Int8
+from .compressed_tensors_w8a16_fp8 import CompressedTensorsW8A16Fp8
+from .compressed_tensors_wNa16 import (WNA16_SUPPORTED_BITS,
+                                       CompressedTensorsWNA16)
+from .compressed_tensors_24 import CompressedTensors24  # isort: skip
+__all__ = [
+    "CompressedTensorsScheme", "CompressedTensorsWNA16",
+    "CompressedTensorsW8A16Fp8", "CompressedTensorsW4A16Sparse24",
+    "CompressedTensorsW8A8Int8", "CompressedTensorsW8A8Fp8",
+    "WNA16_SUPPORTED_BITS", "W4A16SPARSE24_SUPPORTED_BITS",
+    "CompressedTensors24"
+]

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_24.py ADDED Viewed

@@ -0,0 +1,357 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Any, Callable, Dict, List, Optional, Tuple
+import torch
+from compressed_tensors import CompressionFormat, ModelCompressor
+from compressed_tensors.quantization import (QuantizationArgs,
+                                             QuantizationStrategy,
+                                             QuantizationType)
+from compressed_tensors.utils import combine_shards
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
+                                               QKVParallelLinear)
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
+    convert_to_channelwise, sparse_cutlass_supported)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           ModelWeightParameter,
+                                           PerTensorScaleParameter)
+__all__ = ["CompressedTensors24"]
+class CompressedTensors24(CompressedTensorsScheme):
+    def __init__(
+        self,
+        quantized: bool = False,
+        weight_quant: Optional[QuantizationArgs] = None,
+        input_quant: Optional[QuantizationArgs] = None,
+        model_compression_config: Optional[Dict[str, Any]] = None,
+    ):
+        self.quantized = quantized
+        self.weight_quant = weight_quant
+        self.input_quant = input_quant
+        self.model_compressor = (
+            ModelCompressor.from_compression_config(model_compression_config)
+            if model_compression_config is not None else None)
+        self.do_sparse_decompress = (
+            self.model_compressor is not None
+            and self.model_compressor.sparsity_config.format
+            == CompressionFormat.sparse_24_bitmask.value)
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # Only cutlass 3.x kernels are implemented so far
+        return 90
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size: int,
+        output_partition_sizes: List[int],
+        input_size_per_partition: int,
+        params_dtype: torch.dtype,
+        weight_loader: Callable,
+        **kwargs,
+    ):
+        if not sparse_cutlass_supported():
+            raise ValueError(
+                "Sparse CUTLASS not supported. vLLM must be built with "
+                "CUDA 12.2 or later to use this feature")
+        layer.logical_widths = output_partition_sizes
+        layer.input_size = input_size
+        layer.input_size_per_partition = input_size_per_partition
+        self.weights_dtype: torch.dtype = self._get_params_dtype(params_dtype)
+        # parameter to store uncompressed weight
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=self.weights_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        if self.do_sparse_decompress:
+            assert all(partition_size % 8 == 0
+                       for partition_size in output_partition_sizes
+                       ), "All partitions must be divisible by 8 for "
+            "2:4 sparse compressed models"
+            shape = BasevLLMParameter(
+                data=torch.empty(2, 1, dtype=torch.int64),
+                weight_loader=weight_loader,
+            )
+            compressed_weight = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 2,
+                    dtype=self.weights_dtype,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            bitmask = ModelWeightParameter(
+                data=torch.empty(
+                    sum(output_partition_sizes),
+                    input_size_per_partition // 8,
+                    dtype=torch.uint8,
+                ),
+                input_dim=1,
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("shape", shape)
+            layer.register_parameter("compressed", compressed_weight)
+            layer.register_parameter("bitmask", bitmask)
+        # Check if quantized, not just 2:4 Sparse
+        if self.quantized:
+            if (self.weight_quant and self.weight_quant.strategy
+                    == QuantizationStrategy.CHANNEL.value):
+                weight_scale = ChannelQuantScaleParameter(
+                    data=torch.empty((sum(output_partition_sizes), 1),
+                                     dtype=torch.float32),
+                    output_dim=0,
+                    weight_loader=weight_loader,
+                )
+            else:
+                assert (self.weight_quant and self.weight_quant.strategy
+                        == QuantizationStrategy.TENSOR.value)
+                weight_scale = PerTensorScaleParameter(
+                    data=torch.empty(len(output_partition_sizes),
+                                     dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+            layer.register_parameter("weight_scale", weight_scale)
+            # input quant will be non-none
+            if self.input_quant and not self.input_quant.dynamic:
+                # register input quant scale
+                assert (self.input_quant.strategy ==
+                        QuantizationStrategy.TENSOR.value)
+                input_scale = BasevLLMParameter(
+                    data=torch.empty(1, dtype=torch.float32),
+                    weight_loader=weight_loader,
+                )
+                layer.register_parameter("input_scale", input_scale)
+        else:
+            # for sparse-only, pass in 1 for weight/input scales
+            weight_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                              requires_grad=False)
+            input_scale = torch.nn.Parameter(data=torch.ones(
+                1, dtype=torch.float32),
+                                             requires_grad=False)
+            layer.register_parameter("input_scale", input_scale)
+            layer.register_parameter("weight_scale", weight_scale)
+        layer.register_parameter("weight", weight)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """
+        Compress weights after loading. Store compressed weight and meta
+            tensor
+        :post-condition: layer.w_compressed and layer.meta are
+            set to the compressed weight and meta tensor in the
+            format expected by the Cutlass kernels
+        :param layer: The layer with the weights to be processed
+        """
+        if self.do_sparse_decompress:
+            layer.weight.data = self._decompress_bitmask_compressed_weight(
+                compressed=layer.compressed,
+                bitmask=layer.bitmask,
+                layer=layer,
+            )
+            # compressed and bitmask tensors
+            # are no longer needed after decompression
+            del layer.compressed
+            del layer.bitmask
+        # torch.compile workaround
+        if hasattr(layer, "input_scale"):
+            layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
+                                                   requires_grad=False)
+        if self.weight_quant:
+            if self.weight_quant.strategy == QuantizationStrategy.TENSOR.value:
+                layer.weight_scale = torch.nn.Parameter(
+                    convert_to_channelwise(
+                        weight_scale=layer.weight_scale,
+                        logical_widths=layer.logical_widths,
+                    ),
+                    requires_grad=False,
+                )
+            else:
+                # torch.compile workaround
+                layer.weight_scale = torch.nn.Parameter(
+                    layer.weight_scale.data, requires_grad=False)
+        # Set all negative zero values to 0 prior to compression
+        if (layer.weight.dtype.is_floating_point
+                and layer.weight.dtype.itemsize >= 2):
+            layer.weight.data[layer.weight.data == -0.0] = 0.0
+        w_compressed, meta = ops.cutlass_sparse_compress(layer.weight.data)
+        layer.weight = torch.nn.Parameter(w_compressed, requires_grad=False)
+        layer.meta = torch.nn.Parameter(meta, requires_grad=False)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Returns the output tensor for the layer with 2:4
+        sparse compressed weights, given the input tensor
+        and bias
+        :param layer: The layer with 2:4 sparse compressed
+            weights to be used for the computation
+        :param x: The input tensor to the layer
+        :param bias: The bias to be added to the output tensor
+        :return: The output tensor of the layer
+        """
+        if self.quantized:
+            scale = None
+            if hasattr(layer, "input_scale"):
+                scale = layer.input_scale
+            if self.weights_dtype == torch.int8:
+                ops_output = ops.scaled_int8_quant(x, scale=scale)
+                q_input = ops_output[0]
+                input_scale = ops_output[1]
+            else:
+                assert self.weights_dtype == torch.float8_e4m3fn
+                if scale is not None:
+                    q_input, input_scale = ops.scaled_fp8_quant(x, scale=scale)
+                else:
+                    q_input, input_scale = ops.scaled_fp8_quant(
+                        x, use_per_token_if_dynamic=True)
+        else:
+            # Not quantized, nothing to do with the input_scales, use as is
+            input_scale = layer.input_scale
+            q_input = x
+        out = ops.cutlass_scaled_sparse_mm(
+            a=q_input,
+            bt_nzs=layer.weight,
+            bt_meta=layer.meta,
+            scale_a=input_scale,
+            scale_b=layer.weight_scale,
+            out_dtype=x.dtype,
+            bias=bias,
+        )
+        assert out.is_contiguous()
+        return out
+    def _get_params_dtype(self, params_dtype: torch.dtype) -> torch.dtype:
+        if not self.quantized:
+            return params_dtype
+        assert self.weight_quant is not None
+        assert self.input_quant is not None
+        is_8_bits = self.weight_quant.num_bits == self.input_quant.num_bits == 8
+        if not is_8_bits:
+            raise ValueError("Cutlass only supports 8-bit quantization")
+        if (self.weight_quant.type == QuantizationType.FLOAT
+                and self.input_quant.type == QuantizationType.FLOAT):
+            return torch.float8_e4m3fn
+        if (self.weight_quant.type == QuantizationType.INT
+                and self.input_quant.type == QuantizationType.INT):
+            return torch.int8
+        raise ValueError("Quantization type not supported by Cutlass")
+    def _decompress_bitmask_compressed_weight(
+        self,
+        compressed: torch.Tensor,
+        bitmask: torch.Tensor,
+        layer: torch.nn.Module,
+    ) -> torch.Tensor:
+        """
+        Decompress a compressed 2:4 sparse weight tensor using the bitmask and
+        return the result.
+        This function also supports sharded decompression.
+        :param compressed: The 2:4 sparse weight tensor compressed using the
+            sparse-24-bitmask compressor. This is different from
+            `cutlass_sparse_compress` which uses a different scheme (2 bits for
+            every nonzero element that represent the coordinate within the block
+            of 4). The bitmask compression here uses a bitmask to indicate the
+            positions of non-zero elements.
+        :param bitmask: The 2:4 bitmask associated with the compressed weights,
+            representing the positions of non-zero elements in the compressed
+            tensor.
+        :param layer: The layer whose weights need to be processed after
+            loading.
+        :return: The decompressed 2:4 sparse weight tensor.
+        """
+        sparsity_compressor = self.model_compressor.sparsity_compressor
+        def _process_split(
+            bitmask_compressed_weight: torch.Tensor,
+            shape,
+            bitmask: torch.Tensor,
+        ) -> torch.Tensor:
+            weight_data = dict(
+                compressed=bitmask_compressed_weight,
+                shape=shape,
+                bitmask=bitmask,
+            )
+            return sparsity_compressor.decompress_weight(weight_data)
+        split_weights: List[torch.Tensor] = []
+        split_bitmask: List[torch.Tensor] = []
+        split_shape: List[Tuple[int, int]] = []
+        if isinstance(layer, (QKVParallelLinear, MergedColumnParallelLinear)):
+            split_weights = torch.split(compressed, layer.logical_widths)
+            split_bitmask = torch.split(bitmask, layer.logical_widths)
+            split_shape = [(out, layer.input_size_per_partition)
+                           for out in layer.logical_widths]
+        if split_weights:
+            decompressed_shards = [
+                _process_split(compressed_weight, shape, bitmask)
+                for compressed_weight, shape, bitmask in zip(
+                    split_weights, split_shape, split_bitmask)
+            ]
+            decompressed = combine_shards(decompressed_shards)
+        else:
+            decompressed = sparsity_compressor.decompress_weight(
+                dict(
+                    compressed=compressed,
+                    shape=(
+                        layer.logical_widths[0],
+                        layer.input_size_per_partition,
+                    ),
+                    bitmask=bitmask,
+                ))
+        return decompressed

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_scheme.py ADDED Viewed

@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Optional
+import torch
+__all__ = ["CompressedTensorsScheme"]
+class CompressedTensorsScheme(ABC):
+    """
+    Abstract class used to describe the weight creation and forward pass
+    of different quantization schemes supported by CompressedTensors.
+    """
+    @classmethod
+    @abstractmethod
+    def get_min_capability(cls) -> int:
+        """
+        Get minimum device capability.
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def create_weights(self, *args, **kwargs):
+        """
+        Weight creation for the particular scheme. Inputs to this function
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]):
+        """
+        Run the forward pass for the particular scheme. This is where
+        scheme-specific dequant/quant steps/kernels should be applied.
+        :param layer: torch.nn.Module with the registered weights and
+            other parameters relevant to the particular scheme.
+        :param x: input to the layer
+        :param bias: bias parameter
+        """
+        raise NotImplementedError
+    @abstractmethod
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        """
+        Called after weight loading is complete for any cleanup that
+        needs to occur.
+        """
+        raise NotImplementedError

vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w4a16_24.py ADDED Viewed

@@ -0,0 +1,159 @@
+# SPDX-License-Identifier: Apache-2.0
+from typing import Callable, List, Optional
+import torch
+from torch.nn import Parameter
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+    GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           ChannelQuantScaleParameter,
+                                           GroupQuantScaleParameter,
+                                           PackedvLLMParameter)
+from vllm.scalar_type import scalar_types
+__all__ = ["CompressedTensorsW4A16Sparse24"]
+W4A16SPARSE24_SUPPORTED_TYPES_MAP = {
+    4: scalar_types.uint4b8,
+}
+W4A16SPARSE24_SUPPORTED_BITS = list(W4A16SPARSE24_SUPPORTED_TYPES_MAP.keys())
+class CompressedTensorsW4A16Sparse24(CompressedTensorsScheme):
+    def __init__(self,
+                 strategy: str,
+                 num_bits: int,
+                 group_size: Optional[int] = None):
+        self.strategy = strategy
+        self.group_size = group_size
+        self.tile_size = 16
+        if num_bits not in W4A16SPARSE24_SUPPORTED_TYPES_MAP:
+            raise ValueError(
+                f"Unsupported num_bits = {num_bits}. "
+                f"Supported num_bits = {W4A16SPARSE24_SUPPORTED_BITS}")
+        self.quant_type = W4A16SPARSE24_SUPPORTED_TYPES_MAP[num_bits]
+        if self.strategy == "group" and self.group_size is None:
+            raise ValueError(
+                "group_size must be given when using strategy group")
+    @classmethod
+    def get_min_capability(cls) -> int:
+        # ampere + up
+        return 80
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile to be torch.nn.Parameter
+        layer.weight_packed = Parameter(layer.weight_packed.data,
+                                        requires_grad=False)
+        layer.scale_packed = Parameter(layer.scale_packed.data,
+                                       requires_grad=False)
+        layer.meta = Parameter(layer.meta.data, requires_grad=False)
+    def create_weights(self, layer: torch.nn.Module, input_size: int,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+        assert params_dtype == torch.float16, (
+            "float16 is required for marlin24 compressed models. Set dtype=torch.float16"  # noqa: E501
+        )
+        pack_factor = 32 // self.quant_type.size_bits
+        output_size_per_partition = sum(output_partition_sizes)
+        qweight = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // self.tile_size // 2,
+            output_size_per_partition * self.tile_size // pack_factor,
+            dtype=torch.int32,
+        ),
+                                      input_dim=0,
+                                      output_dim=1,
+                                      packed_dim=1,
+                                      packed_factor=pack_factor,
+                                      marlin_tile_size=self.tile_size,
+                                      weight_loader=weight_loader)
+        input_groups = (1 if self.group_size is None else
+                        input_size_per_partition // self.group_size)
+        weight_scale_args = {
+            "data":
+            torch.empty(
+                input_groups,
+                output_size_per_partition,
+                dtype=params_dtype,
+            ),
+            "weight_loader":
+            weight_loader
+        }
+        if self.group_size is not None:
+            scales = GroupQuantScaleParameter(output_dim=1,
+                                              input_dim=0,
+                                              **weight_scale_args)
+        else:
+            scales = ChannelQuantScaleParameter(output_dim=1,
+                                                **weight_scale_args)
+        weight_shape = BasevLLMParameter(data=torch.empty(2,
+                                                          dtype=torch.int64),
+                                         weight_loader=weight_loader)
+        meta = PackedvLLMParameter(data=torch.empty(
+            input_size_per_partition // 8 // 2 // 2,
+            output_size_per_partition * 2,
+            dtype=torch.int16,
+        ),
+                                   input_dim=0,
+                                   output_dim=1,
+                                   packed_dim=1,
+                                   packed_factor=1,
+                                   marlin_tile_size=2,
+                                   weight_loader=weight_loader)
+        layer.register_parameter("weight_packed", qweight)
+        layer.register_parameter("weight_shape", weight_shape)
+        layer.register_parameter("scale_packed", scales)
+        layer.register_parameter("meta", meta)
+        max_workspace_size = (
+            output_size_per_partition //
+            GPTQ_MARLIN_24_MIN_THREAD_N) * GPTQ_MARLIN_24_MAX_PARALLEL
+        workspace = Parameter(torch.zeros(max_workspace_size, dtype=torch.int),
+                              requires_grad=False)
+        layer.workspace = workspace
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor,
+                      bias: Optional[torch.Tensor]) -> torch.Tensor:
+        qweight = layer.weight_packed
+        meta = layer.meta
+        scales = layer.scale_packed
+        workspace = layer.workspace
+        x_2d = x.view(-1, x.shape[-1])
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+        output_2d = ops.gptq_marlin_24_gemm(x_2d, qweight, meta, scales,
+                                            workspace, self.quant_type, size_m,
+                                            size_n, size_k)
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1], ))
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output