PyPI - vllm-cpu-amxbf16 - Versions diffs - 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl - Mend

vllm-cpu-amxbf16 0.11.2.post2__cp310-cp310-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1536) hide show

vllm/model_executor/layers/quantization/gptq_marlin_24.py ADDED Viewed

@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+import torch
+from torch.nn.parameter import Parameter
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
+from vllm.model_executor.layers.quantization import (
+    QuantizationConfig,
+    QuantizationMethods,
+)
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
+from vllm.scalar_type import scalar_types
+logger = init_logger(__name__)
+GPTQ_MARLIN_24_TILE = 16
+GPTQ_MARLIN_24_MIN_THREAD_N = 128
+GPTQ_MARLIN_24_MIN_THREAD_K = 128
+GPTQ_MARLIN_24_MAX_PARALLEL = 64
+GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES = [-1, 128]
+class GPTQMarlin24Config(QuantizationConfig):
+    """Config class for Marlin24."""
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+    ) -> None:
+        super().__init__()
+        quant_type = {
+            4: scalar_types.uint4b8,
+            8: scalar_types.uint8b128,
+        }.get(weight_bits)
+        self.group_size = group_size
+        # Verify
+        if quant_type is None or quant_type not in GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES:
+            raise ValueError(
+                f"Marlin_24 does not support quant_type = {quant_type}. "
+                f"Only weight_bits = {GPTQ_MARLIN_24_SUPPORTED_QUANT_TYPES} "
+                "are supported."
+            )
+        if self.group_size not in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"Marlin_24 does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES} "
+                "are supported."
+            )
+        self.quant_type = quant_type
+        # 4 Bits packed into 32 bit datatype.
+        self.pack_factor = 32 // self.quant_type.size_bits
+        # Tile size used by marlin kernels.
+        self.tile_size = 16
+        # Min out_features dim
+        self.min_n_threads = GPTQ_MARLIN_24_MIN_THREAD_N
+        # Min in_features dim
+        self.min_k_threads = GPTQ_MARLIN_24_MIN_THREAD_K
+        # Max parallel problems to solve at once (improves large
+        # batch performance)
+        self.max_parallel = GPTQ_MARLIN_24_MAX_PARALLEL
+        # Permutation length used by the marlin kernels.
+        self.perm_len = 1024
+    def __repr__(self) -> str:
+        return "Marlin24Config(quant_type={}, group_size={})".format(
+            self.quant_type, self.group_size
+        )
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "gptq_marlin_24"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half]
+    @classmethod
+    # Need to figure it out
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "GPTQMarlin24Config":
+        weight_bits = cls.get_from_keys(config, ["bits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+    @classmethod
+    def override_quantization_method(
+        cls, hf_quant_cfg, user_quant
+    ) -> QuantizationMethods | None:
+        is_marlin_24_format = hf_quant_cfg.get("checkpoint_format") == "marlin_24"
+        is_valid_user_quant = (
+            user_quant is None or user_quant == "gptq" or user_quant == "gptq_marlin_24"
+        )
+        if is_marlin_24_format and is_valid_user_quant:
+            msg = "The model is serialized in {} format. Using {} kernel.".format(
+                cls.get_name(), cls.get_name()
+            )
+            logger.info(msg)
+            return cls.get_name()
+        return None
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["GPTQMarlin24LinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return GPTQMarlin24LinearMethod(self)
+        return None
+class GPTQMarlin24LinearMethod(LinearMethodBase):
+    """Linear method for Marlin24.
+    Args:
+        quant_config: The Marlin24 quantization config.
+    """
+    def __init__(self, quant_config: GPTQMarlin24Config):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        del output_size  # Unused.
+        weight_loader = extra_weight_attrs["weight_loader"]
+        if params_dtype != torch.float16:
+            raise ValueError(
+                f"The params dtype must be float16, but got {params_dtype}"
+            )
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % self.quant_config.min_n_threads != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"min_n_threads = {self.quant_config.min_n_threads}."
+            )
+        if output_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.min_k_threads != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"min_k_threads = {self.quant_config.min_k_threads}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+        # Check that we have at least 4 tiles horizontally in the shard
+        num_tiles_per_perm = self.quant_config.perm_len // (
+            self.quant_config.tile_size**2
+        )
+        if output_size_per_partition % num_tiles_per_perm != 0:
+            raise ValueError("Each permutation group must reside on the same gpu")
+        # Quantized 4Bit weights packed into Int32.
+        qweight = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // self.quant_config.tile_size // 2,
+                output_size_per_partition
+                * self.quant_config.tile_size
+                // self.quant_config.pack_factor,
+                device="cuda",
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=self.quant_config.pack_factor,
+            marlin_tile_size=self.quant_config.tile_size,
+            weight_loader=weight_loader,
+        )
+        # Meta
+        meta = PackedvLLMParameter(
+            data=torch.empty(
+                input_size_per_partition // 8 // 2 // 2,
+                output_size_per_partition * 2,
+                device="cuda",
+                dtype=torch.int16,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=1,
+            packed_factor=1,
+            marlin_tile_size=2,
+            weight_loader=weight_loader,
+        )
+        # Determine if channelwise or not
+        input_groups = (
+            1
+            if self.quant_config.group_size == -1
+            else input_size_per_partition // self.quant_config.group_size
+        )
+        weight_scale_args = {
+            "data": torch.empty(
+                input_groups,
+                output_size_per_partition,
+                device="cuda",
+                dtype=params_dtype,
+            ),
+            "weight_loader": weight_loader,
+        }
+        if input_groups == 1:
+            scales = ChannelQuantScaleParameter(output_dim=1, **weight_scale_args)
+        else:
+            scales = GroupQuantScaleParameter(
+                output_dim=1, input_dim=0, **weight_scale_args
+            )
+        # Allocate workspace (Used for internal locking mechanism)
+        max_workspace_size = (
+            output_size_per_partition // self.quant_config.min_n_threads
+        ) * self.quant_config.max_parallel
+        workspace = BasevLLMParameter(
+            data=torch.zeros(max_workspace_size, device="cuda", dtype=torch.int),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("B_24", qweight)
+        layer.register_parameter("B_meta", meta)
+        layer.register_parameter("s", scales)
+        layer.register_parameter("workspace", workspace)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # required by torch.compile
+        layer.B_24 = Parameter(layer.B_24.data, requires_grad=False)
+        layer.s = Parameter(layer.s.data, requires_grad=False)
+        layer.B_meta = Parameter(layer.B_meta.data, requires_grad=False)
+        layer.workspace = Parameter(layer.workspace.data, requires_grad=False)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        qweight = layer.B_24
+        meta = layer.B_meta
+        scales = layer.s
+        workspace = layer.workspace
+        x_2d = x.view(-1, x.shape[-1])
+        size_m = x_2d.shape[0]
+        size_k = x_2d.shape[1]
+        size_n = scales.shape[1]
+        output_2d = ops.gptq_marlin_24_gemm(
+            x_2d,
+            qweight,
+            meta,
+            scales,
+            workspace,
+            self.quant_config.quant_type,
+            size_m,
+            size_n,
+            size_k,
+        )
+        output = output_2d.view(x.shape[:-1] + (output_2d.shape[1],))
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output

vllm/model_executor/layers/quantization/hqq_marlin.py ADDED Viewed

@@ -0,0 +1,371 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.logger import init_logger
+from vllm.model_executor.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from vllm.model_executor.layers.quantization import QuantizationMethods
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    GPTQ_MARLIN_MAX_PARALLEL,
+    GPTQ_MARLIN_MIN_THREAD_N,
+    marlin_make_empty_g_idx,
+    marlin_permute_bias,
+    marlin_permute_scales,
+)
+from vllm.model_executor.layers.quantization.utils.marlin_utils_test import (
+    MarlinWorkspace,
+)
+from vllm.model_executor.layers.quantization.utils.quant_utils import gptq_pack
+from vllm.model_executor.parameter import (
+    BasevLLMParameter,
+    GroupQuantScaleParameter,
+    PackedvLLMParameter,
+)
+from vllm.scalar_type import scalar_types
+logger = init_logger(__name__)
+class HQQMarlinConfig(QuantizationConfig):
+    """Config class for HQQ Marlin"""
+    def __init__(
+        self,
+        weight_bits: int,
+        group_size: int,
+        skip_modules: list[str] | None = None,
+    ) -> None:
+        super().__init__()
+        assert group_size == 64, "The only supported HQQ group size is currently 64."
+        assert weight_bits == 4, (
+            "The only supported HQQ quantization bitsize is currently 4."
+        )
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        self.pack_factor = 32 // weight_bits  # packed into int32 in GPTQ format
+        self.quant_type = scalar_types.uint4
+        self.skip_modules = skip_modules
+    def __repr__(self) -> str:
+        return (
+            f"HQQMarlinConfig(quant_type={self.quant_type}, "
+            f"group_size={self.group_size})"
+        )
+    @classmethod
+    def get_name(cls) -> QuantizationMethods:
+        return "hqq"
+    @classmethod
+    def get_supported_act_dtypes(cls) -> list[torch.dtype]:
+        return [torch.half, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_config_filenames(cls) -> list[str]:
+        return ["quantize_config.json"]
+    @classmethod
+    def from_config(cls, config: dict[str, Any]) -> "HQQMarlinConfig":
+        wq_params = config["quant_config"]["weight_quant_params"]
+        weight_bits = cls.get_from_keys(wq_params, ["nbits"])
+        group_size = cls.get_from_keys(wq_params, ["group_size"])
+        skip_modules = config["skip_modules"]
+        return cls(weight_bits, group_size, skip_modules)
+    def is_layer_skipped(self, prefix: str) -> bool:
+        # Split the prefix into its dot-separated components
+        components = prefix.split(".")
+        # Check if any of the skip modules exactly matches any component
+        return self.skip_modules is not None and any(
+            module_name in components for module_name in self.skip_modules
+        )
+    def get_quant_method(
+        self, layer: torch.nn.Module, prefix: str
+    ) -> Optional["QuantizeMethodBase"]:
+        if isinstance(layer, LinearBase):
+            if self.is_layer_skipped(prefix):
+                return UnquantizedLinearMethod()
+            return HQQMarlinMethod(self)
+        return None
+# Empty HQQ parameter, will be ignored during loading
+class HQQEmptyParameter(BasevLLMParameter):
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        pass
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        pass
+def error_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
+    raise ValueError("No loader provided for HQQ parameter!")
+# HQQ packing creates issues with sharding - therefore, prior to loading, we
+# repack to GPTQ. We also reshape the weights to their proper GPTQ shape.
+class HQQweightParameter(PackedvLLMParameter):
+    # unpack function from https://github.com/mobiusml/hqq
+    def unpack_4bit_u8(self, W_q: torch.Tensor) -> torch.Tensor:  # uint8/2 > uint8
+        assert self.weight_bits == 4, "Unsupported quant bitsize (must be 4)"
+        dtype = torch.uint8
+        step = W_q.shape[0]
+        tmp = torch.empty([2 * step, W_q.shape[1]], dtype=dtype, device=W_q.device)
+        tmp[:step] = (W_q & 0b11110000) >> 4
+        tmp[step:] = W_q & 0b00001111
+        return tmp
+    def __init__(self, packed_factor: int, packed_dim: int, weight_bits: int, **kwargs):
+        super().__init__(packed_factor, packed_dim, None, **kwargs)
+        self.weight_bits = weight_bits
+        self.input_shape = self.shape[self.input_dim] * self.packed_factor
+        self.output_shape = self.shape[self.output_dim]
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(self.output_shape, -1).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
+        super().load_row_parallel_weight(loaded_weight)
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = self.unpack_4bit_u8(loaded_weight)
+        loaded_weight = loaded_weight.reshape(-1, self.input_shape).transpose(1, 0)
+        loaded_weight = gptq_pack(
+            loaded_weight,
+            self.weight_bits,
+            loaded_weight.shape[0],
+            loaded_weight.shape[1],
+        )
+        super().load_qkv_weight(loaded_weight, **kwargs)
+# Zero points and scales in HQQ must also be reshaped to correspond to W_q's
+# GPTQ shape (transposed - we transpose them too when processing weights).
+class HQQZeroScaleParameter(GroupQuantScaleParameter):
+    def load_merged_column_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_merged_column_weight(loaded_weight, **kwargs)
+    def load_row_parallel_weight(self, loaded_weight: torch.Tensor):
+        loaded_weight = loaded_weight.reshape(self.shape[0], -1)
+        super().load_row_parallel_weight(loaded_weight)
+    def load_qkv_weight(self, loaded_weight: torch.Tensor, **kwargs):
+        loaded_weight = loaded_weight.reshape(-1, self.shape[1])
+        super().load_qkv_weight(loaded_weight, **kwargs)
+class HQQMarlinMethod(LinearMethodBase):
+    """Linear method for HQQ Marlin."""
+    def __init__(
+        self,
+        quant_config: HQQMarlinConfig,
+    ):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: list[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ) -> None:
+        self.output_size_per_partition = sum(output_partition_sizes)
+        self.input_size_per_partition = input_size_per_partition
+        weight_loader = extra_weight_attrs.get("weight_loader", error_loader)
+        self.scales_and_zp_size = (
+            input_size_per_partition // self.quant_config.group_size
+        )
+        qweight = HQQweightParameter(
+            data=torch.empty(
+                self.input_size_per_partition // self.quant_config.pack_factor,
+                self.output_size_per_partition,
+                dtype=torch.int32,
+            ),
+            input_dim=0,
+            output_dim=1,
+            packed_dim=0,
+            packed_factor=self.quant_config.pack_factor,
+            weight_bits=self.quant_config.weight_bits,
+            weight_loader=weight_loader,
+        )
+        zeros = HQQZeroScaleParameter(
+            data=torch.empty(
+                self.output_size_per_partition,
+                self.scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        scales = HQQZeroScaleParameter(
+            data=torch.empty(
+                self.output_size_per_partition,
+                self.scales_and_zp_size,
+                dtype=params_dtype,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("W_q", qweight)
+        layer.register_parameter("zero", zeros)
+        layer.register_parameter("scale", scales)
+        # Ignore extra parameters in the HQQ model.
+        # To be added as needed.
+        ignore_parameters = (
+            "axis",
+            "channel_wise",
+            "compute_dtype",
+            "encoded_state_dict",
+            "group_size",
+            "nbits",
+            "offload_meta",
+            "optimize",
+            "packing",
+            "quant_scale",
+            "quant_zero",
+            "round_zero",
+            "shape",
+            "stores_quant_config",
+            "unpack_view_dtype",
+            "view_as_float",
+        )
+        for name in ignore_parameters:
+            layer.register_parameter(
+                name,
+                HQQEmptyParameter(data=torch.empty(0), weight_loader=weight_loader),
+            )
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        dev = layer.W_q.device
+        # Repack to Marlin
+        sort_indices = torch.empty(0, dtype=torch.int, device=dev)
+        marlin_w_q = ops.gptq_marlin_repack(
+            layer.W_q,
+            sort_indices,
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.weight_bits,
+        ).to(dev)
+        marlin_s = marlin_permute_scales(
+            layer.scale.transpose(1, 0),
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.group_size,
+        ).to(dev)
+        marlin_zp = marlin_permute_scales(
+            layer.zero.transpose(1, 0),
+            self.input_size_per_partition,
+            self.output_size_per_partition,
+            self.quant_config.group_size,
+        ).to(dev)
+        layer.g_idx = marlin_make_empty_g_idx(dev)
+        layer.g_idx_sort_indices = marlin_make_empty_g_idx(dev)
+        layer.marlin_qweight = marlin_w_q
+        layer.marlin_zeros = marlin_zp
+        layer.marlin_scales = marlin_s
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        workspace = MarlinWorkspace(
+            self.output_size_per_partition,
+            GPTQ_MARLIN_MIN_THREAD_N,
+            GPTQ_MARLIN_MAX_PARALLEL,
+        )
+        scales = layer.marlin_scales
+        zeros = layer.marlin_zeros
+        orig_type = x.dtype
+        if orig_type != torch.float16:
+            x = x.to(torch.float16)
+            scales = scales.to(torch.float16)
+            zeros = zeros.to(torch.float16)
+        marlin_out = ops.gptq_marlin_gemm(
+            x,
+            None,
+            layer.marlin_qweight,
+            bias,
+            scales,
+            None,
+            zeros,
+            layer.g_idx,
+            layer.g_idx_sort_indices,
+            workspace.scratch,
+            scalar_types.uint4,
+            x.shape[0],
+            self.output_size_per_partition,
+            self.input_size_per_partition,
+            True,  # is_k_full
+            False,  # use atomic add
+            True,  # use 32-bit reduce
+            True,  # use float zp
+        )
+        if orig_type != torch.float16:
+            marlin_out = marlin_out.to(orig_type)
+        return marlin_out