PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py ADDED Viewed

@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class CutlassW4A8LinearKernel(MPLinearKernel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # dynamic per-tok fp8 activation quantization
+        self.quant_fp8 = QuantFP8(static=False,
+                                  group_shape=GroupShape.PER_TOKEN)
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_cuda():
+            return False, "CUTLASS only supported on CUDA"
+        if not current_platform.is_device_capability(90):
+            return False, "CUTLASS W4A8 requires compute capability of 90 "\
+                "(Hopper)"
+        if c.act_type != torch.float8_e4m3fn:
+            return False, "CUTLASS W4A8 only supports FP8 (e4m3) activations"
+        if c.has_g_idx:
+            return False, "Act reordering not supported by CUTLASS W4A8"
+        if c.zero_points:
+            return False, "Zero points not supported by CUTLASS W4A8"
+        if c.weight_type != scalar_types.int4:
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "CUTLASS W4A8, only supported int4"
+        # TODO(czhu): support -1 (column-wise)
+        if c.group_size != 128:
+            return False, "Only group_size 128 is supported"
+        in_features, out_features = c.partition_weight_shape
+        if in_features % 128 or out_features % 128:
+            return False, "K and N must be divisible by 128, got "\
+                           f"{c.partition_weight_shape}"
+        if c.out_type != torch.bfloat16:
+            return False, "Only bfloat16 output type currently supported"\
+                           f"got {c.out_type=}"
+        return True, None
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        # TODO(czhu): optimize speed/mem usage
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.cutlass_encode_and_reorder_int4b(
+                x.data.t().contiguous().t())
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous().to(torch.float8_e4m3fn)
+            x.data = ops.cutlass_pack_scale_fp8(x.data)
+            return x
+        # Encode/reorder weights and pack scales
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+        self._transform_param(layer, "weight_chan_scale", lambda x: x)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+        w_ch_s = layer.weight_chan_scale
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        x_2d, act_scales = self.quant_fp8(x_2d)
+        output = ops.cutlass_w4a8_mm(a=x_2d,
+                                     b_q=w_q,
+                                     b_group_scales=w_s,
+                                     b_group_size=c.group_size,
+                                     a_token_scales=act_scales,
+                                     b_channel_scales=w_ch_s)
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)

vllm/model_executor/layers/quantization/kernels/mixed_precision/dynamic_4bit.py ADDED Viewed

@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.platforms import CpuArchEnum, current_platform
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class Dynamic4bitLinearKernel(MPLinearKernel):
+    SUPPORTED_QUANT_TYPES = [scalar_types.int4]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 1
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if not current_platform.is_cpu():
+            return False, "Only CPU is supported"
+        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
+            return False, f"Unsupported quant type {c.weight_type}"
+        if current_platform.get_cpu_architecture(
+        ) == CpuArchEnum.ARM and c.act_type not in [
+                torch.float32,
+        ]:
+            return False, "Dynamic4bitLinearKernel on Arm requires"\
+                " Float32 activations"
+        if c.full_weight_shape[0] % c.group_size != 0:
+            return False, f"Group size ({c.group_size}) does not evenly divide"\
+                " the number of input features "\
+                f"({c.full_weight_shape[0]})"
+        if current_platform.get_cpu_architecture() == CpuArchEnum.ARM:
+            try:
+                # Attempt to retrieve the operation
+                _ = torch.ops.aten._dyn_quant_matmul_4bit
+            except AttributeError:
+                return False, f"PyTorch {torch.__version__} does not support"\
+                    " _dyn_quant_matmul_4bit. Install a newer version"
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        packed_weight = getattr(layer, self.w_q_name)
+        packed_weight = packed_weight.add(8)
+        uint8_packed = (packed_weight[::, 1::2] << 4
+                        | packed_weight[::, ::2]).to(torch.uint8)
+        scales = getattr(layer, self.w_s_name)
+        block_size = c.group_size
+        # Handle scaling factors for partitioned weights
+        if block_size == c.partition_weight_shape[0]:
+            scales = scales.to(
+                torch.float32
+            )  # Float32 & Bfloat16 variants requires float32 scales
+            scales = scales.view(-1, 1)  # Channel-wise scales
+            if layer.bias is not None:
+                layer.bias = layer.bias.to(
+                    torch.float32
+                )  # Float32 & Bfloat16 variants requires float32 bias
+        else:
+            # KleidiAI kernel requires bfloat16 scales with groupwise scheme
+            scales = scales.to(torch.bfloat16)
+        # Repack weights as per kernel requirement
+        w = torch.ops.aten._dyn_quant_pack_4bit_weight(
+            uint8_packed, scales, layer.bias, block_size,
+            c.partition_weight_shape[0], c.partition_weight_shape[1])
+        replace_parameter(layer, self.w_q_name,
+                          torch.nn.Parameter(w, requires_grad=False))
+        setattr(layer, self.w_s_name, None)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        w_q = getattr(layer, self.w_q_name)
+        output = torch.ops.aten._dyn_quant_matmul_4bit(
+            x_2d, w_q, c.group_size, c.partition_weight_shape[0],
+            c.partition_weight_shape[1])
+        return output.reshape(out_shape)

vllm/model_executor/layers/quantization/kernels/mixed_precision/exllama.py ADDED Viewed

@@ -0,0 +1,143 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class ExllamaLinearKernel(MPLinearKernel):
+    SUPPORTED_QUANT_TYPES = [scalar_types.uint4b8, scalar_types.uint8b128]
+    # In theory supports `scalar_types.uint2b2, scalar_types.uint3b4` too but
+    # currently untested so not added to the list
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 60
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Exllama, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+        if c.partition_weight_shape[1] % (32 // c.weight_type.size_bits) != 0:
+            return False, "Output features must be a multiple of the pack " \
+                            "factor (32 / num_bits) so that we can correctly " \
+                            "pack the zero points"
+        if c.act_type != torch.float16:
+            return False, "Exllama only supports float16 activations"
+        if c.weight_type not in cls.SUPPORTED_QUANT_TYPES:
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Exllama, supported types are: "\
+                           f"{cls.SUPPORTED_QUANT_TYPES}"
+        if c.full_weight_shape[0] % c.group_size != 0:
+            return False, f"Group size ({c.group_size}) does not evenly divide"\
+                           " the number of input features "\
+                           f"({c.full_weight_shape[0]})"
+        return True, None
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        # For Exllama, we need to set a zero-point tensor if there is not one
+        if not c.zero_points:
+            self.w_zp_name = "qzeros"
+            device = getattr(layer, self.w_q_name).device
+            groups = c.partition_weight_shape[0] // c.group_size
+            out_features = c.partition_weight_shape[1]
+            if c.weight_type.has_bias():
+                # if the type has a bias we have to create a zeros tensor that
+                # contains the bias values repeated for each group (-1 due to
+                # a bug in the original GPTQ checkpoint format leading to
+                # exllama kernel adding 1 to the zero points during inference)
+                # Documentation of the bug can be found here:
+                #  https://garden.danieldk.eu/GPTQ-Checkpoint-Format
+                zeros = torch.full((groups, out_features),
+                                   c.weight_type.bias - 1,
+                                   dtype=torch.int32,
+                                   device=device)
+            else:
+                raise NotImplementedError(
+                    "A 0 zero-point is not supported by Exllama due to "
+                    "a bug in the original GPTQ checkpoint format leading to "
+                    "exllama kernel adding 1 to the zero points during "
+                    "inference")
+            zeros = pack_quantized_values_into_int32(zeros,
+                                                     c.weight_type,
+                                                     packed_dim=1)
+            setattr(layer, self.w_zp_name,
+                    torch.nn.Parameter(zeros, requires_grad=False))
+        if c.has_g_idx:
+            def transform_w_g_idx(x):
+                # Exllama wants the permutation array instead of the group
+                # indices
+                return torch.argsort(x).to(torch.int)
+            self._transform_param(layer, self.w_gidx_name, transform_w_g_idx)
+        else:
+            self.w_gidx_name = "g_idx"
+            empty_g_idx = torch.nn.Parameter(torch.empty((0, ),
+                                                         dtype=torch.int,
+                                                         device=device),
+                                             requires_grad=False)
+            setattr(layer, self.w_gidx_name, empty_g_idx)
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            assert self.w_gidx_name is not None
+            g_idx = getattr(layer, self.w_gidx_name)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x_cont = x.data.contiguous()
+            ops.gptq_shuffle(x_cont, g_idx, c.weight_type.size_bits)
+            return x_cont
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x.to(dtype=c.act_type)
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        w_q, w_s, w_zp, w_g_idx = self._get_weight_params(layer)
+        assert w_zp is not None, "Zero points are required by Exllama"
+        assert w_g_idx is not None, "Group index is required by Exllama"
+        output = ops.gptq_gemm(x_2d, w_q, w_zp, w_s, w_g_idx, True,
+                               c.weight_type.size_bits)
+        if bias is not None:
+            output.add_(bias)
+        return output.reshape(out_shape)

vllm/model_executor/layers/quantization/kernels/mixed_precision/machete.py ADDED Viewed

@@ -0,0 +1,144 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from functools import partial
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.machete_utils import (
+    check_machete_supports_shape, query_machete_supported_group_sizes,
+    query_machete_supported_quant_types)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    pack_quantized_values_into_int32, unpack_quantized_values_into_int32)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class MacheteLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        # Machete uses CUTLASS, so it can only be compatible with Nvidia
+        if not current_platform.is_cuda():
+            return False, "Machete only supported on CUDA"
+        if not current_platform.is_device_capability(90):
+            return False, "Machete requires compute capability of 90 (Hopper)"
+        if c.has_g_idx and\
+            c.partition_weight_shape[0] != c.full_weight_shape[0]:
+            return False, "Act reordering currently not supported by Machete, "\
+                          "when the input features are partitioned across "\
+                          "devices"
+        if c.weight_type not in query_machete_supported_quant_types(
+                c.zero_points):
+            return False, f"Quant type ({c.weight_type}) not supported by "\
+                           "Machete, supported types are: "\
+                           f"{query_machete_supported_quant_types(c.zero_points)}"
+        if c.group_size not in query_machete_supported_group_sizes(c.act_type):
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Machete, supported group sizes are: "\
+                            f"{query_machete_supported_group_sizes(c.act_type)}"
+        return check_machete_supports_shape(c.partition_weight_shape[0],
+                                            c.partition_weight_shape[1])
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    #  `weight_zp`     is: {input_dim = 0, output_dim = 1, packed_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        c = self.config
+        if c.has_g_idx:
+            assert self.w_gidx_name is not None
+            perm = torch.argsort(getattr(layer, self.w_gidx_name))\
+                .to(torch.int)
+            self.act_perm = lambda x: x[:, perm]
+            # use `ops.permute_cols` if possible
+            if c.act_type in [torch.float16, torch.bfloat16] \
+                and c.partition_weight_shape[0] % 8 == 0:
+                self.act_perm = partial(ops.permute_cols, perm=perm)
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            if c.has_g_idx:
+                x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                                c.weight_type,
+                                                                packed_dim=0)
+                x_perm = x_unpacked[perm, :]
+                x.data = pack_quantized_values_into_int32(x_perm,
+                                                          c.weight_type,
+                                                          packed_dim=0)
+            x.data = ops.machete_prepack_B(x.data.t().contiguous().t(),
+                                           a_type=c.act_type,
+                                           b_type=c.weight_type,
+                                           group_scales_type=c.act_type)
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x
+        def transform_w_zp(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=1)
+            x_unpacked = unpack_quantized_values_into_int32(x.data,
+                                                            c.weight_type,
+                                                            packed_dim=1)
+            w_s = getattr(layer, self.w_s_name).data
+            # pre-apply scales to zero-points
+            x.data = (-1.0 * w_s * (x_unpacked.to(w_s.dtype))).contiguous()
+            return x
+        # Repack weights and scales for Machete
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+        if c.zero_points:
+            self._transform_param(layer, self.w_zp_name, transform_w_zp)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, w_zp, _ = self._get_weight_params(layer)
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1], )
+        if c.has_g_idx:
+            x_2d = self.act_perm(x_2d)
+        if c.zero_points:
+            assert w_zp is not None
+        else:
+            w_zp = None
+        output = ops.machete_mm(a=x_2d,
+                                b_q=w_q,
+                                b_type=c.weight_type,
+                                b_group_zeros=w_zp,
+                                b_group_scales=w_s,
+                                b_group_size=c.group_size)
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)

vllm/model_executor/layers/quantization/kernels/mixed_precision/marlin.py ADDED Viewed

@@ -0,0 +1,139 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+    MARLIN_SUPPORTED_GROUP_SIZES, apply_gptq_marlin_linear,
+    check_marlin_supports_shape, marlin_is_k_full, marlin_make_empty_g_idx,
+    marlin_make_workspace_new, marlin_permute_bias, marlin_permute_scales,
+    marlin_sort_g_idx, marlin_zero_points, query_marlin_supported_quant_types,
+    unpack_cols)
+from vllm.model_executor.parameter import (BasevLLMParameter,
+                                           permute_param_layout_)
+from vllm.platforms import current_platform
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class MarlinLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def can_implement(cls,
+                      c: MPLinearLayerConfig) -> tuple[bool, Optional[str]]:
+        # Marlin uses inline PTX, so it can only be compatible with Nvidia
+        if not current_platform.is_cuda():
+            return False, "Marlin only supported on CUDA"
+        quant_types = query_marlin_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, f"Quant type ({c.weight_type}) not supported by"\
+                          f"  Marlin, supported types are: {quant_types}"
+        if c.group_size not in MARLIN_SUPPORTED_GROUP_SIZES:
+            return False, f"Group size ({c.group_size}) not supported by "\
+                            "Marlin, supported group sizes are: "\
+                            f"{MARLIN_SUPPORTED_GROUP_SIZES}"
+        return check_marlin_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size)
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        row_parallel = (c.partition_weight_shape[0] != c.full_weight_shape[0])
+        self.is_k_full = marlin_is_k_full(c.has_g_idx, row_parallel)
+        # Allocate marlin workspace.
+        self.workspace = marlin_make_workspace_new(device)
+        # Default names since marlin requires empty parameters for these,
+        # TODO: remove this requirement from marlin (allow optional tensors)
+        if self.w_gidx_name is None:
+            self.w_gidx_name = "g_idx"
+        if self.w_zp_name is None:
+            self.w_zp_name = "w_zp"
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.gptq_marlin_repack(x.data.contiguous(),
+                                            perm=layer.g_idx_sort_indices,
+                                            size_k=c.partition_weight_shape[0],
+                                            size_n=c.partition_weight_shape[1],
+                                            num_bits=c.weight_type.size_bits)
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = marlin_permute_scales(x.data.contiguous(),
+                                           size_k=c.partition_weight_shape[0],
+                                           size_n=c.partition_weight_shape[1],
+                                           group_size=c.group_size)
+            return x
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = marlin_sort_g_idx(
+                getattr(layer, self.w_gidx_name))
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, marlin_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = marlin_make_empty_g_idx(device)
+        if c.zero_points:
+            grouped_k = (c.partition_weight_shape[0] //
+                         c.group_size if c.group_size != -1 else 1)
+            self._transform_param(layer, self.w_zp_name, lambda x: \
+                marlin_zero_points(
+                    unpack_cols(x.t(), c.weight_type.size_bits,
+                                grouped_k,
+                                c.partition_weight_shape[1]),
+                    size_k=grouped_k,
+                    size_n=c.partition_weight_shape[1],
+                    num_bits=c.weight_type.size_bits))
+        else:
+            setattr(layer, self.w_zp_name, marlin_make_empty_g_idx(device))
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+        if hasattr(layer, "bias") and layer.bias is not None:
+            layer.bias.data = marlin_permute_bias(layer.bias)
+    def apply_weights(self,
+                      layer: torch.nn.Module,
+                      x: torch.Tensor,
+                      bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, w_zp, w_gidx = self._get_weight_params(layer)
+        # `process_weights_after_loading` will ensure w_zp and w_gidx are not
+        #  None for marlin
+        return apply_gptq_marlin_linear(
+            input=x,
+            weight=w_q,
+            weight_scale=w_s,
+            weight_zp=w_zp,  # type: ignore
+            g_idx=w_gidx,  # type: ignore
+            g_idx_sort_indices=layer.g_idx_sort_indices,
+            workspace=self.workspace,
+            wtype=c.weight_type,
+            input_size_per_partition=c.partition_weight_shape[0],
+            output_size_per_partition=c.partition_weight_shape[1],
+            is_k_full=self.is_k_full,
+            bias=bias)