PyPI - vllm-cpu-avx512vnni - Versions diffs - 0.13.0__cp313-cp313-manylinux_2_28_x86_64.whl - Mend

vllm-cpu-avx512vnni 0.13.0__cp313-cp313-manylinux_2_28_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of vllm-cpu-avx512vnni might be problematic. Click here for more details.

Files changed (1641) hide show

vllm/model_executor/layers/quantization/kernels/mixed_precision/allspark.py ADDED Viewed

@@ -0,0 +1,115 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.allspark_utils import (
+    ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+    check_allspark_supported_dtype_shape,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class AllSparkLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if c.has_g_idx:
+            return False, "Act reordering currently not supported by AllSpark"
+        if c.zero_points:
+            return False, "Zero points currently not supported by AllSpark"
+        return check_allspark_supported_dtype_shape(
+            c.partition_weight_shape[0],  # in_features
+            c.partition_weight_shape[1],  # out_features
+            c.group_size,
+            c.weight_type,
+            c.act_type,
+        )
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        # prepare the parameters required for the kernel
+        properties = torch.cuda.get_device_properties(device.index)
+        sm_count = properties.multi_processor_count
+        sm_version = properties.major * 10 + properties.minor
+        gemm_args = {}
+        gemm_args["sm_count"] = sm_count
+        gemm_args["sm_version"] = sm_version
+        self.gemm_args = gemm_args
+        # transform param weight, scale
+        old_weight_param = getattr(layer, self.w_q_name)
+        old_scale_param = getattr(layer, self.w_s_name)
+        assert isinstance(old_weight_param, BasevLLMParameter)
+        permute_param_layout_(old_weight_param, input_dim=0, output_dim=1, packed_dim=0)
+        assert isinstance(old_scale_param, BasevLLMParameter)
+        permute_param_layout_(old_scale_param, input_dim=0, output_dim=1)
+        # unpack weight from K / 4 x N int32 to K x N uint8
+        new_weight_param = torch.nn.Parameter(
+            old_weight_param.data, requires_grad=False
+        )
+        new_weight_param.data = (
+            new_weight_param.data.t().contiguous().view(dtype=torch.uint8)
+        )
+        new_weight_param.data = new_weight_param.data.t().contiguous()
+        new_scale_param = torch.nn.Parameter(old_scale_param.data, requires_grad=False)
+        # reorder K x N weight as N32K16 format for Ampere W8A16
+        new_weight_param.data, new_scale_param.data, _ = ops.allspark_repack_weight(
+            new_weight_param.data, new_scale_param.data, None, c.zero_points
+        )
+        replace_parameter(layer, self.w_q_name, new_weight_param.data)
+        replace_parameter(layer, self.w_s_name, new_scale_param.data)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        c = self.config
+        gemm_args = self.gemm_args
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+        reshaped_x = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
+        output = ops.allspark_w8a16_gemm(
+            a=reshaped_x,
+            b_qweight=w_q,
+            b_scales=w_s,
+            b_qzeros=None,
+            n=c.partition_weight_shape[1],
+            group_size=c.group_size,
+            sm_count=gemm_args["sm_count"],
+            sm_version=gemm_args["sm_version"],
+            CUBLAS_M_THRESHOLD=ALLSPARK_AMPERE_M_CUBLAS_THRESHOLD,
+            has_zp=c.zero_points,
+            n32k16_reorder=True,
+        )
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)

vllm/model_executor/layers/quantization/kernels/mixed_precision/bitblas.py ADDED Viewed

@@ -0,0 +1,323 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from packaging import version
+from vllm.logger import init_logger
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.utils import replace_parameter
+from vllm.model_executor.layers.quantization.utils.bitblas_utils import (
+    BITBLAS_OPTIMIZE_FEATURES,
+    BITBLAS_SUPPORTED_GROUP_SIZES,
+    MINIMUM_BITBLAS_VERSION,
+    bitblas_make_empty_g_idx,
+    bitblas_sort_g_idx,
+    check_bitblas_supports_shape,
+    query_bitblas_supported_quant_types,
+    unpack_gptq_qweight,
+    unpack_gptq_qzeros,
+)
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+logger = init_logger(__name__)
+class BitBLASLinearKernel(MPLinearKernel):
+    OPT_FEATURES: list[int] = BITBLAS_OPTIMIZE_FEATURES
+    ENABLE_TUNING: bool = True
+    MATMUL_LAYOUT: str = "nt"
+    BITBLAS_DTYPES: dict[torch.dtype, str] = {
+        torch.float32: "float32",
+        torch.float16: "float16",
+        torch.bfloat16: "bfloat16",
+        torch.half: "float16",
+        torch.int8: "int8",
+    }
+    bitblas_matmul: object = None
+    def __init__(
+        self,
+        c: MPLinearLayerConfig,
+        w_q_param_name: str,
+        w_s_param_name: str,
+        w_zp_param_name: str | None = None,
+        w_gidx_param_name: str | None = None,
+        bitblas_quant_config: QuantizationConfig | None = None,
+    ):
+        self.quant_config = bitblas_quant_config
+        super().__init__(
+            c, w_q_param_name, w_s_param_name, w_zp_param_name, w_gidx_param_name
+        )
+    def repack_bitblas_from_gptq(
+        self,
+        b_q_weight: torch.Tensor,
+        scales: torch.Tensor,
+        qzeros: torch.Tensor | None = None,
+    ):
+        from bitblas.quantization.utils import general_compress
+        assert self.bitblas_matmul is not None, "bitblas_matmul is None"
+        quant_config = self.quant_config
+        # qweight in gptq old quant linear stored with
+        # (outfeatures, infeatures), should be transposed.
+        qweight = b_q_weight.T.contiguous().view(quant_config.torch_storage_dtype)  # type: ignore[union-attr]
+        intweight = unpack_gptq_qweight(qweight, quant_config.weight_bits).contiguous()  # type: ignore[union-attr]
+        if self.bitblas_matmul.weight_transform is not None:  # type: ignore[attr-defined]
+            qweight = self.bitblas_matmul.weight_transform(  # type: ignore[attr-defined]
+                intweight.cpu()
+            ).cuda()
+        # scales in gptq old quant linear stored with
+        # (infeatures // group_size, outfeatures), should be transposed.
+        scales = scales.T.contiguous()
+        if qzeros is None:
+            return qweight, scales, None
+        # qzeros should be de-quantized to int zeros.
+        weight_bits = quant_config.weight_bits  # type: ignore[union-attr]
+        intzeros = unpack_gptq_qzeros(qzeros, weight_bits).T.contiguous()
+        zeros: torch.Tensor | None = None
+        zeros_mode = self.bitblas_matmul.config.zeros_mode  # type: ignore[attr-defined]
+        if zeros_mode == "original":
+            zeros = intzeros.to(torch.float16).contiguous()
+        elif zeros_mode == "rescale":
+            assert zeros is not None, "zeros should not be None"
+            zeros[:, :] = intzeros.to(torch.float16)[:, :] * scales[:, :]
+        elif zeros_mode == "quantized":
+            zeros = (
+                torch.Tensor(
+                    general_compress(
+                        intzeros.T.contiguous().cpu().numpy(),
+                        weight_bits,
+                    )
+                )
+                .to(qweight.device)
+                .to(
+                    quant_config.torch_storage_dtype  # type: ignore[union-attr]
+                )
+                .contiguous()
+            )
+        else:
+            raise ValueError("Unsupported zeros type: {}".format(zeros_mode))
+        return qweight, scales, zeros
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 70
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        is_bitblas_installed = True
+        try:
+            import bitblas
+            if version.parse(bitblas.__version__) < version.parse(
+                MINIMUM_BITBLAS_VERSION
+            ):
+                raise ImportError(
+                    "bitblas version is wrong. Please "
+                    f"install bitblas>={MINIMUM_BITBLAS_VERSION}"
+                )
+        except ImportError:
+            is_bitblas_installed = False
+        if not is_bitblas_installed:
+            return (
+                False,
+                "bitblas is not installed. Please install bitblas "
+                "by running `pip install bitblas>="
+                f"{MINIMUM_BITBLAS_VERSION}`",
+            )
+        quant_types = query_bitblas_supported_quant_types(c.zero_points)
+        if c.weight_type not in quant_types:
+            return False, (
+                f"Quant type ({c.weight_type}) not supported by"
+                f"  BitBLAS, supported types are: {quant_types}"
+            )
+        if c.group_size not in BITBLAS_SUPPORTED_GROUP_SIZES:
+            return False, (
+                f"Group size ({c.group_size}) not supported by "
+                "BitBLAS, supported group sizes are: "
+                f"{BITBLAS_SUPPORTED_GROUP_SIZES}"
+            )
+        return check_bitblas_supports_shape(
+            c.partition_weight_shape[1],  # out_features
+            c.partition_weight_shape[0],  # in_features
+            c.full_weight_shape[0],  # in_features
+            c.group_size,
+        )
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        device = getattr(layer, self.w_q_name).device
+        c = self.config
+        quant_config = self.quant_config
+        # Default names since bitblas requires empty parameters for these,
+        # TODO: remove this requirement from bitblas (allow optional tensors)
+        if getattr(self, "w_gidx_name", None) is None:
+            self.w_gidx_name: str = "g_idx"
+        if getattr(self, "w_zp_name", None) is None:
+            self.w_zp_name: str = "qzeros"
+        if c.has_g_idx:
+            g_idx, g_idx_sort_indices = bitblas_sort_g_idx(
+                getattr(layer, self.w_gidx_name)
+            )
+            self._transform_param(layer, self.w_gidx_name, lambda _: g_idx)
+            layer.g_idx_sort_indices = g_idx_sort_indices
+        else:
+            setattr(layer, self.w_gidx_name, bitblas_make_empty_g_idx(device))
+            layer.g_idx_sort_indices = bitblas_make_empty_g_idx(device)
+        if c.zero_points:
+            raise NotImplementedError("Zero points not supported by BitBLAS")
+        else:
+            setattr(layer, self.w_zp_name, bitblas_make_empty_g_idx(device))
+        # Repack weights
+        bitblas_qweight, bitblas_scales, bitblas_qzeros = self.repack_bitblas_from_gptq(
+            layer.qweight,
+            layer.scales,
+            None if quant_config.is_sym else layer.qzeros,  # type: ignore[union-attr]
+        )
+        replace_parameter(layer, self.w_q_name, bitblas_qweight)
+        replace_parameter(layer, self.w_s_name, bitblas_scales)
+        if bitblas_qzeros is not None:
+            replace_parameter(layer, self.w_zp_name, bitblas_qzeros)
+    def configure_bitblas_matmul(
+        self,
+        infeatures: int,
+        outfeatures: int,
+        params_dtype: torch.dtype,
+        bias: bool,
+    ) -> None:
+        enable_tuning = self.ENABLE_TUNING
+        layout = self.MATMUL_LAYOUT
+        bits = self.quant_config.weight_bits  # type: ignore[union-attr]
+        self._configure_bitblas_matmul(
+            infeatures,
+            outfeatures,
+            params_dtype,
+            enable_tuning,
+            bias,
+            layout,
+            bits,
+        )
+    def _configure_bitblas_matmul(
+        self,
+        infeatures,
+        outfeatures,
+        params_dtype,
+        enable_tuning,
+        bias,
+        layout,
+        bits,
+    ):
+        from bitblas import MatmulConfig
+        bitblas_dtype = self.BITBLAS_DTYPES[params_dtype]
+        quant_config = self.quant_config
+        with_scaling = False
+        with_zeros = False
+        group_size = quant_config.group_size  # type: ignore[union-attr]
+        zeros_mode = quant_config.zeros_mode  # type: ignore[union-attr]
+        if quant_config.quant_method == "gptq":  # type: ignore[union-attr]
+            with_scaling = True
+            with_zeros = True
+            W_dtype = f"uint{bits}"
+            if quant_config.is_sym:  # type: ignore[union-attr]
+                with_zeros = False
+                W_dtype = f"int{bits}"
+        else:
+            raise ValueError(
+                f"Unsupported quant_method {quant_config.quant_method}"  # type: ignore[union-attr]
+            )  # type: ignore[union-attr]
+        matmul_config = MatmulConfig(
+            M=self.OPT_FEATURES,
+            N=outfeatures,
+            K=infeatures,
+            A_dtype=bitblas_dtype,
+            W_dtype=W_dtype,
+            out_dtype=bitblas_dtype,
+            accum_dtype="int32" if bitblas_dtype == "int8" else bitblas_dtype,
+            storage_dtype=quant_config.  # type: ignore[union-attr]
+            storage_dtype,  # type: ignore[union-attr]
+            with_scaling=with_scaling,
+            with_zeros=with_zeros,
+            group_size=group_size,
+            with_bias=bias,
+            layout=layout,
+            zeros_mode=zeros_mode,
+        )
+        self.bitblas_matmul = self._get_or_create_bitblas_operator(
+            matmul_config, enable_tuning
+        )
+    def _get_or_create_bitblas_operator(self, config, enable_tuning):
+        from bitblas import Matmul, auto_detect_nvidia_target
+        from bitblas.cache import get_database_path, global_operator_cache
+        BITBLAS_DATABASE_PATH = get_database_path()
+        BITBLAS_TARGET = auto_detect_nvidia_target()
+        if global_operator_cache.size() == 0:
+            global_operator_cache.load_from_database(
+                BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+            )
+        bitblas_matmul = global_operator_cache.get(config)
+        if bitblas_matmul is None:
+            bitblas_matmul = Matmul(config, target=BITBLAS_TARGET, enable_tuning=False)
+            if enable_tuning:
+                bitblas_matmul.hardware_aware_finetune(topk=20)
+                global_operator_cache.add(config, bitblas_matmul)
+                global_operator_cache.save_into_database(
+                    BITBLAS_DATABASE_PATH, BITBLAS_TARGET
+                )
+                TUNING_MESSAGE = (
+                    f"BitBLAS Operator {config} tuned and saved to database."
+                )
+                logger.info(TUNING_MESSAGE)
+            else:
+                _message = f"BitBLAS Operator {config} created without tuning. "
+                logger.info(_message)
+        else:
+            _message = f"BitBLAS Operator {config} retrieved from cache."
+            logger.info(_message)
+        return bitblas_matmul
+    def apply_gptq_bitblas_linear(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+    ) -> torch.Tensor:
+        output_size_per_partition = self.config.partition_weight_shape[1]
+        out_shape = x.shape[:-1] + (output_size_per_partition,)
+        args = [x, layer.qweight, layer.scales]
+        if self.bitblas_matmul.config.with_zeros:  # type: ignore[attr-defined]
+            args.append(layer.qzeros)
+        output = self.bitblas_matmul(*args)  # type: ignore[operator]
+        return output.view(out_shape)
+    def apply_weights(self, layer, x, bias=None):
+        NOT_IMPLEMENT_MESSAGE = (
+            f"{self.__class__.__name__}.apply_weights is not implemented. "
+            "Please use BitBLASLinearKernel.apply_gptq_bitblas_linear instead"
+        )
+        raise NotImplementedError(NOT_IMPLEMENT_MESSAGE)

vllm/model_executor/layers/quantization/kernels/mixed_precision/conch.py ADDED Viewed

@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from importlib.util import find_spec
+from typing import Final
+import torch
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+_CONCH_SUPPORTED_WEIGHT_TYPES: Final = [
+    scalar_types.uint4,
+    scalar_types.uint8,
+    scalar_types.uint4b8,
+    scalar_types.uint8b128,
+]
+_CONCH_SUPPORTED_GROUP_SIZES: Final = [-1, 128]
+class ConchLinearKernel(MPLinearKernel):
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if c.weight_type not in _CONCH_SUPPORTED_WEIGHT_TYPES:
+            error_msg = (
+                f"Weight type ({c.weight_type}) not supported by "
+                "ConchLinearKernel, supported types are: "
+                f"{_CONCH_SUPPORTED_WEIGHT_TYPES}"
+            )
+            return False, error_msg
+        if c.group_size not in _CONCH_SUPPORTED_GROUP_SIZES:
+            error_msg = (
+                f"Group size ({c.group_size}) not supported by "
+                "ConchLinearKernel, supported group sizes are: "
+                f"{_CONCH_SUPPORTED_GROUP_SIZES}"
+            )
+            return False, error_msg
+        if find_spec("conch") is None:
+            error_msg = (
+                "conch-triton-kernels is not installed, please "
+                "install it via `pip install conch-triton-kernels` "
+                "and try again!"
+            )
+            return False, error_msg
+        return True, None
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale` is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = x.data.contiguous()
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous()
+            return x
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        from conch.ops.quantization.gemm import mixed_precision_gemm
+        w_q, w_s, w_zp, _ = self._get_weight_params(layer)
+        output = mixed_precision_gemm(
+            x=x,
+            w_q_packed=w_q.data,
+            w_s=w_s.data,
+            w_zp=w_zp.data if w_zp is not None else None,
+            weight_size_bits=self.config.weight_type.size_bits,
+            weight_bias=self.config.weight_type.bias,
+            group_size=self.config.group_size,
+        )
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output

vllm/model_executor/layers/quantization/kernels/mixed_precision/cutlass.py ADDED Viewed

@@ -0,0 +1,130 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import torch
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape,
+    convert_bf16_scales_to_fp8,
+    convert_packed_uint4b8_to_signed_int4_inplace,
+)
+from vllm.model_executor.parameter import BasevLLMParameter, permute_param_layout_
+from vllm.platforms import current_platform
+from vllm.scalar_type import scalar_types
+from .MPLinearKernel import MPLinearKernel, MPLinearLayerConfig
+class CutlassW4A8LinearKernel(MPLinearKernel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # dynamic per-tok fp8 activation quantization
+        self.quant_fp8 = QuantFP8(static=False, group_shape=GroupShape.PER_TOKEN)
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 90
+    @classmethod
+    def can_implement(cls, c: MPLinearLayerConfig) -> tuple[bool, str | None]:
+        if not current_platform.is_cuda():
+            return False, "CUTLASS only supported on CUDA"
+        if not current_platform.is_device_capability(90):
+            return False, "CUTLASS W4A8 requires compute capability of 90 (Hopper)"
+        if c.act_type != torch.float8_e4m3fn:
+            return False, "CUTLASS W4A8 only supports FP8 (e4m3) activations"
+        if c.has_g_idx:
+            return False, "Act reordering not supported by CUTLASS W4A8"
+        if c.zero_points:
+            return False, "Zero points not supported by CUTLASS W4A8"
+        if c.weight_type != scalar_types.int4:
+            return (
+                False,
+                f"Quant type ({c.weight_type}) not supported by "
+                "CUTLASS W4A8, only supported int4",
+            )
+        if c.group_size != 128:
+            return False, "Only group_size 128 is supported"
+        in_features, out_features = c.partition_weight_shape
+        if in_features % 128 or out_features % 128:
+            return (
+                False,
+                f"K and N must be divisible by 128, got {c.partition_weight_shape}",
+            )
+        if c.out_type != torch.bfloat16:
+            return (
+                False,
+                f"Only bfloat16 output type currently supportedgot {c.out_type=}",
+            )
+        return True, None
+    # note assumes that
+    #  `weight_packed` is: {input_dim = 0, output_dim = 1, packed_dim = 0}
+    #  `weight_scale`  is: {input_dim = 0, output_dim = 1}
+    def process_weights_after_loading(self, layer: torch.nn.Module):
+        def transform_w_q(x):
+            assert isinstance(x, BasevLLMParameter)
+            convert_packed_uint4b8_to_signed_int4_inplace(x.data)
+            permute_param_layout_(x, input_dim=0, output_dim=1, packed_dim=0)
+            x.data = ops.cutlass_encode_and_reorder_int4b(x.data.t().contiguous().t())
+            return x
+        def transform_w_s(x):
+            assert isinstance(x, BasevLLMParameter)
+            permute_param_layout_(x, input_dim=0, output_dim=1)
+            x.data = x.data.contiguous().to(torch.float8_e4m3fn)
+            x.data = ops.cutlass_pack_scale_fp8(x.data)
+            return x
+        w_s = getattr(layer, self.w_s_name)
+        fp8_scales, chan_scales = convert_bf16_scales_to_fp8(self.quant_fp8, w_s.data)
+        w_s.data = fp8_scales
+        # register per-channel scales
+        layer.register_parameter(
+            "weight_chan_scale", torch.nn.Parameter(chan_scales, requires_grad=False)
+        )
+        # Encode/reorder weights and pack scales
+        self._transform_param(layer, self.w_q_name, transform_w_q)
+        self._transform_param(layer, self.w_s_name, transform_w_s)
+    def apply_weights(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        c = self.config
+        w_q, w_s, _, _ = self._get_weight_params(layer)
+        w_ch_s = layer.weight_chan_scale
+        x_2d = x.reshape(-1, x.shape[-1])
+        out_shape = x.shape[:-1] + (c.partition_weight_shape[1],)
+        x_2d, act_scales = self.quant_fp8(x_2d)
+        output = ops.cutlass_w4a8_mm(
+            a=x_2d,
+            b_q=w_q,
+            b_group_scales=w_s,
+            b_group_size=c.group_size,
+            a_token_scales=act_scales,
+            b_channel_scales=w_ch_s,
+        )
+        if bias is not None:
+            output.add_(bias)  # In-place add
+        return output.reshape(out_shape)