PyPI - vllm-cpu - Versions diffs - 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl - Mend

vllm-cpu 0.11.0.post2__cp312-cp312-manylinux_2_17_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1398) hide show

vllm/model_executor/layers/quantization/compressed_tensors/transform/linear.py ADDED Viewed

@@ -0,0 +1,238 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from collections.abc import Generator
+from itertools import accumulate
+from typing import Callable, Optional
+import torch
+from compressed_tensors.transform import (TransformArgs, TransformConfig,
+                                          TransformLocation, TransformScheme)
+from compressed_tensors.utils import is_match
+from vllm.model_executor.layers.linear import (WEIGHT_LOADER_V2_SUPPORTED,
+                                               LinearMethodBase,
+                                               QKVCrossParallelLinear)
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsScheme)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.module import (  # noqa: E501
+    HadamardTransform)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+class CompressedTensorsLinearTransformMethod(LinearMethodBase):
+    """
+    Wraps `CompressedTensorsLinearMethod` or `UnquantizedLinearMethod` and adds
+    input and output transforms to either side of the original apply method
+    """
+    @classmethod
+    def from_schemes(
+        cls,
+        quant_method: LinearMethodBase,
+        quant_scheme: Optional[CompressedTensorsScheme],
+        input_tfms: dict[int, TransformTuple],
+        output_tfms: dict[int, TransformTuple],
+    ) -> "CompressedTensorsLinearTransformMethod":
+        from vllm.model_executor.layers.quantization.compressed_tensors.transform.schemes.linear_qutlass_nvfp4 import (  # noqa: E501
+            QutlassNvFP4LinearMethod, is_qutlass_fp4_scheme)
+        assert input_tfms or output_tfms
+        if is_qutlass_fp4_scheme(quant_scheme, input_tfms):
+            return QutlassNvFP4LinearMethod(quant_method, input_tfms,
+                                            output_tfms)
+        # hadacore or dense gemm is selected by Transform module
+        return cls(quant_method, input_tfms, output_tfms)
+    def __init__(self, quant_method: LinearMethodBase,
+                 input_tfms: dict[int, TransformTuple],
+                 output_tfms: dict[int, TransformTuple]):
+        self.quant_method = quant_method
+        self.input_tfms = input_tfms
+        self.output_tfms = output_tfms
+        self.input_transform: Optional[HadamardTransform] = None
+        self.output_transform: Optional[HadamardTransform] = None
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: list[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        # get weight loader for transforms
+        weight_loader: Callable = extra_weight_attrs.get(
+            "weight_loader")  # type: ignore[assignment]
+        # HACK: UnquantizedLinearMethod does not support weight loader v2, but
+        # transforms (specifically SharedWeightParameter) requires
+        # weight loader v2. Until UnquantizedLinearMethod supports v2, we must
+        # hack around this by getting weight loader v1 so ULM can load correctly
+        quant_method_name = self.quant_method.__class__.__name__
+        if quant_method_name not in WEIGHT_LOADER_V2_SUPPORTED:
+            if isinstance(layer, QKVCrossParallelLinear):
+                weight_loader_v1 = layer.weight_loader_v1
+            else:
+                weight_loader_v1 = layer.weight_loader
+            extra_weight_attrs["weight_loader"] = weight_loader_v1
+        self.quant_method.create_weights(
+            layer=layer,
+            input_size_per_partition=input_size_per_partition,
+            output_partition_sizes=output_partition_sizes,
+            input_size=input_size,
+            output_size=output_size,
+            params_dtype=params_dtype,
+            **extra_weight_attrs)
+        # validate schemes
+        num_partitions = len(output_partition_sizes)
+        self._validate_tfm_schemes(num_partitions)
+        # create submodules for weight loading
+        if len(self.input_tfms) > 0:
+            scheme_name = list(self.input_tfms.values())[0].scheme_name
+            location = list(self.input_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+            transform = HadamardTransform(self.input_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.input_transform = transform
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+            transform_name = f"{scheme_name}_{location}"
+            transform = HadamardTransform(self.output_tfms, layer,
+                                          weight_loader,
+                                          input_size_per_partition,
+                                          output_partition_sizes)
+            layer.register_module(transform_name, transform)
+            self.output_transform = transform
+        # compute partition ranges for slicing activations
+        starts = [0] + list(accumulate(output_partition_sizes))[:-1]
+        self.partition_ranges = list(zip(starts, output_partition_sizes))
+    def process_weights_after_loading(self, layer):
+        self.quant_method.process_weights_after_loading(layer)
+        for submodule in layer.children():
+            if isinstance(submodule, HadamardTransform):
+                submodule.process_weights_after_loading()
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if self.input_transform is not None:
+            x = self.input_transform(x)
+        assert bias is None
+        x = self.quant_method.apply(layer, x, bias)
+        # In most cases, input transforms are preferred over output transforms
+        # (@ksayers): confirm that this is done concurrently
+        if self.output_transform is not None:
+            for part_id, (start, length) in enumerate(self.partition_ranges):
+                x[:, start:start + length] = self.output_transform(
+                    x[:, start:start + length].contiguous(), part_id=part_id)
+        return x
+    def _validate_tfm_schemes(self, num_partitions: int):
+        if len(self.input_tfms) > 0:
+            if 0 not in self.input_tfms:
+                raise ValueError("Must have same input")
+            for part_index in range(num_partitions):
+                if self.input_tfms[part_index] != self.input_tfms[0]:
+                    raise ValueError("Must have same input")
+        if len(self.output_tfms) > 0:
+            scheme_name = list(self.output_tfms.values())[0].scheme_name
+            location = list(self.output_tfms.values())[0].args.location
+            for tfm in self.output_tfms.values():
+                if tfm.scheme_name != scheme_name:
+                    raise ValueError("Must have same scheme name")
+                if tfm.args.location != location:
+                    raise ValueError("Must have same location")
+        return self.input_tfms, self.output_tfms
+def get_linear_transform_schemes(
+    layer: torch.nn.Module, layer_name: str,
+    transform_config: Optional[TransformConfig],
+    packed_modules_mapping: dict[str, list[str]]
+) -> tuple[dict[int, TransformTuple], dict[
+        int, TransformTuple]]:  # [input_transform, [output_transform, ...]]
+    # there can only be one transform input scheme per (fused) module
+    input_tfms = {}
+    output_tfms = {}
+    partition_names = get_layer_partition_names(layer_name,
+                                                packed_modules_mapping)
+    for scheme_name, scheme, args in get_schemes_args(transform_config):
+        for part_index, part_name in enumerate(partition_names):
+            if is_match(part_name, layer, args.targets,
+                        args.ignore) and args.is_online():
+                if args.location == TransformLocation.INPUT:
+                    input_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+                elif args.location == TransformLocation.OUTPUT:
+                    output_tfms[part_index] = TransformTuple(
+                        scheme_name, scheme, args)
+                else:
+                    raise ValueError(f"Cannot apply `{args.location}` "
+                                     f"transform to `{layer_name}`")
+    return (input_tfms, output_tfms)
+def get_schemes_args(
+    transform_config: Optional[TransformConfig]
+) -> Generator[tuple[str, TransformScheme, TransformArgs]]:
+    if transform_config is None:
+        return
+    for scheme_name, scheme in transform_config.config_groups.items():
+        for args in scheme.apply:
+            yield (scheme_name, scheme, args)
+def get_layer_partition_names(
+        layer_name: str, packed_modules_mapping: dict[str,
+                                                      list[str]]) -> list[str]:
+    """
+    Get all partition names associated with this layer.
+    Names are returned in order of their partition indices.
+    ```python
+    mapping = {"gate_up_proj", "gate_proj", "up_proj"}
+    assert get_layer_partition_names(
+        "mlp.gate_up_proj", mapping) == ["gate_proj", "up_proj"]
+    assert get_layer_partition_names(
+        "mlp.down_proj", mapping) == ["down_proj"]
+    """
+    for fused_suffix, part_suffixes in packed_modules_mapping.items():
+        if layer_name.endswith(fused_suffix):
+            return [
+                layer_name.removesuffix(fused_suffix) + part_suffix
+                for part_suffix in part_suffixes
+            ]
+    return [layer_name]

vllm/model_executor/layers/quantization/compressed_tensors/transform/module.py ADDED Viewed

@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import math
+from collections.abc import Hashable
+from typing import Callable
+import torch
+from compressed_tensors.transform import (TransformArgs, TransformLocation,
+                                          TransformScheme)
+from torch import Tensor
+import vllm._custom_ops as ops
+from vllm.distributed.parallel_state import (
+    get_tensor_model_parallel_world_size)
+from vllm.model_executor.layers.linear import LinearBase
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.utils import (  # noqa: E501
+    TransformTuple)
+from vllm.model_executor.layers.utils import dispatch_unquantized_gemm
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    VocabParallelEmbedding)
+from vllm.model_executor.parameter import SharedWeightParameter
+class HadamardTransform(torch.nn.Module):
+    """
+    Class which handles weight loading, postprocessing, and application of
+    transforms. Meant to be used with `CompressedTensorsLinearTransformMethod`
+    and attention transforms method (not implemented yet)
+    """
+    transforms: dict[int, TransformTuple]  # info parsed from transforms config
+    weight: SharedWeightParameter  # container for shared tensors
+    scales: dict[int, float]  # hadamard scale, usually sqrt(matrix.size(0))
+    def __init__(self, transforms: dict[int, TransformTuple],
+                 layer: torch.nn.Module, weight_loader: Callable,
+                 input_size_per_partition: int,
+                 output_partition_sizes: list[int]):
+        super().__init__()
+        self.transforms = transforms
+        self.scales = {}
+        if get_tensor_model_parallel_world_size() > 1:
+            raise NotImplementedError("Online transforms with tensor "
+                                      "parallelism is not supported")
+        # Similar to row/col parallel params, but tensors are separate
+        # to allow for loading with shared memory
+        self.weight = SharedWeightParameter(weight_loader=weight_loader)
+        # create shared partition data for each partition of the original weight
+        input_size = input_size_per_partition
+        for part_index, (_scheme_name, scheme,
+                         args) in self.transforms.items():
+            output_size = output_partition_sizes[part_index]
+            weight_size = self._get_weight_size(layer, scheme, args,
+                                                input_size, output_size)
+            data_key = self._get_data_key(scheme, weight_size)
+            self.weight.add_partition(
+                part_index,
+                data_key,
+                size=(weight_size, weight_size),
+                dtype=scheme.precision,
+            )
+        # validate that shared tensors and schemes are correct
+        self._validate_input_transforms()
+    def process_weights_after_loading(self):
+        for part_id in self.weight.partitions:
+            data = self.weight.partitions[part_id].data
+            # required by torch.compile
+            self.weight.process_weights_after_loading()
+            # precompute scale as a runtime multiply, not division
+            # do not fold into weight in order to utilize FWHT
+            self.scales[part_id] = 1 / math.sqrt(data.size(0))
+            # FUTURE: avoid runtime transpose by processing weights
+            # prior to apply
+    def forward(self, value: Tensor, part_id: int = 0) -> Tensor:
+        if part_id not in self.weight.partitions:
+            return value
+        # use hadacore if possible
+        if self.transforms[part_id].scheme.type == "hadamard":
+            if self.transforms[part_id].scheme.head_dim is not None:
+                weight_size = self.transforms[part_id].scheme.head_dim
+                value = value.unflatten(-1, (-1, weight_size))
+                value = ops.hadacore_transform(value)
+                value = value.flatten(-2, -1)
+                return value
+            # sylvester transforms are symmetric, inv => transpose => original
+            return ops.hadacore_transform(value)
+        # fall back to dense
+        else:
+            weight = self.weight.partitions[part_id]
+            weight = weight if self.transforms[
+                part_id].args.inverse else weight.T  # linear := x(W.T)
+            scale = self.scales[part_id]
+            if self.transforms[part_id].scheme.head_dim is not None:
+                value = value.unflatten(-1, (-1, weight.size(0)))
+                value = dispatch_unquantized_gemm()(self, value.to(
+                    weight.dtype), weight, None).to(value.dtype) * scale
+                value = value.flatten(-2, -1)
+                return value
+            return dispatch_unquantized_gemm()(self, value.to(
+                weight.dtype), weight, None).to(value.dtype) * scale
+    def _get_data_key(self, scheme: TransformScheme,
+                      weight_size: int) -> Hashable:
+        return (id(scheme), weight_size)
+    def _get_weight_size(self, layer: torch.nn.Module, scheme: TransformScheme,
+                         args: TransformArgs, input_size: int,
+                         output_size: int) -> int:
+        if scheme.head_dim is not None:
+            return scheme.head_dim
+        if isinstance(layer, LinearBase):
+            if args.location == TransformLocation.INPUT:
+                return input_size
+            elif args.location == TransformLocation.OUTPUT:
+                return output_size
+        elif isinstance(layer, VocabParallelEmbedding):
+            if args.location == TransformLocation.INPUT:
+                return output_size
+            elif args.location == TransformLocation.OUTPUT:
+                return input_size
+        raise ValueError()
+    def _validate_input_transforms(self):
+        assert len(self.transforms) > 0
+        location = list(self.transforms.values())[0].args.location
+        if location == TransformLocation.INPUT:
+            first_data = self.weight.partitions[0].data
+            for partition in self.weight.partitions.values():
+                if partition.data.data_ptr() != first_data.data_ptr():
+                    raise ValueError("")

vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/__init__.py ADDED Viewed

File without changes

vllm/model_executor/layers/quantization/compressed_tensors/transform/schemes/linear_qutlass_nvfp4.py ADDED Viewed

@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
+    CompressedTensorsScheme, CompressedTensorsW4A4Fp4)
+from vllm.model_executor.layers.quantization.compressed_tensors.transform.linear import (  # noqa: E501
+    CompressedTensorsLinearTransformMethod, TransformTuple)
+__all__ = ["is_qutlass_fp4_scheme", "QutlassNvFP4LinearMethod"]
+def is_qutlass_fp4_scheme(quant_scheme: Optional[CompressedTensorsScheme],
+                          input_tfms: dict[int, TransformTuple]) -> bool:
+    return isinstance(
+        quant_scheme,
+        (CompressedTensorsW4A4Fp4, )) and len(input_tfms) == 1 and input_tfms[
+            0].scheme.head_dim == quant_scheme.group_size
+class QutlassNvFP4LinearMethod(CompressedTensorsLinearTransformMethod):
+    def create_weights(self, layer, input_size_per_partition,
+                       output_partition_sizes, input_size, output_size,
+                       params_dtype, **extra_weight_attrs):
+        # initializes fp4 qparams
+        assert isinstance(layer.scheme, (CompressedTensorsW4A4Fp4, ))
+        ret = super().create_weights(layer, input_size_per_partition,
+                                     output_partition_sizes, input_size,
+                                     output_size, params_dtype,
+                                     **extra_weight_attrs)
+        assert self.input_transform is not None
+        assert len(self.input_transform.weight) == 1
+        assert self.input_transform.weight[0].size(
+            0) == layer.scheme.group_size
+        return ret
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+        raise NotImplementedError()

vllm/model_executor/layers/quantization/compressed_tensors/transform/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import NamedTuple
+from compressed_tensors.transform import TransformArgs, TransformScheme
+__all__ = ["TransformTuple"]
+class TransformTuple(NamedTuple):
+    scheme_name: str
+    scheme: TransformScheme
+    args: TransformArgs

vllm/model_executor/layers/quantization/compressed_tensors/triton_scaled_mm.py ADDED Viewed

@@ -0,0 +1,206 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Optional
+import torch
+from vllm.triton_utils import tl, triton
+def is_weak_contiguous(x: torch.Tensor):
+    strides = x.stride()
+    sizes = x.shape
+    is_not_transpose = strides[0] == 1 and (strides[1] >= max(1, sizes[0]))
+    is_transpose = strides[1] == 1 and (strides[0] >= max(1, sizes[1]))
+    return is_transpose or is_not_transpose
+@triton.jit
+def scaled_mm_kernel(a_ptr, b_ptr, scale_a_ptr, scale_b_ptr, c_ptr, bias_ptr,
+                     M, N, K, stride_am, stride_ak, stride_bk, stride_bn,
+                     stride_cm, stride_cn, ACCUMULATOR_DTYPE: tl.constexpr,
+                     BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr,
+                     BLOCK_SIZE_K: tl.constexpr,
+                     BLOCK_SIZE_SCALE_A: tl.constexpr,
+                     BLOCK_SIZE_SCALE_B: tl.constexpr):
+    pid = tl.program_id(axis=0)
+    num_pid_n = tl.cdiv(N, BLOCK_SIZE_N)
+    pid_m = pid // num_pid_n
+    pid_n = pid % num_pid_n
+    accumulator_dtype = ACCUMULATOR_DTYPE
+    accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N),
+                           dtype=accumulator_dtype)
+    # NOTE: Some tensor inputs are so large, they will cause int32 overflow
+    # so it is necessary to use tl.int64 for all the offsets, else SEGV will
+    # eventually occur.
+    # Offsets and masks.
+    offsets_am = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    masks_am = offsets_am < M
+    offsets_bn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    masks_bn = offsets_bn < N
+    offsets_k = tl.arange(0, BLOCK_SIZE_K).to(tl.int64)
+    offsets_a = (stride_am * offsets_am[:, None] +
+                 stride_ak * offsets_k[None, :])
+    offsets_b = (stride_bk * offsets_k[:, None] +
+                 stride_bn * offsets_bn[None, :])
+    # NOTE: BLOCK_SIZE_SCALE_A could be 1 or BLOCK_SIZE_M, so need to create
+    # appropriate offsets and masks for each case. Same goes for
+    # BLOCK_SIZE_SCALE_B.
+    offsets_scale_am = (tl.arange(0, BLOCK_SIZE_SCALE_A) +
+                        (BLOCK_SIZE_SCALE_A > 1) * pid_m * BLOCK_SIZE_M)
+    masks_scale_am = offsets_scale_am < M
+    offsets_scale_bn = (tl.arange(0, BLOCK_SIZE_SCALE_B) +
+                        (BLOCK_SIZE_SCALE_B > 1) * pid_n * BLOCK_SIZE_N)
+    masks_scale_bn = offsets_scale_bn < N
+    a_ptrs = a_ptr + offsets_a
+    b_ptrs = b_ptr + offsets_b
+    scale_a_ptrs = scale_a_ptr + offsets_scale_am
+    scale_b_ptrs = scale_b_ptr + offsets_scale_bn
+    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+        masks_k = offsets_k < K
+        masks_a = masks_am[:, None] & masks_k[None, :]
+        a = tl.load(a_ptrs, mask=masks_a)
+        masks_b = masks_k[:, None] & masks_bn[None, :]
+        b = tl.load(b_ptrs, mask=masks_b)
+        # Accumulate results.
+        accumulator = tl.dot(a, b, accumulator, out_dtype=accumulator_dtype)
+        offsets_k += BLOCK_SIZE_K
+        a_ptrs += BLOCK_SIZE_K * stride_ak
+        b_ptrs += BLOCK_SIZE_K * stride_bk
+    # Apply scale at end.
+    masks_scale_a = masks_scale_am[:, None] & (tl.arange(0, 1) < 1)[:, None]
+    scale_a = tl.load(scale_a_ptrs[:, None], masks_scale_a)
+    # Need to broadcast to the appropriate size, if scale_a is already
+    # (BLOCK_SIZE_M, 1) then it will broadcast to its own shape. Same goes
+    # for scale_b below.
+    scale_a = scale_a.broadcast_to((BLOCK_SIZE_M, 1))
+    accumulator = scale_a * accumulator.to(tl.float32)
+    masks_scale_b = masks_scale_bn[:, None] & (tl.arange(0, 1) < 1)[None, :]
+    scale_b = tl.load(scale_b_ptrs[:, None], masks_scale_b)
+    scale_b = scale_b.broadcast_to((BLOCK_SIZE_N, 1))
+    accumulator = scale_b.T * accumulator.to(tl.float32)
+    # Convert to output format.
+    c = accumulator.to(c_ptr.type.element_ty)
+    # Add bias, it's already in output format, so add it after conversion.
+    if bias_ptr:
+        offsets_bias = offsets_bn
+        bias_ptrs = bias_ptr + offsets_bias
+        bias_mask = offsets_bias < N
+        bias = tl.load(bias_ptrs, bias_mask)
+        c += bias
+    # Save output
+    offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
+    offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)
+    offs_cm = offs_cm.to(tl.int64)
+    offs_cn = offs_cn.to(tl.int64)
+    c_ptrs = (c_ptr + stride_cm * offs_cm[:, None] +
+              stride_cn * offs_cn[None, :])
+    c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N)
+    tl.store(c_ptrs, c, mask=c_mask)
+# input   - [M, K]
+# weight - [K, N]
+def triton_scaled_mm(input: torch.Tensor,
+                     weight: torch.Tensor,
+                     scale_a: torch.Tensor,
+                     scale_b: torch.Tensor,
+                     out_dtype: type[torch.dtype],
+                     bias: Optional[torch.Tensor] = None,
+                     block_size_m: int = 32,
+                     block_size_n: int = 32,
+                     block_size_k: int = 32,
+                     use_heuristic=True) -> torch.Tensor:
+    M, K = input.shape
+    N = weight.shape[1]
+    assert N > 0 and K > 0 and M > 0
+    assert weight.shape[0] == K
+    assert input.dtype == weight.dtype
+    scale_a = scale_a.reshape(-1, 1) if scale_a.dim() <= 1 else scale_a
+    scale_b = scale_b.reshape(-1, 1) if scale_b.dim() <= 1 else scale_b
+    assert scale_a.dtype == scale_b.dtype and scale_a.is_floating_point()
+    assert scale_a.shape[1] == 1 and (scale_a.shape[0] == 1
+                                      or scale_a.shape[0] == M)
+    assert scale_b.shape[1] == 1 and (scale_b.shape[0] == 1
+                                      or scale_b.shape[0] == N)
+    assert out_dtype.is_floating_point
+    assert bias is None or bias.is_floating_point()
+    assert is_weak_contiguous(input)
+    assert is_weak_contiguous(weight)
+    grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(
+        N, META['BLOCK_SIZE_N']), )
+    result = torch.empty((M, N), dtype=out_dtype, device=input.device)
+    has_scalar = lambda x: x.shape[0] == 1 and x.shape[1] == 1
+    if use_heuristic:
+        is_small_N = N < 8192
+        next_power_of_2_M = max(32, triton.next_power_of_2(M))
+        if next_power_of_2_M <= 32:
+            tile_shape = (64, 64, 256) if is_small_N else (64, 128, 256)
+        elif next_power_of_2_M <= 64:
+            tile_shape = (64, 64, 256)
+        elif next_power_of_2_M <= 128:
+            tile_shape = (64, 128, 128)
+        else:
+            tile_shape = (128, 128, 128)
+    block_size_m, block_size_n, block_size_k = tile_shape
+    block_size_sa = 1 if has_scalar(scale_a) else block_size_m
+    block_size_sb = 1 if has_scalar(scale_b) else block_size_n
+    accumulator_dtype = tl.float32 if input.is_floating_point() else tl.int32
+    # A = input, B = weight, C = result
+    # A = M x K, B = K x N, C = M x N
+    scaled_mm_kernel[grid](input,
+                           weight,
+                           scale_a,
+                           scale_b,
+                           result,
+                           bias,
+                           M,
+                           N,
+                           K,
+                           input.stride(0),
+                           input.stride(1),
+                           weight.stride(0),
+                           weight.stride(1),
+                           result.stride(0),
+                           result.stride(1),
+                           accumulator_dtype,
+                           BLOCK_SIZE_M=block_size_m,
+                           BLOCK_SIZE_N=block_size_n,
+                           BLOCK_SIZE_K=block_size_k,
+                           BLOCK_SIZE_SCALE_A=block_size_sa,
+                           BLOCK_SIZE_SCALE_B=block_size_sb)
+    return result.to(out_dtype)