PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -22,17 +22,54 @@ import torch
 import triton
 import triton.language as tl
-from sglang.srt.utils import get_device_core_count, get_device_name, is_hip
-is_hip_ = is_hip()
-fp8_type_ = torch.float8_e4m3fnuz if is_hip_ else torch.float8_e4m3fn
-_is_cuda = torch.cuda.is_available() and torch.version.cuda
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    get_device_core_count,
+    get_device_name,
+    is_cuda,
+    is_hip,
+    supports_custom_op,
+)
+_is_hip = is_hip()
+fp8_type_ = torch.float8_e4m3fnuz if _is_hip else torch.float8_e4m3fn
+_is_cuda = is_cuda()
 if _is_cuda:
-    from sgl_kernel import sgl_per_token_group_quant_fp8
+    import deep_gemm  # `pip install "sgl-kernel>=0.0.4.post3"`
+    from sgl_kernel import sgl_per_token_group_quant_fp8, sgl_per_token_quant_fp8
 logger = logging.getLogger(__name__)
+_enable_jit_deepgemm = int(os.getenv("SGL_ENABLE_JIT_DEEPGEMM", "0"))
+if supports_custom_op():
+    def deep_gemm_fp8_fp8_bf16_nt(
+        A: torch.Tensor,
+        As: torch.Tensor,
+        B: torch.Tensor,
+        Bs: torch.Tensor,
+        C: torch.Tensor,
+    ) -> None:
+        deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+    def deep_gemm_fp8_fp8_bf16_nt_fake(
+        A: torch.Tensor,
+        As: torch.Tensor,
+        B: torch.Tensor,
+        Bs: torch.Tensor,
+        C: torch.Tensor,
+    ) -> None:
+        return
+    direct_register_custom_op(
+        op_name="deep_gemm_fp8_fp8_bf16_nt",
+        op_func=deep_gemm_fp8_fp8_bf16_nt,
+        mutates_args=["C"],
+        fake_impl=deep_gemm_fp8_fp8_bf16_nt_fake,
+    )
 @triton.jit
 def _per_token_group_quant_fp8(
@@ -70,7 +107,8 @@ def _per_token_group_quant_fp8(
     # Quant
     _absmax = tl.maximum(tl.max(tl.abs(y)), eps)
     y_s = _absmax / fp8_max
-    y_q = tl.clamp(y / y_s, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
     tl.store(y_q_ptr + cols, y_q, mask=mask)
     tl.store(y_s_ptr, y_s)
@@ -140,7 +178,7 @@ def per_token_group_quant_fp8(
         x: The input tenosr with ndim >= 2.
         group_size: The group size used for quantization.
         eps: The minimum to avoid dividing zero.
-        dtype: The dype of output tensor. Note that only `torch.float8_e4m3fn` is supported for now.
+        dtype: The dype of output tensor.
     Returns:
         Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
@@ -153,7 +191,7 @@ def per_token_group_quant_fp8(
     finfo = torch.finfo(dtype)
     fp8_max = finfo.max
-    if is_hip_:
+    if _is_hip:
         fp8_max = 224.0
     fp8_min = -fp8_max
@@ -241,6 +279,132 @@ def sglang_per_token_group_quant_fp8(
     return x_q, x_s
+def sglang_per_token_quant_fp8(
+    x: torch.Tensor,
+    dtype: torch.dtype = fp8_type_,
+):
+    assert x.is_contiguous(), "`x` is not contiguous"
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    x_s = torch.empty(
+        x.shape[0],
+        1,
+        device=x.device,
+        dtype=torch.float32,
+    )
+    sgl_per_token_quant_fp8(x, x_q, x_s)
+    return x_q, x_s
+@triton.jit
+def _static_quant_fp8(
+    # Pointers to inputs and output
+    y_ptr,
+    y_q_ptr,
+    y_s_ptr,
+    y_s_repeat_ptr,
+    # Stride of input
+    y_stride,
+    # Collums of input
+    N,
+    # Information for float8
+    fp8_min,
+    fp8_max,
+    # Meta-parameters
+    BLOCK: tl.constexpr,
+    REPEAT_SCALE: tl.constexpr,
+):
+    """A Triton-accelerated function to perform quantization using the given scale on a
+    tensor
+    This function converts the tensor values into float8 values.
+    """
+    # Map the program id to the row of X and Y it should compute.
+    g_id = tl.program_id(0)
+    y_ptr += g_id * y_stride
+    y_q_ptr += g_id * y_stride
+    if REPEAT_SCALE:
+        y_s_repeat_ptr += g_id
+    cols = tl.arange(0, BLOCK)  # N <= BLOCK
+    mask = cols < N
+    y = tl.load(y_ptr + cols, mask=mask, other=0.0).to(tl.float32)
+    y_s = tl.load(y_s_ptr).to(tl.float32)
+    y_s_inv = 1.0 / y_s
+    y_q = tl.clamp(y * y_s_inv, fp8_min, fp8_max).to(y_q_ptr.dtype.element_ty)
+    tl.store(y_q_ptr + cols, y_q, mask=mask)
+    if REPEAT_SCALE:
+        tl.store(y_s_repeat_ptr, y_s)
+def static_quant_fp8(
+    x: torch.Tensor,
+    x_s: torch.Tensor,
+    repeat_scale: bool = False,
+    dtype: torch.dtype = fp8_type_,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Function to perform static quantization using the given scale on an input tensor `x`.
+    It converts the tensor values into signed float8 values and returns the
+    quantized tensor along with the scaling factor used for quantization.
+    Args:
+        x: The input tenosr with ndim >= 2.
+        x_s: The quantization scale.
+        repeat_scale: Whether to broadcast per-tensor scale to per-channel scale.
+        dtype: The dype of output tensor.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The quantized tensor and the scaling factor for quantization.
+    """
+    assert x.is_contiguous(), "`x` is not contiguous"
+    assert x_s.numel() == 1, "only supports per-tensor scale"
+    finfo = torch.finfo(dtype)
+    fp8_max = finfo.max
+    if _is_hip:
+        fp8_max = 224.0
+    fp8_min = -fp8_max
+    x_q = torch.empty_like(x, device=x.device, dtype=dtype)
+    M = x.numel() // x.shape[-1]
+    N = x.shape[-1]
+    if repeat_scale:
+        x_s_repeat = torch.empty(
+            (M, 1),
+            device=x.device,
+            dtype=torch.float32,
+        )
+    else:
+        x_s_repeat = None
+    BLOCK = triton.next_power_of_2(N)
+    # heuristics for number of warps
+    num_warps = min(max(BLOCK // 256, 1), 8)
+    num_stages = 1
+    _static_quant_fp8[(M,)](
+        x,
+        x_q,
+        x_s,
+        x_s_repeat,
+        N,
+        N,
+        fp8_min=fp8_min,
+        fp8_max=fp8_max,
+        BLOCK=BLOCK,
+        REPEAT_SCALE=repeat_scale,
+        num_warps=num_warps,
+        num_stages=num_stages,
+    )
+    x_s = x_s_repeat if repeat_scale else x_s
+    return x_q, x_s
 @triton.jit
 def _w8a8_block_fp8_matmul(
     # Pointers to inputs and output
@@ -595,34 +759,42 @@ def w8a8_block_fp8_matmul(
     num_workgroups = triton.cdiv(M, config["BLOCK_SIZE_M"]) * triton.cdiv(
         N, config["BLOCK_SIZE_N"]
     )
-    kernel = (
-        _w8a8_block_fp8_matmul_unrolledx4
-        if (is_hip_ == True and num_workgroups <= get_device_core_count())
-        else _w8a8_block_fp8_matmul
-    )
-    kernel[grid](
-        A,
-        B,
-        C,
-        As,
-        Bs,
-        M,
-        N,
-        K,
-        block_n,
-        block_k,
-        A.stride(-2),
-        A.stride(-1),
-        B.stride(1),
-        B.stride(0),
-        C.stride(-2),
-        C.stride(-1),
-        As.stride(-2),
-        As.stride(-1),
-        Bs.stride(1),
-        Bs.stride(0),
-        **config,
-    )
+    # deepgemm only support bf16
+    if _is_cuda and C.dtype == torch.bfloat16 and _enable_jit_deepgemm:
+        if supports_custom_op():
+            torch.ops.sglang.deep_gemm_fp8_fp8_bf16_nt(A, As, B, Bs, C)
+        else:
+            deep_gemm.gemm_fp8_fp8_bf16_nt((A, As), (B, Bs), C)
+    else:
+        kernel = (
+            _w8a8_block_fp8_matmul_unrolledx4
+            if (_is_hip == True and num_workgroups <= get_device_core_count())
+            else _w8a8_block_fp8_matmul
+        )
+        kernel[grid](
+            A,
+            B,
+            C,
+            As,
+            Bs,
+            M,
+            N,
+            K,
+            block_n,
+            block_k,
+            A.stride(-2),
+            A.stride(-1),
+            B.stride(1),
+            B.stride(0),
+            C.stride(-2),
+            C.stride(-1),
+            As.stride(-2),
+            As.stride(-1),
+            Bs.stride(1),
+            Bs.stride(0),
+            **config,
+        )
     return C

sglang/srt/layers/quantization/fp8_utils.py CHANGED Viewed

@@ -1,23 +1,53 @@
-import os
 from typing import List, Optional, Tuple
 import torch
-from sglang.srt.layers.parameter import RowvLLMParameter, _ColumnvLLMParameter
 from sglang.srt.layers.quantization.fp8_kernel import (
     per_token_group_quant_fp8,
+    static_quant_fp8,
     w8a8_block_fp8_matmul,
 )
-from sglang.srt.utils import get_bool_env_var, is_hip
+from sglang.srt.utils import (
+    get_bool_env_var,
+    get_cuda_version,
+    get_device_capability,
+    is_cuda,
+    is_hip,
+)
+use_vllm_cutlass_w8a8_fp8_kernel = get_bool_env_var("USE_VLLM_CUTLASS_W8A8_FP8_KERNEL")
-is_hip_ = is_hip()
-if is_hip_ and get_bool_env_var("CK_MOE"):
+_is_hip = is_hip()
+if _is_hip and get_bool_env_var("CK_MOE"):
     from aiter import gemm_a8w8_blockscale
-_is_cuda = torch.cuda.is_available() and torch.version.cuda
+_is_cuda = is_cuda()
 if _is_cuda:
     from sgl_kernel import fp8_blockwise_scaled_mm
+    from sglang.srt.layers.quantization.fp8_kernel import sglang_per_token_quant_fp8
+    if use_vllm_cutlass_w8a8_fp8_kernel:
+        from vllm import _custom_ops as ops
+    else:
+        from sgl_kernel import fp8_scaled_mm
+# Input scaling factors are no longer optional in _scaled_mm starting
+# from pytorch 2.5. Allocating a dummy tensor to pass as input_scale
+TORCH_DEVICE_IDENTITY = torch.ones(1, dtype=torch.float32)
+def cutlass_fp8_supported():
+    if not _is_cuda:
+        return False
+    major, minor = get_device_capability()
+    cuda_version = get_cuda_version()
+    if major >= 9:
+        return cuda_version >= (12, 0)
+    elif major == 8 and minor == 9:
+        return cuda_version >= (12, 4)
+    return False
 def normalize_e4m3fn_to_e4m3fnuz(
     weight: torch.Tensor,
@@ -44,7 +74,7 @@ def normalize_e4m3fn_to_e4m3fnuz(
 def cutlass_block_fp8_supported() -> bool:
-    if os.environ.get("SUPPORT_CUTLASS_BLOCK_FP8") is None:
+    if get_bool_env_var("SUPPORT_CUTLASS_BLOCK_FP8"):
         return False
     if _is_cuda:
         major, minor = torch.cuda.get_device_capability()
@@ -81,7 +111,7 @@ def apply_w8a8_block_fp8_linear(
         output = fp8_blockwise_scaled_mm(
             q_input, weight.T, x_scale, weight_scale.T, out_dtype=input.dtype
         )
-    elif is_hip_ and get_bool_env_var("CK_MOE"):
+    elif _is_hip and get_bool_env_var("CK_MOE"):
         q_input, x_scale = per_token_group_quant_fp8(
             input_2d, block_size[1], column_major_scales=False
         )
@@ -112,7 +142,7 @@ def input_to_float8(
     min_val, max_val = x.aminmax()
     amax = torch.maximum(min_val.abs(), max_val.abs()).clamp(min=1e-12)
     fp8_max = finfo.max
-    if is_hip_:
+    if _is_hip:
         fp8_max = 224.0
     scale = fp8_max / amax
     x_scl_sat = (x * scale).clamp(min=-fp8_max, max=fp8_max)
@@ -158,10 +188,121 @@ def block_quant_to_tensor_quant(
     return x_q_tensor, scale
-class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
-    """
-    Parameter class for weight scales loaded for weights with
-    block-wise quantization. Uses both column and row parallelism.
-    """
+def apply_fp8_linear(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    input_scale: Optional[torch.Tensor] = None,
+    input_scale_ub: Optional[torch.Tensor] = None,
+    bias: Optional[torch.Tensor] = None,
+    cutlass_fp8_supported: bool = True,
+    use_per_token_if_dynamic: bool = False,
+) -> torch.Tensor:
+    # View input as 2D matrix for fp8 methods
+    input_2d = input.view(-1, input.shape[-1])
+    output_shape = [*input.shape[:-1], weight.shape[1]]
+    # cutlass w8a8 fp8 sgl-kernel only supports per-token scale
+    if input_scale is not None:
+        assert input_scale.numel() == 1
+        # broadcast per-tensor scale to per-token scale when supporting cutlass
+        qinput, x_scale = static_quant_fp8(
+            input_2d, input_scale, repeat_scale=cutlass_fp8_supported
+        )
+    else:
+        # default use per-token quantization if dynamic
+        if _is_cuda:
+            qinput, x_scale = sglang_per_token_quant_fp8(input_2d)
+        else:
+            qinput, x_scale = per_token_group_quant_fp8(
+                input_2d, group_size=input_2d.shape[1]
+            )
+    if cutlass_fp8_supported:
+        if use_vllm_cutlass_w8a8_fp8_kernel:
+            # Fall back to vllm cutlass w8a8 fp8 kernel
+            output = ops.cutlass_scaled_mm(
+                qinput,
+                weight,
+                out_dtype=input.dtype,
+                scale_a=x_scale,
+                scale_b=weight_scale,
+                bias=bias,
+            )
+        else:
+            assert (
+                weight_scale.numel() == weight.shape[1]
+            ), "cutlass w8a8 fp8 sgl-kernel only supports per-channel scale"
+            output = fp8_scaled_mm(
+                qinput, weight, x_scale, weight_scale, out_dtype=input.dtype, bias=bias
+            )
+        return output.view(*output_shape)
+    # torch.scaled_mm supports per tensor weights + activations only
+    # so fallback to naive if per channel or per token
+    else:
+        per_tensor_weights = weight_scale.numel() == 1
+        per_tensor_activations = x_scale.numel() == 1
+        if per_tensor_weights and per_tensor_activations:
+            # Fused GEMM_DQ
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                out_dtype=input.dtype,
+                scale_a=x_scale,
+                scale_b=weight_scale,
+                bias=bias,
+            )
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
+            return torch.narrow(output, 0, 0, input_2d.shape[0]).view(*output_shape)
+        else:
+            # Fallback for channelwise case, where we use unfused DQ
+            # due to limitations with scaled_mm
+            # Symmetric quantized GEMM by definition computes the following:
+            #   C = (s_x * X) (s_w * W) + bias
+            # This is equivalent to dequantizing the weights and activations
+            # before applying a GEMM.
+            #
+            # In order to compute quantized operands, a quantized kernel
+            # will rewrite the above like so:
+            #   C = s_w * s_x * (X * W) + bias
+            #
+            # For the scaled_mm fallback case, we break this down, since it
+            # does not support s_w being a vector.
+            # Making sure the dummy tensor is on the same device as the weight
+            global TORCH_DEVICE_IDENTITY
+            if TORCH_DEVICE_IDENTITY.device != weight.device:
+                TORCH_DEVICE_IDENTITY = TORCH_DEVICE_IDENTITY.to(weight.device)
+            # GEMM
+            # This computes C = (X * W).
+            # Output in fp32 to allow subsequent ops to happen in-place
+            output = torch._scaled_mm(
+                qinput,
+                weight,
+                scale_a=TORCH_DEVICE_IDENTITY,
+                scale_b=TORCH_DEVICE_IDENTITY,
+                out_dtype=torch.float32,
+            )
+            # A fix for discrepancy in scaled_mm which returns tuple
+            # for torch < 2.5 and a single value in torch >= 2.5
+            if type(output) is tuple and len(output) == 2:
+                output = output[0]
+            # Unpad (undo num_token_padding)
+            output = torch.narrow(output, 0, 0, input_2d.shape[0])
+            x_scale = torch.narrow(x_scale, 0, 0, input_2d.shape[0])
-    pass
+            # DQ
+            # C = sw * sx * (X * W) + bias
+            output = output * x_scale * weight_scale.t()
+            if bias is not None:
+                output = output + bias
+            return output.to(dtype=input.dtype).view(*output_shape)

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -7,7 +7,7 @@ import torch
 from torch.nn.parameter import Parameter
 from vllm.model_executor.layers.quantization.kv_cache import BaseKVCacheMethod
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
-    apply_fp8_linear,
+    convert_to_channelwise,
     cutlass_fp8_supported,
     requantize_with_max_scale,
 )
@@ -19,6 +19,7 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8_utils import apply_fp8_linear
 # Initialize logger for the module
 logger = logging.getLogger(__name__)
@@ -161,6 +162,9 @@ class ModelOptFp8LinearMethod(LinearMethodBase):
             layer.weight, layer.weight_scale, layer.logical_widths
         )
         layer.weight = Parameter(quantized_weight.t(), requires_grad=False)
+        # cutlass sgl-kernel only supports per-channel scale
+        if self.cutlass_fp8_supported:
+            max_w_scale = convert_to_channelwise(max_w_scale, layer.logical_widths)
         layer.weight_scale = Parameter(max_w_scale, requires_grad=False)
         layer.input_scale = Parameter(layer.input_scale.max(), requires_grad=False)

sglang/srt/layers/quantization/w8a8_fp8.py ADDED Viewed

@@ -0,0 +1,128 @@
+from typing import Any, Dict, List, Optional
+import torch
+from torch.nn.parameter import Parameter
+from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.fp8_utils import (
+    apply_fp8_linear,
+    cutlass_fp8_supported,
+    normalize_e4m3fn_to_e4m3fnuz,
+)
+from sglang.srt.utils import is_hip
+_is_hip = is_hip()
+class W8A8Fp8Config(QuantizationConfig):
+    """Config class for W8A8 FP8 Quantization.
+    - Weight: static, per-channel, symmetric
+    - Activation: dynamic, per-token, symmetric
+    """
+    def __init__(self):
+        pass
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16, torch.bfloat16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 89
+    @classmethod
+    def get_name(self) -> str:
+        return "w8a8_fp8"
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        return []
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "W8A8Fp8Config":
+        return cls()
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from sglang.srt.layers.linear import LinearBase
+        if isinstance(layer, LinearBase):
+            return W8A8Fp8LinearMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+class W8A8Fp8LinearMethod(LinearMethodBase):
+    def __init__(self, quantization_config: W8A8Fp8Config):
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+        self.quantization_config = quantization_config
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight = layer.weight
+        weight_scale = layer.weight_scale.detach()
+        if _is_hip:
+            weight, weight_scale, _ = normalize_e4m3fn_to_e4m3fnuz(
+                weight=weight, weight_scale=weight_scale
+            )
+        layer.weight = Parameter(weight.t(), requires_grad=False)
+        layer.weight_scale = Parameter(weight_scale, requires_grad=False)
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        self.logical_widths = output_partition_sizes
+        weight = ModelWeightParameter(
+            data=torch.empty(
+                sum(output_partition_sizes),
+                input_size_per_partition,
+                dtype=torch.float8_e4m3fn,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight", weight)
+        weight_scale = ChannelQuantScaleParameter(
+            data=torch.empty((sum(output_partition_sizes), 1), dtype=torch.float32),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("weight_scale", weight_scale)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        return apply_fp8_linear(
+            x,
+            layer.weight,
+            layer.weight_scale,
+            bias=bias,
+            cutlass_fp8_supported=self.cutlass_fp8_supported,
+        )

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl