PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -20,10 +20,11 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda, is_hip
+from sglang.srt.utils import get_bool_env_var, is_cuda, is_hip
 _is_cuda = is_cuda()
 _is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_cuda:
     from sgl_kernel import (
@@ -33,7 +34,10 @@ if _is_cuda:
         rmsnorm,
     )
-if _is_hip:
+if _use_aiter:
+    from aiter import rmsnorm2d_fwd as rms_norm
+    from aiter import rmsnorm2d_fwd_with_add as fused_add_rms_norm
+elif _is_hip:
     from vllm._custom_ops import fused_add_rms_norm, rms_norm
 logger = logging.getLogger(__name__)
@@ -48,16 +52,8 @@ class RMSNorm(CustomOp):
         super().__init__()
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
-    def forward(self, *args, **kwargs):
-        if torch.compiler.is_compiling():
-            return self.forward_native(*args, **kwargs)
-        if _is_cuda:
-            return self.forward_cuda(*args, **kwargs)
-        elif _is_hip:
-            return self.forward_hip(*args, **kwargs)
-        else:
-            return self.forward_native(*args, **kwargs)
+        if _use_aiter:
+            self._forward_method = self.forward_aiter
     def forward_cuda(
         self,
@@ -70,6 +66,25 @@ class RMSNorm(CustomOp):
         out = rmsnorm(x, self.weight.data, self.variance_epsilon)
         return out
+    def forward_aiter(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            residual_out = torch.empty_like(x)
+            output = torch.empty_like(x)
+            fused_add_rms_norm(
+                output,
+                x,
+                residual,
+                residual_out,
+                self.weight.data,
+                self.variance_epsilon,
+            )
+            return output, residual_out
+        return rms_norm(x, self.weight.data, self.variance_epsilon)
     def forward_hip(
         self,
         x: torch.Tensor,
@@ -117,13 +132,9 @@ class GemmaRMSNorm(CustomOp):
         self.weight = nn.Parameter(torch.zeros(hidden_size))
         self.variance_epsilon = eps
-    def forward(self, *args, **kwargs):
-        if torch.compiler.is_compiling():
-            return self.forward_native(*args, **kwargs)
-        if _is_cuda:
-            return self.forward_cuda(*args, **kwargs)
-        else:
-            return self.forward_native(*args, **kwargs)
+        # Re-dispatch
+        if _is_hip:
+            self._forward_method = self.forward_native
     def forward_native(
         self,

sglang/srt/layers/moe/cutlass_moe.py CHANGED Viewed

@@ -1,4 +1,4 @@
-"""Cutlass MoE kernel."""
+"""CUTLASS based Fused MoE kernels."""
 import functools
 import json
@@ -8,19 +8,24 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
 from sglang.srt.utils import is_cuda
 _is_cuda = is_cuda()
 if _is_cuda:
     import sgl_kernel
     from sgl_kernel import (
+        apply_shuffle_mul_sum,
+        cutlass_fp4_group_mm,
         fp8_blockwise_scaled_grouped_mm,
         prepare_moe_input,
+        scaled_fp4_experts_quant,
+        shuffle_rows,
         silu_and_mul,
     )
-def cutlass_fused_experts(
+def cutlass_fused_experts_fp8(
     a: torch.Tensor,
     w1_q: torch.Tensor,
     w2_q: torch.Tensor,
@@ -147,8 +152,8 @@ def cutlass_fused_experts(
         k,
     )
-    rep_a_q = a_q.view(dtype=torch.uint8)[a_map].view(dtype=a_q.dtype)
-    rep_a1_scales = a1_scale[a_map]
+    rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
+    rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
     c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
     c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
@@ -202,6 +207,164 @@ def cutlass_fused_experts(
         expert_offsets[:-1],
         workspace,
     )
-    return (
-        c2[c_map].view(m, topk, k) * topk_weights.view(m, topk, 1).to(out_dtype)
-    ).sum(dim=1)
+    result = torch.empty((m, k), device=device, dtype=out_dtype)
+    return apply_shuffle_mul_sum(c2, result, c_map, topk_weights)
+FLOAT4_E2M1_MAX = 6.0
+FLOAT8_E4M3_MAX = 448.0
+def cutlass_moe_fp4(
+    a: torch.Tensor,
+    a1_gscale: torch.Tensor,
+    w1_fp4: torch.Tensor,
+    w1_blockscale: torch.Tensor,
+    w1_alphas: torch.Tensor,
+    a2_gscale: torch.Tensor,
+    w2_fp4: torch.Tensor,
+    w2_blockscale: torch.Tensor,
+    w2_alphas: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    params: CutlassMoEParams,
+    apply_router_weight_on_input: bool = False,
+):
+    """
+    MoE implementation for FP4 Inputs
+    # Gemm 1
+    a: Input tensor: [m, k] (half/bfloat16)
+    a1_gscale: Activation scale per expert: [e]  (float32)
+    w1(gate up) (not an argument to cutlass_moe_fp4): [e, 2 * n, k]
+    w1_fp4: [e, 2 * n, k // 2], dtype: torch.uint8 (stacked fp4: E2M1)
+    (Note: `n` is the up projection output dim, `k` is the input dim in
+     full precision)
+    w1_blockscale: [e, 2 * n, k // block_size] (float8_e4m3)
+                   (Block size = 16 for NVFP4)
+    # Gemm 2
+    a2_gscale: Activation scale per expert: [e]
+    w2(down projection) (not an argument to cutlass_moe_fp4): [e, k, n]
+    w2_fp4: [e, k, n // 2], dtype: torch.uint8 (stacked E2M1)
+    w2_blockscale: [e, k, n // block_size], dtype: float8_e4m3
+    Strides for activations, weights and output in logical number of elements.
+    The activations & output stride is the number of elements to the next row.
+    The weights stride is the number of elements to the next row per expert.
+    For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    shape [e] with each element being k. Similarly for activations, if the
+    shape is [m, k], then the a_stride has shape [e] with each value k.
+    Similarly for output, if the output is [m, n], then the c_stride is a
+    tensor of shape [e] with each element being k.
+    Note: cutlass_fp4_group_mm is designed to accept the strides of
+    activations and weights to be the same, so it is passed in as a single
+    tensor.
+    ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    c_strides_2: [e] dtype: int64 [Gemm 1: Output Strides]
+    topk_weights: [m, topk] dtype: float8
+    topk_ids: [m, topk] dtype: float8
+    m, n, k: Unquantized weight shapes, dtype: int
+    e: number of experts for the current rank, dtype: int
+    assumes that topk < k < n to satisfy - up/down projection expectations.
+    """
+    assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
+    assert w1_fp4.dtype == torch.uint8, "weight 1 must be uint8"
+    assert w2_fp4.dtype == torch.uint8, "weight 2 must be uint8"
+    assert (
+        w1_fp4.ndim == 3
+        and w2_fp4.ndim == 3
+        and w1_blockscale.ndim == 3
+        and w2_blockscale.ndim == 3
+    ), "All Weights must be of rank 3 for cutlass_moe_fp4"
+    m_a, k_a = a.shape
+    e_w1, nx2_w1, half_k_w1 = w1_fp4.shape
+    e_w2, k_w2, half_n_w2 = w2_fp4.shape
+    assert e_w1 == e_w2 and e_w1 == params.num_experts, (
+        "Number of experts must match",
+        " between weights.",
+    )
+    assert (
+        k_a // 2 == half_k_w1 and params.hidden_size == k_w2
+    ), "Hidden size mismatch between a, w1 and w2"
+    assert (
+        nx2_w1 == params.intermediate_size_per_partition * 2
+        and half_n_w2 == params.intermediate_size_per_partition // 2
+    ), ("mismatch in " "expected `n`")
+    assert 2 * half_k_w1 == k_w2, "Hidden size mismatch w2 and w1"
+    assert a.dtype in [torch.half, torch.bfloat16], "Invalid input dtype"
+    out_dtype = a.dtype
+    num_topk = topk_ids.shape[1]
+    device = a.device
+    a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
+    prepare_moe_input(
+        topk_ids,
+        params.expert_offsets,
+        params.problem_sizes1,
+        params.problem_sizes2,
+        a_map,
+        c_map,
+        params.num_experts,
+        params.intermediate_size_per_partition,
+        params.hidden_size,
+        params.blockscale_offsets,
+    )
+    rep_a_fp4, rep_a_blockscale = scaled_fp4_experts_quant(
+        a,
+        a1_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+        expert_map=a_map,
+    )
+    c1 = cutlass_fp4_group_mm(
+        rep_a_fp4,
+        w1_fp4,
+        rep_a_blockscale,
+        w1_blockscale,
+        w1_alphas,
+        out_dtype,
+        device,
+        params.to_gemm1_args(),
+    )
+    del rep_a_fp4, rep_a_blockscale
+    # hidden size dimension is split to one halfpytho sized tensor.
+    intermediate = torch.empty(
+        (m_a * num_topk, w1_fp4.shape[1] // 2), device=device, dtype=out_dtype
+    )
+    silu_and_mul(c1, intermediate)
+    int_fp4, int_blockscale = scaled_fp4_experts_quant(
+        intermediate,
+        a2_gscale,
+        params.expert_offsets,
+        params.blockscale_offsets,
+        num_topk,
+    )
+    c2 = cutlass_fp4_group_mm(
+        int_fp4,
+        w2_fp4,
+        int_blockscale,
+        w2_blockscale,
+        w2_alphas,
+        out_dtype,
+        device,
+        params.to_gemm2_args(),
+    )
+    del int_fp4, int_blockscale
+    c2 = shuffle_rows(c2, c_map, (m_a * num_topk, params.hidden_size))
+    c2 = c2.view(m_a, num_topk, params.hidden_size)
+    if not apply_router_weight_on_input:
+        c2 = c2 * topk_weights.view(m_a, num_topk, 1).to(out_dtype)
+    return c2.sum(dim=1).to(out_dtype)

sglang/srt/layers/moe/cutlass_moe_params.py ADDED Viewed

@@ -0,0 +1,169 @@
+from dataclasses import dataclass
+from enum import Enum, auto
+from typing import Optional
+import torch
+class CutlassMoEType(Enum):
+    """
+    Enum for the different types of cutlass moe operations
+    that are currently supported in SGLang.
+    """
+    BlockscaledFP8 = auto()
+    BlockscaledFP4 = auto()
+@dataclass
+class CutlassMoEParams:
+    """
+    Parameters for the cutlass moe operation.
+    """
+    #  Type as defined above
+    cutlass_moe_type: CutlassMoEType
+    # Strides for activations, weights and output in logical number of elements.
+    # The activations & output stride is the number of elements to the next row.
+    # The weights stride is the number of elements to the next row per expert.
+    # For example, if the weight is [e, n, k], then the b_stride is a tensor of
+    # shape [e] with each element being k. Similarly for activations, if the
+    # shape is [m, k], then the a_stride has shape [e] with each value k.
+    # Similarly for output, if the output is [m, n], then the c_stride is a
+    # tensor of shape [e] with each element being k.
+    # Note: cutlass_fp4_group_mm is designed to accept the strides of
+    # activations and weights to be the same, so it is passed in as a single
+    # tensor.
+    # ab_strides_13: [e] dtype: int64 [Gemm 1: Activation / Weight strides]
+    # ab_strides_2: [e] dtype: int64 [Gemm 2: Activation / Weight strides]
+    # c_strides_13: [e] dtype: int64 [Gemm 1: Output Strides]
+    # c_strides_2: [e] dtype: int64 [Gemm 2: Output Strides]
+    ab_strides_13: torch.Tensor
+    ab_strides_2: torch.Tensor
+    c_strides_13: torch.Tensor
+    c_strides_2: torch.Tensor
+    # m: Total number of tokens
+    # n: intermediate size per partition
+    # k: hidden size per expert
+    # e: Number of experts
+    # device: Device to run computation on and store tensors
+    m: int
+    intermediate_size_per_partition: int
+    hidden_size: int
+    num_experts: int
+    device: torch.device
+    # Pointers container for calculating offsets of the input activations for each expert
+    # a_ptrs: [e] dtype: int64
+    a_ptrs: torch.Tensor
+    # Pointers container for calculating offsets of the input weights for each expert
+    # b_ptrs: [e] dtype: int64
+    b_ptrs: torch.Tensor
+    # Pointers container for calculating offsets of the output activations for each expert
+    # out_ptrs: [e] dtype: int64
+    out_ptrs: torch.Tensor
+    # Pointers container for calculating offsets of the input scales for each expert
+    # a_scales_ptrs: [e] dtype: int64
+    # b_scales_ptrs: [e] dtype: int64
+    a_scales_ptrs: torch.Tensor
+    b_scales_ptrs: torch.Tensor
+    # Offsets that mark at which token index each expert begins its computation
+    # The number of tokens computed with expert E is expert_offsets[E + 1] - expert_offsets[E]
+    # expert_offsets: [e+1] dtype: int32
+    expert_offsets: torch.Tensor
+    # Problem size: (num_experts, (m,2n,k)) for first GEMM
+    # problem_sizes1: [e, 3] dtype: int32
+    # Problem size: (num_experts, (m,n,k)) for second GEMM
+    # problem_sizes2: [e, 3] dtype: int32
+    problem_sizes1: torch.Tensor
+    problem_sizes2: torch.Tensor
+    # Similar to expert_offsets, but for blockscales for FP4 blockscaled Group GEMM
+    blockscale_offsets: Optional[torch.Tensor] = None
+    def __init__(
+        self,
+        cutlass_moe_type: CutlassMoEType,
+        device: torch.device,
+        num_experts: int,
+        intermediate_size_per_partition: int,
+        hidden_size: int,
+    ):
+        self.cutlass_moe_type = cutlass_moe_type
+        self.device = device
+        self.num_experts = num_experts
+        self.intermediate_size_per_partition = intermediate_size_per_partition
+        self.hidden_size = hidden_size
+        self.n = self.intermediate_size_per_partition
+        self.k = self.hidden_size
+        self.e = self.num_experts
+        self.ab_strides_13 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.ab_strides_2 = torch.full(
+            (self.e,), self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_13 = torch.full(
+            (self.e,), 2 * self.n, dtype=torch.int64, device=self.device
+        )
+        self.c_strides_2 = torch.full(
+            (self.e,), self.k, dtype=torch.int64, device=self.device
+        )
+        self.expert_offsets = torch.empty(
+            (self.e + 1,), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes1 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        self.problem_sizes2 = torch.empty(
+            (self.e, 3), dtype=torch.int32, device=self.device
+        )
+        if self.cutlass_moe_type == CutlassMoEType.BlockscaledFP4:
+            self.blockscale_offsets = torch.empty(
+                (self.e + 1,), dtype=torch.int32, device=self.device
+            )
+        else:
+            self.blockscale_offsets = None
+        self.a_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.b_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.out_ptrs = torch.empty((self.e,), dtype=torch.int64, device=self.device)
+        self.a_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+        self.b_scales_ptrs = torch.empty(
+            (self.e,), dtype=torch.int64, device=self.device
+        )
+    def to_gemm1_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_13,
+            "c_strides": self.c_strides_13,
+            "problem_sizes": self.problem_sizes1,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            #    "a_ptrs": self.a_ptrs,
+            #    "b_ptrs": self.b_ptrs,
+            #    "out_ptrs": self.out_ptrs,
+            #    "a_scales_ptrs": self.a_scales_ptrs,
+            #    "b_scales_ptrs": self.b_scales_ptrs,
+        }
+    def to_gemm2_args(self) -> dict:
+        return {
+            "ab_strides": self.ab_strides_2,
+            "c_strides": self.c_strides_2,
+            "problem_sizes": self.problem_sizes2,
+            "expert_offsets": self.expert_offsets[:-1],
+            "blockscale_offsets": self.blockscale_offsets[:-1],
+            #    "a_ptrs": self.a_ptrs,
+            #    "b_ptrs": self.b_ptrs,
+            #    "out_ptrs": self.out_ptrs,
+            #    "a_scales_ptrs": self.a_scales_ptrs,
+            #    "b_scales_ptrs": self.b_scales_ptrs,
+        }

sglang/srt/layers/moe/ep_moe/kernels.py CHANGED Viewed

@@ -178,26 +178,33 @@ def pre_reorder_triton_kernel(
     topk,
     hidden_size,
     BLOCK_SIZE: tl.constexpr,
+    use_per_token_if_dynamic: tl.constexpr,
 ):
     OutDtype = gateup_input_ptr.dtype.element_ty
     src_idx = tl.program_id(0)
     src2dst_ptr = src2dst_ptr + src_idx * topk
     topk_ids_ptr = topk_ids_ptr + src_idx * topk
     src_ptr = input_ptr + src_idx * hidden_size
+    vec = tl.arange(0, BLOCK_SIZE)
+    if a1_scales_ptr is not None and use_per_token_if_dynamic:
+        scale = 1.0 / tl.load(a1_scales_ptr + src_idx)
     for idx in range(topk):
         expert_id = tl.load(topk_ids_ptr + idx)
         if expert_id >= start_expert_id and expert_id <= end_expert_id:
             if a1_scales_ptr is not None:
-                scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
+                if not use_per_token_if_dynamic:
+                    scale = 1.0 / tl.load(a1_scales_ptr + expert_id - start_expert_id)
             else:
                 scale = 1.0
             dst_idx = tl.load(src2dst_ptr + idx)
             dst_ptr = gateup_input_ptr + dst_idx * hidden_size
             for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
-                offset = start_offset + tl.arange(0, BLOCK_SIZE)
+                offset = start_offset + vec
                 mask = offset < hidden_size
                 in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
                 out_data = (in_data * scale).to(OutDtype)
@@ -481,8 +488,11 @@ def post_reorder_triton_kernel(
     computed = False
     store_ptr = output_ptr + src_idx * hidden_size
+    vec = tl.arange(0, BLOCK_SIZE)
     for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
-        offset = start_offset + tl.arange(0, BLOCK_SIZE)
+        offset = start_offset + vec
         mask = offset < hidden_size
         sum_vec = tl.zeros([BLOCK_SIZE], dtype=InDtype)
@@ -499,7 +509,7 @@ def post_reorder_triton_kernel(
     if computed == False:
         for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
-            offset = start_offset + tl.arange(0, BLOCK_SIZE)
+            offset = start_offset + vec
             mask = offset < hidden_size
             tl.store(
                 store_ptr + offset, tl.zeros([BLOCK_SIZE], dtype=InDtype), mask=mask
@@ -553,6 +563,7 @@ def grouped_gemm_triton_kernel(
     bs_stride_0: tl.constexpr,
     bs_stride_2: tl.constexpr,
     bs_stride_1: tl.constexpr,
+    use_per_token_if_dynamic: tl.constexpr,
     BLOCK_SIZE_M: tl.constexpr,
     BLOCK_SIZE_N: tl.constexpr,
     BLOCK_SIZE_K: tl.constexpr,
@@ -616,7 +627,10 @@ def grouped_gemm_triton_kernel(
         b_ptr += BLOCK_SIZE_K
     if use_fp8_w8a8 and not (group_k > 0 and group_n > 0):
-        scale_a_value = tl.load(scale_a + expert_id)
+        if use_per_token_if_dynamic:
+            scale_a_value = tl.load(scale_a + (m_range_start + offs_am[:, None]))
+        else:
+            scale_a_value = tl.load(scale_a + expert_id)
         scale_b_value = tl.load(scale_b + expert_id)
         accumulator *= scale_a_value * scale_b_value
@@ -653,6 +667,7 @@ def grouped_gemm_triton(
     scale_b: torch.Tensor = None,
     block_shape: Optional[List[int]] = None,
     c_dtype=None,
+    use_per_token_if_dynamic: bool = True,
 ):
     assert weight_column_major == True  # TODO: more
     if use_fp8_w8a8 and block_shape is None:
@@ -693,6 +708,11 @@ def grouped_gemm_triton(
         triton.cdiv(b.size(1), META["BLOCK_SIZE_N"]),
     )
+    if use_fp8_w8a8 and block_shape is None and use_per_token_if_dynamic:
+        assert (
+            scale_a.shape[0] == a.shape[0]
+        ), f"scale_a.shape: {scale_a.shape}, a.shape: {a.shape}"
     grouped_gemm_triton_kernel[grid](
         a,
         b,
@@ -716,6 +736,7 @@ def grouped_gemm_triton(
         scale_b.stride(0) if scale_b is not None and scale_b.ndim >= 2 else 0,
         scale_b.stride(2) if scale_b is not None and scale_b.ndim == 3 else 0,
         scale_b.stride(1) if scale_b is not None and scale_b.ndim >= 2 else 0,
+        use_per_token_if_dynamic,
         **config,
     )
     return c

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl