PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/layers/moe/router.py ADDED Viewed

@@ -0,0 +1,342 @@
+from typing import Tuple
+import torch
+import triton
+import triton.language as tl
+from sglang.srt.layers.moe.topk import fused_topk
+@triton.jit
+def fused_moe_router_kernel(
+    input_ptr,  # input (bs, hidden_dim)
+    moe_router_weight_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    hidden_dim: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    offsets = tl.arange(0, BLOCK_SIZE)
+    mask = offsets < hidden_dim
+    # moe_router_weight is k major
+    expert_offsets = tl.arange(0, num_experts)[:, None]
+    router_mask = mask[None, :]
+    w_router = tl.load(
+        moe_router_weight_ptr + expert_offsets * hidden_dim + offsets[None, :],
+        mask=router_mask,
+        other=0.0,
+    )
+    x = tl.load(input_ptr + pid * hidden_dim + offsets, mask=mask, other=0.0)
+    # todo: tl.dot?
+    logits = tl.sum((w_router.to(tl.float32) * x[None, :].to(tl.float32)), axis=-1)
+    # logit softcap
+    logits_scaled = logits / moe_softcapping
+    exped = tl.exp(2 * logits_scaled)
+    top = exped - 1
+    bottom = exped + 1
+    logits_softcapped = top / bottom * moe_softcapping
+    # topk
+    # assert 1 <= topk <= num_experts
+    # 5.38 us
+    top1 = tl.argmax(logits_softcapped, axis=0)
+    tl.store(topk_ids_ptr + pid * topk + 0, top1)  # 5.63 us
+    top1_v = tl.max(logits_softcapped, axis=0)
+    invsumexp = 1.0 / tl.sum(tl.exp(logits_softcapped - top1_v), axis=0)
+    tl.store(
+        topk_weights_ptr + pid * topk + 0,
+        invsumexp,
+    )  # 5.73 us
+    if topk >= 2:
+        top2 = tl.argmax(
+            tl.where(
+                tl.arange(0, num_experts) != top1, logits_softcapped, float("-inf")
+            ),
+            axis=0,
+        )
+        tl.store(topk_ids_ptr + pid * topk + 1, top2)
+        top2_v = tl.sum(logits_softcapped * (tl.arange(0, num_experts) == top2), axis=0)
+        tl.store(
+            topk_weights_ptr + pid * topk + 1,
+            tl.exp(top2_v - top1_v) * invsumexp,
+        )  # 5.95us
+    # probably slow
+    if topk > 2:
+        topk_mask = tl.full(logits_softcapped.shape, 1.0, dtype=logits_softcapped.dtype)
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top1, topk_mask, float("-inf")
+        )
+        topk_mask = tl.where(
+            tl.arange(0, num_experts) != top2, topk_mask, float("-inf")
+        )
+        for i in range(2, topk):
+            topi = tl.argmax(logits_softcapped + topk_mask, axis=0)
+            topk_mask = tl.where(
+                tl.arange(0, num_experts) != topi, topk_mask, float("-inf")
+            )
+            tl.store(topk_ids_ptr + pid * topk + i, topi)
+            topi_v = tl.sum(
+                logits_softcapped * (tl.arange(0, num_experts) == topi), axis=0
+            )
+            tl.store(
+                topk_weights_ptr + pid * topk + i,
+                tl.exp(topi_v - top1_v) * invsumexp,
+            )
+    # assert not moe_renormalize, "moe weight renormalization not implemented"
+def fused_moe_router_impl(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+    # router_logits = torch.empty((bs, num_experts), dtype=torch.float32, device=x.device)
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    grid = lambda meta: (bs,)
+    config = {
+        "BLOCK_SIZE": triton.next_power_of_2(hidden_dim),
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 256)), 32), 4
+        ),
+    }
+    fused_moe_router_kernel[grid](
+        x,
+        router_weight,
+        topk_weights,
+        topk_ids,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        hidden_dim=hidden_dim,
+        **config,
+    )
+    return topk_weights, topk_ids
+@triton.jit
+def fused_moe_router_large_bs_kernel(
+    a_ptr,  # input (bs, hidden_dim)
+    b_ptr,  # input (num_experts, hidden_dim)
+    topk_weights_ptr,  # output (bs, topk)
+    topk_ids_ptr,  # output (bs, topk)
+    bs,
+    num_experts: tl.constexpr,
+    topk: tl.constexpr,  # only support topk == 1
+    moe_softcapping: tl.constexpr,
+    moe_renormalize: tl.constexpr,  # not supported
+    K: tl.constexpr,
+    BLOCK_SIZE_M: tl.constexpr,
+    BLOCK_SIZE_N: tl.constexpr,
+    BLOCK_SIZE_K: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_bn: tl.constexpr,
+):
+    # 1. get block id
+    pid = tl.program_id(axis=0)
+    # 2. create pointers for the first block of A and B
+    # 2.1. setup a_ptrs with offsets in m and k
+    offs_m = pid * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)[:, None]
+    bs_mask = offs_m < bs
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    a_ptrs = a_ptr + (offs_m * stride_am + offs_k)
+    # 2.2. setup b_ptrs with offsets in k and n.
+    #      Note: b matrix is k-major.
+    offs_k = tl.arange(0, BLOCK_SIZE_K)[None, :]
+    offs_n = tl.arange(0, BLOCK_SIZE_N)[:, None]
+    expert_mask = offs_n < num_experts
+    b_ptrs = b_ptr + (offs_n * stride_bn + offs_k)
+    # 3. Create an accumulator of float32 of size [BLOCK_SIZE_M, BLOCK_SIZE_N]
+    #    3.1. iterate in K dimension
+    #    3.2. transpose tile B
+    acc = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
+    for k in range(0, K // BLOCK_SIZE_K):  # hidden_dim % BLOCK_SIZE_K == 0
+        a = tl.load(
+            a_ptrs,
+            mask=bs_mask,
+            other=0.0,
+        ).to(tl.float32)
+        b = tl.load(b_ptrs, mask=expert_mask, other=0.0).to(tl.float32).T
+        acc += tl.dot(a, b)
+        # Advance the ptrs to the next K block.
+        a_ptrs += BLOCK_SIZE_K
+        b_ptrs += BLOCK_SIZE_K
+    # 4. logit softcap
+    logits_scaled = acc / moe_softcapping
+    exped = tl.exp(2 * logits_scaled)
+    logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
+    # 5. top1
+    cond = tl.arange(0, BLOCK_SIZE_N)[None, :] < num_experts
+    top1 = tl.argmax(tl.where(cond, logits_softcapped, float("-inf")), axis=1)
+    top1_v = tl.max(
+        tl.where(cond, logits_softcapped, float("-inf")), axis=1, keep_dims=True
+    )
+    invsumexp = 1.0 / tl.sum(
+        tl.where(cond, tl.exp(logits_softcapped - top1_v), 0.0), axis=1
+    )
+    # 6. store to output
+    offs_topk = pid * topk * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    topk_mask = offs_topk < bs
+    tl.store(topk_ids_ptr + offs_topk, top1, mask=topk_mask)
+    tl.store(
+        topk_weights_ptr + offs_topk,
+        invsumexp,
+        mask=topk_mask,
+    )
+def fused_moe_router_large_bs_impl(
+    x: torch.Tensor,
+    router_weight: torch.Tensor,
+    topk: int,
+    moe_softcapping: float,
+    BLOCK_SIZE_M: int,
+    BLOCK_SIZE_N: int,
+    BLOCK_SIZE_K: int,
+):
+    assert len(x.shape) == 2 and x.shape[1] == router_weight.shape[1]
+    bs, hidden_dim = x.shape
+    num_experts = router_weight.shape[0]
+    assert num_experts <= BLOCK_SIZE_N
+    assert hidden_dim % BLOCK_SIZE_K == 0
+    assert topk == 1
+    topk_weights = torch.empty((bs, topk), dtype=torch.float32, device=x.device)
+    topk_ids = torch.empty((bs, topk), dtype=torch.int32, device=x.device)
+    grid = (triton.cdiv(bs, BLOCK_SIZE_M) * triton.cdiv(num_experts, BLOCK_SIZE_N),)
+    fused_moe_router_large_bs_kernel[grid](
+        a_ptr=x,
+        b_ptr=router_weight,
+        topk_weights_ptr=topk_weights,
+        topk_ids_ptr=topk_ids,
+        bs=bs,
+        num_experts=num_experts,
+        topk=topk,
+        moe_softcapping=moe_softcapping,
+        moe_renormalize=False,
+        K=hidden_dim,
+        BLOCK_SIZE_M=BLOCK_SIZE_M,
+        BLOCK_SIZE_N=BLOCK_SIZE_N,
+        BLOCK_SIZE_K=BLOCK_SIZE_K,
+        stride_am=hidden_dim,
+        stride_bn=hidden_dim,
+    )
+    return topk_weights, topk_ids
+def fused_moe_router_shim(
+    moe_softcapping,
+    hidden_states,
+    gating_output,
+    topk,
+    renormalize,
+):
+    assert not renormalize
+    assert (
+        len(hidden_states.shape) == 2
+        and hidden_states.shape[1] == gating_output.shape[1]
+    )
+    bs, hidden_dim = hidden_states.shape
+    num_experts = gating_output.shape[0]
+    BLOCK_SIZE_M = 32
+    BLOCK_SIZE_N = 16
+    BLOCK_SIZE_K = 256
+    if (
+        bs >= 512
+        and topk == 1
+        and num_experts <= BLOCK_SIZE_N
+        and hidden_dim % BLOCK_SIZE_K == 0
+    ):
+        return fused_moe_router_large_bs_impl(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+            BLOCK_SIZE_M=BLOCK_SIZE_M,
+            BLOCK_SIZE_N=BLOCK_SIZE_N,
+            BLOCK_SIZE_K=BLOCK_SIZE_K,
+        )
+    else:
+        return fused_moe_router_impl(
+            x=hidden_states,
+            router_weight=gating_output,
+            topk=topk,
+            moe_softcapping=moe_softcapping,
+        )
+class FusedMoeRouter:
+    def __init__(self, router_linear, topk, moe_softcapping) -> None:
+        self.router_linear = router_linear
+        self.topk = topk
+        self.moe_softcapping = moe_softcapping
+    def __call__(self, *args, **kwargs):
+        return self.forward(*args, **kwargs)
+    def forward(
+        self, x: torch.Tensor, residual: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if x.is_cuda:
+            return self.forward_cuda(x, residual)
+        else:
+            return self.forward_vllm(x, residual)
+    def forward_cuda(
+        self, x: torch.Tensor, autotune=False
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        return fused_moe_router_shim(
+            moe_softcapping=self.moe_softcapping,
+            hidden_states=x,
+            gating_output=self.router_linear.weight,
+            topk=self.topk,
+            renormalize=False,
+        )
+    def forward_vllm(
+        self,
+        x: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # g, _ = self.router_linear.forward(x)
+        g = x.float() @ self.router_linear.weight.T.float()
+        g = torch.tanh(g.float() / self.moe_softcapping) * self.moe_softcapping
+        return fused_topk(x, g, self.topk, False)

sglang/srt/layers/parameter.py CHANGED Viewed

@@ -16,6 +16,7 @@ __all__ = [
     "ModelWeightParameter",
     "ChannelQuantScaleParameter",
     "GroupQuantScaleParameter",
+    "BlockQuantScaleParameter",
     "PackedColumnParameter",
     "RowvLLMParameter",
 ]
@@ -221,6 +222,15 @@ class ChannelQuantScaleParameter(_ColumnvLLMParameter):
     pass
+class BlockQuantScaleParameter(_ColumnvLLMParameter, RowvLLMParameter):
+    """
+    Parameter class for weight scales loaded for weights with
+    block-wise quantization. Uses both column and row parallelism.
+    """
+    pass
 class PerTensorScaleParameter(BasevLLMParameter):
     """
     Parameter class for scales where the number of scales is

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -1,4 +1,6 @@
 # Adapted from https://raw.githubusercontent.com/vllm-project/vllm/v0.5.5/vllm/model_executor/layers/quantization/__init__.py
+import builtins
+import inspect
 import re
 from copy import deepcopy
 from typing import Callable, Dict, Optional, Type, Union
@@ -6,10 +8,7 @@ from typing import Callable, Dict, Optional, Type, Union
 import torch
 from vllm.model_executor.layers.quantization.aqlm import AQLMConfig
 from vllm.model_executor.layers.quantization.awq import AWQConfig
-from vllm.model_executor.layers.quantization.awq_marlin import (
-    AWQMarlinConfig,
-    AWQMoEMethod,
-)
+from vllm.model_executor.layers.quantization.awq_marlin import AWQMarlinConfig
 from vllm.model_executor.layers.quantization.bitsandbytes import BitsAndBytesConfig
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (
     CompressedTensorsConfig,
@@ -28,6 +27,7 @@ from sglang.srt.layers.quantization.blockwise_int8 import BlockInt8Config
 from sglang.srt.layers.quantization.fp8 import Fp8Config
 from sglang.srt.layers.quantization.gptq import GPTQConfig, GPTQMarlinConfig
 from sglang.srt.layers.quantization.modelopt_quant import ModelOptFp8Config
+from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
 QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
@@ -50,6 +50,7 @@ QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "qqq": QQQConfig,
     "experts_int8": ExpertsInt8Config,
     "w8a8_int8": W8A8Int8Config,
+    "w8a8_fp8": W8A8Fp8Config,
 }
@@ -178,96 +179,117 @@ def gptq_get_quant_method(self, layer, prefix):
     return None
-def awq_get_quant_method(self, layer, prefix):
-    from vllm.model_executor.layers.quantization.awq import is_layer_skipped_awq
-    from vllm.model_executor.layers.quantization.awq_marlin import (
-        AWQMarlinLinearMethod,
-        AWQMoEMethod,
-    )
+original_isinstance = builtins.isinstance
-    from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
-    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE
-    from sglang.srt.layers.vocab_parallel_embedding import ParallelLMHead
-    if isinstance(layer, LinearBase) or (
-        isinstance(layer, ParallelLMHead) and self.lm_head_quantized
-    ):
-        if is_layer_skipped_awq(prefix, self.modules_to_not_convert):
-            return UnquantizedLinearMethod()
-        return AWQMarlinLinearMethod(self)
-    elif isinstance(layer, FusedMoE):
-        return AWQMoEMethod(self)
-    return None
+def monkey_patch_isinstance_for_vllm_base_layer(reverse: bool = False):
+    """
+    Patch isinstance so that the `get_quant_method` in vllm's QuantizationConfig
+    can recognize sglang layers
+    """
+    if reverse:
+        builtins.isinstance = original_isinstance
+        return
-original_awq_moe_method_apply = AWQMoEMethod.apply
-def awq_moe_method_apply(
-    self,
-    layer: torch.nn.Module,
-    x: torch.Tensor,
-    router_logits: torch.Tensor,
-    top_k: int,
-    renormalize: bool,
-    use_grouped_topk: bool = False,
-    topk_group: Optional[int] = None,
-    num_expert_group: Optional[int] = None,
-    custom_routing_function: Optional[Callable] = None,
-    scoring_func: str = "softmax",
-    e_score_correction_bias: Optional[torch.Tensor] = None,
-    **kwargs,
-):
-    return original_awq_moe_method_apply(
-        self,
-        layer,
-        x,
-        router_logits,
-        top_k,
-        renormalize,
-        use_grouped_topk,
-        topk_group,
-        num_expert_group,
-        custom_routing_function,
-        scoring_func,
-        e_score_correction_bias,
-    )
-def patch_vllm_linear_base_isinstance():
-    import builtins
+    from vllm.model_executor.layers.fused_moe import FusedMoE
     from vllm.model_executor.layers.linear import LinearBase
+    from vllm.model_executor.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding,
+    )
     from sglang.srt.layers.linear import LinearBase as PatchedLinearBase
-    original_isinstance = builtins.isinstance
+    from sglang.srt.layers.moe.fused_moe_triton.layer import FusedMoE as PatchedFusedMoE
+    from sglang.srt.layers.vocab_parallel_embedding import (
+        VocabParallelEmbedding as PatchedVocabParallelEmbedding,
+    )
     def patched_isinstance(obj, classinfo):
         if classinfo is LinearBase:
             return original_isinstance(obj, PatchedLinearBase)
+        if classinfo is FusedMoE:
+            return original_isinstance(obj, PatchedFusedMoE)
+        if classinfo is VocabParallelEmbedding:
+            return original_isinstance(obj, PatchedVocabParallelEmbedding)
         return original_isinstance(obj, classinfo)
     builtins.isinstance = patched_isinstance
-def apply_monkey_patches():
+def monkey_patch_moe_apply(class_obj: "FusedMoEMethodBase"):
+    """
+    Monkey patch the apply function of vllm's FusedMoEMethodBase.
+    Convert sglang arguments to vllm arguments.
+    """
+    original_apply = class_obj.apply
+    sig = inspect.signature(original_apply)
+    param_names = list(sig.parameters.keys())
+    has_correction_bias = "e_score_correction_bias" in param_names
+    def new_apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        inplace: bool = True,
+        no_combine: bool = False,
+    ):
+        assert activation == "silu"
+        assert inplace and not no_combine
+        kwargs = {
+            "self": self,
+            "layer": layer,
+            "x": x,
+            "router_logits": router_logits,
+            "top_k": top_k,
+            "renormalize": renormalize,
+            "use_grouped_topk": use_grouped_topk,
+            "topk_group": topk_group,
+            "num_expert_group": num_expert_group,
+            "custom_routing_function": custom_routing_function,
+        }
+        if correction_bias is not None:
+            if not has_correction_bias:
+                raise ValueError(
+                    "Please increase the version of your vllm. Try `pip install vllm==0.7.2`"
+                )
+            kwargs["e_score_correction_bias"] = correction_bias
+        return original_apply(**kwargs)
+    setattr(class_obj, "apply", new_apply)
+def monkey_patch_quant_configs():
     """Apply all monkey patches in one place."""
     from vllm.model_executor.layers.quantization.awq_marlin import AWQMoEMethod
+    from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors_moe import (
+        CompressedTensorsW8A8Fp8MoEMethod,
+        CompressedTensorsWNA16MoEMethod,
+    )
+    from vllm.model_executor.layers.quantization.gptq_marlin import GPTQMarlinMoEMethod
     setattr(GPTQMarlinConfig, "get_quant_method", gptq_get_quant_method)
     setattr(GPTQConfig, "get_quant_method", gptq_get_quant_method)
-    setattr(AWQMarlinConfig, "get_quant_method", awq_get_quant_method)
-    setattr(AWQMoEMethod, "apply", awq_moe_method_apply)
+    monkey_patch_moe_apply(AWQMoEMethod)
+    monkey_patch_moe_apply(GPTQMarlinMoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsW8A8Fp8MoEMethod)
+    monkey_patch_moe_apply(CompressedTensorsWNA16MoEMethod)
-patch_vllm_linear_base_isinstance()
-# Apply patches when module is imported
-apply_monkey_patches()
+monkey_patch_quant_configs()
 __all__ = [
-    "QuantizationConfig",
     "get_quantization_config",
     "QUANTIZATION_METHODS",
 ]

sglang/srt/layers/quantization/blockwise_int8.py CHANGED Viewed

@@ -13,12 +13,11 @@ from sglang.srt.layers.linear import (
     LinearMethodBase,
     UnquantizedLinearMethod,
 )
-from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
+from sglang.srt.layers.parameter import BlockQuantScaleParameter, ModelWeightParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8_utils import BlockQuantScaleParameter
 from sglang.srt.layers.quantization.int8_utils import apply_w8a8_block_int8_linear
 from sglang.srt.utils import set_weight_attrs

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl