PyPI - sglang - Versions diffs - 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (318) hide show

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -1,12 +1,17 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter
-from sglang.srt.layers.linear import LinearBase, LinearMethodBase
+from sglang.srt.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -15,10 +20,12 @@ from sglang.srt.layers.quantization.base_config import (
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
+    is_sm100_supported,
 )
 from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
 from sglang.srt.layers.quantization.utils import (
     convert_to_channelwise,
+    is_layer_skipped,
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
@@ -270,9 +277,16 @@ class ModelOptFp4Config(QuantizationConfig):
             )
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
         kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        if not kv_cache_quant_algo:
+            kv_cache_quant_algo = "auto"
         group_size = quant_config["group_size"]
         exclude_modules = quant_config["exclude_modules"]
         if not (group_size and kv_cache_quant_algo and exclude_modules):
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
             raise ValueError(
                 "NVFP4 quantization requires group size and "
                 "kv_cache_quant_algo specified in "
@@ -285,19 +299,30 @@ class ModelOptFp4Config(QuantizationConfig):
             exclude_modules,
         )
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        import regex as re
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        if self.exclude_modules and any(
-            module in prefix for module in self.exclude_modules
-        ):
-            return None
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
             return ModelOptFp4LinearMethod(self)
         if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
             return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptNvFp4FusedMoEMethod(self)
         return None
     def get_scaled_act_names(self) -> List[str]:
@@ -461,3 +486,305 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
+class ModelOptNvFp4FusedMoEMethod:
+    """
+       MoE Method for FP4 Quantization with Blockscales and PerTensorScales
+    Args:
+        quant_config: NVFP4 Quant Config
+    """
+    def __new__(cls, *args, **kwargs):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+        if not is_sm100_supported():
+            raise ValueError(
+                "Current platform does not support NVFP4"
+                " quantization. Please use Blackwell and"
+                " above."
+            )
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+        layer.num_experts = num_experts
+        layer.params_dtype = params_dtype
+        layer.quant_config = self.quant_config
+        weight_dtype = torch.uint8
+        weight_scale_dtype = torch.float8_e4m3fn
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # GEMM 1
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        # GEMM 2
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        w13_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        w2_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+        w13_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+        w2_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.empty(num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert scale.dtype == torch.float8_e4m3fn
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (
+            swizzled_scale.reshape(M, K)
+            if scale_ndim == 2
+            else swizzled_scale.reshape(B, M, K)
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # GEMM 1
+        if not torch.allclose(
+            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
+        ):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected."
+            )
+        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
+        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+        layer.g1_alphas = Parameter(
+            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        assert (
+            layer.w13_weight_scale.shape[2] % 16 == 0
+        ), "Expected weight_scale.dim(1) to be divisible by 16"
+        assert (
+            layer.w13_weight_scale.dtype == torch.float8_e4m3fn
+        ), "Weight Blockscale must be represented as FP8-E4M3"
+        w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
+        layer.w13_blockscale_swizzled = Parameter(
+            w13_blockscale_swizzled, requires_grad=False
+        )
+        # This is for quantization, so we need to invert it.
+        layer.w13_input_scale_quant = Parameter(
+            (1 / w13_input_scale).to(torch.float32), requires_grad=False
+        )
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        # GEMM 2
+        layer.g2_alphas = Parameter(
+            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        # This is for quantization, so we need to invert it.
+        layer.w2_input_scale_quant = Parameter(
+            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
+        )
+        assert (
+            layer.w2_weight_scale.shape[2] % 16 == 0
+        ), "Expected weight_scale.dim(1) to be divisible by 16"
+        assert (
+            layer.w2_weight_scale.dtype == torch.float8_e4m3fn
+        ), "Weight Blockscale must be represented as FP8-E4M3"
+        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+        layer.w2_blockscale_swizzled = Parameter(
+            w2_blockscale_swizzled, requires_grad=False
+        )
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+        device = layer.w13_weight.device
+        layer.cutlass_moe_params = CutlassMoEParams(
+            CutlassMoEType.BlockscaledFP4,
+            device,
+            num_experts=layer.num_experts,
+            intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,  # n
+            hidden_size=layer.w13_weight.shape[2] * 2,
+        )  # k
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        assert activation == "silu", "Only SiLU activation is supported."
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        from sglang.srt.layers.moe.topk import select_experts
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+        return cutlass_moe_fp4(
+            a=x,
+            a1_gscale=layer.w13_input_scale_quant,
+            w1_fp4=layer.w13_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w1_alphas=layer.g1_alphas,
+            a2_gscale=layer.w2_input_scale_quant,
+            w2_fp4=layer.w2_weight,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            w2_alphas=layer.g2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=layer.cutlass_moe_params,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        ).to(x.dtype)

sglang/srt/layers/quantization/moe_wna16.py CHANGED Viewed

@@ -341,6 +341,7 @@ class MoeWNA16Method:
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -362,6 +363,7 @@ class MoeWNA16Method:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -386,6 +388,7 @@ class MoeWNA16Method:
             w2_zp=layer.w2_qzeros if has_zp else None,
             block_shape=[0, layer.group_size],
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )
     @staticmethod

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -287,6 +287,7 @@ class W8A8FP8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -306,6 +307,7 @@ class W8A8FP8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -326,4 +328,5 @@ class W8A8FP8MoEMethod:
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -225,6 +225,7 @@ class W8A8Int8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -245,6 +246,7 @@ class W8A8Int8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -266,4 +268,5 @@ class W8A8Int8MoEMethod:
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/rotary_embedding.py CHANGED Viewed

@@ -8,9 +8,10 @@ import torch
 import torch.nn as nn
 from sglang.srt.custom_op import CustomOp
-from sglang.srt.utils import is_cuda
+from sglang.srt.utils import is_cuda, is_hip
 _is_cuda = is_cuda()
+_is_hip = is_hip()
 if _is_cuda:
     from sgl_kernel import apply_rope_with_cos_sin_cache_inplace
@@ -609,6 +610,10 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
             head_size, rotary_dim, max_position_embeddings, base, is_neox_style, dtype
         )
+        # Re-dispatch
+        if _is_hip:
+            self._forward_method = self.forward_native
     def _compute_inv_freq(self, scaling_factor: float) -> torch.Tensor:
         pos_freqs = self.base ** (
             torch.arange(0, self.rotary_dim, 2, dtype=torch.float, device=self.device)
@@ -650,17 +655,6 @@ class DeepseekScalingRotaryEmbedding(RotaryEmbedding):
         cache = torch.cat((cos, sin), dim=-1)
         return cache
-    def forward_hip(self, *args, **kwargs):
-        return self.forward_native(*args, **kwargs)
-    def forward(self, *args, **kwargs):
-        if torch.compiler.is_compiling():
-            return self.forward_native(*args, **kwargs)
-        if _is_cuda:
-            return self.forward_cuda(*args, **kwargs)
-        else:
-            return self.forward_native(*args, **kwargs)
     def forward_native(
         self,
         positions: torch.Tensor,

sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post5py3-none-any.whl → 0.4.7py3-none-any.whl