PyPI - sglang - Versions diffs - 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl - Mend

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -1,12 +1,17 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/quantization/modelopt.py
 import logging
-from typing import Any, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional
 import torch
 from torch.nn.parameter import Parameter
-from sglang.srt.layers.linear import LinearBase, LinearMethodBase
+from sglang.srt.layers.linear import (
+    LinearBase,
+    LinearMethodBase,
+    UnquantizedLinearMethod,
+)
+from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams, CutlassMoEType
 from sglang.srt.layers.parameter import ModelWeightParameter, PerTensorScaleParameter
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
@@ -15,10 +20,12 @@ from sglang.srt.layers.quantization.base_config import (
 from sglang.srt.layers.quantization.fp8_utils import (
     apply_fp8_linear,
     cutlass_fp8_supported,
+    is_sm100_supported,
 )
 from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
 from sglang.srt.layers.quantization.utils import (
     convert_to_channelwise,
+    is_layer_skipped,
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
@@ -270,9 +277,16 @@ class ModelOptFp4Config(QuantizationConfig):
             )
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
         kv_cache_quant_algo = quant_config["kv_cache_quant_algo"]
+        if not kv_cache_quant_algo:
+            kv_cache_quant_algo = "auto"
         group_size = quant_config["group_size"]
         exclude_modules = quant_config["exclude_modules"]
         if not (group_size and kv_cache_quant_algo and exclude_modules):
+            logger.warning(
+                f"group_size: {group_size},"
+                f"kv_cache_quant_algo: {kv_cache_quant_algo},"
+                f"exclude_modules: {exclude_modules}"
+            )
             raise ValueError(
                 "NVFP4 quantization requires group size and "
                 "kv_cache_quant_algo specified in "
@@ -285,19 +299,30 @@ class ModelOptFp4Config(QuantizationConfig):
             exclude_modules,
         )
+    def is_layer_excluded(self, prefix: str, exclude_modules: list):
+        import regex as re
+        for pattern in exclude_modules:
+            regex_str = pattern.replace(".", r"\.").replace("*", r".*")
+            if re.fullmatch(regex_str, prefix):
+                return True
+        return False
     def get_quant_method(
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
-        if self.exclude_modules and any(
-            module in prefix for module in self.exclude_modules
-        ):
-            return None
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
+            if is_layer_skipped(prefix, self.exclude_modules) or self.is_layer_excluded(
+                prefix, self.exclude_modules
+            ):
+                return UnquantizedLinearMethod()
             return ModelOptFp4LinearMethod(self)
         if self.kv_cache_quant_algo and isinstance(layer, RadixAttention):
             return ModelOptFp8KVCacheMethod(self)
+        elif isinstance(layer, FusedMoE):
+            return ModelOptNvFp4FusedMoEMethod(self)
         return None
     def get_scaled_act_names(self) -> List[str]:
@@ -461,3 +486,305 @@ class ModelOptFp4LinearMethod(LinearMethodBase):
         if bias is not None:
             out = out + bias
         return out.view(*output_shape)
+class ModelOptNvFp4FusedMoEMethod:
+    """
+       MoE Method for FP4 Quantization with Blockscales and PerTensorScales
+    Args:
+        quant_config: NVFP4 Quant Config
+    """
+    def __new__(cls, *args, **kwargs):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+    def __init__(self, quant_config: ModelOptFp4Config):
+        self.quant_config = quant_config
+        if not is_sm100_supported():
+            raise ValueError(
+                "Current platform does not support NVFP4"
+                " quantization. Please use Blackwell and"
+                " above."
+            )
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size_per_partition: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError(
+                "NVFP4 quantization was selected, "
+                " dynamic quantization is not supported."
+            )
+        layer.num_experts = num_experts
+        layer.params_dtype = params_dtype
+        layer.quant_config = self.quant_config
+        weight_dtype = torch.uint8
+        weight_scale_dtype = torch.float8_e4m3fn
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # GEMM 1
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        # GEMM 2
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // 2,
+                dtype=weight_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        w13_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                2 * intermediate_size_per_partition,
+                # 2 fp4 items are packed in the input dimension
+                hidden_size // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale", w13_weight_scale)
+        w2_weight_scale = ModelWeightParameter(
+            data=torch.empty(
+                num_experts,
+                hidden_size,
+                # 2 fp4 items are packed in the input dimension
+                intermediate_size_per_partition // self.quant_config.group_size,
+                dtype=weight_scale_dtype,
+            ),
+            input_dim=1,
+            output_dim=2,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale", w2_weight_scale)
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.BLOCK.value}
+        )
+        w13_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight_scale_2", w13_weight_scale_2)
+        w2_weight_scale_2 = PerTensorScaleParameter(
+            data=torch.empty(num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight_scale_2", w2_weight_scale_2)
+        extra_weight_attrs.update(
+            {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+        )
+        w13_input_scale = PerTensorScaleParameter(
+            data=torch.empty(num_experts, 2, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_input_scale", w13_input_scale)
+        w2_input_scale = PerTensorScaleParameter(
+            data=torch.empty(num_experts, dtype=torch.float32),
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_input_scale", w2_input_scale)
+    def swizzle_blockscale(self, scale: torch.tensor):
+        assert scale.dtype == torch.float8_e4m3fn
+        # Pad and blockwise interleave weight_scale
+        scale_ndim = scale.ndim
+        if scale.ndim == 2:
+            scale = scale.unsqueeze(0)
+        assert scale.ndim == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = torch.zeros((B, M_padded, K_padded), dtype=scale.dtype)
+        padded_scale[:B, :M, :K] = scale
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().cuda()
+        return (
+            swizzled_scale.reshape(M, K)
+            if scale_ndim == 2
+            else swizzled_scale.reshape(B, M, K)
+        )
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        # GEMM 1
+        if not torch.allclose(
+            layer.w13_weight_scale_2[:, 0], layer.w13_weight_scale_2[:, 1]
+        ):
+            logger.warning_once(
+                "w1_weight_scale_2 must match w3_weight_scale_2. "
+                "Accuracy may be affected."
+            )
+        w13_weight_scale_2 = layer.w13_weight_scale_2[:, 0]
+        layer.w13_weight_scale_2 = Parameter(w13_weight_scale_2, requires_grad=False)
+        w13_input_scale = layer.w13_input_scale.max(dim=1).values.to(torch.float32)
+        layer.g1_alphas = Parameter(
+            (w13_input_scale * w13_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        assert (
+            layer.w13_weight_scale.shape[2] % 16 == 0
+        ), "Expected weight_scale.dim(1) to be divisible by 16"
+        assert (
+            layer.w13_weight_scale.dtype == torch.float8_e4m3fn
+        ), "Weight Blockscale must be represented as FP8-E4M3"
+        w13_blockscale_swizzled = self.swizzle_blockscale(layer.w13_weight_scale)
+        layer.w13_blockscale_swizzled = Parameter(
+            w13_blockscale_swizzled, requires_grad=False
+        )
+        # This is for quantization, so we need to invert it.
+        layer.w13_input_scale_quant = Parameter(
+            (1 / w13_input_scale).to(torch.float32), requires_grad=False
+        )
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        # GEMM 2
+        layer.g2_alphas = Parameter(
+            (layer.w2_input_scale * layer.w2_weight_scale_2).to(torch.float32),
+            requires_grad=False,
+        )
+        # This is for quantization, so we need to invert it.
+        layer.w2_input_scale_quant = Parameter(
+            (1 / layer.w2_input_scale).to(torch.float32), requires_grad=False
+        )
+        assert (
+            layer.w2_weight_scale.shape[2] % 16 == 0
+        ), "Expected weight_scale.dim(1) to be divisible by 16"
+        assert (
+            layer.w2_weight_scale.dtype == torch.float8_e4m3fn
+        ), "Weight Blockscale must be represented as FP8-E4M3"
+        w2_blockscale_swizzled = self.swizzle_blockscale(layer.w2_weight_scale)
+        layer.w2_blockscale_swizzled = Parameter(
+            w2_blockscale_swizzled, requires_grad=False
+        )
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+        device = layer.w13_weight.device
+        layer.cutlass_moe_params = CutlassMoEParams(
+            CutlassMoEType.BlockscaledFP4,
+            device,
+            num_experts=layer.num_experts,
+            intermediate_size_per_partition=layer.w2_weight.shape[2] * 2,  # n
+            hidden_size=layer.w13_weight.shape[2] * 2,
+        )  # k
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        assert activation == "silu", "Only SiLU activation is supported."
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        from sglang.srt.layers.moe.topk import select_experts
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
+        return cutlass_moe_fp4(
+            a=x,
+            a1_gscale=layer.w13_input_scale_quant,
+            w1_fp4=layer.w13_weight,
+            w1_blockscale=layer.w13_blockscale_swizzled,
+            w1_alphas=layer.g1_alphas,
+            a2_gscale=layer.w2_input_scale_quant,
+            w2_fp4=layer.w2_weight,
+            w2_blockscale=layer.w2_blockscale_swizzled,
+            w2_alphas=layer.g2_alphas,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            params=layer.cutlass_moe_params,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+        ).to(x.dtype)

sglang/srt/layers/quantization/moe_wna16.py CHANGED Viewed

@@ -341,6 +341,7 @@ class MoeWNA16Method:
         use_grouped_topk: bool = False,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -362,6 +363,7 @@ class MoeWNA16Method:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -386,6 +388,7 @@ class MoeWNA16Method:
             w2_zp=layer.w2_qzeros if has_zp else None,
             block_shape=[0, layer.group_size],
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )
     @staticmethod

sglang/srt/layers/quantization/qoq.py ADDED Viewed

@@ -0,0 +1,244 @@
+from typing import Any, Callable, Dict, List, Optional
+import torch
+from torch.nn.parameter import Parameter
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
+from sglang.srt.layers.linear import LinearMethodBase
+from sglang.srt.layers.parameter import (
+    ChannelQuantScaleParameter,
+    GroupQuantScaleParameter,
+    ModelWeightParameter,
+)
+from sglang.srt.layers.quantization.base_config import (
+    QuantizationConfig,
+    QuantizeMethodBase,
+)
+from sglang.srt.layers.quantization.int8_kernel import per_token_quant_int8
+from sglang.srt.utils import is_cuda
+_is_cuda = is_cuda()
+if _is_cuda:
+    from sgl_kernel import qserve_w4a8_per_chn_gemm, qserve_w4a8_per_group_gemm
+QoQ_SUPPORTED_WEIGHT_BITS = [4]
+QoQ_SUPPORTED_GROUP_SIZES = [-1, 128]
+class QoQConfig(QuantizationConfig):
+    """Config class for QoQ Quantization.
+    - Weight: static, per-channel/group, asymmetric
+    - Activation: dynamic, per-token, symmetric
+    Reference: https://arxiv.org/abs/2405.04532
+    https://github.com/mit-han-lab/omniserve
+    """
+    def __init__(self, weight_bits: int, group_size: int) -> None:
+        self.weight_bits = weight_bits
+        self.group_size = group_size
+        # Verify
+        if self.weight_bits not in QoQ_SUPPORTED_WEIGHT_BITS:
+            raise ValueError(
+                f"QoQ does not support weight_bits = {self.weight_bits}. "
+                f"Only weight_bits = {QoQ_SUPPORTED_WEIGHT_BITS} "
+                "are supported."
+            )
+        if self.group_size not in QoQ_SUPPORTED_GROUP_SIZES:
+            raise ValueError(
+                f"QoQ does not support group_size = {self.group_size}. "
+                f"Only group_sizes = {QoQ_SUPPORTED_GROUP_SIZES} "
+                "are supported."
+            )
+        # 4 bits packed into 8 bit datatype.
+        self.pack_factor = 8 // self.weight_bits
+    def __repr__(self) -> str:
+        return "QoQConfig(weight_bits={}, group_size={})".format(
+            self.weight_bits, self.group_size
+        )
+    @classmethod
+    def get_supported_act_dtypes(cls) -> List[torch.dtype]:
+        return [torch.float16]
+    @classmethod
+    def get_min_capability(cls) -> int:
+        return 80
+    @classmethod
+    def get_name(self) -> str:
+        return "qoq"
+    @classmethod
+    def get_config_filenames(cls) -> List[str]:
+        """List of filenames to search for in the model directory."""
+        return [
+            "quant_config.json",
+            "quantize_config.json",
+        ]
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "QoQConfig":
+        weight_bits = cls.get_from_keys(config, ["wbits"])
+        group_size = cls.get_from_keys(config, ["group_size"])
+        return cls(weight_bits, group_size)
+    def get_quant_method(
+        self,
+        layer: torch.nn.Module,
+        prefix: str,
+    ) -> Optional["QuantizeMethodBase"]:
+        from sglang.srt.layers.linear import LinearBase
+        if isinstance(layer, LinearBase):
+            return QoQLinearMethod(self)
+        return None
+    def get_scaled_act_names(self) -> List[str]:
+        return []
+class QoQLinearMethod(LinearMethodBase):
+    """Linear method for QoQ.
+    Args:
+        quant_config: The QoQ quantization config.
+    """
+    def __init__(self, quant_config: QoQConfig):
+        self.quant_config = quant_config
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        input_size_per_partition: int,
+        output_partition_sizes: List[int],
+        input_size: int,
+        output_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        # Validate output_size_per_partition
+        output_size_per_partition = sum(output_partition_sizes)
+        if output_size_per_partition % 32 != 0:
+            raise ValueError(
+                f"Weight output_size_per_partition = "
+                f"{output_size_per_partition} is not divisible by 32."
+            )
+        # Validate input_size_per_partition
+        if input_size_per_partition % self.quant_config.pack_factor != 0:
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"pack_factor = {self.quant_config.pack_factor}."
+            )
+        if (
+            self.quant_config.group_size != -1
+            and input_size_per_partition % self.quant_config.group_size != 0
+        ):
+            raise ValueError(
+                f"Weight input_size_per_partition = "
+                f"{input_size_per_partition} is not divisible by "
+                f"group_size = {self.quant_config.group_size}."
+            )
+        qweight = ModelWeightParameter(
+            data=torch.empty(
+                output_size_per_partition,
+                input_size_per_partition // self.quant_config.pack_factor,
+                dtype=torch.int8,
+            ),
+            input_dim=1,
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("qweight", qweight)
+        s1_scales = ChannelQuantScaleParameter(
+            data=torch.empty(output_size_per_partition, dtype=torch.float16),
+            output_dim=0,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("s1_scales", s1_scales)
+        if self.quant_config.group_size == -1:
+            s1_szeros = ChannelQuantScaleParameter(
+                data=torch.empty(output_size_per_partition, dtype=torch.float16),
+                output_dim=0,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s1_szeros", s1_szeros)
+        else:
+            s2_scales = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_scales", s2_scales)
+            s2_zeros = GroupQuantScaleParameter(
+                data=torch.empty(
+                    (
+                        input_size_per_partition // self.quant_config.group_size,
+                        output_size_per_partition,
+                    ),
+                    dtype=torch.int8,
+                ),
+                input_dim=0,
+                output_dim=1,
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("s2_zeros", s2_zeros)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        layer.qweight = Parameter(layer.qweight.data, requires_grad=False)
+        layer.s1_scales = Parameter(layer.s1_scales.data, requires_grad=False)
+        if self.quant_config.group_size == -1:
+            layer.s1_szeros = Parameter(layer.s1_szeros.data, requires_grad=False)
+        else:
+            layer.s2_scales = Parameter(layer.s2_scales.data, requires_grad=False)
+            layer.s2_zeros = Parameter(layer.s2_zeros.data, requires_grad=False)
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        bias: Optional[torch.Tensor] = None,
+    ):
+        assert x.dtype == torch.float16, "QoQ only supports float16 input now"
+        if self.quant_config.group_size == -1:
+            x_q, x_scale, x_sum = per_token_quant_int8(
+                x, scale_dtype=x.dtype, cal_sum=True
+            )
+            out = qserve_w4a8_per_chn_gemm(
+                x_q, layer.qweight, layer.s1_scales, x_scale, layer.s1_szeros, x_sum
+            )
+        else:
+            x_q, x_scale = per_token_quant_int8(x, scale_dtype=x.dtype)
+            out = qserve_w4a8_per_group_gemm(
+                x_q,
+                layer.qweight,
+                layer.s2_zeros,
+                layer.s2_scales,
+                layer.s1_scales,
+                x_scale,
+            )
+        if bias is not None:
+            out = out + bias
+        return out

sglang/srt/layers/quantization/w8a8_fp8.py CHANGED Viewed

@@ -287,6 +287,7 @@ class W8A8FP8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -306,6 +307,7 @@ class W8A8FP8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -326,4 +328,5 @@ class W8A8FP8MoEMethod:
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -225,6 +225,7 @@ class W8A8Int8MoEMethod:
         use_grouped_topk: bool,
         topk_group: Optional[int] = None,
         num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         custom_routing_function: Optional[Callable] = None,
         correction_bias: Optional[torch.Tensor] = None,
         activation: str = "silu",
@@ -245,6 +246,7 @@ class W8A8Int8MoEMethod:
             renormalize=renormalize,
             topk_group=topk_group,
             num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
             custom_routing_function=custom_routing_function,
             correction_bias=correction_bias,
             routed_scaling_factor=routed_scaling_factor,
@@ -266,4 +268,5 @@ class W8A8Int8MoEMethod:
             a1_scale=layer.w13_input_scale,
             a2_scale=layer.w2_input_scale,
             no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
         )

sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

sglang 0.4.6.post4py3-none-any.whl → 0.4.7py3-none-any.whl