PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +12 -1
sglang/srt/conversation.py +35 -1
sglang/srt/disaggregation/mooncake/conn.py +35 -4
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/layers/communicator.py +3 -1
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +58 -0
sglang/srt/layers/moe/ep_moe/layer.py +140 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +135 -58
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +8 -1
sglang/srt/managers/mm_utils.py +4 -2
sglang/srt/managers/schedule_batch.py +1 -1
sglang/srt/managers/scheduler.py +17 -5
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +113 -63
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/models/deepseek_v2.py +16 -2
sglang/srt/models/mllama4.py +360 -79
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +62 -60
sglang/srt/server_args.py +15 -0
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +37 -17
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/METADATA +4 -3
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/RECORD +47 -43
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py ADDED Viewed

@@ -0,0 +1,176 @@
+# Adapted from https://github.com/vllm-project/vllm/pull/18595/files#diff-f426a6de78c82ffec568eff6811bfbf0043dab5f87f1a8c0cffdbdcb8a81e035
+from typing import Optional
+import torch
+from sgl_kernel import gelu_and_mul, silu_and_mul
+from triton_kernels.matmul_ogs import matmul_ogs
+from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx, routing
+from sglang.srt.utils import direct_register_custom_op
+def triton_kernel_moe_forward(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    if not renormalize:
+        gating_output = torch.softmax(gating_output, dim=-1)
+    routing_data, gather_idx, scatter_idx = routing(gating_output, topk, renormalize)
+    return triton_kernel_fused_experts(
+        hidden_states,
+        w1,
+        w2,
+        routing_data,
+        gather_idx,
+        scatter_idx,
+        inplace=inplace,
+        activation=activation,
+        apply_router_weight_on_input=apply_router_weight_on_input,
+        use_fp8_w8a8=use_fp8_w8a8,
+        per_channel_quant=per_channel_quant,
+        global_num_experts=global_num_experts,
+        expert_map=expert_map,
+        w1_scale=w1_scale,
+        w2_scale=w2_scale,
+        a1_scale=a1_scale,
+        a2_scale=a2_scale,
+        block_shape=block_shape,
+    )
+# This is a triton implementation of the fused_experts function
+def triton_kernel_fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    routing_data: RoutingData,
+    gather_indx: GatherIndx,
+    scatter_indx: ScatterIndx,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
+    assert per_channel_quant == False, "per_channel_quant is not supported"
+    assert expert_map == None, "expert_map is not supported"
+    assert w1_scale == None, "w1_scale is not supported"
+    assert w2_scale == None, "w2_scale is not supported"
+    assert a1_scale == None, "a1_scale is not supported"
+    assert a2_scale == None, "a2_scale is not supported"
+    assert block_shape == None, "block_shape is not supported"
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    assert w1.dtype == torch.bfloat16, "w1 must be bfloat16"
+    assert w2.dtype == torch.bfloat16, "w2 must be bfloat16"
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        w2.shape[-1] == w1.shape[1]
+    ), f"w2 shape[-1] {w2.shape[-1]} must be equal to w1 shape[1] {w1.shape[1]}"
+    # feature check
+    assert inplace == False, "Inplace is not supported in new triton MoE kernel"
+    M, K = hidden_states.shape
+    E, _, N = w1.shape
+    n_expts_act = routing_data.n_expts_act
+    dtype = hidden_states.dtype
+    if global_num_experts == -1:
+        global_num_experts = E
+    # consistent with default implementation
+    intermediate_cache2 = torch.empty(
+        (M * n_expts_act, N // 2), device="cuda", dtype=dtype
+    )
+    intermediate_cache1 = matmul_ogs(
+        hidden_states,
+        w1,
+        None,
+        routing_data,
+        gather_indx=gather_indx,
+        gammas=routing_data.gate_scal if apply_router_weight_on_input else None,
+    )
+    if activation == "silu":
+        silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    elif activation == "gelu":
+        gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
+    else:
+        raise ValueError(f"Unsupported FusedMoe activation: {activation}")
+    intermediate_cache3 = matmul_ogs(
+        intermediate_cache2,
+        w2,
+        None,
+        routing_data,
+        scatter_indx=scatter_indx,
+        gammas=None if apply_router_weight_on_input else routing_data.gate_scal,
+    )
+    return intermediate_cache3
+def triton_kernel_moe_forward_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    gating_output: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+    inplace: bool = False,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    global_num_experts: int = -1,
+    expert_map: Optional[torch.Tensor] = None,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+direct_register_custom_op(
+    op_name="forward_cuda_triton",
+    op_func=triton_kernel_moe_forward,
+    mutates_args=[],
+    fake_impl=triton_kernel_moe_forward_fake,
+)

sglang/srt/layers/quantization/__init__.py CHANGED Viewed

@@ -68,6 +68,7 @@ from sglang.srt.layers.quantization.modelopt_quant import (
 )
 from sglang.srt.layers.quantization.moe_wna16 import MoeWNA16Config
 from sglang.srt.layers.quantization.qoq import QoQConfig
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config
 from sglang.srt.layers.quantization.w8a8_fp8 import W8A8Fp8Config
 from sglang.srt.layers.quantization.w8a8_int8 import W8A8Int8Config
@@ -82,6 +83,7 @@ BASE_QUANTIZATION_METHODS: Dict[str, Type[QuantizationConfig]] = {
     "moe_wna16": MoeWNA16Config,
     "compressed-tensors": CompressedTensorsConfig,
     "qoq": QoQConfig,
+    "w4afp8": W4AFp8Config,
 }
 # VLLM-dependent quantization methods

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -1,7 +1,7 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/layers/quantization/fp8.py
 import logging
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Union
 import torch
 import torch.nn.functional as F
@@ -88,7 +88,7 @@ _is_fp8_fnuz = is_fp8_fnuz()
 _use_hip_int4 = get_bool_env_var("SGLANG_INT4_WEIGHT")
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-if _is_hip:
+if _is_hip and (_use_aiter or _use_hip_int4):
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
     from aiter.fused_moe_bf16_asm import asm_moe, ck_moe_2stages
@@ -200,7 +200,7 @@ class Fp8LinearMethod(LinearMethodBase):
         quant_config: The quantization config.
     """
-    def __init__(self, quant_config: Fp8Config):
+    def __init__(self, quant_config: Union["Fp8Config", "W4AFp8Config"]):
         self.quant_config = quant_config
         self.cutlass_fp8_supported = cutlass_fp8_supported()
@@ -286,7 +286,10 @@ class Fp8LinearMethod(LinearMethodBase):
         if self.quant_config.is_checkpoint_fp8_serialized:
             # WEIGHT SCALE
             if self.block_quant:
-                assert self.quant_config.activation_scheme == "dynamic"
+                if hasattr(self.quant_config, "activation_scheme"):
+                    assert self.quant_config.activation_scheme == "dynamic"
+                elif hasattr(self.quant_config, "linear_activation_scheme"):
+                    assert self.quant_config.linear_activation_scheme == "dynamic"
                 scale = BlockQuantScaleParameter(
                     data=torch.empty(
                         (output_size_per_partition + block_n - 1) // block_n,
@@ -308,7 +311,13 @@ class Fp8LinearMethod(LinearMethodBase):
                 layer.register_parameter("weight_scale", scale)
             # INPUT ACTIVATION SCALE
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 scale = PerTensorScaleParameter(
                     data=torch.empty(len(output_partition_sizes), dtype=torch.float32),
                     weight_loader=weight_loader,
@@ -371,7 +380,13 @@ class Fp8LinearMethod(LinearMethodBase):
             layer.weight_scale = torch.nn.Parameter(
                 layer.weight_scale.data, requires_grad=False
             )
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = torch.nn.Parameter(
                     layer.input_scale.data, requires_grad=False
                 )
@@ -405,7 +420,13 @@ class Fp8LinearMethod(LinearMethodBase):
             # Update layer with new values.
             layer.weight = Parameter(weight.t(), requires_grad=False)
             layer.weight_scale = Parameter(weight_scale, requires_grad=False)
-            if self.quant_config.activation_scheme == "static":
+            if (
+                hasattr(self.quant_config, "activation_scheme")
+                and self.quant_config.activation_scheme == "static"
+            ) or (
+                hasattr(self.quant_config, "linear_activation_scheme")
+                and self.quant_config.linear_activation_scheme == "static"
+            ):
                 layer.input_scale = Parameter(
                     layer.input_scale.max(), requires_grad=False
                 )

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -26,6 +26,7 @@ from sglang.srt.layers.quantization.kv_cache import BaseKVCacheMethod
 from sglang.srt.layers.quantization.utils import (
     convert_to_channelwise,
     is_layer_skipped,
+    per_tensor_dequantize,
     requantize_with_max_scale,
 )
 from sglang.srt.layers.radix_attention import RadixAttention
@@ -110,7 +111,12 @@ class ModelOptFp8Config(QuantizationConfig):
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional["QuantizeMethodBase"]:
         if self.exclude_modules and any(
-            module in prefix for module in self.exclude_modules
+            module in prefix
+            or (
+                prefix.startswith("language_model.")
+                and module in prefix.removeprefix("language_model.")
+            )
+            for module in self.exclude_modules
         ):
             return None
@@ -119,6 +125,12 @@ class ModelOptFp8Config(QuantizationConfig):
         if self.kv_cache_quant_method and isinstance(layer, RadixAttention):
             return ModelOptFp8KVCacheMethod(self)
+        # Add MoE support
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
+        if isinstance(layer, FusedMoE):
+            return ModelOptFp8MoEMethod(self)
         return None
     def get_scaled_act_names(self) -> List[str]:
@@ -234,6 +246,237 @@ class ModelOptFp8KVCacheMethod(BaseKVCacheMethod):
         super().__init__(quant_config)
+class ModelOptFp8MoEMethod:
+    """MoE method for ModelOpt FP8.
+    Supports loading FP8 checkpoints with static weight scale and activation scale.
+    Args:
+        quant_config: The ModelOpt quantization config.
+    """
+    def __new__(cls, *args, **kwargs):
+        """
+        Dynamic class composition pattern.
+        This allows us to effectively "inject" FusedMoEMethodBase as a parent class
+        at runtime while avoiding circular import issues.
+        """
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoEMethodBase
+        if not hasattr(cls, "_initialized"):
+            original_init = cls.__init__
+            new_cls = type(
+                cls.__name__,
+                (FusedMoEMethodBase,),
+                {
+                    "__init__": original_init,
+                    **{k: v for k, v in cls.__dict__.items() if k != "__dict__"},
+                },
+            )
+            obj = super(new_cls, new_cls).__new__(new_cls)
+            obj.__init__(*args, **kwargs)
+            return obj
+        return super().__new__(cls)
+    def __init__(self, quant_config: ModelOptFp8Config):
+        self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
+    def create_weights(
+        self,
+        layer: torch.nn.Module,
+        num_experts: int,
+        hidden_size: int,
+        intermediate_size: int,
+        params_dtype: torch.dtype,
+        **extra_weight_attrs,
+    ):
+        from sglang.srt.layers.moe.fused_moe_triton import FusedMoeWeightScaleSupported
+        # Use FP8 dtype if checkpoint is serialized, otherwise use the default dtype
+        weight_dtype = (
+            torch.float8_e4m3fn
+            if self.quant_config.is_checkpoint_fp8_serialized
+            else params_dtype
+        )
+        weight_loader = extra_weight_attrs.get("weight_loader")
+        w13_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts, 2 * intermediate_size, hidden_size, dtype=weight_dtype
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w13_weight", w13_weight)
+        w2_weight = ModelWeightParameter(
+            data=torch.empty(
+                num_experts, hidden_size, intermediate_size, dtype=weight_dtype
+            ),
+            input_dim=2,
+            output_dim=1,
+            weight_loader=weight_loader,
+        )
+        layer.register_parameter("w2_weight", w2_weight)
+        if self.quant_config.is_checkpoint_fp8_serialized:
+            # WEIGHT SCALES - Per-tensor scaling for ModelOpts
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
+            w13_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts, 2),
+                    torch.finfo(torch.float32).min,
+                    dtype=torch.float32,
+                ),
+                weight_loader=weight_loader,
+            )
+            w2_weight_scale = PerTensorScaleParameter(
+                data=torch.full(
+                    (num_experts,), torch.finfo(torch.float32).min, dtype=torch.float32
+                ),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_weight_scale", w13_weight_scale)
+            layer.register_parameter("w2_weight_scale", w2_weight_scale)
+            # Set weight loader attributes for scales
+            extra_weight_attrs.update(
+                {"quant_method": FusedMoeWeightScaleSupported.TENSOR.value}
+            )
+            # INPUT SCALES - Per-tensor scaling for ModelOpt
+            w13_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            w2_input_scale = PerTensorScaleParameter(
+                data=torch.full((num_experts,), 1.0, dtype=torch.float32),
+                weight_loader=weight_loader,
+            )
+            layer.register_parameter("w13_input_scale", w13_input_scale)
+            layer.register_parameter("w2_input_scale", w2_input_scale)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        """Process FP8 MoE weights after loading from serialized checkpoint.
+        Only supports pre-quantized checkpoints with FP8 weights and scales.
+        """
+        layer.w13_weight = Parameter(layer.w13_weight.data, requires_grad=False)
+        layer.w2_weight = Parameter(layer.w2_weight.data, requires_grad=False)
+        # Handle scale parameters
+        if hasattr(layer, "w13_weight_scale") and layer.w13_weight_scale is not None:
+            # Fp8 moe kernel needs single weight scale for w13 per expert.
+            # We take the max of the w1 and w3 scales then dequant and requant each expert.
+            if layer.w13_weight_scale.dim() == 2:  # Shape: (num_experts, 2)
+                from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
+                # Get the maximum scale across w1 and w3 for each expert
+                max_w13_scales = layer.w13_weight_scale.max(dim=1).values
+                # Requantize each expert's weights using the combined scale
+                # w13_weight has shape (num_experts, 2 * intermediate_size, hidden_size)
+                # where the first intermediate_size rows are w1, the next are w3
+                intermediate_size = layer.w13_weight.shape[1] // 2
+                for expert_id in range(layer.w13_weight.shape[0]):
+                    start = 0
+                    for shard_id in range(2):  # w1 and w3
+                        # Dequantize using the original scale for this shard
+                        dq_weight = per_tensor_dequantize(
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
+                            layer.w13_weight_scale[expert_id][shard_id],
+                        )
+                        # Requantize using the combined max scale
+                        (
+                            layer.w13_weight[expert_id][
+                                start : start + intermediate_size, :
+                            ],
+                            _,
+                        ) = scaled_fp8_quant(dq_weight, max_w13_scales[expert_id])
+                        start += intermediate_size
+                # Update the scale parameter to be per-expert instead of per-shard
+                layer.w13_weight_scale = Parameter(max_w13_scales, requires_grad=False)
+            else:
+                layer.w13_weight_scale = Parameter(
+                    layer.w13_weight_scale.data, requires_grad=False
+                )
+        if hasattr(layer, "w2_weight_scale") and layer.w2_weight_scale is not None:
+            layer.w2_weight_scale = Parameter(
+                layer.w2_weight_scale.data, requires_grad=False
+            )
+        if hasattr(layer, "w13_input_scale") and layer.w13_input_scale is not None:
+            layer.w13_input_scale = Parameter(
+                layer.w13_input_scale.max(), requires_grad=False
+            )
+        if hasattr(layer, "w2_input_scale") and layer.w2_input_scale is not None:
+            layer.w2_input_scale = Parameter(
+                layer.w2_input_scale.max(), requires_grad=False
+            )
+    def apply(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        top_k: int,
+        renormalize: bool,
+        use_grouped_topk: bool,
+        topk_group: Optional[int] = None,
+        num_expert_group: Optional[int] = None,
+        num_fused_shared_experts: Optional[int] = None,
+        custom_routing_function: Optional[Callable] = None,
+        correction_bias: Optional[torch.Tensor] = None,
+        activation: str = "silu",
+        apply_router_weight_on_input: bool = False,
+        inplace: bool = True,
+        no_combine: bool = False,
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
+        from sglang.srt.layers.moe.topk import select_experts
+        # Expert selection
+        topk_weights, topk_ids = select_experts(
+            hidden_states=x,
+            router_logits=router_logits,
+            use_grouped_topk=use_grouped_topk,
+            top_k=top_k,
+            renormalize=renormalize,
+            topk_group=topk_group,
+            num_expert_group=num_expert_group,
+            num_fused_shared_experts=num_fused_shared_experts,
+            custom_routing_function=custom_routing_function,
+            correction_bias=correction_bias,
+            routed_scaling_factor=routed_scaling_factor,
+        )
+        return fused_experts(
+            x,
+            layer.w13_weight,
+            layer.w2_weight,
+            topk_weights=topk_weights,
+            topk_ids=topk_ids,
+            inplace=inplace,
+            activation=activation,
+            use_fp8_w8a8=True,
+            per_channel_quant=False,  # ModelOpt uses per-tensor quantization
+            w1_scale=layer.w13_weight_scale,
+            w2_scale=layer.w2_weight_scale,
+            a1_scale=layer.w13_input_scale,
+            a2_scale=layer.w2_input_scale,
+            no_combine=no_combine,
+        )
 class ModelOptFp4Config(QuantizationConfig):
     """Config class for FP4."""

sglang 0.4.9__py3-none-any.whl → 0.4.9.post1__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post1py3-none-any.whl