PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

sglang/bench_one_batch.py +113 -17
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +11 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +4 -3
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +71 -0
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/vision.py +13 -5
sglang/srt/layers/communicator.py +21 -4
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +2 -7
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +77 -73
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +3 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +55 -30
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +28 -7
sglang/srt/managers/scheduler.py +26 -12
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +24 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +7 -6
sglang/srt/model_executor/forward_batch_info.py +35 -14
sglang/srt/model_executor/model_runner.py +19 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +72 -33
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +24 -12
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +142 -7
sglang/srt/two_batch_overlap.py +157 -5
sglang/srt/utils.py +38 -2
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -14,13 +14,9 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
     silu_and_mul_masked_post_quant_fwd,
     tma_align_input_scale,
 )
-from sglang.srt.layers.moe.fused_moe_triton.layer import (
-    FlashInferFusedMoE,
-    FusedMoE,
-    should_use_flashinfer_trtllm_moe,
-)
+from sglang.srt.layers.moe.fused_moe_triton.layer import FlashInferFusedMoE, FusedMoE
 from sglang.srt.layers.moe.topk import TopKOutput
-from sglang.srt.layers.moe.utils import DeepEPMode
+from sglang.srt.layers.moe.utils import DeepEPMode, should_use_flashinfer_trtllm_moe
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import (
@@ -48,7 +44,6 @@ _is_npu = is_npu()
 _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if not (_is_npu or _is_hip):
     from sgl_kernel import silu_and_mul
@@ -60,6 +55,22 @@ if _use_aiter:
 logger = logging.getLogger(__name__)
+# TODO(kaixih@nvidia): ideally we should merge this logic into
+# `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
+@torch.compile
+def _cast_to_e8m0_with_rounding_up(x: torch.Tensor) -> torch.Tensor:
+    temp = x.to(torch.float32).view(torch.int32)
+    exp = torch.bitwise_right_shift(temp, 23)
+    mant = torch.bitwise_and(temp, 0x7FFFFF)
+    is_ru = torch.logical_and(
+        torch.logical_and((mant > 0), (exp != 0xFE)),
+        ~torch.logical_and((exp == 0), (mant <= 0x400000)),
+    )
+    exp = torch.where(is_ru, exp + 1, exp)
+    new_x = exp.to(torch.uint8).view(torch.int)
+    return new_x.transpose(1, 2).contiguous().transpose(1, 2)
 class EPMoE(FusedMoE):
     """
     MoE Expert Parallel Impl
@@ -81,6 +92,9 @@ class EPMoE(FusedMoE):
         prefix: str = "",
         activation: str = "silu",
         routed_scaling_factor: Optional[float] = None,
+        activation_alpha: Optional[float] = None,
+        swiglu_limit: Optional[float] = None,
+        with_bias: bool = False,
     ):
         super().__init__(
             num_experts=num_experts,
@@ -96,6 +110,9 @@ class EPMoE(FusedMoE):
             activation=activation,
             # apply_router_weight_on_input=apply_router_weight_on_input,
             routed_scaling_factor=routed_scaling_factor,
+            activation_alpha=activation_alpha,
+            swiglu_limit=swiglu_limit,
+            with_bias=with_bias,
         )
         self.start_expert_id = self.moe_ep_rank * self.num_local_experts
@@ -203,10 +220,22 @@ class EPMoE(FusedMoE):
         dispose_tensor(hidden_states)
+        if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
+            b, s_mn, s_k = gateup_input_scale.shape
+            assert (
+                s_mn % 4 == 0 and s_k % 4 == 0
+            ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
         # GroupGemm-0
         gateup_input_fp8 = (
             gateup_input,
-            deep_gemm_wrapper.get_col_major_tma_aligned_tensor(gateup_input_scale),
+            (
+                _cast_to_e8m0_with_rounding_up(gateup_input_scale)
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
+                    gateup_input_scale
+                )
+            ),
         )
         num_groups, m, k = gateup_input_fp8[0].size()
         n = self.w13_weight.size(1)
@@ -214,7 +243,12 @@ class EPMoE(FusedMoE):
             (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
         )
         deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            gateup_input_fp8, self.w13_weight_fp8, gateup_output, masked_m, expected_m
+            gateup_input_fp8,
+            self.w13_weight_fp8,
+            gateup_output,
+            masked_m,
+            expected_m,
+            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         del gateup_input
         del gateup_input_fp8
@@ -245,6 +279,7 @@ class EPMoE(FusedMoE):
             down_input_scale,
             scale_block_size,
             masked_m,
+            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
         )
         del gateup_output
@@ -252,13 +287,24 @@ class EPMoE(FusedMoE):
         n = self.w2_weight.size(1)
         down_input_fp8 = (
             down_input,
-            deep_gemm_wrapper.get_col_major_tma_aligned_tensor(down_input_scale),
+            (
+                down_input_scale
+                if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0
+                else deep_gemm_wrapper.get_col_major_tma_aligned_tensor(
+                    down_input_scale
+                )
+            ),
         )
         down_output = torch.empty(
             (num_groups, m, n), device=hidden_states_device, dtype=torch.bfloat16
         )
         deep_gemm_wrapper.grouped_gemm_nt_f8f8bf16_masked(
-            down_input_fp8, self.w2_weight_fp8, down_output, masked_m, expected_m
+            down_input_fp8,
+            self.w2_weight_fp8,
+            down_output,
+            masked_m,
+            expected_m,
+            recipe=(1, 128, 128) if deep_gemm_wrapper.DEEPGEMM_BLACKWELL else None,
         )
         del down_input
         del down_input_fp8
@@ -678,71 +724,29 @@ class DeepEPMoE(EPMoE):
         return down_output
-class FlashInferEPMoE(EPMoE):
-    def __init__(self, *args, **kwargs):
-        renormalize = kwargs.pop("renormalize", True)
-        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
-        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
-        num_expert_group = kwargs.pop("num_expert_group", None)
-        topk_group = kwargs.pop("topk_group", None)
-        correction_bias = kwargs.pop("correction_bias", None)
-        super().__init__(*args, **kwargs)
-        self.renormalize = renormalize
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.use_grouped_topk = use_grouped_topk
-        if self.use_grouped_topk:
-            assert num_expert_group is not None and topk_group is not None
-        self.num_expert_group = num_expert_group
-        self.topk_group = topk_group
-        self.correction_bias = correction_bias
-        self.use_flashinfer_trtllm_moe = should_use_flashinfer_trtllm_moe()
-    def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
-        assert self.use_flashinfer_trtllm_moe
-        assert (
-            self.activation == "silu"
-        ), "Only silu is supported for flashinfer blockscale fp8 moe"
-        assert (
-            self.renormalize
-        ), "Renormalize is required for flashinfer blockscale fp8 moe"
-        assert (
-            self.num_fused_shared_experts == 0
-        ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
-        a_q, a_sf = sglang_per_token_group_quant_fp8(hidden_states, self.block_shape[1])
-        # NOTE: scales of hidden states have to be transposed!
-        a_sf_t = a_sf.t().contiguous()
-        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
-        return trtllm_fp8_block_scale_moe(
-            routing_logits=router_logits.to(torch.float32),
-            routing_bias=self.correction_bias.to(hidden_states.dtype),
-            hidden_states=a_q,
-            hidden_states_scale=a_sf_t,
-            gemm1_weights=self.w13_weight,
-            gemm1_weights_scale=self.w13_weight_scale_inv,
-            gemm2_weights=self.w2_weight,
-            gemm2_weights_scale=self.w2_weight_scale_inv,
-            num_experts=self.num_experts,
-            top_k=self.top_k,
-            n_group=self.num_expert_group,
-            topk_group=self.topk_group,
-            intermediate_size=self.w2_weight.shape[2],
-            local_expert_offset=self.start_expert_id,
-            local_num_experts=self.num_local_experts,
-            routed_scaling_factor=self.routed_scaling_factor,
-            tile_tokens_dim=get_tile_tokens_dim(
-                hidden_states.shape[0], self.top_k, self.num_experts
-            ),
-            routing_method_type=2,  # DeepSeek-styled routing method
-            use_shuffled_weight=False,
-        )
 def get_moe_impl_class():
     if global_server_args_dict["moe_a2a_backend"].is_deepep():
         return DeepEPMoE
+    # NEW: Direct FP4 detection (bypasses EP requirements)
+    # Check for FP4 quantization with TRTLLM flag, regardless of EP
+    if global_server_args_dict.get("enable_flashinfer_trtllm_moe", False):
+        try:
+            # Check the quantization argument directly
+            quantization = global_server_args_dict.get("quantization")
+            if quantization == "modelopt_fp4":
+                from sglang.srt.layers.moe.fused_moe_triton.layer import (
+                    FlashInferFP4MoE,
+                )
+                return FlashInferFP4MoE
+        except:
+            pass
+    if should_use_flashinfer_trtllm_moe():
+        return FlashInferFusedMoE
     if global_server_args_dict["enable_flashinfer_cutlass_moe"]:
         return FusedMoE
     if get_moe_expert_parallel_world_size() > 1:
-        return FlashInferEPMoE if should_use_flashinfer_trtllm_moe() else EPMoE
-    return FlashInferFusedMoE if should_use_flashinfer_trtllm_moe() else FusedMoE
+        return EPMoE
+    return FusedMoE

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -319,6 +319,7 @@ def fused_moe_kernel(
     # Pointers to matrices
     a_ptr,
     b_ptr,
+    bias_ptr,
     c_ptr,
     a_scale_ptr,
     b_scale_ptr,
@@ -340,6 +341,8 @@ def fused_moe_kernel(
     stride_be,
     stride_bk,
     stride_bn,
+    stride_bias_e,
+    stride_bias_n,
     stride_cm,
     stride_cn,
     stride_asm,
@@ -449,6 +452,10 @@ def fused_moe_kernel(
         + off_experts * stride_be
         + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
     )
+    if bias_ptr is not None:
+        bias = tl.load(
+            bias_ptr + off_experts * stride_bias_e + offs_bn[None, :] * stride_bias_n
+        )
     if use_int8_w8a16:
         b_scale_ptrs = (
             b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
@@ -526,18 +533,20 @@ def fused_moe_kernel(
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
-    if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
-        accumulator = accumulator * moe_weight[:, None]
     if use_int8_w8a16:
-        accumulator = (accumulator * b_scale).to(compute_type)
+        accumulator *= b_scale
     elif use_fp8_w8a8 or use_int8_w8a8:
-        if group_k > 0 and group_n > 0:
-            accumulator = accumulator.to(compute_type)
-        else:
-            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
-    else:
-        accumulator = accumulator.to(compute_type)
+        if group_k == 0 or group_n == 0:
+            accumulator *= a_scale * b_scale
+    if bias_ptr is not None:
+        accumulator += bias
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator *= moe_weight[:, None]
+    accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -622,6 +631,7 @@ def moe_align_block_size(
 def invoke_fused_moe_kernel(
     A: torch.Tensor,
     B: torch.Tensor,
+    bias: Optional[torch.Tensor],
     C: torch.Tensor,
     A_scale: Optional[torch.Tensor],
     B_scale: Optional[torch.Tensor],
@@ -711,6 +721,7 @@ def invoke_fused_moe_kernel(
     ):
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
+        assert bias is None
         fused_moe_kernel_gptq_awq[grid](
             A,
             B,
@@ -754,6 +765,7 @@ def invoke_fused_moe_kernel(
         fused_moe_kernel[grid](
             A,
             B,
+            bias,
             C,
             A_scale,
             B_scale,
@@ -770,6 +782,8 @@ def invoke_fused_moe_kernel(
             B.stride(0),
             B.stride(2),
             B.stride(1),
+            bias.stride(0) if bias is not None else 0,
+            bias.stride(1) if bias is not None else 0,
             C.stride(1),
             C.stride(2),
             A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
@@ -994,6 +1008,8 @@ def inplace_fused_experts(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1009,6 +1025,8 @@ def inplace_fused_experts(
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> None:
     fused_experts_impl(
         hidden_states,
@@ -1016,6 +1034,8 @@ def inplace_fused_experts(
         w2,
         topk_weights,
         topk_ids,
+        b1,
+        b2,
         True,
         activation,
         apply_router_weight_on_input,
@@ -1033,6 +1053,8 @@ def inplace_fused_experts(
         block_shape,
         False,
         routed_scaling_factor,
+        activation_alpha,
+        swiglu_limit,
     )
@@ -1042,6 +1064,8 @@ def inplace_fused_experts_fake(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1057,6 +1081,8 @@ def inplace_fused_experts_fake(
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> None:
     pass
@@ -1075,6 +1101,8 @@ def outplace_fused_experts(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1091,6 +1119,8 @@ def outplace_fused_experts(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     return fused_experts_impl(
         hidden_states,
@@ -1098,6 +1128,8 @@ def outplace_fused_experts(
         w2,
         topk_weights,
         topk_ids,
+        b1,
+        b2,
         False,
         activation,
         apply_router_weight_on_input,
@@ -1115,6 +1147,8 @@ def outplace_fused_experts(
         block_shape,
         no_combine=no_combine,
         routed_scaling_factor=routed_scaling_factor,
+        activation_alpha=activation_alpha,
+        swiglu_limit=swiglu_limit,
     )
@@ -1124,6 +1158,8 @@ def outplace_fused_experts_fake(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1140,6 +1176,8 @@ def outplace_fused_experts_fake(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
@@ -1157,6 +1195,8 @@ def fused_experts(
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_output: TopKOutput,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1174,6 +1214,8 @@ def fused_experts(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ):
     topk_weights, topk_ids, _ = topk_output
     if inplace:
@@ -1184,6 +1226,8 @@ def fused_experts(
             w2,
             topk_weights,
             topk_ids,
+            b1,
+            b2,
             activation,
             apply_router_weight_on_input,
             use_fp8_w8a8,
@@ -1199,6 +1243,8 @@ def fused_experts(
             a2_scale,
             block_shape,
             routed_scaling_factor,
+            activation_alpha,
+            swiglu_limit,
         )
         return hidden_states
     else:
@@ -1208,6 +1254,8 @@ def fused_experts(
             w2,
             topk_weights,
             topk_ids,
+            b1,
+            b2,
             activation,
             apply_router_weight_on_input,
             use_fp8_w8a8,
@@ -1224,6 +1272,8 @@ def fused_experts(
             block_shape,
             no_combine=no_combine,
             routed_scaling_factor=routed_scaling_factor,
+            activation_alpha=activation_alpha,
+            swiglu_limit=swiglu_limit,
         )
@@ -1319,12 +1369,22 @@ def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
     out.mul_(routed_scaling_factor)
+@torch.compile
+def swiglu_with_alpha_and_limit(x, alpha, limit):
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    return gate * torch.sigmoid(gate * alpha) * (up + 1)
 def fused_experts_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1342,6 +1402,8 @@ def fused_experts_impl(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ):
     padded_size = padding_size
     if not (use_fp8_w8a8 or use_int8_w8a8) or block_shape is not None or _use_aiter:
@@ -1353,7 +1415,7 @@ def fused_experts_impl(
     else:
         assert (
             hidden_states.shape[1] == w1.shape[2] - padded_size
-        ), "Hidden size mismatch"
+        ), f"Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -1449,6 +1511,7 @@ def fused_experts_impl(
         invoke_fused_moe_kernel(
             curr_hidden_states,
             w1,
+            b1,
             intermediate_cache1,
             a1_scale,
             w1_scale,
@@ -1470,13 +1533,24 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
         if activation == "silu":
-            if _is_cuda:
+            if activation_alpha is not None:
+                assert swiglu_limit is not None
+                intermediate_cache2 = swiglu_with_alpha_and_limit(
+                    intermediate_cache1.view(-1, N),
+                    activation_alpha,
+                    swiglu_limit,
+                )
+            elif _is_cuda:
                 silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
                 vllm_ops.silu_and_mul(
                     intermediate_cache2, intermediate_cache1.view(-1, N)
                 )
         elif activation == "gelu":
+            assert (
+                activation_alpha is None
+            ), "activation_alpha is not supported for gelu"
+            assert swiglu_limit is None, "swiglu_limit is not supported for gelu"
             if _is_cuda:
                 gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
@@ -1489,6 +1563,7 @@ def fused_experts_impl(
         invoke_fused_moe_kernel(
             intermediate_cache2,
             w2,
+            b2,
             (
                 intermediate_cache3
                 if not no_combine and topk_ids.shape[1] != 1
@@ -1567,6 +1642,8 @@ def fused_moe(
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_output: TopKOutput,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1584,6 +1661,8 @@ def fused_moe(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1594,6 +1673,8 @@ def fused_moe(
     - w1 (torch.Tensor): The first set of expert weights.
     - w2 (torch.Tensor): The second set of expert weights.
     - topk_output (TopKOutput): The top-k output of the experts.
+    - b1 (Optional[torch.Tensor]): Optional bias for w1.
+    - b2 (Optional[torch.Tensor]): Optional bias for w2.
     - inplace (bool): If True, perform the operation in-place.
         Defaults to False.
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
@@ -1615,6 +1696,10 @@ def fused_moe(
         a2.
     - block_shape: (Optional[List[int]]): Optional block size for block-wise
         quantization.
+    - activation_alpha (Optional[float]): Optional alpha for the activation
+        function.
+    - swiglu_limit (Optional[float]): Optional limit for the swiglu activation
+        function.
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -1625,6 +1710,8 @@ def fused_moe(
         w1,
         w2,
         topk_output,
+        b1=b1,
+        b2=b2,
         inplace=inplace,
         activation=activation,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -1642,4 +1729,6 @@ def fused_moe(
         block_shape=block_shape,
         no_combine=no_combine,
         routed_scaling_factor=routed_scaling_factor,
+        activation_alpha=activation_alpha,
+        swiglu_limit=swiglu_limit,
     )

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl