PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (175) hide show

sglang/__init__.py +8 -3
sglang/bench_one_batch.py +119 -17
sglang/lang/chat_template.py +18 -0
sglang/srt/bench_utils.py +137 -0
sglang/srt/configs/model_config.py +42 -7
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +14 -4
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +286 -160
sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +15 -11
sglang/srt/entrypoints/context.py +227 -0
sglang/srt/entrypoints/engine.py +15 -9
sglang/srt/entrypoints/harmony_utils.py +372 -0
sglang/srt/entrypoints/http_server.py +74 -4
sglang/srt/entrypoints/openai/protocol.py +218 -1
sglang/srt/entrypoints/openai/serving_chat.py +41 -11
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +175 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/ebnf_composer.py +1 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +331 -0
sglang/srt/function_call/kimik2_detector.py +3 -3
sglang/srt/function_call/qwen3_coder_detector.py +219 -9
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +14 -1
sglang/srt/layers/attention/aiter_backend.py +375 -115
sglang/srt/layers/attention/ascend_backend.py +3 -0
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/flashattention_backend.py +18 -0
sglang/srt/layers/attention/flashinfer_backend.py +52 -13
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
sglang/srt/layers/attention/vision.py +22 -6
sglang/srt/layers/attention/wave_backend.py +627 -0
sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
sglang/srt/layers/communicator.py +29 -14
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
sglang/srt/layers/linear.py +3 -7
sglang/srt/layers/moe/cutlass_moe.py +12 -3
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
sglang/srt/layers/moe/ep_moe/layer.py +135 -73
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
sglang/srt/layers/moe/topk.py +16 -4
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +27 -3
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_kernel.py +277 -0
sglang/srt/layers/quantization/fp8_utils.py +51 -10
sglang/srt/layers/quantization/modelopt_quant.py +258 -68
sglang/srt/layers/quantization/mxfp4.py +654 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +21 -12
sglang/srt/layers/quantization/w8a8_int8.py +48 -34
sglang/srt/layers/rotary_embedding.py +506 -3
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/backend/base_backend.py +3 -23
sglang/srt/lora/layers.py +60 -114
sglang/srt/lora/lora.py +17 -62
sglang/srt/lora/lora_manager.py +82 -62
sglang/srt/lora/lora_registry.py +23 -11
sglang/srt/lora/mem_pool.py +63 -68
sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
sglang/srt/lora/utils.py +25 -58
sglang/srt/managers/cache_controller.py +75 -58
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +20 -8
sglang/srt/managers/mm_utils.py +6 -13
sglang/srt/managers/multimodal_processor.py +1 -1
sglang/srt/managers/schedule_batch.py +61 -25
sglang/srt/managers/schedule_policy.py +6 -6
sglang/srt/managers/scheduler.py +41 -19
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +47 -30
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/allocator.py +61 -87
sglang/srt/mem_cache/hicache_storage.py +1 -1
sglang/srt/mem_cache/hiradix_cache.py +80 -22
sglang/srt/mem_cache/lora_radix_cache.py +421 -0
sglang/srt/mem_cache/memory_pool_host.py +34 -36
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/radix_cache.py +2 -5
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
sglang/srt/model_executor/cuda_graph_runner.py +29 -9
sglang/srt/model_executor/forward_batch_info.py +61 -19
sglang/srt/model_executor/model_runner.py +148 -37
sglang/srt/model_loader/loader.py +18 -6
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +137 -59
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma2.py +0 -34
sglang/srt/models/gemma3n_mm.py +38 -0
sglang/srt/models/glm4.py +6 -0
sglang/srt/models/glm4_moe.py +28 -16
sglang/srt/models/glm4v.py +589 -0
sglang/srt/models/glm4v_moe.py +400 -0
sglang/srt/models/gpt_oss.py +1251 -0
sglang/srt/models/granite.py +0 -25
sglang/srt/models/llama.py +0 -25
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_5_vl.py +7 -3
sglang/srt/models/qwen2_audio.py +10 -9
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3.py +0 -24
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/registry.py +1 -1
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/torch_native_llama.py +0 -24
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/base_processor.py +23 -13
sglang/srt/multimodal/processors/glm4v.py +132 -0
sglang/srt/multimodal/processors/qwen_audio.py +4 -2
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +332 -37
sglang/srt/server_args.py +186 -75
sglang/srt/speculative/eagle_worker.py +16 -0
sglang/srt/two_batch_overlap.py +169 -9
sglang/srt/utils.py +41 -5
sglang/srt/weight_sync/tensor_bucket.py +106 -0
sglang/test/attention/test_trtllm_mla_backend.py +186 -36
sglang/test/doc_patch.py +59 -0
sglang/test/few_shot_gsm8k.py +1 -1
sglang/test/few_shot_gsm8k_engine.py +1 -1
sglang/test/run_eval.py +4 -1
sglang/test/runners.py +2 -2
sglang/test/simple_eval_common.py +6 -0
sglang/test/simple_eval_gpqa.py +2 -0
sglang/test/test_fp4_moe.py +118 -36
sglang/test/test_utils.py +1 -1
sglang/utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
sglang/srt/lora/backend/flashinfer_backend.py +0 -131
/sglang/{api.py → lang/api.py} +0 -0
/sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -319,6 +319,7 @@ def fused_moe_kernel(
     # Pointers to matrices
     a_ptr,
     b_ptr,
+    bias_ptr,
     c_ptr,
     a_scale_ptr,
     b_scale_ptr,
@@ -340,6 +341,8 @@ def fused_moe_kernel(
     stride_be,
     stride_bk,
     stride_bn,
+    stride_bias_e,
+    stride_bias_n,
     stride_cm,
     stride_cn,
     stride_asm,
@@ -449,6 +452,10 @@ def fused_moe_kernel(
         + off_experts * stride_be
         + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
     )
+    if bias_ptr is not None:
+        bias = tl.load(
+            bias_ptr + off_experts * stride_bias_e + offs_bn[None, :] * stride_bias_n
+        )
     if use_int8_w8a16:
         b_scale_ptrs = (
             b_scale_ptr + off_experts * stride_bse + offs_bn[None, :] * stride_bsn
@@ -526,18 +533,20 @@ def fused_moe_kernel(
         a_ptrs += BLOCK_SIZE_K * stride_ak
         b_ptrs += BLOCK_SIZE_K * stride_bk
-    if MUL_ROUTED_WEIGHT:
-        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
-        accumulator = accumulator * moe_weight[:, None]
     if use_int8_w8a16:
-        accumulator = (accumulator * b_scale).to(compute_type)
+        accumulator *= b_scale
     elif use_fp8_w8a8 or use_int8_w8a8:
-        if group_k > 0 and group_n > 0:
-            accumulator = accumulator.to(compute_type)
-        else:
-            accumulator = (accumulator * a_scale * b_scale).to(compute_type)
-    else:
-        accumulator = accumulator.to(compute_type)
+        if group_k == 0 or group_n == 0:
+            accumulator *= a_scale * b_scale
+    if bias_ptr is not None:
+        accumulator += bias
+    if MUL_ROUTED_WEIGHT:
+        moe_weight = tl.load(topk_weights_ptr + offs_token, mask=token_mask, other=0)
+        accumulator *= moe_weight[:, None]
+    accumulator = accumulator.to(compute_type)
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
@@ -622,6 +631,7 @@ def moe_align_block_size(
 def invoke_fused_moe_kernel(
     A: torch.Tensor,
     B: torch.Tensor,
+    bias: Optional[torch.Tensor],
     C: torch.Tensor,
     A_scale: Optional[torch.Tensor],
     B_scale: Optional[torch.Tensor],
@@ -711,6 +721,7 @@ def invoke_fused_moe_kernel(
     ):
         assert B_scale is not None and B_scale.ndim == 3
         assert B_zp is None or B_zp.ndim == 3
+        assert bias is None
         fused_moe_kernel_gptq_awq[grid](
             A,
             B,
@@ -754,6 +765,7 @@ def invoke_fused_moe_kernel(
         fused_moe_kernel[grid](
             A,
             B,
+            bias,
             C,
             A_scale,
             B_scale,
@@ -770,6 +782,8 @@ def invoke_fused_moe_kernel(
             B.stride(0),
             B.stride(2),
             B.stride(1),
+            bias.stride(0) if bias is not None else 0,
+            bias.stride(1) if bias is not None else 0,
             C.stride(1),
             C.stride(2),
             A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
@@ -994,6 +1008,8 @@ def inplace_fused_experts(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1009,6 +1025,8 @@ def inplace_fused_experts(
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> None:
     fused_experts_impl(
         hidden_states,
@@ -1016,6 +1034,8 @@ def inplace_fused_experts(
         w2,
         topk_weights,
         topk_ids,
+        b1,
+        b2,
         True,
         activation,
         apply_router_weight_on_input,
@@ -1033,6 +1053,8 @@ def inplace_fused_experts(
         block_shape,
         False,
         routed_scaling_factor,
+        activation_alpha,
+        swiglu_limit,
     )
@@ -1042,6 +1064,8 @@ def inplace_fused_experts_fake(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1057,6 +1081,8 @@ def inplace_fused_experts_fake(
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[List[int]] = None,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> None:
     pass
@@ -1075,6 +1101,8 @@ def outplace_fused_experts(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1091,6 +1119,8 @@ def outplace_fused_experts(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     return fused_experts_impl(
         hidden_states,
@@ -1098,6 +1128,8 @@ def outplace_fused_experts(
         w2,
         topk_weights,
         topk_ids,
+        b1,
+        b2,
         False,
         activation,
         apply_router_weight_on_input,
@@ -1115,6 +1147,8 @@ def outplace_fused_experts(
         block_shape,
         no_combine=no_combine,
         routed_scaling_factor=routed_scaling_factor,
+        activation_alpha=activation_alpha,
+        swiglu_limit=swiglu_limit,
     )
@@ -1124,6 +1158,8 @@ def outplace_fused_experts_fake(
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
@@ -1140,6 +1176,8 @@ def outplace_fused_experts_fake(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     return torch.empty_like(hidden_states)
@@ -1157,6 +1195,8 @@ def fused_experts(
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_output: TopKOutput,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1174,6 +1214,8 @@ def fused_experts(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ):
     topk_weights, topk_ids, _ = topk_output
     if inplace:
@@ -1184,6 +1226,8 @@ def fused_experts(
             w2,
             topk_weights,
             topk_ids,
+            b1,
+            b2,
             activation,
             apply_router_weight_on_input,
             use_fp8_w8a8,
@@ -1199,6 +1243,8 @@ def fused_experts(
             a2_scale,
             block_shape,
             routed_scaling_factor,
+            activation_alpha,
+            swiglu_limit,
         )
         return hidden_states
     else:
@@ -1208,6 +1254,8 @@ def fused_experts(
             w2,
             topk_weights,
             topk_ids,
+            b1,
+            b2,
             activation,
             apply_router_weight_on_input,
             use_fp8_w8a8,
@@ -1224,6 +1272,8 @@ def fused_experts(
             block_shape,
             no_combine=no_combine,
             routed_scaling_factor=routed_scaling_factor,
+            activation_alpha=activation_alpha,
+            swiglu_limit=swiglu_limit,
         )
@@ -1319,12 +1369,22 @@ def moe_sum_reduce_torch_compile(x, out, routed_scaling_factor):
     out.mul_(routed_scaling_factor)
+@torch.compile
+def swiglu_with_alpha_and_limit(x, alpha, limit):
+    gate, up = x[..., ::2], x[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    return gate * torch.sigmoid(gate * alpha) * (up + 1)
 def fused_experts_impl(
     hidden_states: torch.Tensor,
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1342,6 +1402,8 @@ def fused_experts_impl(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ):
     padded_size = padding_size
     if not (use_fp8_w8a8 or use_int8_w8a8) or block_shape is not None or _use_aiter:
@@ -1353,7 +1415,7 @@ def fused_experts_impl(
     else:
         assert (
             hidden_states.shape[1] == w1.shape[2] - padded_size
-        ), "Hidden size mismatch"
+        ), f"Hidden size mismatch"
     assert topk_weights.shape == topk_ids.shape, "topk shape mismatch"
     assert hidden_states.is_contiguous(), "Hidden_states must be contiguous"
     assert w1.is_contiguous(), "Expert weights1 must be contiguous"
@@ -1449,6 +1511,7 @@ def fused_experts_impl(
         invoke_fused_moe_kernel(
             curr_hidden_states,
             w1,
+            b1,
             intermediate_cache1,
             a1_scale,
             w1_scale,
@@ -1470,13 +1533,24 @@ def fused_experts_impl(
             block_shape=block_shape,
         )
         if activation == "silu":
-            if _is_cuda:
+            if activation_alpha is not None:
+                assert swiglu_limit is not None
+                intermediate_cache2 = swiglu_with_alpha_and_limit(
+                    intermediate_cache1.view(-1, N),
+                    activation_alpha,
+                    swiglu_limit,
+                )
+            elif _is_cuda:
                 silu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
                 vllm_ops.silu_and_mul(
                     intermediate_cache2, intermediate_cache1.view(-1, N)
                 )
         elif activation == "gelu":
+            assert (
+                activation_alpha is None
+            ), "activation_alpha is not supported for gelu"
+            assert swiglu_limit is None, "swiglu_limit is not supported for gelu"
             if _is_cuda:
                 gelu_and_mul(intermediate_cache1.view(-1, N), intermediate_cache2)
             else:
@@ -1489,6 +1563,7 @@ def fused_experts_impl(
         invoke_fused_moe_kernel(
             intermediate_cache2,
             w2,
+            b2,
             (
                 intermediate_cache3
                 if not no_combine and topk_ids.shape[1] != 1
@@ -1567,6 +1642,8 @@ def fused_moe(
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_output: TopKOutput,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
     inplace: bool = False,
     activation: str = "silu",
     apply_router_weight_on_input: bool = False,
@@ -1584,6 +1661,8 @@ def fused_moe(
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
+    activation_alpha: Optional[float] = None,
+    swiglu_limit: Optional[float] = None,
 ) -> torch.Tensor:
     """
     This function computes a Mixture of Experts (MoE) layer using two sets of
@@ -1594,6 +1673,8 @@ def fused_moe(
     - w1 (torch.Tensor): The first set of expert weights.
     - w2 (torch.Tensor): The second set of expert weights.
     - topk_output (TopKOutput): The top-k output of the experts.
+    - b1 (Optional[torch.Tensor]): Optional bias for w1.
+    - b2 (Optional[torch.Tensor]): Optional bias for w2.
     - inplace (bool): If True, perform the operation in-place.
         Defaults to False.
     - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
@@ -1615,6 +1696,10 @@ def fused_moe(
         a2.
     - block_shape: (Optional[List[int]]): Optional block size for block-wise
         quantization.
+    - activation_alpha (Optional[float]): Optional alpha for the activation
+        function.
+    - swiglu_limit (Optional[float]): Optional limit for the swiglu activation
+        function.
     Returns:
     - torch.Tensor: The output tensor after applying the MoE layer.
@@ -1625,6 +1710,8 @@ def fused_moe(
         w1,
         w2,
         topk_output,
+        b1=b1,
+        b2=b2,
         inplace=inplace,
         activation=activation,
         apply_router_weight_on_input=apply_router_weight_on_input,
@@ -1642,4 +1729,6 @@ def fused_moe(
         block_shape=block_shape,
         no_combine=no_combine,
         routed_scaling_factor=routed_scaling_factor,
+        activation_alpha=activation_alpha,
+        swiglu_limit=swiglu_limit,
     )

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc1py3-none-any.whl