PyPI - sglang - Versions diffs - 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl - Mend

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (150) hide show

sglang/bench_one_batch.py +149 -34
sglang/bench_serving.py +18 -3
sglang/compile_deep_gemm.py +13 -7
sglang/srt/batch_invariant_ops/__init__.py +2 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +120 -0
sglang/srt/checkpoint_engine/__init__.py +9 -0
sglang/srt/checkpoint_engine/update.py +317 -0
sglang/srt/configs/__init__.py +2 -0
sglang/srt/configs/deepseek_ocr.py +542 -10
sglang/srt/configs/deepseekvl2.py +95 -194
sglang/srt/configs/kimi_linear.py +160 -0
sglang/srt/configs/mamba_utils.py +66 -0
sglang/srt/configs/model_config.py +25 -2
sglang/srt/constants.py +7 -0
sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
sglang/srt/disaggregation/decode.py +34 -6
sglang/srt/disaggregation/nixl/conn.py +2 -2
sglang/srt/disaggregation/prefill.py +25 -3
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
sglang/srt/distributed/parallel_state.py +9 -5
sglang/srt/entrypoints/engine.py +13 -5
sglang/srt/entrypoints/http_server.py +22 -3
sglang/srt/entrypoints/openai/protocol.py +7 -1
sglang/srt/entrypoints/openai/serving_chat.py +42 -0
sglang/srt/entrypoints/openai/serving_completions.py +10 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/environ.py +7 -0
sglang/srt/eplb/expert_distribution.py +34 -1
sglang/srt/eplb/expert_location.py +106 -36
sglang/srt/grpc/compile_proto.py +3 -0
sglang/srt/layers/attention/ascend_backend.py +233 -5
sglang/srt/layers/attention/attention_registry.py +3 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
sglang/srt/layers/attention/fla/kda.py +1359 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
sglang/srt/layers/attention/flashattention_backend.py +7 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +3 -1
sglang/srt/layers/attention/flashmla_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
sglang/srt/layers/attention/mamba/mamba.py +20 -11
sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
sglang/srt/layers/attention/nsa/transform_index.py +1 -1
sglang/srt/layers/attention/nsa_backend.py +157 -23
sglang/srt/layers/attention/triton_backend.py +4 -1
sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
sglang/srt/layers/attention/trtllm_mla_backend.py +10 -2
sglang/srt/layers/communicator.py +23 -1
sglang/srt/layers/layernorm.py +16 -2
sglang/srt/layers/logits_processor.py +4 -20
sglang/srt/layers/moe/ep_moe/layer.py +0 -18
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
sglang/srt/layers/moe/moe_runner/deep_gemm.py +53 -33
sglang/srt/layers/moe/token_dispatcher/deepep.py +12 -9
sglang/srt/layers/moe/topk.py +31 -6
sglang/srt/layers/pooler.py +21 -2
sglang/srt/layers/quantization/__init__.py +9 -78
sglang/srt/layers/quantization/auto_round.py +394 -0
sglang/srt/layers/quantization/fp8_kernel.py +1 -1
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +168 -11
sglang/srt/layers/rotary_embedding.py +117 -45
sglang/srt/lora/lora_registry.py +9 -0
sglang/srt/managers/async_mm_data_processor.py +122 -0
sglang/srt/managers/data_parallel_controller.py +30 -3
sglang/srt/managers/detokenizer_manager.py +3 -0
sglang/srt/managers/io_struct.py +26 -4
sglang/srt/managers/multi_tokenizer_mixin.py +5 -0
sglang/srt/managers/schedule_batch.py +74 -15
sglang/srt/managers/scheduler.py +164 -129
sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
sglang/srt/managers/scheduler_pp_mixin.py +7 -2
sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
sglang/srt/managers/session_controller.py +6 -5
sglang/srt/managers/tokenizer_manager.py +154 -59
sglang/srt/managers/tp_worker.py +24 -1
sglang/srt/mem_cache/base_prefix_cache.py +23 -4
sglang/srt/mem_cache/common.py +1 -0
sglang/srt/mem_cache/memory_pool.py +171 -57
sglang/srt/mem_cache/memory_pool_host.py +12 -5
sglang/srt/mem_cache/radix_cache.py +4 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
sglang/srt/metrics/collector.py +46 -3
sglang/srt/model_executor/cuda_graph_runner.py +15 -3
sglang/srt/model_executor/forward_batch_info.py +11 -11
sglang/srt/model_executor/model_runner.py +76 -21
sglang/srt/model_executor/npu_graph_runner.py +7 -3
sglang/srt/model_loader/weight_utils.py +1 -1
sglang/srt/models/bailing_moe.py +9 -2
sglang/srt/models/deepseek_nextn.py +11 -2
sglang/srt/models/deepseek_v2.py +149 -34
sglang/srt/models/glm4.py +391 -77
sglang/srt/models/glm4v.py +196 -55
sglang/srt/models/glm4v_moe.py +0 -1
sglang/srt/models/gpt_oss.py +1 -10
sglang/srt/models/kimi_linear.py +678 -0
sglang/srt/models/llama4.py +1 -1
sglang/srt/models/llama_eagle3.py +11 -1
sglang/srt/models/longcat_flash.py +2 -2
sglang/srt/models/minimax_m2.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +30 -15
sglang/srt/models/qwen3.py +1 -1
sglang/srt/models/qwen3_moe.py +16 -8
sglang/srt/models/qwen3_next.py +7 -0
sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
sglang/srt/multiplex/multiplexing_mixin.py +209 -0
sglang/srt/multiplex/pdmux_context.py +164 -0
sglang/srt/parser/conversation.py +7 -1
sglang/srt/sampling/custom_logit_processor.py +67 -1
sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
sglang/srt/server_args.py +103 -22
sglang/srt/single_batch_overlap.py +4 -1
sglang/srt/speculative/draft_utils.py +16 -0
sglang/srt/speculative/eagle_info.py +42 -36
sglang/srt/speculative/eagle_info_v2.py +68 -25
sglang/srt/speculative/eagle_utils.py +261 -16
sglang/srt/speculative/eagle_worker.py +11 -3
sglang/srt/speculative/eagle_worker_v2.py +15 -9
sglang/srt/speculative/spec_info.py +305 -31
sglang/srt/speculative/spec_utils.py +44 -8
sglang/srt/tracing/trace.py +121 -12
sglang/srt/utils/common.py +55 -32
sglang/srt/utils/hf_transformers_utils.py +38 -16
sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
sglang/test/kits/radix_cache_server_kit.py +50 -0
sglang/test/runners.py +31 -7
sglang/test/simple_eval_common.py +5 -3
sglang/test/simple_eval_humaneval.py +1 -0
sglang/test/simple_eval_math.py +1 -0
sglang/test/simple_eval_mmlu.py +1 -0
sglang/test/simple_eval_mmmu_vlm.py +1 -0
sglang/test/test_utils.py +7 -1
sglang/version.py +1 -1
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +10 -24
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +150 -136
/sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.4.post1.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py CHANGED Viewed

@@ -25,6 +25,13 @@ from sglang.srt.utils import (
     is_hip,
 )
+try:
+    from triton.tools.tensor_descriptor import TensorDescriptor
+    _support_tensor_descriptor = True
+except:
+    _support_tensor_descriptor = False
 _is_hip = is_hip()
 _is_cuda = is_cuda()
 _is_cpu_amx_available = cpu_has_amx_support()
@@ -41,6 +48,10 @@ elif _is_hip:
 padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
+def support_tensor_descriptor():
+    return _support_tensor_descriptor
 @triton.jit
 def write_zeros_to_output(
     c_ptr,
@@ -108,6 +119,7 @@ def fused_moe_kernel_gptq_awq(
     use_int4_w4a16: tl.constexpr,
     use_int8_w8a16: tl.constexpr,
     even_Ks: tl.constexpr,
+    filter_expert: tl.constexpr,
 ):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
@@ -161,7 +173,7 @@ def fused_moe_kernel_gptq_awq(
     token_mask = offs_token < num_valid_tokens
     off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
-    if off_experts == -1:
+    if filter_expert and off_experts == -1:
         # -----------------------------------------------------------
         # Write back zeros to the output when the expert is not
         # in the current expert parallel rank.
@@ -296,7 +308,9 @@ def fused_moe_kernel_gptq_awq(
 def fused_moe_kernel(
     # Pointers to matrices
     a_ptr,
+    a_desc,
     b_ptr,
+    b_desc,
     bias_ptr,
     c_ptr,
     a_scale_ptr,
@@ -344,6 +358,8 @@ def fused_moe_kernel(
     use_int8_w8a16: tl.constexpr,
     per_channel_quant: tl.constexpr,
     even_Ks: tl.constexpr,
+    c_sorted: tl.constexpr,
+    filter_expert: tl.constexpr,
 ):
     """
     Implements the fused computation for a Mixture of Experts (MOE) using
@@ -399,9 +415,10 @@ def fused_moe_kernel(
     offs_token = offs_token.to(tl.int64)
     token_mask = offs_token < num_valid_tokens
-    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    off_experts_i32 = tl.load(expert_ids_ptr + pid_m)
+    off_experts = off_experts_i32.to(tl.int64)
-    if off_experts == -1:
+    if filter_expert and off_experts == -1:
         # -----------------------------------------------------------
         # Write back zeros to the output when the expert is not
         # in the current expert parallel rank.
@@ -421,15 +438,23 @@ def fused_moe_kernel(
     offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
-    a_ptrs = a_ptr + (
-        offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
-    )
+    if a_desc is not None:
+        assert use_fp8_w8a8 and group_n > 0 and group_k > 0
+        start_offs_m = pid_m * BLOCK_SIZE_M
+    else:
+        a_ptrs = a_ptr + (
+            offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
+        )
+    if b_desc is not None:
+        start_offs_n = pid_n * BLOCK_SIZE_N
+    else:
+        b_ptrs = (
+            b_ptr
+            + off_experts * stride_be
+            + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
+        )
-    b_ptrs = (
-        b_ptr
-        + off_experts * stride_be
-        + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn)
-    )
     if bias_ptr is not None:
         bias = tl.load(
             bias_ptr + off_experts * stride_bias_e + offs_bn[None, :] * stride_bias_n
@@ -443,8 +468,14 @@ def fused_moe_kernel(
     if use_fp8_w8a8 or use_int8_w8a8:
         # block-wise
         if group_k > 0 and group_n > 0:
-            a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
-            offs_bsn = offs_bn // group_n
+            if a_desc is not None:
+                a_scale_ptrs = a_scale_ptr + offs_token_id * stride_asm
+            else:
+                a_scale_ptrs = a_scale_ptr + (offs_token // top_k) * stride_asm
+            if BLOCK_SIZE_N > group_n:
+                offs_bsn = offs_bn // group_n
+            else:
+                offs_bsn = pid_n * BLOCK_SIZE_N // group_n
             b_scale_ptrs = (
                 b_scale_ptr + off_experts * stride_bse + offs_bsn * stride_bsn
             )
@@ -469,37 +500,49 @@ def fused_moe_kernel(
     # `accumulator` will be converted back to fp16 after the loop.
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)):
+    for k_start in range(0, K, BLOCK_SIZE_K):
         # Load the next block of A and B, generate a mask by checking the
         # K dimension.
-        if even_Ks:
+        if a_desc is not None:
+            a = a_desc.load([start_offs_m, k_start])
+        elif even_Ks:
             a = tl.load(
                 a_ptrs,
                 mask=token_mask[:, None],
                 other=0.0,
             )
-            b = tl.load(b_ptrs)
         else:
             a = tl.load(
                 a_ptrs,
-                mask=token_mask[:, None] & (offs_k[None, :] < K - k * BLOCK_SIZE_K),
+                mask=token_mask[:, None] & (offs_k[None, :] < K - k_start),
                 other=0.0,
             )
-            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0)
+        if b_desc is not None:
+            b = (
+                b_desc.load([off_experts_i32, start_offs_n, k_start])
+                .reshape(BLOCK_SIZE_N, BLOCK_SIZE_K)
+                .T
+            )
+        elif even_Ks:
+            b = tl.load(b_ptrs)
+        else:
+            b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k_start, other=0.0)
         # We accumulate along the K dimension.
         if use_int8_w8a16:
             accumulator = tl.dot(a, b.to(compute_type), acc=accumulator)
         elif use_fp8_w8a8 or use_int8_w8a8:
             if group_k > 0 and group_n > 0:
-                k_start = k * BLOCK_SIZE_K
                 offs_ks = k_start // group_k
                 a_scale = tl.load(
                     a_scale_ptrs + offs_ks * stride_ask, mask=token_mask, other=0.0
                 )
                 b_scale = tl.load(b_scale_ptrs + offs_ks * stride_bsk)
-                accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
+                if BLOCK_SIZE_N > group_n:
+                    accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
+                else:
+                    accumulator += tl.dot(a, b) * (a_scale[:, None] * b_scale)
             else:
                 if use_fp8_w8a8:
                     accumulator = tl.dot(a, b, acc=accumulator)
@@ -508,8 +551,10 @@ def fused_moe_kernel(
         else:
             accumulator += tl.dot(a, b)
         # Advance the ptrs to the next K block.
-        a_ptrs += BLOCK_SIZE_K * stride_ak
-        b_ptrs += BLOCK_SIZE_K * stride_bk
+        if a_desc is None:
+            a_ptrs += BLOCK_SIZE_K * stride_ak
+        if b_desc is None:
+            b_ptrs += BLOCK_SIZE_K * stride_bk
     if use_int8_w8a16:
         accumulator *= b_scale
@@ -528,7 +573,12 @@ def fused_moe_kernel(
     # -----------------------------------------------------------
     # Write back the block of the output
     offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)
-    c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
+    if c_sorted:
+        c_ptrs = (
+            c_ptr + stride_cm * offs_token_id[:, None] + stride_cn * offs_cn[None, :]
+        )
+    else:
+        c_ptrs = c_ptr + stride_cm * offs_token[:, None] + stride_cn * offs_cn[None, :]
     c_mask = token_mask[:, None] & (offs_cn[None, :] < N)
     tl.store(c_ptrs, accumulator, mask=c_mask)
@@ -557,6 +607,10 @@ def invoke_fused_moe_kernel(
     per_channel_quant: bool,
     block_shape: Optional[List[int]] = None,
     no_combine: bool = False,
+    a_use_tma: bool = False,
+    b_use_tma: bool = False,
+    c_sorted: bool = False,
+    filter_expert: bool = True,
 ) -> None:
     assert topk_weights.stride(1) == 1
     assert sorted_token_ids.stride(0) == 1
@@ -662,14 +716,38 @@ def invoke_fused_moe_kernel(
             use_int4_w4a16=use_int4_w4a16,
             use_int8_w8a16=use_int8_w8a16,
             even_Ks=even_Ks,
+            filter_expert=filter_expert,
             **config,
         )
     else:
+        if a_use_tma or b_use_tma:
+            # TMA descriptors require a global memory allocation
+            def alloc_fn(size: int, alignment: int, stream: Optional[int]):
+                return torch.empty(size, device="cuda", dtype=torch.int8)
+            triton.set_allocator(alloc_fn)
+        if a_use_tma:
+            a_desc = TensorDescriptor(
+                A, A.shape, A.stride(), [config["BLOCK_SIZE_M"], config["BLOCK_SIZE_K"]]
+            )
+        else:
+            a_desc = None
+        if b_use_tma:
+            b_desc = TensorDescriptor(
+                B,
+                B.shape,
+                B.stride(),
+                [1, config["BLOCK_SIZE_N"], config["BLOCK_SIZE_K"]],
+            )
+        else:
+            b_desc = None
         fused_moe_kernel[grid](
             A,
+            a_desc,
             B,
+            b_desc,
             bias,
             C,
             A_scale,
@@ -689,8 +767,8 @@ def invoke_fused_moe_kernel(
             B.stride(1),
             bias.stride(0) if bias is not None else 0,
             bias.stride(1) if bias is not None else 0,
-            C.stride(1),
-            C.stride(2),
+            C.stride(-2),
+            C.stride(-1),
             A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0,
             A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0,
             B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0,
@@ -706,6 +784,8 @@ def invoke_fused_moe_kernel(
             use_int8_w8a16=use_int8_w8a16,
             per_channel_quant=per_channel_quant,
             even_Ks=even_Ks,
+            c_sorted=c_sorted,
+            filter_expert=filter_expert,
             **config,
         )

sglang/srt/layers/moe/moe_runner/deep_gemm.py CHANGED Viewed

@@ -39,6 +39,9 @@ if not (_is_npu or _is_hip):
     from sgl_kernel import silu_and_mul
+_MASKED_GEMM_FAST_ACT = get_bool_env_var("SGLANG_MASKED_GEMM_FAST_ACT")
 # TODO(kaixih@nvidia): ideally we should merge this logic into
 # `fill_gateup_input_triton_kernel` to directly generate e8m0 scale.
 @torch.compile
@@ -214,6 +217,9 @@ class DeepGemmRunnerCore(MoeRunnerCore):
         from sglang.srt.layers.moe.ep_moe.kernels import (
             silu_and_mul_masked_post_quant_fwd,
         )
+        from sglang.srt.layers.quantization.fp8_kernel import (
+            sglang_per_token_group_quant_8bit,
+        )
         hidden_states = runner_input.hidden_states
         hidden_states_scale = runner_input.hidden_states_scale
@@ -227,15 +233,16 @@ class DeepGemmRunnerCore(MoeRunnerCore):
         hidden_states_device = running_state["hidden_states_device"]
-        if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
-            b, s_mn, s_k = hidden_states_scale.shape
-            assert (
-                s_mn % 4 == 0 and s_k % 4 == 0
-            ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
         # GroupGemm-0
         if deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0:
-            hidden_states_scale = _cast_to_e8m0_with_rounding_up(hidden_states_scale)
+            if hidden_states_scale.dtype != torch.int:
+                b, s_mn, s_k = hidden_states_scale.shape
+                assert (
+                    s_mn % 4 == 0 and s_k % 4 == 0
+                ), f"scales must be aligned to 4, but got ({b}, {s_mn}, {s_k})"
+                hidden_states_scale = _cast_to_e8m0_with_rounding_up(
+                    hidden_states_scale
+                )
         else:
             hidden_states_scale = deep_gemm_wrapper.get_mn_major_tma_aligned_tensor(
                 hidden_states_scale
@@ -257,33 +264,46 @@ class DeepGemmRunnerCore(MoeRunnerCore):
         dispose_tensor(hidden_states_scale)
         # Act
-        down_input = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2,
-            ),
-            device=hidden_states_device,
-            dtype=torch.float8_e4m3fn,
-        )
         scale_block_size = 128
-        down_input_scale = torch.empty(
-            (
-                gateup_output.shape[0],
-                gateup_output.shape[1],
-                gateup_output.shape[2] // 2 // scale_block_size,
-            ),
-            device=hidden_states_device,
-            dtype=torch.float32,
-        )
-        silu_and_mul_masked_post_quant_fwd(
-            gateup_output,
-            down_input,
-            down_input_scale,
-            scale_block_size,
-            masked_m,
-            scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
-        )
+        if _MASKED_GEMM_FAST_ACT:
+            down_input, down_input_scale = sglang_per_token_group_quant_8bit(
+                x=gateup_output,
+                dst_dtype=torch.float8_e4m3fn,
+                group_size=scale_block_size,
+                masked_m=masked_m,
+                column_major_scales=True,
+                scale_tma_aligned=True,
+                scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+                fuse_silu_and_mul=True,
+                enable_v2=True,
+            )
+        else:
+            down_input = torch.empty(
+                (
+                    gateup_output.shape[0],
+                    gateup_output.shape[1],
+                    gateup_output.shape[2] // 2,
+                ),
+                device=hidden_states_device,
+                dtype=torch.float8_e4m3fn,
+            )
+            down_input_scale = torch.empty(
+                (
+                    gateup_output.shape[0],
+                    gateup_output.shape[1],
+                    gateup_output.shape[2] // 2 // scale_block_size,
+                ),
+                device=hidden_states_device,
+                dtype=torch.float32,
+            )
+            silu_and_mul_masked_post_quant_fwd(
+                gateup_output,
+                down_input,
+                down_input_scale,
+                scale_block_size,
+                masked_m,
+                scale_ue8m0=deep_gemm_wrapper.DEEPGEMM_SCALE_UE8M0,
+            )
         del gateup_output
         # GroupGemm-1

sglang/srt/layers/moe/token_dispatcher/deepep.py CHANGED Viewed

@@ -97,7 +97,6 @@ class DeepEPNormalCombineInput(NamedTuple):
     hidden_states: torch.Tensor
     topk_ids: torch.Tensor
     topk_weights: torch.Tensor
-    overlap_args: Optional[CombineOverlapArgs] = None
     @property
     def format(self) -> CombineInputFormat:
@@ -110,7 +109,6 @@ class DeepEPLLCombineInput(NamedTuple):
     hidden_states: torch.Tensor
     topk_ids: torch.Tensor
     topk_weights: torch.Tensor
-    overlap_args: Optional[CombineOverlapArgs] = None
     @property
     def format(self) -> CombineInputFormat:
@@ -333,7 +331,7 @@ class _DeepEPDispatcherImplBase:
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         raise NotImplementedError
@@ -463,7 +461,7 @@ class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM or _use_aiter or _is_npu:
@@ -619,7 +617,7 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         hidden_states, event, hook = self._combine_core(
             hidden_states,
@@ -645,7 +643,7 @@ class _DeepEPDispatcherImplLowLatency(_DeepEPDispatcherImplBase):
         hidden_states: torch.Tensor,
         topk_ids: torch.Tensor,
         topk_weights: torch.Tensor,
-        overlap_args: Optional["CombineOverlapArgs"],
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
         buffer = self._get_buffer()
@@ -762,16 +760,21 @@ class DeepEPDispatcher(BaseDispatcher):
         del self._dispatch_intermediate_state
         return self._get_impl().dispatch_b(*inner_state)
-    def combine(self, combine_input: CombineInput) -> Tuple:
-        self.combine_a(combine_input)
+    def combine(
+        self,
+        combine_input: CombineInput,
+        overlap_args: Optional[CombineOverlapArgs] = None,
+    ) -> Tuple:
+        self.combine_a(combine_input, overlap_args)
         ret = self.combine_b()
         return ret
     def combine_a(
         self,
         combine_input: CombineInput,
+        overlap_args: Optional[CombineOverlapArgs] = None,
     ):
-        hidden_states, topk_ids, topk_weights, overlap_args = combine_input
+        hidden_states, topk_ids, topk_weights = combine_input
         self._update_stage(_Stage.AFTER_DISPATCH_B, _Stage.AFTER_COMBINE_A)
         inner_state = self._get_impl().combine_a(
             hidden_states=hidden_states,

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -314,16 +314,41 @@ class TopK(CustomOp):
         num_token_non_padded: Optional[torch.Tensor] = None,
         expert_location_dispatch_info: Optional[ExpertLocationDispatchInfo] = None,
     ) -> TopKOutput:
-        global_num_experts = router_logits.shape[-1]
-        # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
-        if global_num_experts == 256:
+        use_grouped_topk = self.topk_config.use_grouped_topk
+        torch_native = self.topk_config.torch_native
+        renormalize = self.topk_config.renormalize
+        if not use_grouped_topk and not torch_native:
+            topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k_softmax(
+                router_logits,
+                k=self.topk_config.top_k,
+            )
+            topk_weights = topk_weights.to(torch.float32)
+            if renormalize:
+                topk_weights_sum = (
+                    topk_weights.sum(dim=-1, keepdim=True)
+                    if self.topk_config.num_fused_shared_experts == 0
+                    else topk_weights[:, :-1].sum(dim=-1, keepdim=True)
+                )
+                topk_weights = topk_weights / topk_weights_sum
+            if expert_location_dispatch_info is not None:
+                topk_ids = topk_ids_logical_to_physical(
+                    topk_ids, expert_location_dispatch_info
+                )
+            get_global_expert_distribution_recorder().on_select_experts(
+                topk_ids=topk_ids
+            )
+            return StandardTopKOutput(topk_weights, topk_ids, _)
+        if use_grouped_topk and not torch_native and router_logits.shape[-1] == 256:
+            # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
             routed_scaling_factor = self.topk_config.routed_scaling_factor or 1
-            router_logits = router_logits.to(torch.float32)
             topk_weights, topk_ids, _ = torch_npu.npu_moe_gating_top_k(
-                router_logits,
+                router_logits.to(torch.float32),
                 k=self.topk_config.top_k,
                 bias=self.topk_config.correction_bias.to(torch.float32),
                 k_group=self.topk_config.topk_group,
@@ -335,7 +360,7 @@ class TopK(CustomOp):
                 eps=float(1e-20),
             )
-            if self.topk_config.renormalize:
+            if renormalize:
                 topk_weights_sum = (
                     topk_weights.sum(dim=-1, keepdim=True)
                     if self.topk_config.num_fused_shared_experts == 0

sglang/srt/layers/pooler.py CHANGED Viewed

@@ -20,7 +20,9 @@ class PoolingType(IntEnum):
 @dataclass
 class EmbeddingPoolerOutput:
-    embeddings: torch.Tensor
+    # Pooler can return list[tensor] instead of tensor if the dimension of each tensor in the batch is different
+    # due to different per-request matryoshka dim truncation
+    embeddings: torch.Tensor | list[torch.Tensor]
 class Pooler(nn.Module):
@@ -42,6 +44,7 @@ class Pooler(nn.Module):
     def forward(
         self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
     ) -> EmbeddingPoolerOutput:
         if self.pooling_type == PoolingType.LAST:
             last_token_indices = torch.cumsum(forward_batch.extend_seq_lens, dim=0) - 1
             pooled_data = hidden_states[last_token_indices]
@@ -53,8 +56,24 @@ class Pooler(nn.Module):
         else:
             raise ValueError(f"Invalid pooling type: {self.pooling_type}")
+        if forward_batch.dimensions is not None:
+            all_same_dimensions = len(set(forward_batch.dimensions)) == 1
+            if all_same_dimensions:
+                pooled_data = pooled_data[..., : forward_batch.dimensions[0]]
+            else:
+                pooled_data = [
+                    tensor[..., :dim]
+                    for tensor, dim in zip(pooled_data, forward_batch.dimensions)
+                ]
         if self.normalize:
-            pooled_data = nn.functional.normalize(pooled_data, p=2, dim=1)
+            if isinstance(pooled_data, list):
+                pooled_data = [
+                    nn.functional.normalize(tensor, p=2, dim=-1)
+                    for tensor in pooled_data
+                ]
+            else:
+                pooled_data = nn.functional.normalize(pooled_data, p=2, dim=-1)
         return EmbeddingPoolerOutput(embeddings=pooled_data)

sglang 0.5.4.post1__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

sglang 0.5.4.post1py3-none-any.whl → 0.5.4.post2py3-none-any.whl