PyPI - sglang - Versions diffs - 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl - Mend

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (75) hide show

sglang/srt/_custom_ops.py +29 -1
sglang/srt/configs/model_config.py +1 -1
sglang/srt/conversation.py +1 -1
sglang/srt/disaggregation/common/conn.py +34 -6
sglang/srt/disaggregation/mini_lb.py +3 -2
sglang/srt/disaggregation/mooncake/conn.py +49 -20
sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
sglang/srt/disaggregation/nixl/conn.py +17 -13
sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
sglang/srt/distributed/parallel_state.py +70 -15
sglang/srt/entrypoints/engine.py +2 -8
sglang/srt/entrypoints/http_server.py +20 -32
sglang/srt/entrypoints/openai/protocol.py +3 -3
sglang/srt/entrypoints/openai/serving_chat.py +27 -4
sglang/srt/function_call/base_format_detector.py +74 -12
sglang/srt/function_call/deepseekv3_detector.py +26 -11
sglang/srt/function_call/ebnf_composer.py +95 -63
sglang/srt/function_call/function_call_parser.py +4 -4
sglang/srt/function_call/kimik2_detector.py +41 -16
sglang/srt/function_call/llama32_detector.py +6 -3
sglang/srt/function_call/mistral_detector.py +11 -3
sglang/srt/function_call/pythonic_detector.py +16 -14
sglang/srt/function_call/qwen25_detector.py +12 -3
sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +10 -9
sglang/srt/layers/activation.py +11 -3
sglang/srt/layers/attention/base_attn_backend.py +3 -1
sglang/srt/layers/communicator.py +12 -12
sglang/srt/layers/dp_attention.py +72 -24
sglang/srt/layers/logits_processor.py +34 -24
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
sglang/srt/layers/moe/topk.py +5 -13
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
sglang/srt/layers/quantization/modelopt_quant.py +8 -4
sglang/srt/layers/quantization/utils.py +0 -9
sglang/srt/layers/radix_attention.py +5 -3
sglang/srt/lora/lora_manager.py +133 -169
sglang/srt/lora/lora_registry.py +124 -0
sglang/srt/lora/mem_pool.py +2 -2
sglang/srt/managers/cache_controller.py +53 -6
sglang/srt/managers/io_struct.py +19 -1
sglang/srt/managers/schedule_batch.py +13 -3
sglang/srt/managers/scheduler.py +13 -25
sglang/srt/managers/tokenizer_manager.py +28 -25
sglang/srt/managers/tp_worker.py +2 -4
sglang/srt/mem_cache/allocator.py +67 -7
sglang/srt/mem_cache/hicache_storage.py +17 -1
sglang/srt/mem_cache/hiradix_cache.py +30 -16
sglang/srt/mem_cache/memory_pool_host.py +3 -0
sglang/srt/model_executor/cuda_graph_runner.py +61 -25
sglang/srt/model_executor/forward_batch_info.py +201 -29
sglang/srt/model_executor/model_runner.py +41 -23
sglang/srt/models/deepseek_v2.py +1 -2
sglang/srt/models/mllama4.py +10 -3
sglang/srt/models/qwen2_moe.py +0 -4
sglang/srt/models/qwen3_moe.py +1 -6
sglang/srt/reasoning_parser.py +46 -4
sglang/srt/sampling/sampling_batch_info.py +6 -5
sglang/srt/server_args.py +76 -55
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
sglang/srt/speculative/eagle_utils.py +51 -23
sglang/srt/speculative/eagle_worker.py +59 -44
sglang/srt/two_batch_overlap.py +9 -5
sglang/srt/utils.py +17 -68
sglang/test/test_activation.py +50 -1
sglang/version.py +1 -1
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +5 -5
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +75 -72
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "256": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    }
+}

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -53,9 +53,7 @@ elif _is_hip:
             from aiter import moe_sum
         except ImportError:
             raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
-else:
-    from vllm import _custom_ops as vllm_ops
-    from vllm._custom_ops import scaled_fp8_quant
 if _is_cuda or _is_hip:
     from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
@@ -63,9 +61,6 @@ if _is_cuda or _is_hip:
 logger = logging.getLogger(__name__)
 padding_size = 128 if bool(int(os.getenv("SGLANG_MOE_PADDING", "0"))) else 0
-enable_moe_align_block_size_triton = bool(
-    int(os.getenv("ENABLE_MOE_ALIGN_BLOCK_SIZE_TRITON", "0"))
-)
 @triton.jit
@@ -533,190 +528,6 @@ def fused_moe_kernel(
     tl.store(c_ptrs, accumulator, mask=c_mask)
-@triton.jit
-def moe_align_block_size_stage1(
-    topk_ids_ptr,
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = pid * tokens_per_thread
-    off_c = (pid + 1) * num_experts
-    for i in range(tokens_per_thread):
-        if start_idx + i < numel:
-            idx = tl.load(topk_ids_ptr + start_idx + i)
-            token_cnt = tl.load(tokens_cnts_ptr + off_c + idx)
-            tl.store(tokens_cnts_ptr + off_c + idx, token_cnt + 1)
-@triton.jit
-def moe_align_block_size_stage2(
-    tokens_cnts_ptr,
-    num_experts: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    last_cnt = 0
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + i * num_experts + pid)
-        last_cnt = last_cnt + token_cnt
-        tl.store(tokens_cnts_ptr + i * num_experts + pid, last_cnt)
-@triton.jit
-def moe_align_block_size_stage3(
-    total_tokens_post_pad_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-):
-    last_cumsum = 0
-    off_cnt = num_experts * num_experts
-    for i in range(1, num_experts + 1):
-        token_cnt = tl.load(tokens_cnts_ptr + off_cnt + i - 1)
-        last_cumsum = last_cumsum + tl.cdiv(token_cnt, block_size) * block_size
-        tl.store(cumsum_ptr + i, last_cumsum)
-    tl.store(total_tokens_post_pad_ptr, last_cumsum)
-@triton.jit
-def moe_align_block_size_stage4(
-    topk_ids_ptr,
-    sorted_token_ids_ptr,
-    expert_ids_ptr,
-    tokens_cnts_ptr,
-    cumsum_ptr,
-    num_experts: tl.constexpr,
-    block_size: tl.constexpr,
-    numel: tl.constexpr,
-    tokens_per_thread: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    start_idx = tl.load(cumsum_ptr + pid)
-    end_idx = tl.load(cumsum_ptr + pid + 1)
-    for i in range(start_idx, end_idx, block_size):
-        tl.store(expert_ids_ptr + i // block_size, pid)
-    start_idx = pid * tokens_per_thread
-    off_t = pid * num_experts
-    for i in range(start_idx, tl.minimum(start_idx + tokens_per_thread, numel)):
-        expert_id = tl.load(topk_ids_ptr + i)
-        token_cnt = tl.load(tokens_cnts_ptr + off_t + expert_id)
-        rank_post_pad = token_cnt + tl.load(cumsum_ptr + expert_id)
-        tl.store(sorted_token_ids_ptr + rank_post_pad, i)
-        tl.store(tokens_cnts_ptr + off_t + expert_id, token_cnt + 1)
-def moe_align_block_size_triton(
-    topk_ids: torch.Tensor,
-    num_experts: int,
-    block_size: int,
-    sorted_token_ids: torch.Tensor,
-    expert_ids: torch.Tensor,
-    num_tokens_post_pad: torch.Tensor,
-) -> None:
-    numel = topk_ids.numel()
-    grid = (num_experts,)
-    tokens_cnts = torch.zeros(
-        (num_experts + 1, num_experts), dtype=torch.int32, device=topk_ids.device
-    )
-    cumsum = torch.zeros((num_experts + 1,), dtype=torch.int32, device=topk_ids.device)
-    tokens_per_thread = ceil_div(numel, num_experts)
-    moe_align_block_size_stage1[grid](
-        topk_ids,
-        tokens_cnts,
-        num_experts,
-        numel,
-        tokens_per_thread,
-    )
-    moe_align_block_size_stage2[grid](
-        tokens_cnts,
-        num_experts,
-    )
-    moe_align_block_size_stage3[(1,)](
-        num_tokens_post_pad,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-    )
-    moe_align_block_size_stage4[grid](
-        topk_ids,
-        sorted_token_ids,
-        expert_ids,
-        tokens_cnts,
-        cumsum,
-        num_experts,
-        block_size,
-        numel,
-        tokens_per_thread,
-    )
-@triton.jit
-def init_sorted_ids_and_cumsum_buffer_kernel(
-    sorted_ids_ptr,
-    cumsum_buffer_ptr,
-    max_num_tokens_padded,
-    topk_ids_numel,
-    num_experts: tl.constexpr,
-    BLOCK_SIZE: tl.constexpr,
-    ALIGNED_NUM_EXPERTS_P1: tl.constexpr,
-):
-    pid = tl.program_id(0)
-    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
-    sorted_ids_blocks = tl.cdiv(max_num_tokens_padded, BLOCK_SIZE)
-    if pid < sorted_ids_blocks:
-        mask = offsets < max_num_tokens_padded
-        tl.store(
-            sorted_ids_ptr + offsets,
-            tl.full((BLOCK_SIZE,), topk_ids_numel, dtype=tl.int32),
-            mask=mask,
-        )
-    elif pid == sorted_ids_blocks:
-        offset_e = tl.arange(0, ALIGNED_NUM_EXPERTS_P1)
-        mask_e = offset_e < num_experts + 1
-        tl.store(
-            cumsum_buffer_ptr + offset_e,
-            tl.zeros((ALIGNED_NUM_EXPERTS_P1,), dtype=tl.int32),
-            mask=mask_e,
-        )
-def init_sorted_ids_and_cumsum_buffer(
-    max_num_tokens_padded: int, topk_ids_numel: int, num_experts: int, device="cuda"
-):
-    sorted_ids = torch.empty((max_num_tokens_padded,), dtype=torch.int32, device=device)
-    cumsum_buffer = torch.empty((num_experts + 1,), dtype=torch.int32, device=device)
-    BLOCK_SIZE = 1024
-    sorted_ids_blocks = triton.cdiv(max_num_tokens_padded, BLOCK_SIZE)
-    grid = (sorted_ids_blocks + 1,)
-    init_sorted_ids_and_cumsum_buffer_kernel[grid](
-        sorted_ids,
-        cumsum_buffer,
-        max_num_tokens_padded,
-        topk_ids_numel,
-        num_experts,
-        BLOCK_SIZE,
-        next_power_of_2(num_experts + 1),
-    )
-    return sorted_ids, cumsum_buffer
 def moe_align_block_size(
     topk_ids: torch.Tensor, block_size: int, num_experts: int
 ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
@@ -766,42 +577,32 @@ def moe_align_block_size(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
-    if enable_moe_align_block_size_triton:
-        sorted_ids.fill_(topk_ids.numel())
-        moe_align_block_size_triton(
-            topk_ids,
-            num_experts,
-            block_size,
-            sorted_ids,
-            expert_ids,
-            num_tokens_post_pad,
-        )
-    else:
-        cumsum_buffer = torch.empty(
-            (num_experts + 1,), dtype=torch.int32, device=topk_ids.device
-        )
-        token_cnts_buffer = torch.empty(
-            (num_experts + 1) * num_experts,
-            dtype=torch.int32,
-            device=topk_ids.device,
-        )
-        # Threshold based on benchmark results
-        fuse_sorted_ids_padding = sorted_ids.shape[0] <= 4096
-        if not fuse_sorted_ids_padding:
-            sorted_ids.fill_(topk_ids.numel())
+    cumsum_buffer = torch.empty(
+        (num_experts + 1,), dtype=torch.int32, device=topk_ids.device
+    )
+    token_cnts_buffer = torch.empty(
+        (num_experts + 1) * num_experts,
+        dtype=torch.int32,
+        device=topk_ids.device,
+    )
-        sgl_moe_align_block_size(
-            topk_ids,
-            num_experts,
-            block_size,
-            sorted_ids,
-            expert_ids,
-            num_tokens_post_pad,
-            token_cnts_buffer,
-            cumsum_buffer,
-            fuse_sorted_ids_padding,
-        )
+    # Threshold based on benchmark results
+    fuse_sorted_ids_padding = sorted_ids.shape[0] <= 4096
+    if not fuse_sorted_ids_padding:
+        sorted_ids.fill_(topk_ids.numel())
+    sgl_moe_align_block_size(
+        topk_ids,
+        num_experts,
+        block_size,
+        sorted_ids,
+        expert_ids,
+        num_tokens_post_pad,
+        token_cnts_buffer,
+        cumsum_buffer,
+        fuse_sorted_ids_padding,
+    )
     return sorted_ids, expert_ids, num_tokens_post_pad

sglang/srt/layers/moe/topk.py CHANGED Viewed

@@ -15,7 +15,7 @@
 from __future__ import annotations
 import math
-from typing import TYPE_CHECKING, Callable, NamedTuple, Optional
+from typing import Callable, NamedTuple, Optional
 import torch
 import torch.nn.functional as F
@@ -39,10 +39,10 @@ from sglang.srt.utils import (
 _is_cuda = is_cuda()
 _is_hip = is_hip()
-_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
-_is_cpu_amx_available = cpu_has_amx_support()
 _is_cpu = is_cpu()
+_is_cpu_amx_available = cpu_has_amx_support()
 _is_npu = is_npu()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_cuda:
     from sgl_kernel import moe_fused_gate
@@ -54,7 +54,6 @@ if _use_aiter:
         from aiter import biased_grouped_topk as aiter_biased_grouped_topk
     except ImportError:
         raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
 if _is_npu:
     import torch_npu
@@ -387,6 +386,7 @@ def grouped_topk_cpu(
     )
+@torch.compile(dynamic=True, backend=get_compiler_backend(), disable=_is_npu)
 def biased_grouped_topk_impl(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -482,7 +482,6 @@ def biased_grouped_topk_gpu(
     renormalize: bool,
     num_expert_group: int = 0,
     topk_group: int = 0,
-    compiled: bool = not _is_npu,
     num_fused_shared_experts: int = 0,
     routed_scaling_factor: Optional[float] = None,
     num_token_non_padded: Optional[torch.Tensor] = None,
@@ -535,14 +534,7 @@ def biased_grouped_topk_gpu(
         )
         return topk_weights, topk_ids
     else:
-        biased_grouped_topk_fn = (
-            torch.compile(
-                biased_grouped_topk_impl, dynamic=True, backend=get_compiler_backend()
-            )
-            if compiled
-            else biased_grouped_topk_impl
-        )
-        return biased_grouped_topk_fn(
+        return biased_grouped_topk_impl(
             hidden_states,
             gating_output,
             correction_bias,

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -28,15 +28,6 @@ if TYPE_CHECKING:
         CompressedTensorsConfig,
     )
-_is_cuda = is_cuda()
-_is_npu = is_npu()
-_is_cpu_amx_available = cpu_has_amx_support()
-_is_cpu = is_cpu()
-_is_hip = is_hip()
-if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip):
-    from vllm import _custom_ops as vllm_ops
-    from vllm._custom_ops import scaled_fp8_quant
 try:
     import vllm
@@ -568,6 +559,8 @@ class CompressedTensorsWNA16MoEMethod(CompressedTensorsMoEMethod):
                 requires_grad=False,
             )
+        from vllm import _custom_ops as vllm_ops
         marlin_w13_qweight = vllm_ops.gptq_marlin_moe_repack(
             layer.w13_weight_packed,
             layer.w13_g_idx_sort_indices,

sglang/srt/layers/quantization/modelopt_quant.py CHANGED Viewed

@@ -952,7 +952,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
         tp_rank: Optional[int] = None,
         tp_size: Optional[int] = None,
     ) -> torch.Tensor:
         assert activation == "silu", "Only SiLU activation is supported."
         if self.enable_flashinfer_moe:
@@ -982,13 +981,15 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
                 tp_size=tp_size,
                 tp_rank=tp_rank,
                 tune_max_num_tokens=next_power_of_2(x.shape[0]),
-            )
-            return output[0]
+            )[0]
+            if routed_scaling_factor is not None:
+                output *= routed_scaling_factor
+            return output
         from sglang.srt.layers.moe.cutlass_moe import cutlass_moe_fp4
         topk_weights, topk_ids, _ = topk_output
-        return cutlass_moe_fp4(
+        output = cutlass_moe_fp4(
             a=x,
             a1_gscale=layer.w13_input_scale_quant,
             w1_fp4=layer.w13_weight,
@@ -1003,3 +1004,6 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
             params=layer.cutlass_moe_params,
             apply_router_weight_on_input=apply_router_weight_on_input,
         ).to(x.dtype)
+        if routed_scaling_factor is not None:
+            output *= routed_scaling_factor
+        return output

sglang/srt/layers/quantization/utils.py CHANGED Viewed

@@ -17,15 +17,6 @@ from sglang.srt.utils import cpu_has_amx_support, is_cpu, is_cuda, is_hip, is_np
 if TYPE_CHECKING:
     from sglang.srt.layers.quantization.base_config import QuantizationConfig
-_is_cuda = is_cuda()
-_is_npu = is_npu()
-_is_cpu_amx_available = cpu_has_amx_support()
-_is_cpu = is_cpu()
-_is_hip = is_hip()
-if not (_is_cuda or _is_npu or (_is_cpu and _is_cpu_amx_available) or _is_hip):
-    from vllm._custom_ops import scaled_fp8_quant
 def is_layer_skipped(
     prefix: str,

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -12,14 +12,16 @@
 # limitations under the License.
 # ==============================================================================
 """Radix attention."""
+from __future__ import annotations
 from enum import Enum
-from typing import Optional
+from typing import TYPE_CHECKING, Optional
 from torch import nn
-from sglang.srt.layers.quantization.base_config import QuantizationConfig
-from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+if TYPE_CHECKING:
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 class AttentionType(Enum):

sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

sglang 0.4.9.post3py3-none-any.whl → 0.4.9.post4py3-none-any.whl