PyPI - sglang - Versions diffs - 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl - Mend

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (99) hide show

sglang/bench_serving.py +2 -2
sglang/srt/configs/model_config.py +36 -2
sglang/srt/conversation.py +56 -3
sglang/srt/disaggregation/ascend/__init__.py +6 -0
sglang/srt/disaggregation/ascend/conn.py +44 -0
sglang/srt/disaggregation/ascend/transfer_engine.py +58 -0
sglang/srt/disaggregation/mooncake/conn.py +50 -18
sglang/srt/disaggregation/mooncake/transfer_engine.py +17 -8
sglang/srt/disaggregation/utils.py +25 -3
sglang/srt/entrypoints/engine.py +1 -1
sglang/srt/entrypoints/http_server.py +1 -0
sglang/srt/entrypoints/http_server_engine.py +1 -1
sglang/srt/entrypoints/openai/protocol.py +11 -0
sglang/srt/entrypoints/openai/serving_chat.py +7 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/kimik2_detector.py +220 -0
sglang/srt/hf_transformers_utils.py +18 -0
sglang/srt/jinja_template_utils.py +8 -0
sglang/srt/layers/communicator.py +20 -5
sglang/srt/layers/flashinfer_comm_fusion.py +3 -3
sglang/srt/layers/layernorm.py +2 -2
sglang/srt/layers/linear.py +12 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +215 -0
sglang/srt/layers/moe/ep_moe/kernels.py +60 -1
sglang/srt/layers/moe/ep_moe/layer.py +141 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +2 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +141 -59
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +176 -0
sglang/srt/layers/moe/topk.py +8 -2
sglang/srt/layers/parameter.py +19 -3
sglang/srt/layers/quantization/__init__.py +2 -0
sglang/srt/layers/quantization/fp8.py +28 -7
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/modelopt_quant.py +244 -1
sglang/srt/layers/quantization/moe_wna16.py +1 -2
sglang/srt/layers/quantization/w4afp8.py +264 -0
sglang/srt/layers/quantization/w8a8_int8.py +738 -14
sglang/srt/layers/vocab_parallel_embedding.py +9 -3
sglang/srt/lora/triton_ops/gate_up_lora_b.py +30 -19
sglang/srt/lora/triton_ops/qkv_lora_b.py +30 -19
sglang/srt/lora/triton_ops/sgemm_lora_a.py +27 -11
sglang/srt/lora/triton_ops/sgemm_lora_b.py +27 -15
sglang/srt/managers/cache_controller.py +41 -195
sglang/srt/managers/io_struct.py +35 -3
sglang/srt/managers/mm_utils.py +59 -96
sglang/srt/managers/schedule_batch.py +17 -6
sglang/srt/managers/scheduler.py +38 -6
sglang/srt/managers/tokenizer_manager.py +16 -0
sglang/srt/mem_cache/hiradix_cache.py +2 -0
sglang/srt/mem_cache/memory_pool.py +176 -101
sglang/srt/mem_cache/memory_pool_host.py +6 -109
sglang/srt/mem_cache/radix_cache.py +8 -4
sglang/srt/model_executor/forward_batch_info.py +13 -1
sglang/srt/model_loader/loader.py +23 -12
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +78 -19
sglang/srt/models/deepseek_vl2.py +1 -1
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +6 -3
sglang/srt/models/internvl.py +8 -2
sglang/srt/models/kimi_vl.py +8 -2
sglang/srt/models/llama.py +2 -0
sglang/srt/models/llava.py +3 -1
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpmo.py +1 -2
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mixtral_quant.py +4 -0
sglang/srt/models/mllama4.py +372 -82
sglang/srt/models/phi4mm.py +8 -2
sglang/srt/models/phimoe.py +553 -0
sglang/srt/models/qwen2.py +2 -0
sglang/srt/models/qwen2_5_vl.py +10 -7
sglang/srt/models/qwen2_vl.py +12 -1
sglang/srt/models/vila.py +8 -2
sglang/srt/multimodal/mm_utils.py +2 -2
sglang/srt/multimodal/processors/base_processor.py +197 -137
sglang/srt/multimodal/processors/deepseek_vl_v2.py +1 -1
sglang/srt/multimodal/processors/gemma3.py +4 -2
sglang/srt/multimodal/processors/gemma3n.py +1 -1
sglang/srt/multimodal/processors/internvl.py +1 -1
sglang/srt/multimodal/processors/janus_pro.py +1 -1
sglang/srt/multimodal/processors/kimi_vl.py +1 -1
sglang/srt/multimodal/processors/minicpm.py +4 -3
sglang/srt/multimodal/processors/mllama4.py +63 -61
sglang/srt/multimodal/processors/phi4mm.py +1 -1
sglang/srt/multimodal/processors/pixtral.py +1 -1
sglang/srt/multimodal/processors/qwen_vl.py +203 -80
sglang/srt/multimodal/processors/vila.py +1 -1
sglang/srt/server_args.py +26 -4
sglang/srt/two_batch_overlap.py +3 -0
sglang/srt/utils.py +191 -48
sglang/test/test_cutlass_w4a8_moe.py +281 -0
sglang/utils.py +5 -5
sglang/version.py +1 -1
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/METADATA +6 -4
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/RECORD +99 -90
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.dist-info → sglang-0.4.9.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/cutlass_w4a8_moe.py ADDED Viewed

@@ -0,0 +1,215 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Cutlass W4A8 MoE kernel."""
+from typing import Optional
+import torch
+from sgl_kernel import (
+    cutlass_w4a8_moe_mm,
+    get_cutlass_w4a8_moe_mm_data,
+    sgl_per_tensor_quant_fp8,
+    silu_and_mul,
+)
+from sglang.srt.layers.moe.ep_moe.kernels import (
+    post_reorder_triton_kernel,
+    pre_reorder_triton_kernel_for_cutlass_moe,
+    run_cutlass_moe_ep_preproess,
+)
+def cutlass_w4a8_moe(
+    start_expert_id: int,
+    end_expert_id: int,
+    total_num_experts: int,
+    a: torch.Tensor,
+    w1_q: torch.Tensor,
+    w2_q: torch.Tensor,
+    w1_scale: torch.Tensor,
+    w2_scale: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids_: torch.Tensor,
+    local_topk_ids: torch.Tensor,
+    a_strides1: torch.Tensor,
+    b_strides1: torch.Tensor,
+    c_strides1: torch.Tensor,
+    a_strides2: torch.Tensor,
+    b_strides2: torch.Tensor,
+    c_strides2: torch.Tensor,
+    s_strides13: torch.Tensor,
+    s_strides2: torch.Tensor,
+    expert_offsets: torch.Tensor,
+    problem_sizes1: torch.Tensor,
+    problem_sizes2: torch.Tensor,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    apply_router_weight_on_input: bool = False,
+) -> torch.Tensor:
+    """
+    This function computes a w4a8-quantized Mixture of Experts (MoE) layer
+    using two sets of quantized weights, w1_q and w2_q, and top-k gating
+    mechanism. The matrix multiplications are implemented with CUTLASS
+    grouped gemm.
+    Parameters:
+    - a (torch.Tensor): The input tensor to the MoE layer.
+        Shape: [M, K]
+    - w1_q (torch.Tensor): The first set of int4-quantized expert weights.
+        Shape: [num_experts, N * 2,  K // 2]
+        (the weights are passed transposed and int4-packed)
+    - w2_q (torch.Tensor): The second set of int4-quantized expert weights.
+        Shape: [num_experts, K, N // 2]
+        (the weights are passed transposed and int4-packed)
+    - w1_scale (torch.Tensor): The fp32 scale to dequantize w1_q.
+        Shape: [num_experts, K // 512, N * 8]
+    - w2_scale (torch.Tensor): The fp32 scale to dequantize w2_q.
+        Shape: [num_experts, N // 512, K * 4]
+    - topk_weights (torch.Tensor): The weights of each token->expert mapping.
+    - a_strides1 (torch.Tensor): The input strides of the first grouped gemm.
+    - b_strides1 (torch.Tensor): The weights strides of the first grouped gemm.
+    - c_strides1 (torch.Tensor): The output strides of the first grouped gemm.
+    - a_strides2 (torch.Tensor): The input strides of the second grouped gemm.
+    - b_strides2 (torch.Tensor): The weights strides of the second grouped gemm.
+    - c_strides2 (torch.Tensor): The output strides of the second grouped gemm.
+    - s_strides13 (torch.Tensor): The input and scale strides of the first grouped gemm.
+    - s_strides2 (torch.Tensor): The scale strides of the second grouped gemm.
+    - a1_scale (Optional[torch.Tensor]): The optional fp32 scale to quantize a.
+        Shape: scalar or [1, K]
+    - a2_scale (Optional[torch.Tensor]): The optional fp32 scale to
+        quantize the intermediate result between the gemms.
+        Shape: scalar or [1, N]
+    - apply_router_weight_on_input (bool): When true, the topk weights are
+        applied directly on the inputs. This is only applicable when topk is 1.
+    Returns:
+    - torch.Tensor: The fp8 output tensor after applying the MoE layer.
+    """
+    assert topk_weights.shape == topk_ids_.shape, "topk shape mismatch"
+    assert w1_q.dtype == torch.int8
+    assert w2_q.dtype == torch.int8
+    assert a.shape[1] // 2 == w1_q.shape[2], "Hidden size mismatch w1"
+    assert w1_q.shape[2] * 2 == w2_q.shape[1], "Hidden size mismatch w2"
+    assert w1_q.shape[0] == w2_q.shape[0], "Expert number mismatch"
+    assert w1_q.shape[0] == w1_scale.shape[0], "w1 scales expert number mismatch"
+    assert w1_q.shape[0] == w2_scale.shape[0], "w2 scales expert number mismatch"
+    assert (
+        w1_scale.shape[1] == w1_q.shape[2] * 2 / 512
+        and w1_scale.shape[2] == w1_q.shape[1] * 4
+    ), "W1 scale shape mismatch"
+    assert (
+        w2_scale.shape[1] == w2_q.shape[2] * 2 / 512
+        and w2_scale.shape[2] == w2_q.shape[1] * 4
+    ), "W2 scale shape mismatch"
+    assert a_strides1.shape[0] == w1_q.shape[0], "A Strides 1 expert number mismatch"
+    assert b_strides1.shape[0] == w1_q.shape[0], "B Strides 1 expert number mismatch"
+    assert a_strides2.shape[0] == w2_q.shape[0], "A Strides 2 expert number  mismatch"
+    assert b_strides2.shape[0] == w2_q.shape[0], "B Strides 2 expert number mismatch"
+    num_experts = w1_q.size(0)
+    m = a.size(0)
+    k = w1_q.size(2) * 2  # w1_q is transposed and packed
+    n = w2_q.size(2) * 2  # w2_q is transposed and packed
+    topk = topk_ids_.size(1)
+    if apply_router_weight_on_input:
+        assert topk == 1, "apply_router_weight_on_input is only implemented for topk=1"
+    device = a.device
+    _, src2dst, _ = run_cutlass_moe_ep_preproess(
+        local_topk_ids,
+        num_experts,
+    )
+    gateup_input = torch.empty(
+        (m * topk, k),
+        device=device,
+        dtype=torch.float8_e4m3fn,
+    )
+    pre_reorder_triton_kernel_for_cutlass_moe[(m,)](
+        a,
+        gateup_input,
+        src2dst,
+        local_topk_ids,
+        a1_scale,
+        total_num_experts,
+        topk,
+        k,
+        BLOCK_SIZE=512,
+    )
+    # NOTE: a_map and c_map are not used in the get_cutlass_w4a8_moe_mm_data kernel,
+    # they are kept to allow for a quick switch of the permutation logic
+    # from the current triton kernel implementation to the cutlass-based one if needed.
+    a_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    c_map = torch.empty((local_topk_ids.numel()), dtype=torch.int32, device=device)
+    get_cutlass_w4a8_moe_mm_data(
+        local_topk_ids,
+        expert_offsets,
+        problem_sizes1,
+        problem_sizes2,
+        a_map,
+        c_map,
+        num_experts,
+        n,
+        k,
+    )
+    c1 = torch.empty((m * topk, n * 2), device=device, dtype=torch.half)
+    c2 = torch.zeros((m * topk, k), device=device, dtype=torch.half)
+    cutlass_w4a8_moe_mm(
+        c1,
+        gateup_input,
+        w1_q,
+        a1_scale.float(),
+        w1_scale,
+        expert_offsets[:-1],
+        problem_sizes1,
+        a_strides1,
+        b_strides1,
+        c_strides1,
+        s_strides13,
+        128,
+        topk,
+    )
+    intermediate = torch.empty((m * topk, n), device=device, dtype=torch.half)
+    silu_and_mul(c1, intermediate)
+    intermediate_q = torch.empty(
+        intermediate.shape, dtype=torch.float8_e4m3fn, device=device
+    )
+    sgl_per_tensor_quant_fp8(intermediate, intermediate_q, a2_scale.float(), True)
+    cutlass_w4a8_moe_mm(
+        c2,
+        intermediate_q,
+        w2_q,
+        a2_scale.float(),
+        w2_scale,
+        expert_offsets[:-1],
+        problem_sizes2,
+        a_strides2,
+        b_strides2,
+        c_strides2,
+        s_strides2,
+        128,
+        topk,
+    )
+    output = torch.empty_like(a)
+    post_reorder_triton_kernel[(m,)](
+        c2,
+        output,
+        src2dst,
+        topk_ids_,
+        topk_weights,
+        start_expert_id,
+        end_expert_id,
+        topk,
+        k,
+        0,
+        BLOCK_SIZE=512,
+    )
+    return output

sglang/srt/layers/moe/ep_moe/kernels.py CHANGED Viewed

@@ -6,6 +6,7 @@ import triton
 from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
 from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda
+from sglang.utils import is_in_ci
 logger = logging.getLogger(__name__)
@@ -146,6 +147,7 @@ def compute_seg_indptr_triton_kernel(reorder_topk_ids, seg_indptr, num_toks):
 def run_moe_ep_preproess(topk_ids: torch.Tensor, num_experts: int):
     reorder_topk_ids, reorder_ids = torch.sort(topk_ids.view(-1), stable=True)
     seg_indptr = torch.zeros(num_experts + 1, device=topk_ids.device, dtype=torch.int64)
     src2dst = torch.empty(topk_ids.numel(), device=topk_ids.device, dtype=torch.int32)
@@ -158,9 +160,66 @@ def run_moe_ep_preproess(topk_ids: torch.Tensor, num_experts: int):
     compute_src2dst_triton_kernel[grid](
         reorder_ids, src2dst, topk_ids.numel(), BLOCK_SIZE
     )
     return reorder_topk_ids, src2dst, seg_indptr
+def run_cutlass_moe_ep_preproess(local_topk_ids: torch.Tensor, local_num_experts: int):
+    reorder_topk_ids, reorder_ids = torch.sort(local_topk_ids.view(-1), stable=True)
+    seg_indptr = torch.zeros(
+        local_num_experts + 1, device=local_topk_ids.device, dtype=torch.int64
+    )
+    src2dst = torch.empty(
+        local_topk_ids.numel(), device=local_topk_ids.device, dtype=torch.int32
+    )
+    BLOCK_SIZE = 512
+    grid = (triton.cdiv(local_topk_ids.numel(), BLOCK_SIZE),)
+    compute_src2dst_triton_kernel[grid](
+        reorder_ids, src2dst, local_topk_ids.numel(), BLOCK_SIZE
+    )
+    return reorder_topk_ids, src2dst, seg_indptr
+@triton.jit
+def pre_reorder_triton_kernel_for_cutlass_moe(
+    input_ptr,
+    gateup_input_ptr,
+    src2dst_ptr,
+    topk_ids_ptr,
+    a1_scales_ptr,
+    num_experts,
+    topk,
+    hidden_size,
+    BLOCK_SIZE: tl.constexpr,
+):
+    OutDtype = gateup_input_ptr.dtype.element_ty
+    src_idx = tl.program_id(0)
+    src2dst_ptr = src2dst_ptr + src_idx * topk
+    topk_ids_ptr = topk_ids_ptr + src_idx * topk
+    src_ptr = input_ptr + src_idx * hidden_size
+    for idx in range(topk):
+        expert_id = tl.load(topk_ids_ptr + idx)
+        if expert_id != num_experts:
+            if a1_scales_ptr is not None:
+                scale = 1.0 / tl.load(a1_scales_ptr)
+            else:
+                scale = 1.0
+            dst_idx = tl.load(src2dst_ptr + idx)
+            dst_ptr = gateup_input_ptr + dst_idx * hidden_size
+            for start_offset in tl.range(0, hidden_size, BLOCK_SIZE):
+                offset = start_offset + tl.arange(0, BLOCK_SIZE)
+                mask = offset < hidden_size
+                in_data = tl.load(src_ptr + offset, mask=mask).to(tl.float32)
+                out_data = (in_data * scale).to(OutDtype)
+                tl.store(dst_ptr + offset, out_data, mask=mask)
 @triton.jit
 def pre_reorder_triton_kernel(
     input_ptr,
@@ -1000,7 +1059,7 @@ def ep_gather(
     input_index: torch.Tensor,
     output_tensor: torch.Tensor,
 ):
-    BLOCK_D = 1024  # block size of quantization
+    BLOCK_D = 1024 if not is_in_ci() else 128  # block size of quantization
     num_warps = 2
     num_tokens = output_tensor.shape[0]
     hidden_size = input_tensor.shape[1]

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -20,6 +20,8 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
     moe_ep_deepgemm_preprocess,
     post_reorder_triton_kernel,
     pre_reorder_triton_kernel,
+    pre_reorder_triton_kernel_for_cutlass_moe,
+    run_cutlass_moe_ep_preproess,
     run_moe_ep_preproess,
     silu_and_mul_masked_post_quant_fwd,
     silu_and_mul_triton_kernel,
@@ -41,6 +43,7 @@ from sglang.srt.layers.quantization.fp8_kernel import (
     sglang_per_token_quant_fp8,
 )
 from sglang.srt.layers.quantization.fp8_utils import normalize_e4m3fn_to_e4m3fnuz
+from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
 from sglang.srt.utils import (
@@ -61,6 +64,8 @@ _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if not _is_npu:
     from sgl_kernel import silu_and_mul
+    from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
 if _is_hip:
     from vllm._custom_ops import scaled_fp8_quant
@@ -191,7 +196,7 @@ class EPMoE(torch.nn.Module):
             num_fused_shared_experts == 0
         ), "num_fused_shared_experts is not supported in EP"
         self.num_fused_shared_experts = num_fused_shared_experts
-        self.num_experts_per_partition = self.num_experts // self.tp_size
+        self.num_experts_per_partition, self.expert_map = self.determine_expert_map()
         self.start_expert_id = self.tp_rank * self.num_experts_per_partition
         self.end_expert_id = self.start_expert_id + self.num_experts_per_partition - 1
@@ -215,6 +220,18 @@ class EPMoE(torch.nn.Module):
             self.use_block_quant = False
             self.block_shape = None
             self.activation_scheme = None
+            self.use_w4afp8 = False
+        elif isinstance(quant_config, W4AFp8Config):
+            self.quant_method: Optional[QuantizeMethodBase] = W4AFp8MoEMethod(
+                quant_config
+            )
+            self.use_w4afp8 = True
+            self.use_fp8_w8a8 = False
+            self.use_block_quant = False
+            self.fp8_dtype = torch.float8_e4m3fn
+            self.w13_weight_scale = None
+            self.w2_weight_scale = None
+            self.activation_scheme = quant_config.moe_activation_scheme
         else:
             self.quant_method: Optional[QuantizeMethodBase] = Fp8EPMoEMethod(
                 quant_config
@@ -228,6 +245,7 @@ class EPMoE(torch.nn.Module):
             )
             self.fp8_dtype = torch.float8_e4m3fn
             self.activation_scheme = quant_config.activation_scheme
+            self.use_w4afp8 = False
         self.quant_method.create_weights(
             layer=self,
@@ -253,6 +271,49 @@ class EPMoE(torch.nn.Module):
             self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
         )
+    # Adapted from https://github.com/vllm-project/vllm/blob/9fb52e523abf7bdaf7e60cf2971edb5a1b13dc08/vllm/model_executor/layers/fused_moe/layer.py#L544C1-L586C43
+    # Modifications: use determine_expert_map as a class internal function, set 'global_num_experts' rather than '-1' for experts not assigned to the current rank.
+    def determine_expert_map(self) -> Tuple[int, Optional[torch.Tensor]]:
+        """
+        Calculates how many experts should be assigned to each rank for EP and
+        creates a mapping from global to local expert index. Experts are
+        distributed evenly across ranks. Any remaining are assigned to the
+        last rank.
+        Returns:
+            Tuple[int, Optional[torch.Tensor]]: A tuple containing:
+                - local_num_experts (int): The number of experts assigned
+                    to the current rank.
+                - expert_map (Optional[torch.Tensor]): A tensor of shape
+                    (global_num_experts,) mapping from global to local index.
+                    Contains global_num_experts for experts not assigned to the current rank.
+                    Returns None if ep_size is 1.
+        """
+        ep_size = self.tp_size
+        ep_rank = self.tp_rank
+        global_num_experts = self.num_experts
+        assert ep_size > 0
+        if ep_size == 1:
+            return (global_num_experts, None)
+        local_num_experts = global_num_experts // ep_size
+        expert_map = torch.full(
+            (global_num_experts,), self.num_experts, dtype=torch.int32
+        )
+        if ep_rank < (ep_size - 1):
+            expert_map[
+                ep_rank * local_num_experts : (ep_rank + 1) * local_num_experts
+            ] = torch.arange(0, local_num_experts, dtype=torch.int32)
+        else:
+            local_num_experts = global_num_experts - ep_rank * local_num_experts
+            expert_map[-local_num_experts:] = torch.arange(
+                0, local_num_experts, dtype=torch.int32
+            )
+        return (local_num_experts, expert_map)
     def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
         if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
             return self.forward_deepgemm(hidden_states, router_logits)
@@ -440,6 +501,51 @@ class EPMoE(torch.nn.Module):
             ),
         )
+        if self.use_w4afp8:
+            local_topk_ids = topk_ids
+            if self.expert_map is not None:
+                "Translate info from expert_map to topk_ids"
+                local_topk_ids = torch.where(
+                    self.expert_map[topk_ids] != self.num_experts,
+                    self.expert_map[topk_ids],
+                    self.num_experts,
+                )
+            output = cutlass_w4a8_moe(
+                self.start_expert_id,
+                self.end_expert_id,
+                self.num_experts,
+                hidden_states,
+                self.w13_weight,
+                self.w2_weight,
+                self.w13_weight_scale_inv,
+                self.w2_weight_scale_inv,
+                topk_weights,
+                topk_ids,
+                local_topk_ids,
+                self.quant_method.a_strides1,
+                self.quant_method.b_strides1,
+                self.quant_method.c_strides1,
+                self.quant_method.a_strides2,
+                self.quant_method.b_strides2,
+                self.quant_method.c_strides2,
+                self.quant_method.s_strides13,
+                self.quant_method.s_strides2,
+                self.quant_method.expert_offsets,
+                self.quant_method.problem_sizes1,
+                self.quant_method.problem_sizes2,
+                self.w13_input_scale,
+                self.w2_input_scale,
+            )
+            return output
+        if self.grouped_gemm_runner is None:
+            self.grouped_gemm_runner = GroupedGemmRunner(
+                hidden_states.device,
+                use_flashinfer=False,  # TODO: use flashinfer
+                use_per_token_if_dynamic=self.use_per_token_if_dynamic,
+            )
         reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
             topk_ids, self.num_experts
         )
@@ -449,7 +555,7 @@ class EPMoE(torch.nn.Module):
             device=hidden_states.device,
             dtype=(
                 self.fp8_dtype
-                if (self.use_fp8_w8a8 and not self.use_block_quant)
+                if ((self.use_fp8_w8a8 or self.use_w4afp8) and not self.use_block_quant)
                 else hidden_states.dtype
             ),
         )
@@ -656,6 +762,23 @@ class EPMoE(torch.nn.Module):
             ]
         ]
+    @classmethod
+    def make_expert_input_scale_params_mapping(
+        cls,
+        num_experts: int,
+    ) -> List[Tuple[str, str, int, str]]:
+        # (param_name, weight_name, expert_id, shard_id)
+        return [
+            (
+                "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
+                f"experts.{expert_id}.{shard_id}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id in ["w1", "w2", "w3"]
+        ]
     def weight_loader(
         self,
         param: torch.nn.Parameter,
@@ -727,6 +850,15 @@ class EPMoE(torch.nn.Module):
         # Input scales can be loaded directly and should be equal.
         if "input_scale" in weight_name:
+            if self.use_w4afp8:
+                if shard_id == "w1":
+                    param_data[expert_id][0] = loaded_weight
+                elif shard_id == "w3":
+                    param_data[expert_id][1] = loaded_weight
+                else:
+                    param_data[expert_id] = loaded_weight
+                return
             if (
                 (shard_id == "w1" or shard_id == "w3")
                 and param_data[expert_id] != 1
@@ -752,6 +884,13 @@ class EPMoE(torch.nn.Module):
                     ] = loaded_weight
                 else:  # w2
                     param_data[expert_id] = loaded_weight
+            elif self.use_w4afp8:
+                if shard_id == "w1":
+                    param_data[expert_id][: self.intermediate_size, :] = loaded_weight
+                elif shard_id == "w3":
+                    param_data[expert_id][self.intermediate_size :, :] = loaded_weight
+                else:
+                    param_data[expert_id] = loaded_weight
             # If we are in merged column case (gate_up_proj)
             else:
                 if shard_id in ("w1", "w3"):

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -1737,6 +1737,7 @@ def fused_moe(
     renormalize: bool,
     inplace: bool = False,
     activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
     use_grouped_topk: bool = False,
     num_expert_group: Optional[int] = None,
     num_fused_shared_experts: int = 0,
@@ -1822,6 +1823,7 @@ def fused_moe(
         topk_ids,
         inplace=inplace,
         activation=activation,
+        apply_router_weight_on_input=apply_router_weight_on_input,
         use_fp8_w8a8=use_fp8_w8a8,
         use_int8_w8a8=use_int8_w8a8,
         use_int8_w8a16=use_int8_w8a16,

sglang 0.4.9__py3-none-any.whl → 0.4.9.post2__py3-none-any.whl

sglang 0.4.9py3-none-any.whl → 0.4.9.post2py3-none-any.whl