PyPI - sglang - Versions diffs - 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl - Mend

sglang 0.4.9.post5py3-none-any.whl → 0.4.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/model_config.py +6 -0
sglang/srt/configs/step3_vl.py +172 -0
sglang/srt/conversation.py +23 -0
sglang/srt/disaggregation/decode.py +2 -8
sglang/srt/disaggregation/prefill.py +2 -6
sglang/srt/distributed/parallel_state.py +86 -1
sglang/srt/entrypoints/engine.py +14 -18
sglang/srt/entrypoints/http_server.py +23 -3
sglang/srt/entrypoints/openai/protocol.py +3 -1
sglang/srt/entrypoints/openai/serving_base.py +5 -2
sglang/srt/entrypoints/openai/serving_chat.py +2 -21
sglang/srt/eplb/expert_distribution.py +5 -0
sglang/srt/eplb/expert_location.py +17 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -0
sglang/srt/eplb/expert_location_updater.py +2 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/step3_detector.py +436 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/jinja_template_utils.py +4 -1
sglang/srt/layers/moe/cutlass_moe.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +98 -603
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +83 -118
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
sglang/srt/layers/moe/fused_moe_triton/layer.py +97 -38
sglang/srt/layers/moe/token_dispatcher/__init__.py +0 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +48 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +19 -0
sglang/srt/layers/moe/topk.py +6 -2
sglang/srt/layers/quantization/fp8.py +0 -18
sglang/srt/layers/quantization/modelopt_quant.py +2 -0
sglang/srt/layers/quantization/unquant.py +0 -8
sglang/srt/layers/quantization/w4afp8.py +1 -0
sglang/srt/managers/cache_controller.py +143 -45
sglang/srt/managers/data_parallel_controller.py +6 -0
sglang/srt/managers/io_struct.py +12 -2
sglang/srt/managers/scheduler.py +116 -669
sglang/srt/managers/scheduler_input_blocker.py +106 -0
sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
sglang/srt/managers/template_manager.py +62 -19
sglang/srt/managers/tokenizer_manager.py +166 -83
sglang/srt/managers/tp_worker.py +9 -0
sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
sglang/srt/mem_cache/hicache_storage.py +45 -11
sglang/srt/mem_cache/hiradix_cache.py +15 -4
sglang/srt/mem_cache/memory_pool_host.py +73 -1
sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +177 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
sglang/srt/model_executor/model_runner.py +20 -13
sglang/srt/models/arcee.py +532 -0
sglang/srt/models/deepseek_v2.py +15 -56
sglang/srt/models/glm4_moe.py +3 -1
sglang/srt/models/granitemoe.py +3 -0
sglang/srt/models/grok.py +3 -0
sglang/srt/models/hunyuan.py +1 -0
sglang/srt/models/llama4.py +3 -0
sglang/srt/models/mixtral.py +3 -0
sglang/srt/models/olmoe.py +3 -0
sglang/srt/models/phimoe.py +1 -0
sglang/srt/models/qwen3_moe.py +12 -69
sglang/srt/models/step3_vl.py +994 -0
sglang/srt/multimodal/processors/base_processor.py +15 -16
sglang/srt/multimodal/processors/step3_vl.py +515 -0
sglang/srt/poll_based_barrier.py +31 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +18 -13
sglang/srt/speculative/eagle_worker.py +2 -0
sglang/srt/two_batch_overlap.py +8 -3
sglang/test/test_utils.py +53 -0
sglang/utils.py +0 -11
sglang/version.py +1 -1
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/METADATA +4 -4
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/RECORD +84 -64
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post5.dist-info → sglang-0.4.10.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -1,5 +1,7 @@
+from __future__ import annotations
 import logging
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 import torch
@@ -50,6 +52,13 @@ from sglang.srt.utils import (
     next_power_of_2,
 )
+if TYPE_CHECKING:
+    from sglang.srt.layers.moe.ep_moe.token_dispatcher import (
+        DeepEPLLOutput,
+        DeepEPNormalOutput,
+        DispatchOutput,
+    )
 _is_hip = is_hip()
 _is_npu = is_npu()
 _is_fp8_fnuz = is_fp8_fnuz()
@@ -77,79 +86,6 @@ if use_flashinfer_trtllm_moe:
 logger = logging.getLogger(__name__)
-class GroupedGemmRunner(torch.nn.Module):
-    flashinfer_gemm_warpper = None
-    def __init__(
-        self,
-        device,
-        use_flashinfer: bool = False,
-        use_per_token_if_dynamic: bool = True,
-    ):
-        super().__init__()
-        self.device = device
-        self.use_flashinfer = use_flashinfer
-        self.use_per_token_if_dynamic = use_per_token_if_dynamic
-        if self.use_flashinfer and GroupedGemmRunner.flashinfer_gemm_warpper is None:
-            GroupedGemmRunner._init_flashinfer_wrapper(device)
-    @classmethod
-    def _init_flashinfer_wrapper(cls, device):
-        from flashinfer import SegmentGEMMWrapper
-        workspace_buffer = torch.empty(
-            128 * 1024 * 1024, dtype=torch.int8, device=device
-        )
-        cls.flashinfer_gemm_warpper = SegmentGEMMWrapper(workspace_buffer)
-    # c = a * b
-    def forward(
-        self,
-        a: torch.Tensor,
-        b: torch.Tensor,
-        c: torch.Tensor,
-        batch_size: int,
-        weight_column_major: bool,
-        seg_indptr: Optional[torch.Tensor] = None,
-        weight_indices: Optional[torch.Tensor] = None,
-        use_fp8_w8a8: bool = False,
-        scale_a: torch.Tensor = None,
-        scale_b: torch.Tensor = None,
-        block_shape: Optional[List[int]] = None,
-        c_dtype=None,
-    ):
-        if self.use_flashinfer:
-            # TODO: flashinfer
-            assert False
-            assert GroupedGemmRunner.flashinfer_gemm_warpper is not None
-            c = GroupedGemmRunner.flashinfer_gemm_warpper.run(
-                x=a,
-                weights=b,
-                batch_size=batch_size,
-                weight_column_major=weight_column_major,
-                seg_indptr=seg_indptr,
-                weight_indices=weight_indices,
-            )
-        else:
-            assert weight_column_major == True
-            c = grouped_gemm_triton(
-                a,
-                b,
-                c,
-                batch_size,
-                weight_column_major,
-                seg_indptr,
-                weight_indices,
-                use_fp8_w8a8,
-                scale_a,
-                scale_b,
-                block_shape=block_shape,
-                c_dtype=c_dtype,
-                use_per_token_if_dynamic=self.use_per_token_if_dynamic,
-            )
-        return c
 def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
     # Guess tokens per expert assuming perfect expert distribution first.
     num_tokens_per_expert = (num_tokens * top_k) // num_experts
@@ -174,140 +110,57 @@ class EPMoE(FusedMoE):
         hidden_size: int,
         intermediate_size: int,
         layer_id: int,
+        num_fused_shared_experts: int = 0,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
         prefix: str = "",
         activation: str = "silu",
         routed_scaling_factor: Optional[float] = None,
-        use_per_token_if_dynamic: bool = True,
     ):
         super().__init__(
             num_experts=num_experts,
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
-            top_k=top_k,
+            num_fused_shared_experts=num_fused_shared_experts,
             layer_id=layer_id,
+            top_k=top_k,
             params_dtype=params_dtype,
             quant_config=quant_config,
             tp_size=tp_size,
             prefix=prefix,
             activation=activation,
+            # apply_router_weight_on_input=apply_router_weight_on_input,
             routed_scaling_factor=routed_scaling_factor,
             enable_ep_moe=True,
-            skip_quant=True,
         )
-        if params_dtype is None:
-            params_dtype = torch.get_default_dtype()
-        self.layer_id = layer_id
-        self.num_local_experts, self.expert_map = self.determine_expert_map()
-        self.start_expert_id = self.ep_rank * self.num_local_experts
+        self.start_expert_id = self.moe_ep_rank * self.num_local_experts
         self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
         self.intermediate_size = intermediate_size
-        self.use_per_token_if_dynamic = use_per_token_if_dynamic
-        # TODO(ch-wan): move quant preparation to FusedMoE
-        if quant_config is None:
-            self.quant_method: Optional[QuantizeMethodBase] = (
-                UnquantizedFusedMoEMethod()
-            )
-            self.use_fp8_w8a8 = False
-            self.use_block_quant = False
-            self.block_shape = None
-            self.activation_scheme = None
-            self.w13_input_scale = None
-            self.w2_input_scale = None
-            self.w13_weight_scale = None
-            self.w2_weight_scale = None
-        elif isinstance(quant_config, W4AFp8Config):
-            self.quant_method: Optional[QuantizeMethodBase] = W4AFp8MoEMethod(
-                quant_config
-            )
-            self.use_fp8_w8a8 = False
-            self.use_block_quant = False
-            self.fp8_dtype = torch.float8_e4m3fn
-            self.w13_input_scale = None
-            self.w2_input_scale = None
-            self.w13_weight_scale = None
-            self.w2_weight_scale = None
-            self.activation_scheme = quant_config.moe_activation_scheme
-        elif isinstance(quant_config, Fp8Config):
-            self.quant_method: Optional[QuantizeMethodBase] = Fp8MoEMethod(quant_config)
-            self.use_fp8_w8a8 = True
+        if isinstance(quant_config, Fp8Config):
             self.use_block_quant = getattr(self.quant_method, "block_quant", False)
             self.block_shape = (
                 self.quant_method.quant_config.weight_block_size
                 if self.use_block_quant
                 else None
             )
+            self.use_fp8_w8a8 = True
             self.fp8_dtype = torch.float8_e4m3fn
             self.activation_scheme = quant_config.activation_scheme
         else:
-            raise ValueError(f"Unsupported quant_config: {quant_config}")
-        self.quant_config = quant_config
-        self.quant_method.create_weights(
-            layer=self,
-            num_experts=self.num_local_experts,
-            hidden_size=hidden_size,
-            intermediate_size=self.intermediate_size,
-            params_dtype=params_dtype,
-            weight_loader=self.weight_loader,
-        )
-        self.grouped_gemm_runner = None
-    # Adapted from https://github.com/vllm-project/vllm/blob/9fb52e523abf7bdaf7e60cf2971edb5a1b13dc08/vllm/model_executor/layers/fused_moe/layer.py#L544C1-L586C43
-    # Modifications: use determine_expert_map as a class internal function, set 'global_num_experts' rather than '-1' for experts not assigned to the current rank.
-    def determine_expert_map(self) -> Tuple[int, Optional[torch.Tensor]]:
-        """
-        Calculates how many experts should be assigned to each rank for EP and
-        creates a mapping from global to local expert index. Experts are
-        distributed evenly across ranks. Any remaining are assigned to the
-        last rank.
-        Returns:
-            Tuple[int, Optional[torch.Tensor]]: A tuple containing:
-                - local_num_experts (int): The number of experts assigned
-                    to the current rank.
-                - expert_map (Optional[torch.Tensor]): A tensor of shape
-                    (global_num_experts,) mapping from global to local index.
-                    Contains global_num_experts for experts not assigned to the current rank.
-                    Returns None if ep_size is 1.
-        """
-        ep_size = self.ep_size
-        ep_rank = self.ep_rank
-        global_num_experts = self.num_experts
-        assert ep_size > 0
-        if ep_size == 1:
-            return (global_num_experts, None)
-        local_num_experts = global_num_experts // ep_size
-        expert_map = torch.full(
-            (global_num_experts,), global_num_experts, dtype=torch.int32
-        )
-        if ep_rank < (ep_size - 1):
-            expert_map[
-                ep_rank * local_num_experts : (ep_rank + 1) * local_num_experts
-            ] = torch.arange(0, local_num_experts, dtype=torch.int32)
-        else:
-            local_num_experts = global_num_experts - ep_rank * local_num_experts
-            expert_map[-local_num_experts:] = torch.arange(
-                0, local_num_experts, dtype=torch.int32
-            )
-        return (local_num_experts, expert_map)
+            self.use_fp8_w8a8 = False
+            self.use_block_quant = False
+            self.block_shape = None
+            self.activation_scheme = None
     def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8:
             return self.forward_deepgemm(hidden_states, topk_output)
         else:
-            return self.forward_normal(hidden_states, topk_output)
+            return super().forward(hidden_states, topk_output)
     def forward_deepgemm(
         self,
@@ -466,294 +319,6 @@ class EPMoE(FusedMoE):
         )
         return output
-    def forward_normal(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
-        return self.quant_method.apply(self, hidden_states, topk_output)
-    def run_moe(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
-        topk_weights, topk_ids, _ = topk_output
-        hidden_states_shape = hidden_states.shape
-        hidden_states_dtype = hidden_states.dtype
-        hidden_states_device = hidden_states.device
-        if self.grouped_gemm_runner is None:
-            self.grouped_gemm_runner = GroupedGemmRunner(
-                hidden_states.device,
-                use_flashinfer=False,  # TODO: use flashinfer
-                use_per_token_if_dynamic=self.use_per_token_if_dynamic,
-            )
-        num_experts = self.num_experts
-        reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
-            topk_ids,
-            num_experts,
-        )
-        gateup_input = torch.empty(
-            (int(hidden_states.shape[0] * self.top_k), hidden_states.shape[1]),
-            device=hidden_states.device,
-            dtype=(
-                self.fp8_dtype
-                if self.use_fp8_w8a8 and not self.use_block_quant
-                else hidden_states.dtype
-            ),
-        )
-        if self.activation_scheme == "dynamic" and not self.use_block_quant:
-            if self.use_per_token_if_dynamic:
-                max_value = torch.max(hidden_states, dim=1).values.to(torch.float32)
-                self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
-            else:
-                max_value = (
-                    torch.max(hidden_states)
-                    .repeat(self.num_local_experts)
-                    .to(torch.float32)
-                )
-                self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
-        # PreReorder
-        pre_reorder_triton_kernel[(hidden_states.shape[0],)](
-            hidden_states,
-            gateup_input,
-            src2dst,
-            topk_ids,
-            self.w13_input_scale,
-            self.start_expert_id,
-            self.end_expert_id,
-            self.top_k,
-            hidden_states.shape[1],
-            BLOCK_SIZE=512,
-            use_per_token_if_dynamic=self.use_per_token_if_dynamic,
-        )
-        dispose_tensor(hidden_states)
-        if (
-            self.activation_scheme == "dynamic"
-            and not self.use_block_quant
-            and self.use_per_token_if_dynamic
-        ):
-            scale = torch.empty(
-                hidden_states_shape[0] * self.top_k,
-                device=hidden_states_device,
-                dtype=torch.float32,
-            )
-            scale[src2dst] = (
-                self.w13_input_scale.unsqueeze(1)
-                .expand(hidden_states_shape[0], self.top_k)
-                .reshape(-1)
-            )
-            self.w13_input_scale = scale
-        seg_indptr_cur_rank = seg_indptr[self.start_expert_id : self.end_expert_id + 2]
-        weight_indices_cur_rank = torch.arange(
-            0,
-            self.num_local_experts,
-            device=hidden_states_device,
-            dtype=torch.int64,
-        )
-        # GroupGemm-0
-        gateup_output = self.grouped_gemm_runner(
-            a=gateup_input,
-            b=self.w13_weight,
-            c=None,
-            c_dtype=hidden_states_dtype,
-            batch_size=self.num_local_experts,
-            weight_column_major=True,
-            seg_indptr=seg_indptr_cur_rank,
-            weight_indices=weight_indices_cur_rank,
-            use_fp8_w8a8=self.use_fp8_w8a8,
-            scale_a=self.w13_input_scale,
-            scale_b=self.w13_weight_scale,
-            block_shape=self.block_shape,
-        )
-        del gateup_input
-        # Act
-        if self.activation_scheme == "dynamic" and not self.use_block_quant:
-            self.w2_input_scale = None
-            down_input = torch.empty(
-                gateup_output.shape[0],
-                gateup_output.shape[1] // 2,
-                device=gateup_output.device,
-                dtype=hidden_states_dtype,
-            )
-        else:
-            down_input = torch.empty(
-                gateup_output.shape[0],
-                gateup_output.shape[1] // 2,
-                device=gateup_output.device,
-                dtype=(
-                    self.fp8_dtype
-                    if (self.use_fp8_w8a8 and not self.use_block_quant)
-                    else hidden_states_dtype
-                ),
-            )
-        if self.activation == "silu":
-            silu_and_mul_triton_kernel[(gateup_output.shape[0],)](
-                gateup_output,
-                down_input,
-                gateup_output.shape[1],
-                reorder_topk_ids,
-                self.w2_input_scale,
-                self.start_expert_id,
-                self.end_expert_id,
-                BLOCK_SIZE=512,
-            )
-        elif self.activation == "gelu":
-            gelu_and_mul_triton_kernel[(gateup_output.shape[0],)](
-                gateup_output,
-                down_input,
-                gateup_output.shape[1],
-                reorder_topk_ids,
-                self.w2_input_scale,
-                self.start_expert_id,
-                self.end_expert_id,
-                BLOCK_SIZE=512,
-            )
-        else:
-            raise ValueError(f"Unsupported activation: {self.activation=}")
-        del gateup_output
-        if self.activation_scheme == "dynamic" and not self.use_block_quant:
-            if self.use_per_token_if_dynamic:
-                down_input, self.w2_input_scale = sglang_per_token_quant_fp8(down_input)
-            else:
-                self.w2_input_scale = torch.ones(
-                    self.num_local_experts,
-                    dtype=torch.float32,
-                    device=hidden_states_device,
-                )
-        # GroupGemm-1
-        down_output = torch.empty(
-            down_input.shape[0],
-            self.w2_weight.shape[1],
-            device=hidden_states_device,
-            dtype=hidden_states_dtype,
-        )
-        down_output = self.grouped_gemm_runner(
-            a=down_input,
-            b=self.w2_weight,
-            c=down_output,
-            batch_size=self.num_local_experts,
-            weight_column_major=True,
-            seg_indptr=seg_indptr_cur_rank,
-            weight_indices=weight_indices_cur_rank,
-            use_fp8_w8a8=self.use_fp8_w8a8,
-            scale_a=self.w2_input_scale,
-            scale_b=self.w2_weight_scale,
-            block_shape=self.block_shape,
-        )
-        del down_input
-        # PostReorder
-        output = torch.empty(
-            hidden_states_shape, dtype=hidden_states_dtype, device=hidden_states_device
-        )
-        post_reorder_triton_kernel[(hidden_states_shape[0],)](
-            down_output,
-            output,
-            src2dst,
-            topk_ids,
-            topk_weights,
-            self.start_expert_id,
-            self.end_expert_id,
-            self.top_k,
-            hidden_states_shape[1],
-            0,
-            BLOCK_SIZE=512,
-        )
-        return output
-    @classmethod
-    def make_expert_params_mapping(
-        cls,
-        ckpt_gate_proj_name: str,
-        ckpt_down_proj_name: str,
-        ckpt_up_proj_name: str,
-        num_experts: int,
-    ) -> List[Tuple[str, str, int, str]]:
-        return [
-            # (param_name, weight_name, expert_id, shard_id)
-            (
-                (
-                    "experts.w13_"
-                    if weight_name in [ckpt_gate_proj_name, ckpt_up_proj_name]
-                    else "experts.w2_"
-                ),
-                f"experts.{expert_id}.{weight_name}.",
-                expert_id,
-                shard_id,
-            )
-            for expert_id in range(num_experts)
-            for shard_id, weight_name in [
-                ("w1", ckpt_gate_proj_name),
-                ("w2", ckpt_down_proj_name),
-                ("w3", ckpt_up_proj_name),
-            ]
-        ]
-    @classmethod
-    def make_expert_input_scale_params_mapping(
-        cls,
-        num_experts: int,
-    ) -> List[Tuple[str, str, int, str]]:
-        # (param_name, weight_name, expert_id, shard_id)
-        return [
-            (
-                "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
-                f"experts.{expert_id}.{shard_id}.",
-                expert_id,
-                shard_id,
-            )
-            for expert_id in range(num_experts)
-            for shard_id in ["w1", "w2", "w3"]
-        ]
-    def weight_loader(
-        self,
-        param: torch.nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        shard_id: str,
-        expert_id: int,
-    ) -> None:
-        physical_expert_ids = (
-            get_global_expert_location_metadata().logical_to_all_physical(
-                self.layer_id, expert_id
-            )
-        )
-        for physical_expert_id in physical_expert_ids:
-            self._weight_loader_physical(
-                param=param,
-                loaded_weight=loaded_weight,
-                weight_name=weight_name,
-                shard_id=shard_id,
-                expert_id=physical_expert_id,
-            )
-    def _weight_loader_physical(
-        self,
-        param: torch.nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        shard_id: str,
-        expert_id: int,
-    ) -> None:
-        if expert_id < self.start_expert_id or expert_id > self.end_expert_id:
-            return
-        expert_id = expert_id - self.start_expert_id
-        self._weight_loader_impl(
-            param=param,
-            loaded_weight=loaded_weight,
-            weight_name=weight_name,
-            shard_id=shard_id,
-            expert_id=expert_id,
-        )
-        return
 class DeepEPMoE(EPMoE):
     """
@@ -769,6 +334,7 @@ class DeepEPMoE(EPMoE):
         hidden_size: int,
         intermediate_size: int,
         layer_id: int,
+        num_fused_shared_experts: int = 0,
         params_dtype: Optional[torch.dtype] = None,
         quant_config: Optional[QuantizationConfig] = None,
         tp_size: Optional[int] = None,
@@ -783,6 +349,7 @@ class DeepEPMoE(EPMoE):
             hidden_size=hidden_size,
             intermediate_size=intermediate_size,
             layer_id=layer_id,
+            num_fused_shared_experts=num_fused_shared_experts,
             params_dtype=params_dtype,
             quant_config=quant_config,
             tp_size=tp_size,
@@ -791,11 +358,24 @@ class DeepEPMoE(EPMoE):
             routed_scaling_factor=routed_scaling_factor,
         )
         self.deepep_mode = deepep_mode
-        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
-            assert self.use_fp8_w8a8, (
-                "DeepGEMM requires an fp8_w8a8 model; "
-                "alternatively, you can disable DeepGEMM by turning off the ENABLE_JIT_DEEPGEMM environment variable."
-            )
+        # TODO: move to the beginning of the file
+        from sglang.srt.distributed.parallel_state import get_tp_group
+        from sglang.srt.managers.schedule_batch import global_server_args_dict
+        from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
+        self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
+            group=get_tp_group().device_group,
+            router_topk=self.top_k,
+            permute_fusion=True,
+            num_experts=self.num_experts,
+            num_local_experts=self.num_local_experts,
+            hidden_size=hidden_size,
+            params_dtype=params_dtype,
+            deepep_mode=deepep_mode,
+            async_finish=True,  # TODO
+            return_recv_hook=True,
+        )
         if self.deepep_mode.enable_low_latency():
             assert (
@@ -837,156 +417,72 @@ class DeepEPMoE(EPMoE):
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
-        reorder_topk_ids: torch.Tensor,
-        seg_indptr: torch.Tensor,
-        masked_m: torch.Tensor,
-        expected_m: int,
-        num_recv_tokens_per_expert: List[int],
         forward_batch: ForwardBatch,
     ):
-        if _use_aiter:
-            # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
-            return self.forward_aiter(hidden_states, topk_idx, topk_weights)
-        resolved_deepep_mode = self.deepep_mode.resolve(
-            forward_batch.is_extend_in_batch
+        dispatch_output = self.dispatch(
+            hidden_states, topk_idx, topk_weights, forward_batch
         )
-        if resolved_deepep_mode == DeepEPMode.normal:
-            if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
-                return self.forward_deepgemm_contiguous(
-                    hidden_states, topk_idx, topk_weights, num_recv_tokens_per_expert
-                )
-            else:
-                return self.forward_normal(hidden_states, reorder_topk_ids, seg_indptr)
-        elif resolved_deepep_mode == DeepEPMode.low_latency:
-            return self.forward_deepgemm_masked(hidden_states, masked_m, expected_m)
-        else:
-            raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")
+        hidden_states = self.moe_impl(dispatch_output)
+        hidden_states = self.combine(
+            hidden_states,
+            dispatch_output.topk_idx,
+            dispatch_output.topk_weights,
+            forward_batch,
+        )
+        return hidden_states
-    def forward_normal(
+    def dispatch(
         self,
         hidden_states: torch.Tensor,
-        reorder_topk_ids: torch.Tensor,
-        seg_indptr: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
     ):
-        hidden_states_dtype = hidden_states.dtype
-        hidden_states_device = hidden_states.device
-        assert self.quant_method is not None
-        assert self.activation == "silu"
-        if self.grouped_gemm_runner is None:
-            self.grouped_gemm_runner = GroupedGemmRunner(
-                hidden_states.device, use_flashinfer=False  # TODO: use flashinfer
-            )
-        if self.activation_scheme == "dynamic" and not self.use_block_quant:
-            max_value = (
-                torch.max(hidden_states)
-                .repeat(self.num_local_experts)
-                .to(torch.float32)
-            )
-            self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
-        weight_indices_cur_rank = torch.arange(
-            0,
-            self.num_local_experts,
-            device=hidden_states.device,
-            dtype=torch.int64,
+        return self.deepep_dispatcher.dispatch(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
         )
-        # GroupGemm-0
-        if hidden_states.shape[0] > 0:
-            gateup_output = self.grouped_gemm_runner(
-                a=hidden_states,
-                b=self.w13_weight,
-                c=None,
-                c_dtype=hidden_states.dtype,
-                batch_size=self.num_local_experts,
-                weight_column_major=True,
-                seg_indptr=seg_indptr,
-                weight_indices=weight_indices_cur_rank,
-                use_fp8_w8a8=self.use_fp8_w8a8,
-                scale_a=self.w13_input_scale,
-                scale_b=(
-                    self.w13_weight_scale_inv
-                    if self.use_block_quant
-                    else self.w13_weight_scale
-                ),
-                block_shape=self.block_shape,
-            )
-        else:
-            gateup_output = torch.empty(
-                hidden_states.shape[0],
-                self.w13_weight.shape[1],
-                device=hidden_states.device,
-                dtype=hidden_states.dtype,
-            )
-        # Act
-        down_input = torch.empty(
-            gateup_output.shape[0],
-            gateup_output.shape[1] // 2,
-            device=gateup_output.device,
-            dtype=(
-                self.fp8_dtype
-                if (self.use_fp8_w8a8 and not self.use_block_quant)
-                else hidden_states_dtype
-            ),
-        )
-        if self.w2_input_scale is None and not self.use_block_quant:
-            self.w2_input_scale = torch.ones(
-                self.num_local_experts,
-                dtype=torch.float32,
-                device=hidden_states_device,
-            )
-        if self.activation == "silu":
-            silu_and_mul_triton_kernel[(gateup_output.shape[0],)](
-                gateup_output,
-                down_input,
-                gateup_output.shape[1],
-                reorder_topk_ids,
-                self.w2_input_scale,
-                0,
-                self.num_local_experts - 1,
-                BLOCK_SIZE=512,
-            )
+    def moe_impl(self, dispatch_output: DispatchOutput):
+        if _use_aiter:
+            # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
+            return self.forward_aiter(dispatch_output)
+        if dispatch_output.format.is_deepep_normal():
+            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
+            return self.forward_deepgemm_contiguous(dispatch_output)
+        elif dispatch_output.format.is_deepep_ll():
+            assert deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM and self.use_fp8_w8a8
+            return self.forward_deepgemm_masked(dispatch_output)
         else:
-            raise ValueError(f"Unsupported activation: {self.activation=}")
-        del gateup_output
-        # GroupGemm-1
-        down_output = torch.empty(
-            down_input.shape[0],
-            self.w2_weight.shape[1],
-            device=hidden_states_device,
-            dtype=hidden_states_dtype,
-        )
-        if down_input.shape[0] > 0:
-            down_output = self.grouped_gemm_runner(
-                a=down_input,
-                b=self.w2_weight,
-                c=down_output,
-                batch_size=self.num_local_experts,
-                weight_column_major=True,
-                seg_indptr=seg_indptr,
-                weight_indices=weight_indices_cur_rank,
-                use_fp8_w8a8=self.use_fp8_w8a8,
-                scale_a=self.w2_input_scale,
-                scale_b=(
-                    self.w2_weight_scale_inv
-                    if self.use_block_quant
-                    else self.w2_weight_scale
-                ),
-                block_shape=self.block_shape,
+            raise ValueError(
+                f"Dispatch output format {dispatch_output.format} is not supported"
             )
-        return down_output
-    def forward_aiter(
+    def combine(
         self,
         hidden_states: torch.Tensor,
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
+        forward_batch: ForwardBatch,
     ):
+        return self.deepep_dispatcher.combine(
+            hidden_states=hidden_states,
+            topk_idx=topk_idx,
+            topk_weights=topk_weights,
+            forward_batch=forward_batch,
+        )
+    def forward_aiter(
+        self,
+        dispatch_output: DeepEPNormalOutput,
+    ):
+        hidden_states, topk_idx, topk_weights = (
+            dispatch_output.hidden_states,
+            dispatch_output.topk_idx,
+            dispatch_output.topk_weights,
+        )
         if hidden_states.shape[0] == 0:
             return hidden_states
         # in original deepep, idx == -1 meaning invalid and will not be processed.
@@ -1014,11 +510,11 @@ class DeepEPMoE(EPMoE):
     def forward_deepgemm_contiguous(
         self,
-        hidden_states_fp8: Tuple[torch.Tensor, torch.Tensor],
-        topk_idx,
-        topk_weights,
-        num_recv_tokens_per_expert: List[int],
+        dispatch_output: DeepEPNormalOutput,
     ):
+        hidden_states_fp8, topk_idx, topk_weights, num_recv_tokens_per_expert = (
+            dispatch_output
+        )
         hidden_states_fp8, hidden_states_scale = hidden_states_fp8
         assert self.quant_method is not None
         assert self.activation == "silu"
@@ -1138,10 +634,9 @@ class DeepEPMoE(EPMoE):
     def forward_deepgemm_masked(
         self,
-        hidden_states_fp8: Tuple[torch.Tensor, torch.Tensor],
-        masked_m: torch.Tensor,
-        expected_m: int,
+        dispatch_output: DeepEPLLOutput,
     ):
+        hidden_states_fp8, _, _, masked_m, expected_m = dispatch_output
         assert self.quant_method is not None
         assert self.activation == "silu"
@@ -1268,7 +763,7 @@ class FlashInferEPMoE(EPMoE):
             topk_group=self.topk_group,
             intermediate_size=self.w2_weight.shape[2],
             local_expert_offset=self.start_expert_id,
-            local_num_experts=self.num_experts_per_partition,
+            local_num_experts=self.num_local_experts,
             routed_scaling_factor=self.routed_scaling_factor,
             tile_tokens_dim=_get_tile_tokens_dim(
                 hidden_states.shape[0], self.top_k, self.num_experts

sglang 0.4.9.post5__py3-none-any.whl → 0.4.10__py3-none-any.whl

sglang 0.4.9.post5py3-none-any.whl → 0.4.10py3-none-any.whl