PyPI - sglang - Versions diffs - 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl - Mend

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (84) hide show

sglang/lang/chat_template.py +21 -0
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/constrained/base_grammar_backend.py +10 -2
sglang/srt/constrained/xgrammar_backend.py +7 -5
sglang/srt/conversation.py +16 -1
sglang/srt/debug_utils/__init__.py +0 -0
sglang/srt/debug_utils/dump_comparator.py +131 -0
sglang/srt/debug_utils/dumper.py +108 -0
sglang/srt/debug_utils/text_comparator.py +172 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +13 -1
sglang/srt/disaggregation/mooncake/conn.py +16 -0
sglang/srt/disaggregation/prefill.py +13 -1
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +132 -79
sglang/srt/function_call/ebnf_composer.py +10 -3
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +164 -0
sglang/srt/function_call/qwen3_coder_detector.py +1 -0
sglang/srt/layers/attention/hybrid_attn_backend.py +100 -0
sglang/srt/layers/attention/vision.py +56 -8
sglang/srt/layers/layernorm.py +26 -1
sglang/srt/layers/logits_processor.py +14 -3
sglang/srt/layers/moe/ep_moe/layer.py +172 -206
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=160,N=320,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -48
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +11 -8
sglang/srt/layers/moe/topk.py +84 -22
sglang/srt/layers/multimodal.py +11 -8
sglang/srt/layers/quantization/fp8.py +25 -247
sglang/srt/layers/quantization/fp8_kernel.py +78 -48
sglang/srt/layers/quantization/modelopt_quant.py +25 -10
sglang/srt/layers/quantization/unquant.py +24 -76
sglang/srt/layers/quantization/w4afp8.py +68 -17
sglang/srt/lora/lora_registry.py +93 -29
sglang/srt/managers/cache_controller.py +9 -7
sglang/srt/managers/mm_utils.py +154 -35
sglang/srt/managers/multimodal_processor.py +3 -14
sglang/srt/managers/schedule_batch.py +14 -8
sglang/srt/managers/scheduler.py +35 -1
sglang/srt/managers/tokenizer_manager.py +37 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +5 -2
sglang/srt/model_executor/model_runner.py +68 -14
sglang/srt/models/deepseek_v2.py +62 -28
sglang/srt/models/glm4_moe.py +1035 -0
sglang/srt/models/glm4_moe_nextn.py +167 -0
sglang/srt/models/interns1.py +328 -0
sglang/srt/models/internvl.py +143 -47
sglang/srt/models/llava.py +9 -5
sglang/srt/models/minicpmo.py +4 -1
sglang/srt/models/qwen2_moe.py +2 -2
sglang/srt/models/qwen3_moe.py +5 -2
sglang/srt/multimodal/processors/base_processor.py +20 -6
sglang/srt/multimodal/processors/clip.py +2 -2
sglang/srt/multimodal/processors/deepseek_vl_v2.py +2 -2
sglang/srt/multimodal/processors/gemma3.py +2 -2
sglang/srt/multimodal/processors/gemma3n.py +2 -2
sglang/srt/multimodal/processors/internvl.py +21 -8
sglang/srt/multimodal/processors/janus_pro.py +2 -2
sglang/srt/multimodal/processors/kimi_vl.py +2 -2
sglang/srt/multimodal/processors/llava.py +4 -4
sglang/srt/multimodal/processors/minicpm.py +2 -3
sglang/srt/multimodal/processors/mlama.py +2 -2
sglang/srt/multimodal/processors/mllama4.py +18 -111
sglang/srt/multimodal/processors/phi4mm.py +2 -2
sglang/srt/multimodal/processors/pixtral.py +2 -2
sglang/srt/multimodal/processors/qwen_audio.py +2 -2
sglang/srt/multimodal/processors/qwen_vl.py +2 -2
sglang/srt/multimodal/processors/vila.py +3 -1
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +57 -6
sglang/srt/utils.py +96 -1
sglang/srt/weight_sync/utils.py +119 -0
sglang/test/runners.py +4 -0
sglang/test/test_utils.py +65 -5
sglang/utils.py +19 -0
sglang/version.py +1 -1
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/METADATA +4 -4
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/RECORD +83 -73
sglang/srt/debug_utils.py +0 -74
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post4.dist-info → sglang-0.4.9.post5.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -30,13 +30,13 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.layers.quantization.fp8 import Fp8EPMoEMethod
+from sglang.srt.layers.quantization.fp8 import Fp8Config, Fp8MoEMethod
 from sglang.srt.layers.quantization.fp8_kernel import (
     is_fp8_fnuz,
     sglang_per_token_group_quant_fp8,
     sglang_per_token_quant_fp8,
 )
-from sglang.srt.layers.quantization.unquant import UnquantizedEPMoEMethod
+from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
 from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -47,23 +47,33 @@ from sglang.srt.utils import (
     get_bool_env_var,
     is_hip,
     is_npu,
+    next_power_of_2,
 )
 _is_hip = is_hip()
 _is_npu = is_npu()
 _is_fp8_fnuz = is_fp8_fnuz()
 _use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+use_flashinfer_trtllm_moe = (
+    global_server_args_dict["enable_flashinfer_trtllm_moe"]
+    and global_server_args_dict["enable_ep_moe"]
+)
 if not (_is_npu or _is_hip):
     from sgl_kernel import silu_and_mul
-    from sglang.srt.layers.moe.cutlass_w4a8_moe import cutlass_w4a8_moe
 if _use_aiter:
     from aiter import ActivationType, QuantType
     from aiter.fused_moe import fused_moe
     from aiter.ops.shuffle import shuffle_weight
+if use_flashinfer_trtllm_moe:
+    try:
+        import flashinfer.fused_moe as fi_fused_moe
+    except ImportError:
+        fi_fused_moe = None
+        use_flashinfer_trtllm_moe = False
 logger = logging.getLogger(__name__)
@@ -140,7 +150,17 @@ class GroupedGemmRunner(torch.nn.Module):
         return c
-class EPMoE(torch.nn.Module):
+def _get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
+class EPMoE(FusedMoE):
     """
     MoE Expert Parallel Impl
@@ -162,51 +182,60 @@ class EPMoE(torch.nn.Module):
         routed_scaling_factor: Optional[float] = None,
         use_per_token_if_dynamic: bool = True,
     ):
-        super().__init__()
+        super().__init__(
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            top_k=top_k,
+            layer_id=layer_id,
+            params_dtype=params_dtype,
+            quant_config=quant_config,
+            tp_size=tp_size,
+            prefix=prefix,
+            activation=activation,
+            routed_scaling_factor=routed_scaling_factor,
+            enable_ep_moe=True,
+            skip_quant=True,
+        )
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
-        self.tp_size = (
-            tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
-        )
-        self.tp_rank = get_tensor_model_parallel_rank()
         self.layer_id = layer_id
-        self.num_experts = num_experts
-        assert self.num_experts % self.tp_size == 0
-        self.num_experts_per_partition, self.expert_map = self.determine_expert_map()
-        self.start_expert_id = self.tp_rank * self.num_experts_per_partition
-        self.end_expert_id = self.start_expert_id + self.num_experts_per_partition - 1
+        self.num_local_experts, self.expert_map = self.determine_expert_map()
+        self.start_expert_id = self.ep_rank * self.num_local_experts
+        self.end_expert_id = self.start_expert_id + self.num_local_experts - 1
-        self.top_k = top_k
         self.intermediate_size = intermediate_size
-        self.activation = activation
-        self.routed_scaling_factor = routed_scaling_factor
         self.use_per_token_if_dynamic = use_per_token_if_dynamic
+        # TODO(ch-wan): move quant preparation to FusedMoE
         if quant_config is None:
-            self.quant_method: Optional[QuantizeMethodBase] = UnquantizedEPMoEMethod()
+            self.quant_method: Optional[QuantizeMethodBase] = (
+                UnquantizedFusedMoEMethod()
+            )
             self.use_fp8_w8a8 = False
             self.use_block_quant = False
             self.block_shape = None
             self.activation_scheme = None
-            self.use_w4afp8 = False
+            self.w13_input_scale = None
+            self.w2_input_scale = None
+            self.w13_weight_scale = None
+            self.w2_weight_scale = None
         elif isinstance(quant_config, W4AFp8Config):
             self.quant_method: Optional[QuantizeMethodBase] = W4AFp8MoEMethod(
                 quant_config
             )
-            self.use_w4afp8 = True
             self.use_fp8_w8a8 = False
             self.use_block_quant = False
             self.fp8_dtype = torch.float8_e4m3fn
+            self.w13_input_scale = None
+            self.w2_input_scale = None
             self.w13_weight_scale = None
             self.w2_weight_scale = None
             self.activation_scheme = quant_config.moe_activation_scheme
-        else:
-            self.quant_method: Optional[QuantizeMethodBase] = Fp8EPMoEMethod(
-                quant_config
-            )
+        elif isinstance(quant_config, Fp8Config):
+            self.quant_method: Optional[QuantizeMethodBase] = Fp8MoEMethod(quant_config)
             self.use_fp8_w8a8 = True
             self.use_block_quant = getattr(self.quant_method, "block_quant", False)
             self.block_shape = (
@@ -216,11 +245,13 @@ class EPMoE(torch.nn.Module):
             )
             self.fp8_dtype = torch.float8_e4m3fn
             self.activation_scheme = quant_config.activation_scheme
-            self.use_w4afp8 = False
+        else:
+            raise ValueError(f"Unsupported quant_config: {quant_config}")
+        self.quant_config = quant_config
         self.quant_method.create_weights(
             layer=self,
-            num_experts_per_partition=self.num_experts_per_partition,
+            num_experts=self.num_local_experts,
             hidden_size=hidden_size,
             intermediate_size=self.intermediate_size,
             params_dtype=params_dtype,
@@ -229,19 +260,6 @@ class EPMoE(torch.nn.Module):
         self.grouped_gemm_runner = None
-        self.w13_weight_fp8 = (
-            self.w13_weight,
-            (
-                self.w13_weight_scale_inv
-                if self.use_block_quant
-                else self.w13_weight_scale
-            ),
-        )
-        self.w2_weight_fp8 = (
-            self.w2_weight,
-            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
-        )
     # Adapted from https://github.com/vllm-project/vllm/blob/9fb52e523abf7bdaf7e60cf2971edb5a1b13dc08/vllm/model_executor/layers/fused_moe/layer.py#L544C1-L586C43
     # Modifications: use determine_expert_map as a class internal function, set 'global_num_experts' rather than '-1' for experts not assigned to the current rank.
     def determine_expert_map(self) -> Tuple[int, Optional[torch.Tensor]]:
@@ -260,8 +278,8 @@ class EPMoE(torch.nn.Module):
                     Contains global_num_experts for experts not assigned to the current rank.
                     Returns None if ep_size is 1.
         """
-        ep_size = self.tp_size
-        ep_rank = self.tp_rank
+        ep_size = self.ep_size
+        ep_rank = self.ep_rank
         global_num_experts = self.num_experts
         assert ep_size > 0
@@ -271,7 +289,7 @@ class EPMoE(torch.nn.Module):
         local_num_experts = global_num_experts // ep_size
         expert_map = torch.full(
-            (global_num_experts,), self.num_experts, dtype=torch.int32
+            (global_num_experts,), global_num_experts, dtype=torch.int32
         )
         if ep_rank < (ep_size - 1):
             expert_map[
@@ -296,6 +314,20 @@ class EPMoE(torch.nn.Module):
         hidden_states: torch.Tensor,
         topk_output: TopKOutput,
     ):
+        self.w13_weight_fp8 = (
+            self.w13_weight,
+            (
+                self.w13_weight_scale_inv
+                if self.use_block_quant
+                else self.w13_weight_scale
+            ),
+        )
+        self.w2_weight_fp8 = (
+            self.w2_weight,
+            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
+        )
         assert self.quant_method is not None
         assert self.activation == "silu"
         hidden_states_shape = hidden_states.shape
@@ -435,7 +467,10 @@ class EPMoE(torch.nn.Module):
         return output
     def forward_normal(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
-        assert self.quant_method is not None
+        return self.quant_method.apply(self, hidden_states, topk_output)
+    def run_moe(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         topk_weights, topk_ids, _ = topk_output
         hidden_states_shape = hidden_states.shape
@@ -448,53 +483,11 @@ class EPMoE(torch.nn.Module):
                 use_per_token_if_dynamic=self.use_per_token_if_dynamic,
             )
-        if self.use_w4afp8:
-            local_topk_ids = topk_ids
-            if self.expert_map is not None:
-                "Translate info from expert_map to topk_ids"
-                local_topk_ids = torch.where(
-                    self.expert_map[topk_ids] != self.num_experts,
-                    self.expert_map[topk_ids],
-                    self.num_experts,
-                )
-            output = cutlass_w4a8_moe(
-                self.start_expert_id,
-                self.end_expert_id,
-                self.num_experts,
-                hidden_states,
-                self.w13_weight,
-                self.w2_weight,
-                self.w13_weight_scale_inv,
-                self.w2_weight_scale_inv,
-                topk_weights,
-                topk_ids,
-                local_topk_ids,
-                self.quant_method.a_strides1,
-                self.quant_method.b_strides1,
-                self.quant_method.c_strides1,
-                self.quant_method.a_strides2,
-                self.quant_method.b_strides2,
-                self.quant_method.c_strides2,
-                self.quant_method.s_strides13,
-                self.quant_method.s_strides2,
-                self.quant_method.expert_offsets,
-                self.quant_method.problem_sizes1,
-                self.quant_method.problem_sizes2,
-                self.w13_input_scale,
-                self.w2_input_scale,
-            )
-            return output
-        if self.grouped_gemm_runner is None:
-            self.grouped_gemm_runner = GroupedGemmRunner(
-                hidden_states.device,
-                use_flashinfer=False,  # TODO: use flashinfer
-                use_per_token_if_dynamic=self.use_per_token_if_dynamic,
-            )
+        num_experts = self.num_experts
         reorder_topk_ids, src2dst, seg_indptr = run_moe_ep_preproess(
-            topk_ids, self.num_experts
+            topk_ids,
+            num_experts,
         )
         gateup_input = torch.empty(
@@ -502,7 +495,7 @@ class EPMoE(torch.nn.Module):
             device=hidden_states.device,
             dtype=(
                 self.fp8_dtype
-                if ((self.use_fp8_w8a8 or self.use_w4afp8) and not self.use_block_quant)
+                if self.use_fp8_w8a8 and not self.use_block_quant
                 else hidden_states.dtype
             ),
         )
@@ -513,7 +506,7 @@ class EPMoE(torch.nn.Module):
             else:
                 max_value = (
                     torch.max(hidden_states)
-                    .repeat(self.num_experts_per_partition)
+                    .repeat(self.num_local_experts)
                     .to(torch.float32)
                 )
                 self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
@@ -554,7 +547,7 @@ class EPMoE(torch.nn.Module):
         seg_indptr_cur_rank = seg_indptr[self.start_expert_id : self.end_expert_id + 2]
         weight_indices_cur_rank = torch.arange(
             0,
-            self.num_experts_per_partition,
+            self.num_local_experts,
             device=hidden_states_device,
             dtype=torch.int64,
         )
@@ -564,17 +557,13 @@ class EPMoE(torch.nn.Module):
             b=self.w13_weight,
             c=None,
             c_dtype=hidden_states_dtype,
-            batch_size=self.num_experts_per_partition,
+            batch_size=self.num_local_experts,
             weight_column_major=True,
             seg_indptr=seg_indptr_cur_rank,
             weight_indices=weight_indices_cur_rank,
             use_fp8_w8a8=self.use_fp8_w8a8,
             scale_a=self.w13_input_scale,
-            scale_b=(
-                self.w13_weight_scale_inv
-                if self.use_block_quant
-                else self.w13_weight_scale
-            ),
+            scale_b=self.w13_weight_scale,
             block_shape=self.block_shape,
         )
         del gateup_input
@@ -631,7 +620,7 @@ class EPMoE(torch.nn.Module):
                 down_input, self.w2_input_scale = sglang_per_token_quant_fp8(down_input)
             else:
                 self.w2_input_scale = torch.ones(
-                    self.num_experts_per_partition,
+                    self.num_local_experts,
                     dtype=torch.float32,
                     device=hidden_states_device,
                 )
@@ -647,17 +636,13 @@ class EPMoE(torch.nn.Module):
             a=down_input,
             b=self.w2_weight,
             c=down_output,
-            batch_size=self.num_experts_per_partition,
+            batch_size=self.num_local_experts,
             weight_column_major=True,
             seg_indptr=seg_indptr_cur_rank,
             weight_indices=weight_indices_cur_rank,
             use_fp8_w8a8=self.use_fp8_w8a8,
             scale_a=self.w2_input_scale,
-            scale_b=(
-                self.w2_weight_scale_inv
-                if self.use_block_quant
-                else self.w2_weight_scale
-            ),
+            scale_b=self.w2_weight_scale,
             block_shape=self.block_shape,
         )
         del down_input
@@ -760,95 +745,14 @@ class EPMoE(torch.nn.Module):
             return
         expert_id = expert_id - self.start_expert_id
-        if shard_id not in ("w1", "w2", "w3"):
-            raise ValueError(
-                f"shard_id must be ['w1','w2','w3'] but " f"got {shard_id}."
-            )
-        # Special case for fp8 scales.
-        if "scale" in weight_name:
-            self._load_fp8_scale(
-                param.data,
-                loaded_weight,
-                weight_name,
-                shard_id,
-                expert_id,
-            )
-            return
-        if shard_id == "w2":
-            param.data[expert_id] = loaded_weight
-        elif shard_id == "w1":
-            param.data[expert_id][: self.intermediate_size, :] = loaded_weight
-        elif shard_id == "w3":
-            param.data[expert_id][self.intermediate_size :, :] = loaded_weight
-        else:
-            raise ValueError(f"Expected shard_id w1,w2 or w3 but got {shard_id}")
-    def _load_fp8_scale(
-        self,
-        param: torch.nn.Parameter,
-        loaded_weight: torch.Tensor,
-        weight_name: str,
-        shard_id: str,
-        expert_id: int,
-    ) -> None:
-        param_data = param.data
-        # Input scales can be loaded directly and should be equal.
-        if "input_scale" in weight_name:
-            if self.use_w4afp8:
-                if shard_id == "w1":
-                    param_data[expert_id][0] = loaded_weight
-                elif shard_id == "w3":
-                    param_data[expert_id][1] = loaded_weight
-                else:
-                    param_data[expert_id] = loaded_weight
-                return
-            if (
-                (shard_id == "w1" or shard_id == "w3")
-                and param_data[expert_id] != 1
-                and (param_data[expert_id] - loaded_weight).abs() > 1e-5
-            ):
-                raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
-                    f"must be equal. But got {param_data[expert_id]} "
-                    f"vs. {loaded_weight}"
-                )
-            param_data[expert_id] = loaded_weight
-        # Weight scales
-        elif "weight_scale" in weight_name:
-            if self.use_block_quant:
-                block_n, block_k = self.block_shape[0], self.block_shape[1]
-                if shard_id == "w1":
-                    param_data[expert_id][
-                        : (self.intermediate_size + block_n - 1) // block_n, :
-                    ] = loaded_weight
-                elif shard_id == "w3":
-                    param_data[expert_id][
-                        (self.intermediate_size + block_n - 1) // block_n :, :
-                    ] = loaded_weight
-                else:  # w2
-                    param_data[expert_id] = loaded_weight
-            elif self.use_w4afp8:
-                if shard_id == "w1":
-                    param_data[expert_id][: self.intermediate_size, :] = loaded_weight
-                elif shard_id == "w3":
-                    param_data[expert_id][self.intermediate_size :, :] = loaded_weight
-                else:
-                    param_data[expert_id] = loaded_weight
-            # If we are in merged column case (gate_up_proj)
-            else:
-                if shard_id in ("w1", "w3"):
-                    # We have to keep the weight scales of w1 and w3 because
-                    # we need to re-quantize w1/w3 weights after weight loading.
-                    idx = 0 if shard_id == "w1" else 1
-                    param_data[expert_id][idx] = loaded_weight
-                # If we are in the row parallel case (down_proj)
-                else:
-                    param_data[expert_id] = loaded_weight
+        self._weight_loader_impl(
+            param=param,
+            loaded_weight=loaded_weight,
+            weight_name=weight_name,
+            shard_id=shard_id,
+            expert_id=expert_id,
+        )
+        return
 class DeepEPMoE(EPMoE):
@@ -898,13 +802,13 @@ class DeepEPMoE(EPMoE):
                 deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
             ), f"DeepEP {self.deepep_mode} mode requires deep_gemm"
         if _use_aiter:
-            # expert_mask is of size (self.num_experts_per_partition + 1),
+            # expert_mask is of size (self.num_local_experts + 1),
             # the extra 1 is for invalid rank_id (in original deepep, the invalid rank_id is -1, but aiter does not allow -1, we use a mask to make those ids invalid)
             # for instance, if we have 4 experts on this rank, we would have a expert_mask like:
             #     self.expert_mask = [1, 1, 1, 1, 0]
             # idx from 0-3 is valid and will be processed, while idx == 4 will be masked out
             self.expert_mask = torch.zeros(
-                (self.num_experts_per_partition + 1),
+                (self.num_local_experts + 1),
                 device=torch.cuda.current_device(),
                 dtype=torch.int,
             )
@@ -977,13 +881,13 @@ class DeepEPMoE(EPMoE):
         if self.activation_scheme == "dynamic" and not self.use_block_quant:
             max_value = (
                 torch.max(hidden_states)
-                .repeat(self.num_experts_per_partition)
+                .repeat(self.num_local_experts)
                 .to(torch.float32)
             )
             self.w13_input_scale = max_value / torch.finfo(self.fp8_dtype).max
         weight_indices_cur_rank = torch.arange(
             0,
-            self.num_experts_per_partition,
+            self.num_local_experts,
             device=hidden_states.device,
             dtype=torch.int64,
         )
@@ -995,7 +899,7 @@ class DeepEPMoE(EPMoE):
                 b=self.w13_weight,
                 c=None,
                 c_dtype=hidden_states.dtype,
-                batch_size=self.num_experts_per_partition,
+                batch_size=self.num_local_experts,
                 weight_column_major=True,
                 seg_indptr=seg_indptr,
                 weight_indices=weight_indices_cur_rank,
@@ -1029,7 +933,7 @@ class DeepEPMoE(EPMoE):
         )
         if self.w2_input_scale is None and not self.use_block_quant:
             self.w2_input_scale = torch.ones(
-                self.num_experts_per_partition,
+                self.num_local_experts,
                 dtype=torch.float32,
                 device=hidden_states_device,
             )
@@ -1042,7 +946,7 @@ class DeepEPMoE(EPMoE):
                 reorder_topk_ids,
                 self.w2_input_scale,
                 0,
-                self.num_experts_per_partition - 1,
+                self.num_local_experts - 1,
                 BLOCK_SIZE=512,
             )
         else:
@@ -1062,7 +966,7 @@ class DeepEPMoE(EPMoE):
                 a=down_input,
                 b=self.w2_weight,
                 c=down_output,
-                batch_size=self.num_experts_per_partition,
+                batch_size=self.num_local_experts,
                 weight_column_major=True,
                 seg_indptr=seg_indptr,
                 weight_indices=weight_indices_cur_rank,
@@ -1087,9 +991,9 @@ class DeepEPMoE(EPMoE):
             return hidden_states
         # in original deepep, idx == -1 meaning invalid and will not be processed.
         # aiter does not accept -1, we use a expert mask to make these idx invalid
-        # (idx == num_experts_per_partition) meaning not used in aiter fused_moe
+        # (idx == num_local_experts) meaning not used in aiter fused_moe
         topk_idx_copy = topk_idx.to(torch.int32)
-        topk_idx_copy[topk_idx_copy == -1] = self.num_experts_per_partition
+        topk_idx_copy[topk_idx_copy == -1] = self.num_local_experts
         return fused_moe(
             hidden_states,
@@ -1315,12 +1219,74 @@ class DeepEPMoE(EPMoE):
         return down_output
+class FlashInferEPMoE(EPMoE):
+    def __init__(self, *args, **kwargs):
+        renormalize = kwargs.pop("renormalize", True)
+        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
+        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
+        num_expert_group = kwargs.pop("num_expert_group", None)
+        topk_group = kwargs.pop("topk_group", None)
+        correction_bias = kwargs.pop("correction_bias", None)
+        super().__init__(*args, **kwargs)
+        self.renormalize = renormalize
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.correction_bias = correction_bias
+        self.use_flashinfer_trtllm_moe = use_flashinfer_trtllm_moe
+    def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        assert use_flashinfer_trtllm_moe
+        assert (
+            self.activation == "silu"
+        ), "Only silu is supported for flashinfer blockscale fp8 moe"
+        assert (
+            self.renormalize
+        ), "Renormalize is required for flashinfer blockscale fp8 moe"
+        assert (
+            self.num_fused_shared_experts == 0
+        ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
+        a_q, a_sf = sglang_per_token_group_quant_fp8(hidden_states, self.block_shape[1])
+        # NOTE: scales of hidden states have to be transposed!
+        a_sf_t = a_sf.t().contiguous()
+        assert fi_fused_moe is not None
+        return fi_fused_moe.trtllm_fp8_block_scale_moe(
+            routing_logits=router_logits.to(torch.float32),
+            routing_bias=self.correction_bias.to(hidden_states.dtype),
+            hidden_states=a_q,
+            hidden_states_scale=a_sf_t,
+            gemm1_weights=self.w13_weight,
+            gemm1_weights_scale=self.w13_weight_scale_inv,
+            gemm2_weights=self.w2_weight,
+            gemm2_weights_scale=self.w2_weight_scale_inv,
+            num_experts=self.num_experts,
+            top_k=self.top_k,
+            n_group=self.num_expert_group,
+            topk_group=self.topk_group,
+            intermediate_size=self.w2_weight.shape[2],
+            local_expert_offset=self.start_expert_id,
+            local_num_experts=self.num_experts_per_partition,
+            routed_scaling_factor=self.routed_scaling_factor,
+            tile_tokens_dim=_get_tile_tokens_dim(
+                hidden_states.shape[0], self.top_k, self.num_experts
+            ),
+            routing_method_type=2,  # DeepSeek-styled routing method
+            use_shuffled_weight=False,
+        )
 def get_moe_impl_class():
     if global_server_args_dict["enable_deepep_moe"]:
         return DeepEPMoE
-    if global_server_args_dict["enable_flashinfer_moe"]:
+    if global_server_args_dict["enable_flashinfer_cutlass_moe"]:
         # Must come before EPMoE because FusedMoE also supports enable_ep_moe
         return FusedMoE
+    if use_flashinfer_trtllm_moe:
+        # Must come before EPMoE because FusedMoE also supports enable_ep_moe
+        return FlashInferEPMoE
     if global_server_args_dict["enable_ep_moe"]:
         return EPMoE
     return FusedMoE

sglang 0.4.9.post4__py3-none-any.whl → 0.4.9.post5__py3-none-any.whl

sglang 0.4.9.post4py3-none-any.whl → 0.4.9.post5py3-none-any.whl