PyPI - sglang - Versions diffs - 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl - Mend

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (83) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/bench_one_batch.py +3 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/model_config.py +4 -0
sglang/srt/configs/step3_vl.py +172 -0
sglang/srt/conversation.py +23 -0
sglang/srt/disaggregation/decode.py +2 -8
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/disaggregation/prefill.py +2 -6
sglang/srt/distributed/parallel_state.py +86 -1
sglang/srt/entrypoints/engine.py +14 -18
sglang/srt/entrypoints/http_server.py +10 -2
sglang/srt/entrypoints/openai/serving_chat.py +2 -21
sglang/srt/eplb/expert_distribution.py +5 -0
sglang/srt/eplb/expert_location.py +17 -6
sglang/srt/eplb/expert_location_dispatch.py +1 -0
sglang/srt/eplb/expert_location_updater.py +2 -0
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/step3_detector.py +436 -0
sglang/srt/hf_transformers_utils.py +2 -0
sglang/srt/jinja_template_utils.py +4 -1
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/moe/cutlass_moe.py +2 -1
sglang/srt/layers/moe/ep_moe/layer.py +39 -674
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +26 -13
sglang/srt/layers/moe/fused_moe_triton/layer.py +152 -39
sglang/srt/layers/quantization/fp8.py +52 -18
sglang/srt/layers/quantization/unquant.py +0 -8
sglang/srt/layers/quantization/w4afp8.py +1 -0
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/managers/cache_controller.py +165 -67
sglang/srt/managers/data_parallel_controller.py +2 -0
sglang/srt/managers/io_struct.py +0 -2
sglang/srt/managers/scheduler.py +90 -671
sglang/srt/managers/scheduler_metrics_mixin.py +229 -0
sglang/srt/managers/scheduler_profiler_mixin.py +279 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +142 -0
sglang/srt/managers/template_manager.py +62 -19
sglang/srt/managers/tokenizer_manager.py +123 -74
sglang/srt/managers/tp_worker.py +4 -0
sglang/srt/managers/tp_worker_overlap_thread.py +2 -1
sglang/srt/mem_cache/hicache_storage.py +60 -17
sglang/srt/mem_cache/hiradix_cache.py +36 -8
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +418 -29
sglang/srt/mem_cache/mooncake_store/mooncake_store.py +264 -0
sglang/srt/mem_cache/mooncake_store/unit_test.py +40 -0
sglang/srt/mem_cache/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +183 -0
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +278 -0
sglang/srt/mem_cache/storage/hf3fs/test_hf3fs_utils.py +43 -0
sglang/srt/model_executor/cuda_graph_runner.py +25 -1
sglang/srt/model_executor/model_runner.py +13 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/arcee.py +532 -0
sglang/srt/models/deepseek_v2.py +7 -6
sglang/srt/models/glm4_moe.py +6 -4
sglang/srt/models/granitemoe.py +3 -0
sglang/srt/models/grok.py +3 -0
sglang/srt/models/hunyuan.py +1 -0
sglang/srt/models/llama4.py +3 -0
sglang/srt/models/mixtral.py +3 -0
sglang/srt/models/olmoe.py +3 -0
sglang/srt/models/phimoe.py +1 -0
sglang/srt/models/step3_vl.py +991 -0
sglang/srt/multimodal/processors/base_processor.py +15 -16
sglang/srt/multimodal/processors/step3_vl.py +515 -0
sglang/srt/reasoning_parser.py +2 -1
sglang/srt/server_args.py +49 -18
sglang/srt/speculative/eagle_worker.py +2 -0
sglang/srt/utils.py +1 -0
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/utils.py +0 -11
sglang/version.py +1 -1
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/METADATA +3 -4
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/RECORD +83 -65
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post6.dist-info → sglang-0.4.10.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -413,18 +413,37 @@ def fused_moe_kernel(
     num_tokens_post_padded = tl.load(num_tokens_post_padded_ptr)
     if pid_m * BLOCK_SIZE_M >= num_tokens_post_padded:
         return
-    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)
+    offs_token_id = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M).to(tl.int64)
     offs_token = tl.load(sorted_token_ids_ptr + offs_token_id)
     offs_token = offs_token.to(tl.int64)
     token_mask = offs_token < num_valid_tokens
-    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N
+    off_experts = tl.load(expert_ids_ptr + pid_m).to(tl.int64)
+    if off_experts == -1:
+        # -----------------------------------------------------------
+        # Write back zeros to the output when the expert is not
+        # in the current expert parallel rank.
+        write_zeros_to_output(
+            c_ptr,
+            stride_cm,
+            stride_cn,
+            pid_n,
+            N,
+            offs_token,
+            token_mask,
+            BLOCK_SIZE_M,
+            BLOCK_SIZE_N,
+            compute_type,
+        )
+        return
+    offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N).to(tl.int64)) % N
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     a_ptrs = a_ptr + (
         offs_token[:, None] // top_k * stride_am + offs_k[None, :] * stride_ak
     )
-    off_experts = tl.load(expert_ids_ptr + pid_m)
     b_ptrs = (
         b_ptr
         + off_experts * stride_be
@@ -497,7 +516,6 @@ def fused_moe_kernel(
                 accumulator += tl.dot(a, b) * a_scale[:, None] * b_scale[None, :]
             else:
-                # fix out of shared memory issue
                 if use_fp8_w8a8:
                     accumulator = tl.dot(a, b, acc=accumulator)
                 else:
@@ -568,7 +586,7 @@ def moe_align_block_size(
     - The padding ensures that the total number of tokens is now divisible
         by block_size for proper block matrix operations.
     """
-    max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
     sorted_ids = torch.empty(
         (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
@@ -578,13 +596,9 @@ def moe_align_block_size(
     )
     num_tokens_post_pad = torch.empty((1), dtype=torch.int32, device=topk_ids.device)
+    # In EP, expert_ids for filtered experts are -1. We have num_experts + 1 ids in total.
     cumsum_buffer = torch.empty(
-        (num_experts + 1,), dtype=torch.int32, device=topk_ids.device
-    )
-    token_cnts_buffer = torch.empty(
-        (num_experts + 1) * num_experts,
-        dtype=torch.int32,
-        device=topk_ids.device,
+        (num_experts + 2,), dtype=torch.int32, device=topk_ids.device
     )
     # Threshold based on benchmark results
@@ -594,12 +608,11 @@ def moe_align_block_size(
     sgl_moe_align_block_size(
         topk_ids,
-        num_experts,
+        num_experts + 1,
         block_size,
         sorted_ids,
         expert_ids,
         num_tokens_post_pad,
-        token_cnts_buffer,
         cumsum_buffer,
         fuse_sorted_ids_padding,
     )

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -1,17 +1,25 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
+import importlib.util
 import logging
 from enum import Enum
+from functools import lru_cache
 from typing import List, Optional, Tuple
 import torch
+from packaging import version as pkg_version
 from sglang.srt.distributed import (
+    get_moe_expert_parallel_rank,
+    get_moe_expert_parallel_world_size,
+    get_moe_tensor_parallel_rank,
+    get_moe_tensor_parallel_world_size,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     tensor_model_parallel_all_reduce,
 )
-from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.layers.moe.topk import StandardTopKOutput
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
@@ -28,6 +36,15 @@ _is_cpu = is_cpu()
 logger = logging.getLogger(__name__)
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    return global_server_args_dict["enable_flashinfer_trtllm_moe"] and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
@@ -62,8 +79,9 @@ class FusedMoE(torch.nn.Module):
         num_experts: int,
         hidden_size: int,
         intermediate_size: int,
+        layer_id: int,
         top_k: Optional[int] = None,
-        layer_id: Optional[int] = None,
+        num_fused_shared_experts: int = 0,
         params_dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
@@ -77,21 +95,19 @@ class FusedMoE(torch.nn.Module):
         routed_scaling_factor: Optional[float] = None,
         enable_flashinfer_cutlass_moe: Optional[bool] = False,
         enable_ep_moe: Optional[bool] = False,
-        skip_quant: Optional[bool] = False,
     ):
         super().__init__()
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
+        self.layer_id = layer_id
         self.top_k = top_k
         self.hidden_size = hidden_size
-        self.tp_size = (
-            tp_size if tp_size is not None else get_tensor_model_parallel_world_size()
-        )
-        self.tp_rank = get_tensor_model_parallel_rank()
         self.num_experts = num_experts
-        self.expert_map = None
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.expert_map_cpu = None
+        self.expert_map_gpu = None
         if enable_flashinfer_cutlass_moe and quant_config is None:
             logger.warning("Disable flashinfer MoE when quantization config is None.")
@@ -99,28 +115,28 @@ class FusedMoE(torch.nn.Module):
             enable_ep_moe = False
         self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
+        self.moe_ep_size = get_moe_expert_parallel_world_size()
+        self.moe_ep_rank = get_moe_expert_parallel_rank()
+        self.moe_tp_size = get_moe_tensor_parallel_world_size()
+        self.moe_tp_rank = get_moe_tensor_parallel_rank()
+        assert num_experts % self.moe_ep_size == 0
+        self.num_local_experts = num_experts // self.moe_ep_size
         if enable_ep_moe:
-            self.ep_size = self.tp_size
-            self.ep_rank = self.tp_rank
-            self.tp_size = 1
-            self.tp_rank = 0
+            # TODO(ch-wan): support shared experts fusion
             # Create a tensor of size num_experts filled with -1
-            self.expert_map = torch.full((self.num_experts,), -1, dtype=torch.int32)
+            self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
             # Create a expert map for the local experts
-            assert num_experts % self.ep_size == 0
-            self.num_local_experts = num_experts // self.ep_size
-            self.expert_map[
-                self.ep_rank
-                * self.num_local_experts : (self.ep_rank + 1)
+            self.expert_map_cpu[
+                self.moe_ep_rank
+                * self.num_local_experts : (self.moe_ep_rank + 1)
                 * self.num_local_experts
             ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
-        else:
-            self.ep_size = 1
-            self.ep_rank = 0
-            self.num_local_experts = num_experts
+            if not self.enable_flashinfer_cutlass_moe:
+                self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
         self.routed_scaling_factor = routed_scaling_factor
-        assert intermediate_size % self.tp_size == 0
-        self.intermediate_size_per_partition = intermediate_size // self.tp_size
+        assert intermediate_size % self.moe_tp_size == 0
+        self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
         self.reduce_results = reduce_results
         self.activation = activation
         self.apply_router_weight_on_input = apply_router_weight_on_input
@@ -132,9 +148,6 @@ class FusedMoE(torch.nn.Module):
             not _is_cpu and global_server_args_dict["enable_triton_kernel_moe"]
         )
-        if skip_quant:
-            return
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = UnquantizedFusedMoEMethod(
                 self.use_triton_kernels
@@ -363,9 +376,9 @@ class FusedMoE(torch.nn.Module):
             expert_data.copy_(loaded_weight)
     def _map_global_expert_id_to_local_expert_id(self, expert_id: int) -> int:
-        if self.expert_map is None:
+        if self.expert_map_cpu is None:
             return expert_id
-        return self.expert_map[expert_id].item()
+        return self.expert_map_cpu[expert_id].item()
     def weight_loader(
         self,
@@ -375,10 +388,48 @@ class FusedMoE(torch.nn.Module):
         shard_id: str,
         expert_id: int,
     ) -> None:
+        global_expert_location_metadata = get_global_expert_location_metadata()
+        if global_expert_location_metadata is None:
+            self._weight_loader_impl(
+                param=param,
+                loaded_weight=loaded_weight,
+                weight_name=weight_name,
+                shard_id=shard_id,
+                expert_id=expert_id,
+            )
+            return
+        if expert_id >= self.num_experts - self.num_fused_shared_experts:
+            # This is a shared expert.
+            physical_expert_ids = [expert_id]
+        else:
+            physical_expert_ids = (
+                global_expert_location_metadata.logical_to_all_physical(
+                    self.layer_id, expert_id
+                )
+            )
+        for physical_expert_id in physical_expert_ids:
+            self._weight_loader_physical(
+                param=param,
+                loaded_weight=loaded_weight,
+                weight_name=weight_name,
+                shard_id=shard_id,
+                expert_id=physical_expert_id,
+            )
+    def _weight_loader_physical(
+        self,
+        param: torch.nn.Parameter,
+        loaded_weight: torch.Tensor,
+        weight_name: str,
+        shard_id: str,
+        expert_id: int,
+    ) -> None:
         expert_id = self._map_global_expert_id_to_local_expert_id(expert_id)
         if expert_id == -1:
             return
         self._weight_loader_impl(
             param=param,
             loaded_weight=loaded_weight,
@@ -396,8 +447,7 @@ class FusedMoE(torch.nn.Module):
         expert_id: int,
     ) -> None:
-        # TP rank is set to 0 if EP is enabled
-        tp_rank = 0 if self.ep_size > 1 else get_tensor_model_parallel_rank()
+        tp_rank = self.moe_tp_rank
         # compressed-tensors checkpoints with packed weights are stored flipped
         # TODO (mgoin): check self.quant_method.quant_config.quant_format
@@ -417,7 +467,7 @@ class FusedMoE(torch.nn.Module):
             )
         # Flashinfer assumes w31 format for w13_weight. Same for the scales.
-        if getattr(self, "use_flashinfer_trtllm_moe", False):
+        if should_use_flashinfer_trtllm_moe():
             shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]
         WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
@@ -571,9 +621,14 @@ class FusedMoE(torch.nn.Module):
             )
             return
-    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
+    def forward(self, hidden_states: torch.Tensor, topk_output: StandardTopKOutput):
         assert self.quant_method is not None
+        if self.expert_map_gpu is not None:
+            topk_output = topk_output._replace(
+                topk_ids=self.expert_map_gpu[topk_output.topk_ids]
+            )
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply(
             layer=self,
@@ -584,17 +639,17 @@ class FusedMoE(torch.nn.Module):
             routed_scaling_factor=self.routed_scaling_factor,
             **(
                 dict(
-                    tp_rank=self.tp_rank,
-                    tp_size=self.tp_size,
-                    ep_rank=self.ep_rank,
-                    ep_size=self.ep_size,
+                    tp_rank=self.moe_tp_rank,
+                    tp_size=self.moe_tp_size,
+                    ep_rank=self.moe_ep_rank,
+                    ep_size=self.moe_ep_size,
                 )
                 if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
                 else {}
             ),
         )
-        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+        if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
         return final_hidden_states
@@ -627,3 +682,61 @@ class FusedMoE(torch.nn.Module):
                 ("w3", ckpt_up_proj_name),
             ]
         ]
+    @classmethod
+    def make_expert_input_scale_params_mapping(
+        cls,
+        num_experts: int,
+    ) -> List[Tuple[str, str, int, str]]:
+        # (param_name, weight_name, expert_id, shard_id)
+        return [
+            (
+                "experts.w13_" if shard_id in ["w1", "w3"] else "experts.w2_",
+                f"experts.{expert_id}.{shard_id}.",
+                expert_id,
+                shard_id,
+            )
+            for expert_id in range(num_experts)
+            for shard_id in ["w1", "w2", "w3"]
+        ]
+class FlashInferFusedMoE(FusedMoE):
+    def __init__(self, *args, **kwargs):
+        renormalize = kwargs.pop("renormalize", True)
+        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
+        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
+        num_expert_group = kwargs.pop("num_expert_group", None)
+        topk_group = kwargs.pop("topk_group", None)
+        correction_bias = kwargs.pop("correction_bias", None)
+        super().__init__(*args, **kwargs)
+        self.renormalize = renormalize
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.correction_bias = correction_bias
+    def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        assert self.quant_method is not None
+        assert (
+            self.renormalize
+        ), "Renormalize is required for flashinfer blockscale fp8 moe"
+        assert (
+            self.num_fused_shared_experts == 0
+        ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply_with_router_logits(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            activation=self.activation,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -72,6 +72,7 @@ from sglang.srt.utils import (
     is_hip,
     is_npu,
     log_info_on_rank0,
+    next_power_of_2,
     print_warning_once,
     set_weight_attrs,
     use_intel_amx_backend,
@@ -172,7 +173,6 @@ class Fp8Config(QuantizationConfig):
         self, layer: torch.nn.Module, prefix: str
     ) -> Optional[QuantizeMethodBase]:
         from sglang.srt.layers.linear import LinearBase
-        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
         if isinstance(layer, LinearBase):
@@ -181,8 +181,6 @@ class Fp8Config(QuantizationConfig):
             return Fp8LinearMethod(self)
         elif isinstance(layer, FusedMoE):
             return Fp8MoEMethod(self)
-        elif isinstance(layer, EPMoE):
-            return Fp8EPMoEMethod(self)
         return None
     def get_scaled_act_names(self) -> List[str]:
@@ -493,6 +491,16 @@ class Fp8LinearMethod(LinearMethodBase):
         )
+def get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
 class Fp8MoEMethod(FusedMoEMethodBase):
     """MoE method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -984,23 +992,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
-        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
         from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_experts
-        if isinstance(layer, EPMoE):
-            layer.w13_weight_scale = (
-                layer.w13_weight_scale_inv
-                if self.block_quant
-                else layer.w13_weight_scale
-            )
-            layer.w2_weight_scale = (
-                layer.w2_weight_scale_inv if self.block_quant else layer.w2_weight_scale
-            )
-            return layer.run_moe(
-                hidden_states=x,
-                topk_output=topk_output,
-            )
         if use_intel_amx_backend(layer):
             from sglang.srt.layers.moe.topk import apply_topk_weights_cpu
@@ -1094,6 +1087,47 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             routed_scaling_factor=routed_scaling_factor,
         )
+    def apply_with_router_logits(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        assert (
+            activation == "silu"
+        ), "Only silu is supported for flashinfer blockscale fp8 moe"
+        a_q, a_sf = per_token_group_quant_fp8(x, self.quant_config.weight_block_size[1])
+        # NOTE: scales of hidden states have to be transposed!
+        a_sf_t = a_sf.t().contiguous()
+        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+        return trtllm_fp8_block_scale_moe(
+            routing_logits=router_logits.to(torch.float32),
+            routing_bias=layer.correction_bias.to(x.dtype),
+            hidden_states=a_q,
+            hidden_states_scale=a_sf_t,
+            gemm1_weights=layer.w13_weight,
+            gemm1_weights_scale=layer.w13_weight_scale_inv,
+            gemm2_weights=layer.w2_weight,
+            gemm2_weights_scale=layer.w2_weight_scale_inv,
+            num_experts=layer.num_experts,
+            top_k=layer.top_k,
+            n_group=layer.num_expert_group,
+            topk_group=layer.topk_group,
+            intermediate_size=layer.w2_weight.shape[2],
+            local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+            local_num_experts=layer.num_local_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            tile_tokens_dim=get_tile_tokens_dim(
+                x.shape[0], layer.top_k, layer.num_experts
+            ),
+            routing_method_type=2,  # DeepSeek-styled routing method
+            use_shuffled_weight=False,
+        )
     def maybe_apply_hip_fused_experts(
         self,
         layer: torch.nn.Module,

sglang/srt/layers/quantization/unquant.py CHANGED Viewed

@@ -204,14 +204,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
         routed_scaling_factor: Optional[float] = None,
     ) -> torch.Tensor:
-        from sglang.srt.layers.moe.ep_moe.layer import EPMoE
-        if isinstance(layer, EPMoE):
-            return layer.run_moe(
-                hidden_states=x,
-                topk_output=topk_output,
-            )
         return self.forward(
             x=x,
             layer=layer,

sglang/srt/layers/quantization/w4afp8.py CHANGED Viewed

@@ -276,6 +276,7 @@ class W4AFp8MoEMethod(FusedMoEMethodBase):
         layer: EPMoE,
         hidden_states: torch.Tensor,
         topk_output: TopKOutput,
+        **kwargs,
     ) -> torch.Tensor:
         # TODO(ch-wan): move it out of this class

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -231,7 +231,10 @@ class W8A8Int8Config(QuantizationConfig):
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return []
+        filenames = []
+        if _is_npu:
+            filenames.append("quant_model_description.json")
+        return filenames
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> W8A8Int8Config:

sglang 0.4.9.post6__py3-none-any.whl → 0.4.10.post1__py3-none-any.whl

sglang 0.4.9.post6py3-none-any.whl → 0.4.10.post1py3-none-any.whl