PyPI - sglang - Versions diffs - 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl - Mend

sglang 0.4.10py3-none-any.whl → 0.4.10.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (92) hide show

sglang/bench_offline_throughput.py +20 -0
sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/configs/model_config.py +1 -0
sglang/srt/conversation.py +0 -112
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
sglang/srt/disaggregation/launch_lb.py +5 -20
sglang/srt/disaggregation/mooncake/conn.py +33 -15
sglang/srt/disaggregation/prefill.py +1 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +11 -0
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +35 -15
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/hf_transformers_utils.py +25 -10
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/trtllm_mla_backend.py +372 -0
sglang/srt/layers/attention/utils.py +6 -1
sglang/srt/layers/attention/vision.py +27 -10
sglang/srt/layers/communicator.py +14 -4
sglang/srt/layers/linear.py +7 -1
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/ep_moe/layer.py +29 -68
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +82 -25
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/utils.py +43 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp8.py +57 -1
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/quantization/w8a8_int8.py +4 -1
sglang/srt/layers/vocab_parallel_embedding.py +7 -1
sglang/srt/lora/lora_registry.py +7 -0
sglang/srt/managers/cache_controller.py +43 -39
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +3 -2
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +145 -6
sglang/srt/managers/template_manager.py +25 -22
sglang/srt/managers/tokenizer_manager.py +114 -62
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -12
sglang/srt/mem_cache/hiradix_cache.py +21 -4
sglang/srt/mem_cache/memory_pool.py +15 -118
sglang/srt/mem_cache/memory_pool_host.py +350 -33
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +8 -2
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +163 -0
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +238 -0
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +216 -0
sglang/srt/model_executor/cuda_graph_runner.py +42 -4
sglang/srt/model_executor/forward_batch_info.py +13 -3
sglang/srt/model_executor/model_runner.py +13 -1
sglang/srt/model_loader/weight_utils.py +2 -0
sglang/srt/models/deepseek_v2.py +28 -23
sglang/srt/models/glm4_moe.py +85 -22
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2_moe.py +1 -4
sglang/srt/models/qwen3_moe.py +7 -8
sglang/srt/models/step3_vl.py +1 -4
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/operations_strategy.py +1 -1
sglang/srt/server_args.py +115 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +6 -4
sglang/srt/utils.py +4 -24
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +945 -0
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +92 -81
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -1,20 +1,25 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
+import importlib.util
 import logging
 from enum import Enum
+from functools import lru_cache
 from typing import List, Optional, Tuple
 import torch
+from packaging import version as pkg_version
 from sglang.srt.distributed import (
     get_moe_expert_parallel_rank,
     get_moe_expert_parallel_world_size,
     get_moe_tensor_parallel_rank,
     get_moe_tensor_parallel_world_size,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
+    get_tp_group,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
 from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
 from sglang.srt.layers.moe.topk import StandardTopKOutput
 from sglang.srt.layers.quantization.base_config import (
@@ -33,6 +38,15 @@ _is_cpu = is_cpu()
 logger = logging.getLogger(__name__)
+@lru_cache(maxsize=1)
+def should_use_flashinfer_trtllm_moe():
+    return global_server_args_dict["enable_flashinfer_trtllm_moe"] and (
+        not importlib.util.find_spec("flashinfer")
+        or pkg_version.parse(__import__("flashinfer").__version__)
+        >= pkg_version.parse("0.2.9rc1")
+    )
 class FusedMoeWeightScaleSupported(Enum):
     TENSOR = "tensor"
     CHANNEL = "channel"
@@ -82,7 +96,6 @@ class FusedMoE(torch.nn.Module):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
         enable_flashinfer_cutlass_moe: Optional[bool] = False,
-        enable_ep_moe: Optional[bool] = False,
     ):
         super().__init__()
@@ -100,7 +113,6 @@ class FusedMoE(torch.nn.Module):
         if enable_flashinfer_cutlass_moe and quant_config is None:
             logger.warning("Disable flashinfer MoE when quantization config is None.")
             enable_flashinfer_cutlass_moe = False
-            enable_ep_moe = False
         self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
         self.moe_ep_size = get_moe_expert_parallel_world_size()
@@ -109,7 +121,7 @@ class FusedMoE(torch.nn.Module):
         self.moe_tp_rank = get_moe_tensor_parallel_rank()
         assert num_experts % self.moe_ep_size == 0
         self.num_local_experts = num_experts // self.moe_ep_size
-        if enable_ep_moe:
+        if self.moe_ep_size > 1:
             # TODO(ch-wan): support shared experts fusion
             # Create a tensor of size num_experts filled with -1
             self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
@@ -119,7 +131,8 @@ class FusedMoE(torch.nn.Module):
                 * self.num_local_experts : (self.moe_ep_rank + 1)
                 * self.num_local_experts
             ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
-            self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
+            if not self.enable_flashinfer_cutlass_moe:
+                self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
         self.routed_scaling_factor = routed_scaling_factor
         assert intermediate_size % self.moe_tp_size == 0
@@ -454,7 +467,7 @@ class FusedMoE(torch.nn.Module):
             )
         # Flashinfer assumes w31 format for w13_weight. Same for the scales.
-        if getattr(self, "use_flashinfer_trtllm_moe", False):
+        if should_use_flashinfer_trtllm_moe():
             shard_id = {"w1": "w3", "w3": "w1", "w2": "w2"}[shard_id]
         WEIGHT_SCALE_SUPPORTED = [e.value for e in FusedMoeWeightScaleSupported]
@@ -617,24 +630,27 @@ class FusedMoE(torch.nn.Module):
             )
         # Matrix multiply.
-        final_hidden_states = self.quant_method.apply(
-            layer=self,
-            x=hidden_states,
-            topk_output=topk_output,
-            activation=self.activation,
-            apply_router_weight_on_input=self.apply_router_weight_on_input,
-            routed_scaling_factor=self.routed_scaling_factor,
-            **(
-                dict(
-                    tp_rank=self.moe_tp_rank,
-                    tp_size=self.moe_tp_size,
-                    ep_rank=self.moe_ep_rank,
-                    ep_size=self.moe_ep_size,
-                )
-                if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
-                else {}
-            ),
-        )
+        with use_symmetric_memory(get_tp_group()) as sm:
+            final_hidden_states = self.quant_method.apply(
+                layer=self,
+                x=hidden_states,
+                topk_output=topk_output,
+                activation=self.activation,
+                apply_router_weight_on_input=self.apply_router_weight_on_input,
+                routed_scaling_factor=self.routed_scaling_factor,
+                **(
+                    dict(
+                        tp_rank=self.moe_tp_rank,
+                        tp_size=self.moe_tp_size,
+                        ep_rank=self.moe_ep_rank,
+                        ep_size=self.moe_ep_size,
+                    )
+                    if self.quant_method.__class__.__name__
+                    == "ModelOptNvFp4FusedMoEMethod"
+                    else {}
+                ),
+            )
+            sm.tag(final_hidden_states)
         if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
@@ -686,3 +702,44 @@ class FusedMoE(torch.nn.Module):
             for expert_id in range(num_experts)
             for shard_id in ["w1", "w2", "w3"]
         ]
+class FlashInferFusedMoE(FusedMoE):
+    def __init__(self, *args, **kwargs):
+        renormalize = kwargs.pop("renormalize", True)
+        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
+        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
+        num_expert_group = kwargs.pop("num_expert_group", None)
+        topk_group = kwargs.pop("topk_group", None)
+        correction_bias = kwargs.pop("correction_bias", None)
+        super().__init__(*args, **kwargs)
+        self.renormalize = renormalize
+        self.num_fused_shared_experts = num_fused_shared_experts
+        self.use_grouped_topk = use_grouped_topk
+        if self.use_grouped_topk:
+            assert num_expert_group is not None and topk_group is not None
+        self.num_expert_group = num_expert_group
+        self.topk_group = topk_group
+        self.correction_bias = correction_bias
+    def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor):
+        assert self.quant_method is not None
+        assert (
+            self.renormalize
+        ), "Renormalize is required for flashinfer blockscale fp8 moe"
+        assert (
+            self.num_fused_shared_experts == 0
+        ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
+        # Matrix multiply.
+        final_hidden_states = self.quant_method.apply_with_router_logits(
+            layer=self,
+            x=hidden_states,
+            router_logits=router_logits,
+            activation=self.activation,
+            routed_scaling_factor=self.routed_scaling_factor,
+        )
+        if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
+            final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
+        return final_hidden_states

sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py CHANGED Viewed

@@ -146,34 +146,3 @@ def triton_kernel_fused_experts(
     )
     return intermediate_cache3
-def triton_kernel_moe_forward_fake(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    gating_output: torch.Tensor,
-    topk: int,
-    renormalize: bool,
-    inplace: bool = False,
-    activation: str = "silu",
-    apply_router_weight_on_input: bool = False,
-    use_fp8_w8a8: bool = False,
-    per_channel_quant: bool = False,
-    global_num_experts: int = -1,
-    expert_map: Optional[torch.Tensor] = None,
-    w1_scale: Optional[torch.Tensor] = None,
-    w2_scale: Optional[torch.Tensor] = None,
-    a1_scale: Optional[torch.Tensor] = None,
-    a2_scale: Optional[torch.Tensor] = None,
-    block_shape: Optional[list[int]] = None,
-) -> torch.Tensor:
-    return torch.empty_like(hidden_states)
-direct_register_custom_op(
-    op_name="forward_cuda_triton",
-    op_func=triton_kernel_moe_forward,
-    mutates_args=[],
-    fake_impl=triton_kernel_moe_forward_fake,
-)

sglang/srt/layers/moe/token_dispatcher/__init__.py CHANGED Viewed

@@ -0,0 +1,23 @@
+from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
+    BaseDispatcher,
+    BaseDispatcherConfig,
+    DispatchOutput,
+    DispatchOutputFormat,
+)
+from sglang.srt.layers.moe.token_dispatcher.deepep import (
+    DeepEPConfig,
+    DeepEPDispatcher,
+    DeepEPLLOutput,
+    DeepEPNormalOutput,
+)
+__all__ = [
+    "BaseDispatcher",
+    "BaseDispatcherConfig",
+    "DispatchOutput",
+    "DispatchOutputFormat",
+    "DeepEPConfig",
+    "DeepEPDispatcher",
+    "DeepEPNormalOutput",
+    "DeepEPLLOutput",
+]

sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py CHANGED Viewed

@@ -2,11 +2,22 @@ from __future__ import annotations
 from abc import ABC, abstractmethod
 from enum import Enum, auto
-from typing import TYPE_CHECKING, NamedTuple, Protocol, runtime_checkable
+from typing import Protocol, runtime_checkable
 import torch
+class MoEA2ABackend(Enum):
+    none = "none"
+    deepep = "deepep"
+    def is_none(self):
+        return self == MoEA2ABackend.none
+    def is_deepep(self):
+        return self == MoEA2ABackend.deepep
 class DispatchOutputFormat(Enum):
     standard = auto()
     deepep_normal = auto()

sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} RENAMED Viewed

@@ -1,5 +1,3 @@
-# TODO(ch-wan): this file will be moved to sglang/srt/layers/moe/token_dispatcher/deepep.py
 from __future__ import annotations
 import logging
@@ -22,15 +20,10 @@ from sglang.srt.layers.moe.token_dispatcher.base_dispatcher import (
     DispatchOutput,
     DispatchOutputFormat,
 )
+from sglang.srt.layers.moe.utils import DeepEPMode
 from sglang.srt.layers.quantization import deep_gemm_wrapper
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.utils import (
-    DeepEPMode,
-    get_bool_env_var,
-    get_int_env_var,
-    is_hip,
-    load_json_config,
-)
+from sglang.srt.utils import get_bool_env_var, get_int_env_var, is_hip, load_json_config
 try:
     from deep_ep import Buffer, Config
@@ -150,9 +143,9 @@ class DeepEPBuffer:
                 num_rdma_bytes,
             )
-        if deepep_mode == DeepEPMode.normal:
+        if deepep_mode == DeepEPMode.NORMAL:
             num_qps_per_rank = DeepEPConfig.get_instance().num_sms // 2
-        elif deepep_mode in [DeepEPMode.low_latency, DeepEPMode.auto]:
+        elif deepep_mode in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]:
             num_qps_per_rank = num_experts // group.size()
         else:
             raise NotImplementedError
@@ -161,7 +154,7 @@ class DeepEPBuffer:
             device="cuda"
         ).multi_processor_count
         if (
-            (deepep_mode != DeepEPMode.low_latency)
+            (deepep_mode != DeepEPMode.LOW_LATENCY)
             and not global_server_args_dict["enable_two_batch_overlap"]
             and (DeepEPConfig.get_instance().num_sms < total_num_sms // 2)
         ):
@@ -611,7 +604,7 @@ class DeepEPDispatcher(BaseDispatcher):
         num_local_experts: int = None,
         hidden_size: int = None,
         params_dtype: torch.dtype = None,
-        deepep_mode: DeepEPMode = DeepEPMode.auto,
+        deepep_mode: DeepEPMode = DeepEPMode.AUTO,
         async_finish: bool = False,
         return_recv_hook: bool = False,
     ):
@@ -697,9 +690,9 @@ class DeepEPDispatcher(BaseDispatcher):
         resolved_deepep_mode = self.deepep_mode.resolve(
             forward_batch.is_extend_in_batch
         )
-        if resolved_deepep_mode == DeepEPMode.normal:
+        if resolved_deepep_mode == DeepEPMode.NORMAL:
             return self._normal_dispatcher
-        elif resolved_deepep_mode == DeepEPMode.low_latency:
+        elif resolved_deepep_mode == DeepEPMode.LOW_LATENCY:
             return self._low_latency_dispatcher
         else:
             raise ValueError(f"Invalid deepep_mode: {self.deepep_mode}")

sglang/srt/layers/moe/utils.py ADDED Viewed

@@ -0,0 +1,43 @@
+from enum import Enum
+class MoeA2ABackend(Enum):
+    STANDARD = ("standard", "none")
+    DEEPEP = "deepep"
+    @classmethod
+    def _missing_(cls, value):
+        if value is None:
+            return cls.STANDARD
+        for member in cls:
+            if value in member.value:
+                return member
+        raise ValueError(f"No {cls.__name__} member for value {value}")
+    def is_deepep(self):
+        return self == MoeA2ABackend.DEEPEP
+    def is_standard(self):
+        return self == MoeA2ABackend.STANDARD
+class DeepEPMode(Enum):
+    NORMAL = "normal"
+    LOW_LATENCY = "low_latency"
+    AUTO = "auto"
+    def enable_normal(self):
+        return self in [DeepEPMode.NORMAL, DeepEPMode.AUTO]
+    def enable_low_latency(self):
+        return self in [DeepEPMode.LOW_LATENCY, DeepEPMode.AUTO]
+    def resolve(self, is_extend_in_batch: bool):
+        if self != DeepEPMode.AUTO:
+            return self
+        if is_extend_in_batch:
+            return DeepEPMode.NORMAL
+        else:
+            return DeepEPMode.LOW_LATENCY

sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py CHANGED Viewed

@@ -23,6 +23,7 @@ from sglang.srt.layers.quantization.utils import (
 from sglang.srt.utils import is_cpu, is_cuda, is_hip, is_npu, set_weight_attrs
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
     from sglang.srt.layers.moe.topk import TopKOutput
     from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors import (
         CompressedTensorsConfig,
@@ -189,7 +190,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             layer.w13_input_scale = None
             layer.w2_input_scale = None
-    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+    def process_weights_after_loading(self, layer: FusedMoE) -> None:
         # Fp8 moe kernels require a single activation scale.
         # We take the max of all the scales in case they differ.
         if self.static_input_scales:
@@ -246,7 +247,7 @@ class CompressedTensorsW8A8Fp8MoEMethod(CompressedTensorsMoEMethod):
             assert layer.w13_weight_scale is not None
             shard_size = layer.intermediate_size_per_partition
             max_w13_scales = layer.w13_weight_scale.max(dim=1).values
-            for expert_id in range(layer.local_num_experts):
+            for expert_id in range(layer.num_local_experts):
                 start = 0
                 for shard_id in range(2):
                     dq_weight = per_tensor_dequantize(

sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py CHANGED Viewed

@@ -148,7 +148,7 @@ def _compile_grouped_gemm_nt_f8f8bf16_masked_one(
         "NUM_MATH_THREADS_PER_GROUP": num_math_threads_per_group,
         "N": n,
         "K": k,
-        "NUM_GROUPS": 1,
+        "NUM_GROUPS": num_groups,
         "BLOCK_M": block_m,
         "BLOCK_N": block_n,
         "BLOCK_K": block_k,

sglang/srt/layers/quantization/fp8.py CHANGED Viewed

@@ -72,6 +72,7 @@ from sglang.srt.utils import (
     is_hip,
     is_npu,
     log_info_on_rank0,
+    next_power_of_2,
     print_warning_once,
     set_weight_attrs,
     use_intel_amx_backend,
@@ -490,6 +491,16 @@ class Fp8LinearMethod(LinearMethodBase):
         )
+def get_tile_tokens_dim(num_tokens, top_k, num_experts):
+    # Guess tokens per expert assuming perfect expert distribution first.
+    num_tokens_per_expert = (num_tokens * top_k) // num_experts
+    # And pad the number to the next power of 2.
+    tile_tokens_dim = next_power_of_2(num_tokens_per_expert)
+    # Cap to 8-64 tokens per CTA tile as it's the range supported by the kernel.
+    tile_tokens_dim = min(max(tile_tokens_dim, 8), 64)
+    return tile_tokens_dim
 class Fp8MoEMethod(FusedMoEMethodBase):
     """MoE method for FP8.
     Supports loading FP8 checkpoints with static weight scale and
@@ -1028,7 +1039,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             from sglang.srt.layers.moe.cutlass_moe import cutlass_fused_experts_fp8
             topk_weights, topk_ids, _ = topk_output
-            return cutlass_fused_experts_fp8(
+            output = cutlass_fused_experts_fp8(
                 x,
                 layer.w13_weight.transpose(1, 2),
                 layer.w2_weight.transpose(1, 2),
@@ -1051,6 +1062,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                 self.problem_sizes2,
                 use_fp8_blockscale=True,
             )
+            # TODO: Fuse into select_experts
+            if routed_scaling_factor is not None:
+                output *= routed_scaling_factor
+            return output
         # Expert fusion with FP8 quantization
         return fused_experts(
             x,
@@ -1076,6 +1091,47 @@ class Fp8MoEMethod(FusedMoEMethodBase):
             routed_scaling_factor=routed_scaling_factor,
         )
+    def apply_with_router_logits(
+        self,
+        layer: torch.nn.Module,
+        x: torch.Tensor,
+        router_logits: torch.Tensor,
+        *,
+        activation: str = "silu",
+        routed_scaling_factor: Optional[float] = None,
+    ) -> torch.Tensor:
+        assert (
+            activation == "silu"
+        ), "Only silu is supported for flashinfer blockscale fp8 moe"
+        a_q, a_sf = per_token_group_quant_fp8(x, self.quant_config.weight_block_size[1])
+        # NOTE: scales of hidden states have to be transposed!
+        a_sf_t = a_sf.t().contiguous()
+        from flashinfer.fused_moe import trtllm_fp8_block_scale_moe
+        return trtllm_fp8_block_scale_moe(
+            routing_logits=router_logits.to(torch.float32),
+            routing_bias=layer.correction_bias.to(x.dtype),
+            hidden_states=a_q,
+            hidden_states_scale=a_sf_t,
+            gemm1_weights=layer.w13_weight,
+            gemm1_weights_scale=layer.w13_weight_scale_inv,
+            gemm2_weights=layer.w2_weight,
+            gemm2_weights_scale=layer.w2_weight_scale_inv,
+            num_experts=layer.num_experts,
+            top_k=layer.top_k,
+            n_group=layer.num_expert_group,
+            topk_group=layer.topk_group,
+            intermediate_size=layer.w2_weight.shape[2],
+            local_expert_offset=layer.moe_ep_rank * layer.num_local_experts,
+            local_num_experts=layer.num_local_experts,
+            routed_scaling_factor=routed_scaling_factor,
+            tile_tokens_dim=get_tile_tokens_dim(
+                x.shape[0], layer.top_k, layer.num_experts
+            ),
+            routing_method_type=2,  # DeepSeek-styled routing method
+            use_shuffled_weight=False,
+        )
     def maybe_apply_hip_fused_experts(
         self,
         layer: torch.nn.Module,

sglang/srt/layers/quantization/fp8_kernel.py CHANGED Viewed

@@ -354,10 +354,6 @@ def sglang_per_token_group_quant_fp8(
     ), "the last dimension of `x` cannot be divisible by `group_size`"
     assert x.is_contiguous(), "`x` is not contiguous"
-    if scale_ue8m0:
-        # TODO: handle this case by fixing the (token=4, dim=256, group_size=128) UT case
-        assert x.shape[-1] % (group_size * 4) == 0
     x_q = torch.empty_like(x, device=x.device, dtype=fp8_dtype)
     x_s = create_per_token_group_quant_fp8_output_scale(
         x_shape=x.shape,

sglang/srt/layers/quantization/w8a8_int8.py CHANGED Viewed

@@ -231,7 +231,10 @@ class W8A8Int8Config(QuantizationConfig):
     @classmethod
     def get_config_filenames(cls) -> List[str]:
-        return []
+        filenames = []
+        if _is_npu:
+            filenames.append("quant_model_description.json")
+        return filenames
     @classmethod
     def from_config(cls, config: Dict[str, Any]) -> W8A8Int8Config:

sglang/srt/layers/vocab_parallel_embedding.py CHANGED Viewed

@@ -11,8 +11,12 @@ from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
+    parallel_state,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
 from sglang.srt.layers.amx_utils import PackWeightMethod
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.layers.parameter import BasevLLMParameter
@@ -464,7 +468,9 @@ class VocabParallelEmbedding(torch.nn.Module):
         else:
             masked_input = input_
         # Get the embeddings.
-        output_parallel = self.quant_method.embedding(self, masked_input.long())
+        with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+            output_parallel = self.quant_method.embedding(self, masked_input.long())
+            sm.tag(output_parallel)
         # Mask the output embedding.
         if self.tp_size > 1:
             output_parallel.masked_fill_(input_mask.unsqueeze(-1), 0)

sglang/srt/lora/lora_registry.py CHANGED Viewed

@@ -186,3 +186,10 @@ class LoRARegistry:
         self._registry[lora_ref.lora_name] = lora_ref
         self._counters[lora_ref.lora_id] = ConcurrentCounter()
         return lora_ref
+    @property
+    def num_registered_loras(self) -> int:
+        """
+        Returns the total number of LoRA adapters currently registered.
+        """
+        return len(self._registry)

sglang 0.4.10__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

sglang 0.4.10py3-none-any.whl → 0.4.10.post2py3-none-any.whl