PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1.post1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -0
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +375 -51
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.post1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -1,10 +1,6 @@
 # Adapted from https://github.com/vllm-project/vllm/blob/a6221a144af772fd1a68fe7e627935dc53e81738/vllm/model_executor/layers/fused_moe/layer.py
-import datetime
-import glob
 import logging
-import os
-import sys
 from enum import Enum
 from typing import List, Optional, Tuple
@@ -22,12 +18,18 @@ from sglang.srt.distributed.device_communicators.pynccl_allocator import (
     use_symmetric_memory,
 )
 from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
-from sglang.srt.layers.moe.topk import StandardTopKOutput
-from sglang.srt.layers.moe.utils import should_use_flashinfer_trtllm_moe
+from sglang.srt.layers.moe import (
+    MoeRunnerConfig,
+    get_moe_runner_backend,
+    should_use_flashinfer_trtllm_moe,
+)
+from sglang.srt.layers.moe.topk import TopKOutput, TopKOutputChecker
 from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
+from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod
+from sglang.srt.layers.quantization.modelopt_quant import ModelOptNvFp4FusedMoEMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_loader.weight_utils import narrow_padded_param_and_loaded_weight
@@ -109,9 +111,8 @@ class FusedMoE(torch.nn.Module):
         hidden_size: Input hidden state size of the transformer
         intermediate_size: Intermediate size of the experts
         params_dtype: Data type for the parameters.
-        reduce_results: Whether to all all_reduce on the output of the layer
-        renomalize: Whether to renormalize the logits in the fused_moe kernel
-        quant_config: Quantization configure.
+        reduce_results: Whether to apply all_reduce on the output of the layer
+        quant_config: Quantization configuration.
         inplace: suggestion to compute inplace (modify input activation).
     """
@@ -126,7 +127,6 @@ class FusedMoE(torch.nn.Module):
         params_dtype: Optional[torch.dtype] = None,
         reduce_results: bool = False,
         quant_config: Optional[QuantizationConfig] = None,
-        tp_size: Optional[int] = None,
         prefix: str = "",
         activation: str = "silu",
         apply_router_weight_on_input: bool = False,
@@ -134,9 +134,8 @@ class FusedMoE(torch.nn.Module):
         inplace: bool = True,
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
-        enable_flashinfer_cutlass_moe: Optional[bool] = False,
-        activation_alpha: Optional[float] = None,
-        swiglu_limit: Optional[float] = None,
+        gemm1_alpha: Optional[float] = None,
+        gemm1_clamp_limit: Optional[float] = None,
         use_weight_loader_fused: bool = False,
         with_bias=False,
     ):
@@ -153,9 +152,17 @@ class FusedMoE(torch.nn.Module):
         self.expert_map_cpu = None
         self.expert_map_gpu = None
-        # For activation
-        self.activation_alpha = activation_alpha
-        self.swiglu_limit = swiglu_limit
+        self.moe_runner_config = MoeRunnerConfig(
+            activation=activation,
+            apply_router_weight_on_input=apply_router_weight_on_input,
+            inplace=inplace,
+            no_combine=no_combine,
+            routed_scaling_factor=routed_scaling_factor,
+            gemm1_alpha=gemm1_alpha,
+            gemm1_clamp_limit=gemm1_clamp_limit,
+        )
+        enable_flashinfer_cutlass_moe = get_moe_runner_backend().is_flashinfer_cutlass()
         if enable_flashinfer_cutlass_moe and quant_config is None:
             logger.warning("Disable flashinfer MoE when quantization config is None.")
@@ -174,9 +181,6 @@ class FusedMoE(torch.nn.Module):
             self.expert_map_cpu = torch.full(
                 (self.num_experts,), -1, dtype=torch.int32, device="cpu"
             )
-            self.expert_map_cpu = torch.full(
-                (self.num_experts,), -1, dtype=torch.int32, device="cpu"
-            )
             # Create a expert map for the local experts
             self.expert_map_cpu[
                 self.moe_ep_rank
@@ -184,20 +188,12 @@ class FusedMoE(torch.nn.Module):
                 * self.num_local_experts
             ] = torch.arange(0, self.num_local_experts, dtype=torch.int32, device="cpu")
-        self.routed_scaling_factor = routed_scaling_factor
         assert intermediate_size % self.moe_tp_size == 0
         self.intermediate_size_per_partition = intermediate_size // self.moe_tp_size
         self.reduce_results = reduce_results
-        self.activation = activation
-        self.apply_router_weight_on_input = apply_router_weight_on_input
         self.use_presharded_weights = use_presharded_weights
-        self.inplace = inplace
-        self.no_combine = no_combine
-        self.use_triton_kernels = (
-            not _is_cpu and global_server_args_dict["enable_triton_kernel_moe"]
-        )
+        self.use_triton_kernels = get_moe_runner_backend().is_triton_kernel()
         if quant_config is None:
             self.quant_method: Optional[QuantizeMethodBase] = UnquantizedFusedMoEMethod(
                 self.use_triton_kernels
@@ -207,14 +203,12 @@ class FusedMoE(torch.nn.Module):
         assert self.quant_method is not None
         self.quant_config = quant_config
-        self.use_enable_flashinfer_mxfp4_moe = global_server_args_dict.get(
-            "enable_flashinfer_mxfp4_moe", False
-        )
+        self.use_flashinfer_mxfp4_moe = get_moe_runner_backend().is_flashinfer_mxfp4()
         # TODO maybe we should remove this `if`, since `Mxfp4MoEMethod` does another round-up logic
         if (
             self.quant_config is not None
             and self.quant_config.get_name() == "mxfp4"
-            and self.use_enable_flashinfer_mxfp4_moe
+            and self.use_flashinfer_mxfp4_moe
         ):
             hidden_size = round_up(hidden_size, 256)
         self.quant_method.create_weights(
@@ -477,6 +471,7 @@ class FusedMoE(torch.nn.Module):
             not expert_id
             and self.quant_config is not None
             and self.quant_config.get_name() == "mxfp4"
+            and self.quant_config.is_static_cfg()
         ):
             if "bias" in weight_name:
                 dim1 = loaded_weight.shape[1]
@@ -625,9 +620,7 @@ class FusedMoE(torch.nn.Module):
         if "ModelOpt" in self.quant_method.__class__.__name__:
             # Determine per-tensor weight scale patterns based on variant
-            is_fp4_variant = (
-                "ModelOptNvFp4FusedMoEMethod" in self.quant_method.__class__.__name__
-            )
+            is_fp4_variant = isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
             # FP4 uses "weight_scale_2" for per-tensor, FP8 uses "weight_scale" for per-tensor
             per_tensor_conditions = (
@@ -729,7 +722,11 @@ class FusedMoE(torch.nn.Module):
     ) -> None:
         tp_rank = self.moe_tp_rank
-        if self.quant_config is not None and self.quant_config.get_name() == "mxfp4":
+        if (
+            self.quant_config is not None
+            and self.quant_config.get_name() == "mxfp4"
+            and self.quant_config.is_static_cfg()
+        ):
             if "bias" in weight_name:
                 dim1 = loaded_weight.shape[1]
                 param.data[:, :dim1].copy_(loaded_weight)
@@ -794,7 +791,7 @@ class FusedMoE(torch.nn.Module):
                 f"Unsupported weight_name {weight_name} for FusedMoE weight_loader_fused. Nothing is loaded."
             )
-    def forward(self, hidden_states: torch.Tensor, topk_output: StandardTopKOutput):
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         origin_hidden_states_dim = hidden_states.shape[-1]
         assert self.quant_method is not None
@@ -803,40 +800,22 @@ class FusedMoE(torch.nn.Module):
                 # If we are in EP mode, we need to move the expert map to GPU.
                 self.expert_map_gpu = self.expert_map_cpu.to(device="cuda")
-        if self.expert_map_gpu is not None and isinstance(
-            topk_output, StandardTopKOutput
-        ):
-            topk_output = topk_output._replace(
-                topk_ids=self.expert_map_gpu[topk_output.topk_ids]
-            )
+        if self.expert_map_gpu is not None:
+            if TopKOutputChecker.format_is_standard(topk_output):
+                topk_output = topk_output._replace(
+                    topk_ids=self.expert_map_gpu[topk_output.topk_ids]
+                )
+            elif TopKOutputChecker.format_is_triton_kernel(topk_output):
+                raise NotImplementedError()
         # Matrix multiply.
         with use_symmetric_memory(get_tp_group()) as sm:
-            kwargs = {}
-            if self.activation_alpha is not None:
-                kwargs["activation_alpha"] = self.activation_alpha
-            if self.swiglu_limit is not None:
-                kwargs["swiglu_limit"] = self.swiglu_limit
             final_hidden_states = self.quant_method.apply(
                 layer=self,
                 x=hidden_states,
                 topk_output=topk_output,
-                activation=self.activation,
-                apply_router_weight_on_input=self.apply_router_weight_on_input,
-                routed_scaling_factor=self.routed_scaling_factor,
-                **(
-                    dict(
-                        tp_rank=self.moe_tp_rank,
-                        tp_size=self.moe_tp_size,
-                        ep_rank=self.moe_ep_rank,
-                        ep_size=self.moe_ep_size,
-                    )
-                    if self.quant_method.__class__.__name__
-                    == "ModelOptNvFp4FusedMoEMethod"
-                    else {}
-                ),
-                **kwargs,
+                moe_runner_config=self.moe_runner_config,
             )
             sm.tag(final_hidden_states)
@@ -941,53 +920,39 @@ class FusedMoE(torch.nn.Module):
             for shard_id in ["w1", "w2", "w3"]
         ]
+    def should_fuse_routed_scaling_factor_in_topk(self):
+        return isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod) or (
+            isinstance(self.quant_method, Fp8MoEMethod)
+            and self.quant_method.use_cutlass_fused_experts_fp8
+        )
 class FlashInferFusedMoE(FusedMoE):
     def __init__(self, *args, **kwargs):
-        renormalize = kwargs.pop("renormalize", True)
-        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
-        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
-        num_expert_group = kwargs.pop("num_expert_group", None)
-        topk_group = kwargs.pop("topk_group", None)
-        correction_bias = kwargs.pop("correction_bias", None)
         super().__init__(*args, **kwargs)
-        self.renormalize = renormalize
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.use_grouped_topk = use_grouped_topk
-        if self.use_grouped_topk:
-            assert num_expert_group is not None and topk_group is not None
-        self.num_expert_group = num_expert_group
-        self.topk_group = topk_group
-        self.correction_bias = correction_bias
         self.use_flashinfer_trtllm_moe = should_use_flashinfer_trtllm_moe()
-    def forward(self, hidden_states: torch.Tensor, topk_output: tuple):
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         assert self.use_flashinfer_trtllm_moe
         assert (
-            self.activation == "silu"
+            self.moe_runner_config.activation == "silu"
         ), "Only silu is supported for flashinfer blockscale fp8 moe"
         assert self.quant_method is not None
         assert (
-            self.renormalize
+            topk_output.topk_config.renormalize
         ), "Renormalize is required for flashinfer blockscale fp8 moe"
         assert (
             self.num_fused_shared_experts == 0
         ), "Fused shared experts are not supported for flashinfer blockscale fp8 moe"
-        # TRTLLM mode expects (TopK_config, router_logits) tuple
-        if not isinstance(topk_output, tuple) or len(topk_output) != 2:
-            raise ValueError(
-                f"FlashInferFusedMoE expects (TopK_config, router_logits) tuple, got {type(topk_output)}"
-            )
-        _, router_logits = topk_output
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
         # Matrix multiply.
         final_hidden_states = self.quant_method.apply_with_router_logits(
             layer=self,
             x=hidden_states,
-            router_logits=router_logits,
-            activation=self.activation,
-            routed_scaling_factor=self.routed_scaling_factor,
+            topk_output=topk_output,
+            moe_runner_config=self.moe_runner_config,
         )
         if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
@@ -1000,28 +965,8 @@ class FlashInferFP4MoE(FusedMoE):
     """FP4 TRTLLM MoE implementation using FlashInfer."""
     def __init__(self, *args, **kwargs):
-        # Extract DeepSeek-specific parameters
-        renormalize = kwargs.pop("renormalize", True)
-        num_fused_shared_experts = kwargs.pop("num_fused_shared_experts", 0)
-        use_grouped_topk = kwargs.pop("use_grouped_topk", False)
-        num_expert_group = kwargs.pop("num_expert_group", None)
-        topk_group = kwargs.pop("topk_group", None)
-        correction_bias = kwargs.pop("correction_bias", None)
-        # Extract additional TopK parameters that were previously extracted in forward
-        routed_scaling_factor = kwargs.pop("routed_scaling_factor", None)
         super().__init__(*args, **kwargs)
-        # Store DeepSeek parameters
-        self.renormalize = renormalize
-        self.num_fused_shared_experts = num_fused_shared_experts
-        self.use_grouped_topk = use_grouped_topk
-        self.num_expert_group = num_expert_group
-        self.topk_group = topk_group
-        self.correction_bias = correction_bias
-        self.routed_scaling_factor = routed_scaling_factor
     # ---------------------------------------------------------------------
     # Helper: quantize hidden states to FP4 each forward pass
     # ---------------------------------------------------------------------
@@ -1052,21 +997,19 @@ class FlashInferFP4MoE(FusedMoE):
         return hs_fp4, hs_sf
-    def forward(self, hidden_states: torch.Tensor, topk_output):
+    def forward(self, hidden_states: torch.Tensor, topk_output: TopKOutput):
         """Forward pass using FP4 TRTLLM kernel.
         Args:
             hidden_states: Input tensor
-            topk_output: Should be tuple of (TopK_config, router_logits) for TRTLLM mode
+            topk_output: TopKOutput object with Bypassed format
         """
+        assert isinstance(self.quant_method, ModelOptNvFp4FusedMoEMethod)
-        # TRTLLM mode expects (TopK_config, router_logits) tuple
-        if not isinstance(topk_output, tuple) or len(topk_output) != 2:
-            raise ValueError(
-                f"FlashInferFP4MoE expects (TopK_config, router_logits) tuple, got {type(topk_output)}"
-            )
+        assert TopKOutputChecker.format_is_bypassed(topk_output)
-        _, router_logits = topk_output
+        router_logits = topk_output.router_logits
+        topk_config = topk_output.topk_config
         hs_fp4, hs_scale_linear = self._quantize_hidden_states_fp4(hidden_states)
@@ -1074,7 +1017,7 @@ class FlashInferFP4MoE(FusedMoE):
         result = trtllm_fp4_block_scale_moe(
             routing_logits=router_logits,
-            routing_bias=self.correction_bias.to(hidden_states.dtype),
+            routing_bias=topk_config.correction_bias.to(hidden_states.dtype),
             hidden_states=hs_fp4,
             hidden_states_scale=hs_scale_linear.view(torch.float8_e4m3fn).flatten(),
             gemm1_weights=self.gemm1_weights_fp4_shuffled.data,
@@ -1094,15 +1037,15 @@ class FlashInferFP4MoE(FusedMoE):
             output1_scale_gate_scalar=self.g1_alphas.data,
             output2_scale_scalar=self.g2_alphas.data,
             num_experts=self.num_experts,
-            top_k=self.top_k,
-            n_group=self.num_expert_group,
-            topk_group=self.topk_group,
+            top_k=topk_config.top_k,
+            n_group=topk_config.num_expert_group,
+            topk_group=topk_config.topk_group,
             intermediate_size=self.intermediate_size_per_partition,
             local_expert_offset=self.moe_ep_rank * self.num_local_experts,
             local_num_experts=self.num_local_experts,
-            routed_scaling_factor=self.routed_scaling_factor,
+            routed_scaling_factor=self.moe_runner_config.routed_scaling_factor,
             tile_tokens_dim=_get_tile_tokens_dim(
-                hidden_states.shape[0], self.top_k, self.num_local_experts
+                hidden_states.shape[0], topk_config.top_k, self.num_local_experts
             ),
             routing_method_type=RoutingMethodType.DeepSeekV3,
             do_finalize=True,

sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py CHANGED Viewed

@@ -18,6 +18,7 @@ from triton_kernels.routing import GatherIndx, RoutingData, ScatterIndx
 from triton_kernels.swiglu import swiglu_fn
 if TYPE_CHECKING:
+    from sglang.srt.layers.moe.moe_runner import MoeRunnerConfig
     from sglang.srt.layers.moe.topk import TopKOutput
@@ -55,8 +56,7 @@ def triton_kernel_moe_forward(
     w1: torch.Tensor,
     w2: torch.Tensor,
     topk_output: TopKOutput,
-    inplace: bool = False,
-    activation: str = "silu",
+    moe_runner_config: MoeRunnerConfig,
     apply_router_weight_on_input: bool = False,
     use_fp8_w8a8: bool = False,
     per_channel_quant: bool = False,
@@ -69,7 +69,10 @@ def triton_kernel_moe_forward(
     block_shape: Optional[list[int]] = None,
 ) -> torch.Tensor:
-    assert topk_output.format.is_triton_kernel()
+    from sglang.srt.layers.moe.topk import TopKOutputChecker
+    assert TopKOutputChecker.format_is_triton_kernel(topk_output)
     routing_data, gather_idx, scatter_idx = topk_output
     return triton_kernel_fused_experts(
@@ -79,8 +82,8 @@ def triton_kernel_moe_forward(
         routing_data,
         gather_idx,
         scatter_idx,
-        inplace=inplace,
-        activation=activation,
+        inplace=False,  # triton kernel doesn't support inplace
+        activation=moe_runner_config.activation,
         apply_router_weight_on_input=apply_router_weight_on_input,
         use_fp8_w8a8=use_fp8_w8a8,
         per_channel_quant=per_channel_quant,
@@ -192,8 +195,7 @@ def triton_kernel_moe_with_bias_forward(
     w2_pcg,
     b2: torch.Tensor,
     topk_output: TopKOutput,
-    inplace: bool = False,
-    activation: str = "silu",
+    moe_runner_config: MoeRunnerConfig,
     use_fp8_w8a8: bool = False,
     per_channel_quant: bool = False,
     global_num_experts: int = -1,
@@ -203,10 +205,11 @@ def triton_kernel_moe_with_bias_forward(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
-    activation_alpha: Optional[float] = None,
-    swiglu_limit: Optional[int] = None,
 ) -> torch.Tensor:
-    assert topk_output.format.is_triton_kernel()
+    from sglang.srt.layers.moe.topk import TopKOutputChecker
+    assert TopKOutputChecker.format_is_triton_kernel(topk_output)
     routing_data, gather_idx, scatter_idx = topk_output
     return triton_kernel_fused_experts_with_bias(
@@ -220,8 +223,8 @@ def triton_kernel_moe_with_bias_forward(
         routing_data=routing_data,
         gather_indx=gather_idx,
         scatter_indx=scatter_idx,
-        inplace=inplace,
-        activation=activation,
+        inplace=False,  # triton kernel doesn't support inplace
+        activation=moe_runner_config.activation,
         use_fp8_w8a8=use_fp8_w8a8,
         per_channel_quant=per_channel_quant,
         global_num_experts=global_num_experts,
@@ -231,8 +234,8 @@ def triton_kernel_moe_with_bias_forward(
         a1_scale=a1_scale,
         a2_scale=a2_scale,
         block_shape=block_shape,
-        activation_alpha=activation_alpha,
-        swiglu_limit=swiglu_limit,
+        gemm1_alpha=moe_runner_config.gemm1_alpha,
+        gemm1_clamp_limit=moe_runner_config.gemm1_clamp_limit,
     )
@@ -258,10 +261,9 @@ def triton_kernel_fused_experts_with_bias(
     a1_scale: Optional[torch.Tensor] = None,
     a2_scale: Optional[torch.Tensor] = None,
     block_shape: Optional[list[int]] = None,
-    activation_alpha: Optional[float] = None,
-    swiglu_limit: Optional[int] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_clamp_limit: Optional[float] = None,
 ) -> torch.Tensor:
-    # print(f"here in triton moe with bias", b1.shape, b1.dtype, b2.shape, b2.dtype)
     assert use_fp8_w8a8 == False, "use_fp8_w8a8 is not supported"
     assert per_channel_quant == False, "per_channel_quant is not supported"
     assert expert_map == None, "expert_map is not supported"
@@ -307,7 +309,7 @@ def triton_kernel_fused_experts_with_bias(
     act = FusedActivation(
         FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")),
-        (activation_alpha, swiglu_limit),
+        (gemm1_alpha, gemm1_clamp_limit),
         2,
     )

sglang/srt/layers/moe/moe_runner/__init__.py ADDED Viewed

@@ -0,0 +1,3 @@
+from sglang.srt.layers.moe.moe_runner.base import MoeRunnerConfig
+__all__ = ["MoeRunnerConfig"]

sglang/srt/layers/moe/moe_runner/base.py ADDED Viewed

@@ -0,0 +1,13 @@
+from dataclasses import dataclass
+from typing import Optional
+@dataclass
+class MoeRunnerConfig:
+    activation: str = "silu"
+    apply_router_weight_on_input: bool = False
+    inplace: bool = True
+    no_combine: bool = False
+    routed_scaling_factor: Optional[float] = None
+    gemm1_alpha: Optional[float] = None
+    gemm1_clamp_limit: Optional[float] = None

sglang/srt/layers/moe/rocm_moe_utils.py ADDED Viewed

@@ -0,0 +1,141 @@
+# Adapted from https://github.com/vllm-project/vllm/blob/v0.9.1rc2/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from enum import IntEnum
+from functools import cache
+from typing import Optional
+import torch
+from sglang.srt.utils import direct_register_custom_op, get_bool_env_var, is_hip
+_is_hip = is_hip()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
+class ActivationMethod(IntEnum):
+    # This allows interfacing with AITER ActivationType enum
+    # without importing the ActivationType enum from AITER globally.
+    SILU = 0
+    GELU = 1
+def rocm_aiter_asm_moe_tkw1_impl(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: Optional[torch.Tensor] = None,
+    fc2_scale: Optional[torch.Tensor] = None,
+    fc1_smooth_scale: Optional[torch.Tensor] = None,
+    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    a16: bool = False,
+    per_tensor_quant_scale: Optional[torch.Tensor] = None,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+) -> torch.Tensor:
+    from aiter import ActivationType
+    from aiter.fused_moe_bf16_asm import asm_moe_tkw1
+    activation = ActivationType(activation_method)
+    return asm_moe_tkw1(
+        hidden_states,
+        w1,
+        w2,
+        topk_weights,
+        topk_ids,
+        fc1_scale=fc1_scale,
+        fc2_scale=fc2_scale,
+        fc1_smooth_scale=fc1_smooth_scale,
+        fc2_smooth_scale=fc2_smooth_scale,
+        a16=a16,
+        per_tensor_quant_scale=per_tensor_quant_scale,
+        expert_mask=expert_mask,
+        activation=activation,
+    )
+def rocm_aiter_asm_moe_tkw1_fake(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    fc1_scale: Optional[torch.Tensor] = None,
+    fc2_scale: Optional[torch.Tensor] = None,
+    fc1_smooth_scale: Optional[torch.Tensor] = None,
+    fc2_smooth_scale: Optional[torch.Tensor] = None,
+    a16: bool = False,
+    per_tensor_quant_scale: Optional[torch.Tensor] = None,
+    expert_mask: Optional[torch.Tensor] = None,
+    activation_method: int = ActivationMethod.SILU.value,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)
+if _use_aiter:
+    direct_register_custom_op(
+        op_name="rocm_aiter_asm_moe_tkw1",
+        op_func=rocm_aiter_asm_moe_tkw1_impl,
+        mutates_args=[],
+        fake_impl=rocm_aiter_asm_moe_tkw1_fake,
+    )
+def rocm_fused_experts_tkw1(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    use_fp8_w8a8: bool = False,
+    per_channel_quant: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+) -> torch.Tensor:
+    activation_method = (
+        ActivationMethod.SILU if activation == "silu" else ActivationMethod.GELU
+    )
+    # All AITER Fused MoE kernels are expecting the following datatypes
+    topk_weights = topk_weights.to(torch.float32)
+    topk_ids = topk_ids.to(torch.int32)
+    # w8a8 per-channel quantization
+    if per_channel_quant and apply_router_weight_on_input and use_fp8_w8a8:
+        # AITER tkw1 kernel for FP8 models with `apply_router_weight_on_input`
+        # This applies topk_weights on the GEMM output of the first FC layer
+        #  rather than the second FC.
+        assert (
+            topk_weights.dim() == 2
+        ), "`topk_weights` should be in shape (num_tokens, topk)"
+        assert topk_weights.shape[-1] == 1, (
+            "Only support topk=1 when" " `apply_router_weight_on_input` is True"
+        )
+        return torch.ops.sglang.rocm_aiter_asm_moe_tkw1(
+            hidden_states,
+            w1,
+            w2,
+            topk_weights,
+            topk_ids,
+            fc1_scale=w1_scale,
+            fc2_scale=w2_scale,
+            fc1_smooth_scale=None,
+            fc2_smooth_scale=None,
+            a16=False,
+            per_tensor_quant_scale=None,
+            expert_mask=None,
+            activation_method=activation_method,
+        )
+    else:
+        assert False, "This should not be called."

sglang/srt/layers/moe/router.py CHANGED Viewed

@@ -45,11 +45,14 @@ def fused_moe_router_kernel(
     logits = tl.sum((w_router.to(tl.float32) * x[None, :].to(tl.float32)), axis=-1)
     # logit softcap
-    logits_scaled = logits / moe_softcapping
-    exped = tl.exp(2 * logits_scaled)
-    top = exped - 1
-    bottom = exped + 1
-    logits_softcapped = top / bottom * moe_softcapping
+    if moe_softcapping == 0:
+        logits_softcapped = logits
+    else:
+        logits_scaled = logits / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        top = exped - 1
+        bottom = exped + 1
+        logits_softcapped = top / bottom * moe_softcapping
     # Add bias after softcapping
     if is_correction_bias:
@@ -207,9 +210,12 @@ def fused_moe_router_large_bs_kernel(
         b_ptrs += BLOCK_SIZE_K
     # 4. logit softcap
-    logits_scaled = acc / moe_softcapping
-    exped = tl.exp(2 * logits_scaled)
-    logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
+    if moe_softcapping == 0:
+        logits_softcapped = acc
+    else:
+        logits_scaled = acc / moe_softcapping
+        exped = tl.exp(2 * logits_scaled)
+        logits_softcapped = (exped - 1) / (exped + 1) * moe_softcapping
     # 5. top1
     arange_block_size_n = tl.arange(0, BLOCK_SIZE_N)[None, :]
@@ -234,7 +240,7 @@ def fused_moe_router_large_bs_kernel(
     # 7. handle topk == 2
     if topk == 2:
-        cond_top2 = (arange_block_size_n < num_experts) and (
+        cond_top2 = (arange_block_size_n < num_experts) & (
             arange_block_size_n != top1[:, None]
         )
         top2 = tl.argmax(

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1.post1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1.post1py3-none-any.whl