PyPI - sglang - Versions diffs - 0.4.8__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl - Mend

sglang 0.4.8py3-none-any.whl → 0.4.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

sglang/srt/configs/model_config.py +1 -0
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +7 -1
sglang/srt/disaggregation/base/conn.py +2 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/mooncake/conn.py +289 -48
sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -1
sglang/srt/disaggregation/nixl/conn.py +94 -46
sglang/srt/disaggregation/prefill.py +3 -2
sglang/srt/disaggregation/utils.py +12 -11
sglang/srt/entrypoints/engine.py +5 -3
sglang/srt/entrypoints/openai/protocol.py +47 -4
sglang/srt/entrypoints/openai/serving_chat.py +52 -76
sglang/srt/entrypoints/openai/serving_completions.py +1 -0
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/layers/activation.py +7 -0
sglang/srt/layers/attention/flashattention_backend.py +24 -14
sglang/srt/layers/layernorm.py +15 -0
sglang/srt/layers/linear.py +18 -1
sglang/srt/layers/logits_processor.py +12 -3
sglang/srt/layers/moe/ep_moe/layer.py +79 -12
sglang/srt/layers/moe/ep_moe/token_dispatcher.py +19 -2
sglang/srt/layers/moe/fused_moe_native.py +7 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +7 -2
sglang/srt/layers/moe/fused_moe_triton/layer.py +73 -14
sglang/srt/layers/moe/topk.py +26 -0
sglang/srt/layers/quantization/fp8_utils.py +5 -4
sglang/srt/layers/rotary_embedding.py +103 -11
sglang/srt/layers/vocab_parallel_embedding.py +14 -1
sglang/srt/managers/expert_distribution.py +21 -0
sglang/srt/managers/io_struct.py +10 -2
sglang/srt/managers/multimodal_processors/base_processor.py +44 -9
sglang/srt/managers/multimodal_processors/gemma3n.py +97 -0
sglang/srt/managers/schedule_batch.py +9 -1
sglang/srt/managers/scheduler.py +42 -6
sglang/srt/model_executor/cuda_graph_runner.py +1 -1
sglang/srt/model_executor/model_runner.py +5 -2
sglang/srt/model_loader/loader.py +45 -10
sglang/srt/model_loader/weight_utils.py +89 -0
sglang/srt/models/deepseek_nextn.py +7 -4
sglang/srt/models/deepseek_v2.py +147 -4
sglang/srt/models/gemma3n_audio.py +949 -0
sglang/srt/models/gemma3n_causal.py +1009 -0
sglang/srt/models/gemma3n_mm.py +511 -0
sglang/srt/models/hunyuan.py +771 -0
sglang/srt/server_args.py +16 -2
sglang/srt/two_batch_overlap.py +4 -1
sglang/srt/utils.py +71 -0
sglang/version.py +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/METADATA +1 -1
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/RECORD +54 -49
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/WHEEL +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.8.dist-info → sglang-0.4.8.post1.dist-info}/top_level.txt +0 -0

sglang/srt/entrypoints/openai/serving_chat.py CHANGED Viewed

@@ -22,6 +22,7 @@ from sglang.srt.entrypoints.openai.protocol import (
     ErrorResponse,
     FunctionResponse,
     LogProbs,
+    MessageProcessingResult,
     ToolCall,
     TopLogprob,
 )
@@ -62,120 +63,81 @@ class OpenAIServingChat(OpenAIServingBase):
         is_multimodal = self.tokenizer_manager.model_config.is_multimodal
         # Process messages and apply chat template
-        (
-            prompt,
-            prompt_ids,
-            image_data,
-            audio_data,
-            modalities,
-            stop,
-            tool_call_constraint,
-        ) = self._process_messages(request, is_multimodal)
+        processed_messages = self._process_messages(request, is_multimodal)
         # Build sampling parameters
         sampling_params = self._build_sampling_params(
-            request, stop, tool_call_constraint
+            request, processed_messages.stop, processed_messages.tool_call_constraint
         )
         # Handle single vs multiple requests
         if is_multimodal:
-            prompt_kwargs = {"text": prompt}
+            prompt_kwargs = {"text": processed_messages.prompt}
         else:
-            if isinstance(prompt_ids, str):
-                prompt_kwargs = {"text": prompt_ids}
+            if isinstance(processed_messages.prompt_ids, str):
+                prompt_kwargs = {"text": processed_messages.prompt_ids}
             else:
-                prompt_kwargs = {"input_ids": prompt_ids}
+                prompt_kwargs = {"input_ids": processed_messages.prompt_ids}
         adapted_request = GenerateReqInput(
             **prompt_kwargs,
-            image_data=image_data,
-            audio_data=audio_data,
+            image_data=processed_messages.image_data,
+            audio_data=processed_messages.audio_data,
             sampling_params=sampling_params,
             return_logprob=request.logprobs,
             logprob_start_len=-1,
             top_logprobs_num=request.top_logprobs or 0,
             stream=request.stream,
             return_text_in_logprobs=True,
-            modalities=modalities,
+            modalities=processed_messages.modalities,
             lora_path=request.lora_path,
             bootstrap_host=request.bootstrap_host,
             bootstrap_port=request.bootstrap_port,
             bootstrap_room=request.bootstrap_room,
             return_hidden_states=request.return_hidden_states,
+            rid=request.rid,
         )
         return adapted_request, request
     def _process_messages(
         self, request: ChatCompletionRequest, is_multimodal: bool
-    ) -> tuple[
-        str,
-        Union[str, List[int]],
-        Optional[Any],
-        Optional[Any],
-        List[str],
-        List[str],
-        Optional[Any],
-    ]:
+    ) -> MessageProcessingResult:
         """Process chat messages and apply chat template"""
         tool_call_constraint = None
-        prompt = ""
-        prompt_ids = []
-        if not isinstance(request.messages, str):
-            # Apply chat template and its stop strings
-            tools = None
-            if request.tools and request.tool_choice != "none":
-                request.skip_special_tokens = False
-                if not isinstance(request.tool_choice, str):
-                    tools = [
-                        item.function.model_dump()
-                        for item in request.tools
-                        if item.function.name == request.tool_choice.function.name
-                    ]
-                else:
-                    tools = [item.function.model_dump() for item in request.tools]
+        # Apply chat template and its stop strings
+        tools = None
+        if request.tools and request.tool_choice != "none":
+            request.skip_special_tokens = False
+            if not isinstance(request.tool_choice, str):
+                tools = [
+                    item.function.model_dump()
+                    for item in request.tools
+                    if item.function.name == request.tool_choice.function.name
+                ]
+            else:
+                tools = [item.function.model_dump() for item in request.tools]
-                tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
-                parser = FunctionCallParser(request.tools, tool_call_parser)
-                tool_call_constraint = parser.get_structure_constraint(
-                    request.tool_choice
-                )
+            tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
+            parser = FunctionCallParser(request.tools, tool_call_parser)
+            tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
-            # Use chat template
-            if self.template_manager.chat_template_name is None:
-                prompt, prompt_ids, image_data, audio_data, modalities, stop = (
-                    self._apply_jinja_template(request, tools, is_multimodal)
-                )
-            else:
-                prompt, prompt_ids, image_data, audio_data, modalities, stop = (
-                    self._apply_conversation_template(request, is_multimodal)
-                )
+        # Use chat template
+        if self.template_manager.chat_template_name is None:
+            result = self._apply_jinja_template(request, tools, is_multimodal)
         else:
-            # Use raw prompt
-            prompt_ids = request.messages
-            stop = request.stop or []
-            image_data = None
-            audio_data = None
-            modalities = []
-            prompt = request.messages
-        return (
-            prompt,
-            prompt_ids,
-            image_data,
-            audio_data,
-            modalities,
-            stop,
-            tool_call_constraint,
-        )
+            result = self._apply_conversation_template(request, is_multimodal)
+        result.tool_call_constraint = tool_call_constraint
+        return result
     def _apply_jinja_template(
         self,
         request: ChatCompletionRequest,
         tools: Optional[List[Dict]],
         is_multimodal: bool,
-    ) -> tuple[str, List[int], Optional[Any], Optional[Any], List[str], List[str]]:
+    ) -> MessageProcessingResult:
         """Apply Jinja chat template"""
         prompt = ""
         prompt_ids = []
@@ -253,13 +215,20 @@ class OpenAIServingChat(OpenAIServingBase):
         image_data = image_data if image_data else None
         audio_data = audio_data if audio_data else None
         modalities = modalities if modalities else []
-        return prompt, prompt_ids, image_data, audio_data, modalities, stop
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
     def _apply_conversation_template(
         self,
         request: ChatCompletionRequest,
         is_multimodal: bool,
-    ) -> tuple[str, Optional[Any], Optional[Any], List[str], List[str], List[str]]:
+    ) -> MessageProcessingResult:
         """Apply conversation template"""
         prompt = ""
         prompt_ids = []
@@ -304,7 +273,14 @@ class OpenAIServingChat(OpenAIServingBase):
         if not is_multimodal:
             prompt_ids = self.tokenizer_manager.tokenizer.encode(prompt)
-        return prompt, prompt_ids, image_data, audio_data, modalities, stop
+        return MessageProcessingResult(
+            prompt=prompt,
+            prompt_ids=prompt_ids,
+            image_data=image_data,
+            audio_data=audio_data,
+            modalities=modalities,
+            stop=stop,
+        )
     def _build_sampling_params(
         self,

sglang/srt/entrypoints/openai/serving_completions.py CHANGED Viewed

@@ -87,6 +87,7 @@ class OpenAIServingCompletion(OpenAIServingBase):
             bootstrap_port=request.bootstrap_port,
             bootstrap_room=request.bootstrap_room,
             return_hidden_states=request.return_hidden_states,
+            rid=request.rid,
         )
         return adapted_request, request

sglang/srt/entrypoints/openai/serving_embedding.py CHANGED Viewed

@@ -119,6 +119,7 @@ class OpenAIServingEmbedding(OpenAIServingBase):
         adapted_request = EmbeddingReqInput(
             **prompt_kwargs,
+            rid=request.rid,
         )
         return adapted_request, request

sglang/srt/layers/activation.py CHANGED Viewed

@@ -48,6 +48,9 @@ if _is_cuda:
 logger = logging.getLogger(__name__)
+if is_npu():
+    import torch_npu
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
@@ -70,6 +73,10 @@ class SiluAndMul(CustomOp):
         else:
             return self.forward_native(x)
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch_npu.npu_swiglu(x)
+        return out
 class GeluAndMul(CustomOp):
     def __init__(self, approximate="tanh"):

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -657,12 +657,16 @@ class FlashAttentionBackend(AttentionBackend):
         )
         k_descale, v_descale = None, None
         # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
-        # has corresponding quantization method so that layer.k_scale is not None
-        if self.kv_cache_dtype_str != "auto" and layer.k_scale is not None:
-            descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
-            k_descale = layer.k_scale.expand(descale_shape)
-            v_descale = layer.v_scale.expand(descale_shape)
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
+            if layer.k_scale is not None:
+                descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
+                k_descale = layer.k_scale.expand(descale_shape)
+                v_descale = layer.v_scale.expand(descale_shape)
             q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
         causal = not layer.is_cross_attention
         # Check if we should use local attention
@@ -776,8 +780,8 @@ class FlashAttentionBackend(AttentionBackend):
                     output, lse, *rest = flash_attn_varlen_func(
                         q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
-                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim),
-                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
                         cu_seqlens_q=metadata.cu_seqlens_q,
                         cu_seqlens_k=forward_batch.prefix_chunk_cu_seq_lens[chunk_idx],
                         max_seqlen_q=metadata.max_seq_len_q,
@@ -790,8 +794,8 @@ class FlashAttentionBackend(AttentionBackend):
                     # MHA for extend part of sequence without attending prefix kv cache
                     output, lse, *rest = flash_attn_varlen_func(
                         q=q.view(-1, layer.tp_q_head_num, layer.head_dim),
-                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim),
-                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim),
+                        k=k.view(-1, layer.tp_k_head_num, layer.head_dim).to(q.dtype),
+                        v=v.view(-1, layer.tp_k_head_num, layer.v_head_dim).to(q.dtype),
                         cu_seqlens_q=metadata.cu_seqlens_q,
                         cu_seqlens_k=metadata.cu_seqlens_q,
                         max_seqlen_q=metadata.max_seq_len_q,
@@ -803,7 +807,9 @@ class FlashAttentionBackend(AttentionBackend):
                 return output, lse
             else:
                 # Do absorbed multi-latent attention
-                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+                kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(
+                    layer.layer_id
+                ).to(q.dtype)
                 k_rope = kv_cache[:, :, layer.v_head_dim :]
                 c_kv = kv_cache[:, :, : layer.v_head_dim]
                 k_rope_cache = k_rope.view(
@@ -933,14 +939,16 @@ class FlashAttentionBackend(AttentionBackend):
         k_descale, v_descale = None, None
         # only use kv scaling if: 1) fp8 kv is explicitly enabled, 2) RadixAttention
-        # has corresponding quantization method so that layer.k_scale is not None
-        if self.kv_cache_dtype_str != "auto":
+        # has corresponding quantization method so that layer.k_scale is not None,
+        # 3) layer.head_dim <= 256 since fa3 kernel require fp16 and bf16 data type in this case.
+        if self.kv_cache_dtype_str != "auto" and layer.head_dim <= 256:
             if layer.k_scale is not None:
                 descale_shape = (forward_batch.batch_size, layer.tp_k_head_num)
                 k_descale = layer.k_scale.expand(descale_shape)
                 v_descale = layer.v_scale.expand(descale_shape)
             q = q.to(self.kv_cache_dtype)
+            q_rope = q_rope.to(self.kv_cache_dtype) if q_rope is not None else None
+            k_rope = k_rope.to(self.kv_cache_dtype) if k_rope is not None else None
         if not self.use_mla:
             # Do multi-head attention
@@ -1048,7 +1056,9 @@ class FlashAttentionBackend(AttentionBackend):
                     o = result
         else:
             # Do absorbed multi-latent attention
-            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id)
+            kv_cache = forward_batch.token_to_kv_pool.get_key_buffer(layer.layer_id).to(
+                q.dtype
+            )
             k_rope = kv_cache[:, :, layer.v_head_dim :]
             c_kv = kv_cache[:, :, : layer.v_head_dim]
             k_rope_cache = k_rope.view(

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -52,6 +52,9 @@ elif _is_hip:
 logger = logging.getLogger(__name__)
+if is_npu():
+    import torch_npu
 class RMSNorm(CustomOp):
     def __init__(
@@ -76,6 +79,18 @@ class RMSNorm(CustomOp):
         out = rmsnorm(x, self.weight.data, self.variance_epsilon)
         return out
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        return torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
     def forward_aiter(
         self,
         x: torch.Tensor,

sglang/srt/layers/linear.py CHANGED Viewed

@@ -30,7 +30,12 @@ from sglang.srt.layers.quantization.base_config import (
     QuantizationConfig,
     QuantizeMethodBase,
 )
-from sglang.srt.utils import set_weight_attrs
+from sglang.srt.utils import (
+    _process_weight_after_loading,
+    cpu_has_amx_support,
+    is_cpu,
+    set_weight_attrs,
+)
 logger = logging.getLogger(__name__)
@@ -52,6 +57,9 @@ WEIGHT_LOADER_V2_SUPPORTED = [
     "IPEXAWQLinearMethod",
 ]
+_is_cpu_amx_available = cpu_has_amx_support()
+_is_cpu = is_cpu()
 def adjust_marlin_shard(param, shard_size, shard_offset):
     marlin_tile_size = getattr(param, "marlin_tile_size", None)
@@ -165,6 +173,10 @@ class UnquantizedLinearMethod(LinearMethodBase):
         layer.register_parameter("weight", weight)
         set_weight_attrs(weight, extra_weight_attrs)
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        if _is_cpu and _is_cpu_amx_available:
+            _process_weight_after_loading(layer, ["weight"])
     def apply(
         self,
         layer: torch.nn.Module,
@@ -172,6 +184,11 @@ class UnquantizedLinearMethod(LinearMethodBase):
         bias: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
+        if getattr(layer, "use_intel_amx_backend", False):
+            return torch.ops.sgl_kernel.weight_packed_linear(
+                x, layer.weight, bias, True  # is_vnni
+            )
         return F.linear(x, layer.weight, bias)

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -442,11 +442,20 @@ class LogitsProcessor(nn.Module):
             dp_gather_replicate(hidden_states, local_hidden_states, logits_metadata)
         if hasattr(lm_head, "weight"):
-            logits = torch.matmul(
-                hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
-            )
+            if getattr(lm_head, "use_intel_amx_backend", False):
+                logits = torch.ops.sgl_kernel.weight_packed_linear(
+                    hidden_states.to(lm_head.weight.dtype),
+                    lm_head.weight,
+                    None,  # bias
+                    True,  # is_vnni
+                )
+            else:
+                logits = torch.matmul(
+                    hidden_states.to(lm_head.weight.dtype), lm_head.weight.T
+                )
         else:
             # GGUF models
+            # TODO: use weight_packed_linear for GGUF models
             logits = lm_head.quant_method.apply(lm_head, hidden_states, embedding_bias)
         if self.logit_scale is not None:

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -54,10 +54,16 @@ from sglang.srt.utils import (
 _is_hip = is_hip()
 _is_fp8_fnuz = is_fp8_fnuz()
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and _is_hip
 if _is_hip:
     from vllm._custom_ops import scaled_fp8_quant
+if _use_aiter:
+    from aiter import ActivationType, QuantType
+    from aiter.fused_moe import fused_moe
+    from aiter.ops.shuffle import shuffle_weight
 logger = logging.getLogger(__name__)
@@ -1046,6 +1052,15 @@ class Fp8EPMoEMethod(Fp8MoEMethod):
                         w2_weight_scale, requires_grad=False
                     )
                     layer.w2_input_scale = None
+                if _use_aiter:
+                    layer.w13_weight = torch.nn.Parameter(
+                        shuffle_weight(layer.w13_weight.data, (16, 16)),
+                        requires_grad=False,
+                    )
+                    layer.w2_weight = torch.nn.Parameter(
+                        shuffle_weight(layer.w2_weight.data, (16, 16)),
+                        requires_grad=False,
+                    )
             return
     def apply(
@@ -1117,18 +1132,36 @@ class DeepEPMoE(EPMoE):
             assert (
                 deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM
             ), f"DeepEP {self.deepep_mode} mode requires deep_gemm"
-        self.w13_weight_fp8 = (
-            self.w13_weight,
-            (
-                self.w13_weight_scale_inv
-                if self.use_block_quant
-                else self.w13_weight_scale
-            ),
-        )
-        self.w2_weight_fp8 = (
-            self.w2_weight,
-            self.w2_weight_scale_inv if self.use_block_quant else self.w2_weight_scale,
-        )
+        if _use_aiter:
+            # expert_mask is of size (self.num_experts_per_partition + 1),
+            # the extra 1 is for invalid rank_id (in original deepep, the invalid rank_id is -1, but aiter does not allow -1, we use a mask to make those ids invalid)
+            # for instance, if we have 4 experts on this rank, we would have a expert_mask like:
+            #     self.expert_mask = [1, 1, 1, 1, 0]
+            # idx from 0-3 is valid and will be processed, while idx == 4 will be masked out
+            self.expert_mask = torch.zeros(
+                (self.num_experts_per_partition + 1),
+                device=torch.cuda.current_device(),
+                dtype=torch.int,
+            )
+            # the last one is invalid rank_id
+            self.expert_mask[:-1] = 1
+        else:
+            self.w13_weight_fp8 = (
+                self.w13_weight,
+                (
+                    self.w13_weight_scale_inv
+                    if self.use_block_quant
+                    else self.w13_weight_scale
+                ),
+            )
+            self.w2_weight_fp8 = (
+                self.w2_weight,
+                (
+                    self.w2_weight_scale_inv
+                    if self.use_block_quant
+                    else self.w2_weight_scale
+                ),
+            )
     def forward(
         self,
@@ -1142,6 +1175,9 @@ class DeepEPMoE(EPMoE):
         num_recv_tokens_per_expert: List[int],
         forward_mode: ForwardMode,
     ):
+        if _use_aiter:
+            # in forward_aiter, we skip token permutation and unpermutation, which have been fused inside aiter kernel
+            return self.forward_aiter(hidden_states, topk_idx, topk_weights)
         resolved_deepep_mode = self.deepep_mode.resolve(forward_mode)
         if resolved_deepep_mode == DeepEPMode.normal:
             if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
@@ -1274,6 +1310,37 @@ class DeepEPMoE(EPMoE):
             )
         return down_output
+    def forward_aiter(
+        self,
+        hidden_states: torch.Tensor,
+        topk_idx: torch.Tensor,
+        topk_weights: torch.Tensor,
+    ):
+        if hidden_states.shape[0] == 0:
+            return hidden_states
+        # in original deepep, idx == -1 meaning invalid and will not be processed.
+        # aiter does not accept -1, we use a expert mask to make these idx invalid
+        # (idx == num_experts_per_partition) meaning not used in aiter fused_moe
+        topk_idx_copy = topk_idx.to(torch.int32)
+        topk_idx_copy[topk_idx_copy == -1] = self.num_experts_per_partition
+        return fused_moe(
+            hidden_states,
+            self.w13_weight,
+            self.w2_weight,
+            topk_weights,
+            topk_idx_copy,
+            w1_scale=self.w13_weight_scale_inv,
+            w2_scale=self.w2_weight_scale_inv,
+            quant_type=QuantType.per_128x128,
+            activation=(
+                ActivationType.Silu
+                if self.activation == "silu"
+                else ActivationType.Gelu
+            ),
+            expert_mask=self.expert_mask,
+        )
     def forward_deepgemm_contiguous(
         self,
         hidden_states_fp8: Tuple[torch.Tensor, torch.Tensor],

sglang/srt/layers/moe/ep_moe/token_dispatcher.py CHANGED Viewed

@@ -6,7 +6,13 @@ from sglang.srt.managers.expert_distribution import (
     get_global_expert_distribution_recorder,
 )
 from sglang.srt.managers.schedule_batch import global_server_args_dict
-from sglang.srt.utils import DeepEPMode, get_int_env_var, load_json_config
+from sglang.srt.utils import (
+    DeepEPMode,
+    get_bool_env_var,
+    get_int_env_var,
+    is_hip,
+    load_json_config,
+)
 try:
     from deep_ep import Buffer, Config
@@ -32,6 +38,8 @@ from sglang.srt.layers.moe.ep_moe.kernels import (
 )
 from sglang.srt.model_executor.forward_batch_info import ForwardMode
+_use_aiter = get_bool_env_var("SGLANG_USE_AITER") and is_hip()
 logger = logging.getLogger(__name__)
@@ -376,6 +384,15 @@ class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
         Copy from Megatron-Core token_dispatcher MoEFlexTokenDispatcher
         https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/token_dispatcher.py
         """
+        if _use_aiter:
+            # skip permutation here as aiter fused_moe has fused inside
+            reorder_topk_ids = torch.empty(
+                (0,), device=hidden_states.device, dtype=torch.int64
+            )
+            seg_indptr = torch.zeros(
+                (self.num_experts + 1,), device=hidden_states.device, dtype=torch.int64
+            )
+            return reorder_topk_ids, seg_indptr, hidden_states
         reorder_topk_ids, self.src2dst, seg_indptr = deepep_run_moe_deep_preprocess(
             topk_idx, self.num_experts
@@ -409,7 +426,7 @@ class _DeepEPDispatcherImplNormal(_DeepEPDispatcherImplBase):
         topk_idx: torch.Tensor,
         topk_weights: torch.Tensor,
     ):
-        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM:
+        if deep_gemm_wrapper.ENABLE_JIT_DEEPGEMM or _use_aiter:
             output = hidden_states
         else:
             if hidden_states.shape[0] > 0:

sglang/srt/layers/moe/fused_moe_native.py CHANGED Viewed

@@ -77,8 +77,15 @@ def moe_forward_native(
     custom_routing_function: Optional[Callable] = None,
     correction_bias: Optional[torch.Tensor] = None,
     activation: str = "silu",
+    apply_router_weight_on_input: bool = False,
+    inplace: bool = True,
+    no_combine: bool = False,
     routed_scaling_factor: Optional[float] = None,
 ) -> torch.Tensor:
+    if apply_router_weight_on_input:
+        raise NotImplementedError()
     topk_weights, topk_ids = select_experts(
         hidden_states=x,
         router_logits=router_logits,

sglang/srt/layers/moe/fused_moe_triton/fused_moe.py CHANGED Viewed

@@ -750,9 +750,11 @@ def moe_align_block_size(
         by block_size for proper block matrix operations.
     """
     max_num_tokens_padded = topk_ids.numel() + num_experts * (block_size - 1)
-    sorted_ids, cumsum_buffer = init_sorted_ids_and_cumsum_buffer(
-        max_num_tokens_padded, topk_ids.numel(), num_experts, topk_ids.device
+    sorted_ids = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
     )
+    sorted_ids.fill_(topk_ids.numel())
     max_num_m_blocks = triton.cdiv(max_num_tokens_padded, block_size)
     expert_ids = torch.empty(
         (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
@@ -768,6 +770,9 @@ def moe_align_block_size(
             num_tokens_post_pad,
         )
     else:
+        cumsum_buffer = torch.empty(
+            (num_experts + 1,), dtype=torch.int32, device=topk_ids.device
+        )
         token_cnts_buffer = torch.empty(
             (num_experts + 1) * num_experts,
             dtype=torch.int32,

sglang 0.4.8__py3-none-any.whl → 0.4.8.post1__py3-none-any.whl

sglang 0.4.8py3-none-any.whl → 0.4.8.post1py3-none-any.whl