PyPI - sglang - Versions diffs - 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl - Mend

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (80) hide show

sglang/compile_deep_gemm.py +8 -1
sglang/global_config.py +5 -1
sglang/srt/conversation.py +0 -112
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +1 -0
sglang/srt/disaggregation/prefill.py +1 -0
sglang/srt/distributed/device_communicators/pynccl.py +7 -0
sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
sglang/srt/distributed/parallel_state.py +11 -0
sglang/srt/entrypoints/engine.py +4 -2
sglang/srt/entrypoints/http_server.py +35 -15
sglang/srt/eplb/expert_distribution.py +4 -2
sglang/srt/hf_transformers_utils.py +25 -10
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/flashattention_backend.py +7 -11
sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
sglang/srt/layers/attention/vision.py +27 -10
sglang/srt/layers/communicator.py +14 -4
sglang/srt/layers/linear.py +7 -1
sglang/srt/layers/logits_processor.py +9 -1
sglang/srt/layers/moe/ep_moe/layer.py +11 -35
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +26 -23
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +0 -31
sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
sglang/srt/layers/moe/utils.py +43 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
sglang/srt/layers/quantization/fp8.py +5 -1
sglang/srt/layers/quantization/fp8_kernel.py +0 -4
sglang/srt/layers/vocab_parallel_embedding.py +7 -1
sglang/srt/lora/lora_registry.py +7 -0
sglang/srt/managers/cache_controller.py +8 -4
sglang/srt/managers/data_parallel_controller.py +52 -2
sglang/srt/managers/io_struct.py +6 -1
sglang/srt/managers/schedule_batch.py +3 -2
sglang/srt/managers/schedule_policy.py +3 -1
sglang/srt/managers/scheduler.py +144 -6
sglang/srt/managers/template_manager.py +25 -22
sglang/srt/managers/tokenizer_manager.py +114 -62
sglang/srt/managers/utils.py +45 -1
sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
sglang/srt/mem_cache/hicache_storage.py +13 -21
sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
sglang/srt/model_executor/cuda_graph_runner.py +17 -3
sglang/srt/model_executor/forward_batch_info.py +13 -3
sglang/srt/model_executor/model_runner.py +5 -0
sglang/srt/models/deepseek_v2.py +23 -17
sglang/srt/models/glm4_moe.py +82 -19
sglang/srt/models/grok.py +3 -3
sglang/srt/models/llama4.py +13 -2
sglang/srt/models/mixtral.py +3 -3
sglang/srt/models/mllama4.py +428 -19
sglang/srt/models/qwen2_moe.py +1 -4
sglang/srt/models/qwen3_moe.py +7 -8
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -3
sglang/srt/multimodal/processors/gemma3n.py +0 -7
sglang/srt/operations_strategy.py +1 -1
sglang/srt/server_args.py +80 -20
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
sglang/srt/two_batch_overlap.py +6 -4
sglang/srt/utils.py +3 -24
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +3 -3
sglang/version.py +1 -1
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/METADATA +3 -2
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/RECORD +80 -74
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
/sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
/sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post1.dist-info → sglang-0.4.10.post2.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/cutlass_mla_backend.py CHANGED Viewed

@@ -102,7 +102,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
                     block_kv_indices,
                     self.req_to_token.stride(0),
                     max_seqlen_pad,
-                    PAGE_SIZE,
+                    PAGED_SIZE=PAGE_SIZE,
                 )
                 workspace_size = cutlass_mla_get_workspace_size(
                     max_seqlen_pad * PAGE_SIZE, bs, num_kv_splits=1
@@ -165,7 +165,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
                     self.cuda_graph_kv_indices,
                     self.req_to_token.stride(0),
                     self.cuda_graph_kv_indices.stride(0),
-                    PAGE_SIZE,
+                    PAGED_SIZE=PAGE_SIZE,
                 )
                 self.forward_metadata = CutlassMLADecodeMetadata(
                     self.cuda_graph_mla_workspace,
@@ -206,7 +206,7 @@ class CutlassMLABackend(FlashInferMLAAttnBackend):
                 self.cuda_graph_kv_indices,
                 self.req_to_token.stride(0),
                 self.cuda_graph_kv_indices.stride(0),
-                PAGE_SIZE,
+                PAGED_SIZE=PAGE_SIZE,
             )
         else:
             super().init_forward_metadata_replay_cuda_graph(

sglang/srt/layers/attention/flashattention_backend.py CHANGED Viewed

@@ -1406,7 +1406,7 @@ class FlashAttentionBackend(AttentionBackend):
                     )
                     metadata.page_table = self.decode_cuda_graph_metadata[
                         "page_table_draft_decode"
-                    ][req_pool_indices, :]
+                    ][:bs, :]
                     self.decode_cuda_graph_metadata[bs] = metadata
                 else:
                     # When top k > 1, we need two specific draft decode metadata, and then merge states
@@ -1424,7 +1424,7 @@ class FlashAttentionBackend(AttentionBackend):
                     ][: bs + 1]
                     metadata.page_table = self.draft_decode_metadata_topk_normal[
                         "page_table"
-                    ][req_pool_indices, :]
+                    ][:bs, :]
                     # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
                     metadata_expand.cache_seqlens_int32 = (
@@ -1461,7 +1461,7 @@ class FlashAttentionBackend(AttentionBackend):
                 metadata.max_seq_len_k = seq_lens.max().item()
                 # Precompute page table
                 metadata.page_table = self.decode_cuda_graph_metadata["page_table"][
-                    req_pool_indices, :
+                    :bs, :
                 ]
                 # Precompute cumulative sequence lengths
                 metadata.cu_seqlens_q = torch.arange(
@@ -1498,9 +1498,7 @@ class FlashAttentionBackend(AttentionBackend):
                     : (bs + 1)
                 ]
-                metadata.page_table = self.target_verify_metadata["page_table"][
-                    req_pool_indices, :
-                ]
+                metadata.page_table = self.target_verify_metadata["page_table"][:bs, :]
                 self.target_verify_metadata[bs] = metadata
             else:
@@ -1519,7 +1517,7 @@ class FlashAttentionBackend(AttentionBackend):
                 ][: bs + 1]
                 metadata.page_table = self.target_verify_metadata_topk_normal[
                     "page_table"
-                ][req_pool_indices, :]
+                ][:bs, :]
                 # 2. The second half of metadata for draft tokens (per_batch_num_tokens = topk)
                 metadata_expand.cache_seqlens_int32 = (
@@ -1562,9 +1560,7 @@ class FlashAttentionBackend(AttentionBackend):
             metadata.cu_seqlens_k = self.draft_extend_metadata["cu_seqlens_k"][
                 : (bs + 1)
             ]
-            metadata.page_table = self.draft_extend_metadata["page_table"][
-                req_pool_indices, :
-            ]
+            metadata.page_table = self.draft_extend_metadata["page_table"][:bs, :]
             self.draft_extend_metadata[bs] = metadata
@@ -1578,7 +1574,7 @@ class FlashAttentionBackend(AttentionBackend):
             ][: (encoder_bs + 1)]
             metadata.encoder_page_table = self.encoder_metadata["encoder_page_table"][
-                req_pool_indices, :
+                :bs, :
             ]
         self.forward_metadata = metadata

sglang/srt/layers/attention/trtllm_mla_backend.py CHANGED Viewed

@@ -147,8 +147,8 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend):
             block_kv_indices,
             self.req_to_token.stride(0),
             max_blocks,
-            TRITON_PAD_NUM_PAGE_PER_BLOCK,
-            self.page_size,
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
         )
         return block_kv_indices
@@ -204,8 +204,8 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend):
             block_kv_indices,
             self.req_to_token.stride(0),
             max_seqlen_pad,
-            TRITON_PAD_NUM_PAGE_PER_BLOCK,
-            self.page_size,
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
         )
         metadata = TRTLLMMLADecodeMetadata(self.cuda_graph_workspace, block_kv_indices)
@@ -248,8 +248,8 @@ class TRTLLMMLABackend(FlashInferMLAAttnBackend):
             metadata.block_kv_indices,
             self.req_to_token.stride(0),
             metadata.block_kv_indices.shape[1],
-            TRITON_PAD_NUM_PAGE_PER_BLOCK,
-            self.page_size,
+            NUM_PAGE_PER_BLOCK=TRITON_PAD_NUM_PAGE_PER_BLOCK,
+            PAGED_SIZE=self.page_size,
         )
     def get_cuda_graph_seq_len_fill_value(self) -> int:

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -4,7 +4,7 @@ import dataclasses
 import functools
 import math
 from functools import lru_cache, partial
-from typing import Any, Optional, Tuple, Union
+from typing import Any, Callable, Optional, Tuple, Union
 import torch
 import torch.nn as nn
@@ -308,6 +308,7 @@ class VisionFlash3Attention(nn.Module):
         cu_seqlens = cu_seqlens.to(dtype=torch.int32).to(q.device)
         seq_lens = cu_seqlens[1:] - cu_seqlens[:-1]
         max_seqlen = seq_lens.max().item()
         output = flash_attn_varlen_func(
             q,
             k,
@@ -358,6 +359,9 @@ class VisionAttention(nn.Module):
         qkv_bias: bool = True,
         qk_normalization: bool = False,
         layer_norm_eps: float = 1e-06,
+        customized_position_embedding_applier: Callable[
+            [torch.Tensor, torch.Tensor, Any, Any], Tuple[torch.Tensor, torch.Tensor]
+        ] = None,
         **kwargs,
     ):
         super().__init__()
@@ -392,6 +396,7 @@ class VisionAttention(nn.Module):
                 self.dummy_dim, eps=layer_norm_eps, var_hidden_size=embed_dim
             )
+        # priority: server_args > passed qkv_backend > sdpa
         if global_server_args_dict["mm_attention_backend"] is None:
             if qkv_backend is None:
                 qkv_backend = "sdpa"
@@ -401,6 +406,9 @@ class VisionAttention(nn.Module):
         print_info_once(f"Using {qkv_backend} as multimodal attention backend.")
+        self.customized_position_embedding_applier = (
+            customized_position_embedding_applier
+        )
         self.qkv_backend = QKV_BACKEND_IMPL[qkv_backend](
             head_dim=self.head_size,
             num_heads=self.num_attention_heads_per_partition,
@@ -473,13 +481,13 @@ class VisionAttention(nn.Module):
         if x.dim() == 2:
             x = x.unsqueeze(0)
         assert x.dim() == 3, x.shape
-        bsz, s, _ = x.shape
+        x_shape = x.shape
+        bsz, s, _ = x_shape
         head = self.num_attention_heads_per_partition
         kv_head = self.num_attention_kv_heads_per_partition
         if self.use_qkv_parallel:
             # [b, s, embed_dim] --> [b, s, embed_dim]
             qkv, _ = self.qkv_proj(x)
             q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
             # [b, s, embed_dim] --> [b * s, head, head_size]
@@ -508,16 +516,25 @@ class VisionAttention(nn.Module):
             ]
         if position_embeddings is not None:
-            cos, sin = position_embeddings
             original_shape = q.shape
-            # [total_tokens, head, head_size]
-            q = q.view(-1, head, self.head_size)
-            k = k.view(-1, head, self.head_size)
-            q, k = apply_rotary_pos_emb(q, k, cos, sin)
+            if self.customized_position_embedding_applier is not None:
+                q, k = self.customized_position_embedding_applier(
+                    q, k, position_embeddings, x_shape
+                )
+                q = q.view(original_shape)
+                k = k.view(original_shape)
+            else:
+                cos, sin = position_embeddings
+                # [total_tokens, head, head_size]
+                q = q.view(-1, head, self.head_size)
+                k = k.view(-1, head, self.head_size)
+                q, k = apply_rotary_pos_emb(q, k, cos, sin)
-            q = q.view(original_shape)
-            k = k.view(original_shape)
+                q = q.view(original_shape)
+                k = k.view(original_shape)
         if q.dim() == 4:
             # [b, s, head, head_size] --> [b * s, head, head_size]

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -108,7 +108,7 @@ class LayerScatterModes:
         if context.is_layer_sparse:
             return (
                 ScatterMode.SCATTERED
-                if global_server_args_dict["enable_deepep_moe"]
+                if not global_server_args_dict["moe_a2a_backend"].is_standard()
                 else ScatterMode.FULL
             )
         else:
@@ -404,14 +404,24 @@ class CommunicateWithAllReduceAndLayerNormFn:
         if context.attn_dp_size != 1:
             if context.attn_tp_rank == 0:
                 hidden_states += residual
+            # Perform layernorm on smaller data before comm. Only valid when attn_tp_size is 1 (tp_size == dp_size)
+            use_layer_norm_before_gather = context.attn_tp_size == 1
+            if use_layer_norm_before_gather:
+                residual.copy_(hidden_states)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
             hidden_states, local_hidden_states = (
                 forward_batch.gathered_buffer,
                 hidden_states,
             )
             dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
-            dp_scatter(residual, hidden_states, forward_batch)
-            if hidden_states.shape[0] != 0:
-                hidden_states = layernorm(hidden_states)
+            if not use_layer_norm_before_gather:
+                dp_scatter(residual, hidden_states, forward_batch)
+                if hidden_states.shape[0] != 0:
+                    hidden_states = layernorm(hidden_states)
         else:
             # According to the discussion in https://github.com/flashinfer-ai/flashinfer/issues/1223#issuecomment-3047256465
             # We set the max token num to 128 for allreduce fusion with min-latency case(use_oneshot=True).

sglang/srt/layers/linear.py CHANGED Viewed

@@ -13,10 +13,14 @@ from sglang.srt.distributed import (
     divide,
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
+    parallel_state,
     split_tensor_along_last_dim,
     tensor_model_parallel_all_gather,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
 from sglang.srt.layers.parameter import (
     BasevLLMParameter,
     BlockQuantScaleParameter,
@@ -1292,7 +1296,9 @@ class RowParallelLinear(LinearBase):
         # Only fuse bias add into GEMM for rank 0 (this ensures that
         # bias will not get added more than once in TP>1 case)
         bias_ = None if (self.tp_rank > 0 or self.skip_bias_add) else self.bias
-        output_parallel = self.quant_method.apply(self, input_parallel, bias=bias_)
+        with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
+            output_parallel = self.quant_method.apply(self, input_parallel, bias=bias_)
+            sm.tag(output_parallel)
         if self.reduce_results and self.tp_size > 1 and not can_fuse_mlp_allreduce:
             output = tensor_model_parallel_all_reduce(output_parallel)
         else:

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -83,6 +83,7 @@ class LogitsProcessorOutput:
 class LogitsMetadata:
     forward_mode: ForwardMode
     capture_hidden_mode: CaptureHiddenMode = CaptureHiddenMode.NULL
+    next_token_logits_buffer: Optional[torch.Tensor] = None
     extend_return_logprob: bool = False
     extend_return_top_logprob: bool = False
@@ -148,6 +149,7 @@ class LogitsMetadata:
         return cls(
             forward_mode=forward_batch.forward_mode,
             capture_hidden_mode=forward_batch.capture_hidden_mode,
+            next_token_logits_buffer=forward_batch.next_token_logits_buffer,
             extend_return_logprob=extend_return_logprob,
             extend_return_top_logprob=extend_return_top_logprob,
             extend_token_ids_logprob=extend_token_ids_logprob,
@@ -508,7 +510,13 @@ class LogitsProcessor(nn.Module):
             )
             dp_scatter(logits, global_logits, logits_metadata)
-        logits = logits[:, : self.config.vocab_size].float()
+        if logits_metadata.next_token_logits_buffer is not None:
+            logits_buffer = logits_metadata.next_token_logits_buffer
+            assert logits_buffer.dtype == torch.float
+            logits_buffer.copy_(logits[:, : self.config.vocab_size])
+            logits = logits_buffer
+        else:
+            logits = logits[:, : self.config.vocab_size].float()
         if self.final_logit_softcapping:
             fused_softcap(logits, self.final_logit_softcapping)

sglang/srt/layers/moe/ep_moe/layer.py CHANGED Viewed

@@ -1,28 +1,17 @@
 from __future__ import annotations
 import logging
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import TYPE_CHECKING, Optional
 import torch
-from sglang.srt.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
-from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
+from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
 from sglang.srt.layers.moe.ep_moe.kernels import (
     ep_gather,
     ep_scatter,
-    gelu_and_mul_triton_kernel,
-    grouped_gemm_triton,
     moe_ep_deepgemm_preprocess,
     post_reorder_triton_kernel,
-    pre_reorder_triton_kernel,
-    pre_reorder_triton_kernel_for_cutlass_moe,
-    run_cutlass_moe_ep_preproess,
-    run_moe_ep_preproess,
     silu_and_mul_masked_post_quant_fwd,
-    silu_and_mul_triton_kernel,
     tma_align_input_scale,
 )
 from sglang.srt.layers.moe.fused_moe_triton.layer import (
@@ -31,11 +20,9 @@ from sglang.srt.layers.moe.fused_moe_triton.layer import (
     should_use_flashinfer_trtllm_moe,
 )
 from sglang.srt.layers.moe.topk import TopKOutput
+from sglang.srt.layers.moe.utils import DeepEPMode
 from sglang.srt.layers.quantization import deep_gemm_wrapper
-from sglang.srt.layers.quantization.base_config import (
-    QuantizationConfig,
-    QuantizeMethodBase,
-)
+from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.quantization.fp8 import (
     Fp8Config,
     Fp8MoEMethod,
@@ -44,23 +31,13 @@ from sglang.srt.layers.quantization.fp8 import (
 from sglang.srt.layers.quantization.fp8_kernel import (
     is_fp8_fnuz,
     sglang_per_token_group_quant_fp8,
-    sglang_per_token_quant_fp8,
 )
-from sglang.srt.layers.quantization.unquant import UnquantizedFusedMoEMethod
-from sglang.srt.layers.quantization.w4afp8 import W4AFp8Config, W4AFp8MoEMethod
 from sglang.srt.managers.schedule_batch import global_server_args_dict
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import (
-    DeepEPMode,
-    ceil_div,
-    dispose_tensor,
-    get_bool_env_var,
-    is_hip,
-    is_npu,
-)
+from sglang.srt.utils import ceil_div, dispose_tensor, get_bool_env_var, is_hip, is_npu
 if TYPE_CHECKING:
-    from sglang.srt.layers.moe.ep_moe.token_dispatcher import (
+    from sglang.srt.layers.moe.token_dispatcher import (
         DeepEPLLOutput,
         DeepEPNormalOutput,
         DispatchOutput,
@@ -119,7 +96,6 @@ class EPMoE(FusedMoE):
             activation=activation,
             # apply_router_weight_on_input=apply_router_weight_on_input,
             routed_scaling_factor=routed_scaling_factor,
-            enable_ep_moe=True,
         )
         self.start_expert_id = self.moe_ep_rank * self.num_local_experts
@@ -304,6 +280,8 @@ class EPMoE(FusedMoE):
             m_max * self.start_expert_id,
             BLOCK_SIZE=512,
         )
+        if self.routed_scaling_factor is not None:
+            output *= self.routed_scaling_factor
         return output
@@ -328,7 +306,7 @@ class DeepEPMoE(EPMoE):
         prefix: str = "",
         activation: str = "silu",
         routed_scaling_factor: Optional[float] = None,
-        deepep_mode: DeepEPMode = DeepEPMode.auto,
+        deepep_mode: DeepEPMode = DeepEPMode.AUTO,
     ):
         super().__init__(
             num_experts=num_experts,
@@ -348,7 +326,6 @@ class DeepEPMoE(EPMoE):
         # TODO: move to the beginning of the file
         from sglang.srt.distributed.parallel_state import get_tp_group
-        from sglang.srt.managers.schedule_batch import global_server_args_dict
         from sglang.srt.two_batch_overlap import MaybeTboDeepEPDispatcher
         self.deepep_dispatcher = MaybeTboDeepEPDispatcher(
@@ -762,11 +739,10 @@ class FlashInferEPMoE(EPMoE):
 def get_moe_impl_class():
-    if global_server_args_dict["enable_deepep_moe"]:
+    if global_server_args_dict["moe_a2a_backend"].is_deepep():
         return DeepEPMoE
     if global_server_args_dict["enable_flashinfer_cutlass_moe"]:
-        # Must come before EPMoE because FusedMoE also supports enable_ep_moe
         return FusedMoE
-    if global_server_args_dict["enable_ep_moe"]:
+    if get_moe_expert_parallel_world_size() > 1:
         return FlashInferEPMoE if should_use_flashinfer_trtllm_moe() else EPMoE
     return FlashInferFusedMoE if should_use_flashinfer_trtllm_moe() else FusedMoE

sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json ADDED Viewed

@@ -0,0 +1,146 @@
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "4": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "24": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "32": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 4
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "64": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 8,
+        "num_stages": 2
+    },
+    "96": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "128": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "512": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 2
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 3
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 8,
+        "num_stages": 3
+    }
+}

sglang/srt/layers/moe/fused_moe_triton/layer.py CHANGED Viewed

@@ -14,10 +14,12 @@ from sglang.srt.distributed import (
     get_moe_expert_parallel_world_size,
     get_moe_tensor_parallel_rank,
     get_moe_tensor_parallel_world_size,
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
+    get_tp_group,
     tensor_model_parallel_all_reduce,
 )
+from sglang.srt.distributed.device_communicators.pynccl_allocator import (
+    use_symmetric_memory,
+)
 from sglang.srt.eplb.expert_location import get_global_expert_location_metadata
 from sglang.srt.layers.moe.topk import StandardTopKOutput
 from sglang.srt.layers.quantization.base_config import (
@@ -94,7 +96,6 @@ class FusedMoE(torch.nn.Module):
         no_combine: bool = False,
         routed_scaling_factor: Optional[float] = None,
         enable_flashinfer_cutlass_moe: Optional[bool] = False,
-        enable_ep_moe: Optional[bool] = False,
     ):
         super().__init__()
@@ -112,7 +113,6 @@ class FusedMoE(torch.nn.Module):
         if enable_flashinfer_cutlass_moe and quant_config is None:
             logger.warning("Disable flashinfer MoE when quantization config is None.")
             enable_flashinfer_cutlass_moe = False
-            enable_ep_moe = False
         self.enable_flashinfer_cutlass_moe = enable_flashinfer_cutlass_moe
         self.moe_ep_size = get_moe_expert_parallel_world_size()
@@ -121,7 +121,7 @@ class FusedMoE(torch.nn.Module):
         self.moe_tp_rank = get_moe_tensor_parallel_rank()
         assert num_experts % self.moe_ep_size == 0
         self.num_local_experts = num_experts // self.moe_ep_size
-        if enable_ep_moe:
+        if self.moe_ep_size > 1:
             # TODO(ch-wan): support shared experts fusion
             # Create a tensor of size num_experts filled with -1
             self.expert_map_cpu = torch.full((self.num_experts,), -1, dtype=torch.int32)
@@ -630,24 +630,27 @@ class FusedMoE(torch.nn.Module):
             )
         # Matrix multiply.
-        final_hidden_states = self.quant_method.apply(
-            layer=self,
-            x=hidden_states,
-            topk_output=topk_output,
-            activation=self.activation,
-            apply_router_weight_on_input=self.apply_router_weight_on_input,
-            routed_scaling_factor=self.routed_scaling_factor,
-            **(
-                dict(
-                    tp_rank=self.moe_tp_rank,
-                    tp_size=self.moe_tp_size,
-                    ep_rank=self.moe_ep_rank,
-                    ep_size=self.moe_ep_size,
-                )
-                if self.quant_method.__class__.__name__ == "ModelOptNvFp4FusedMoEMethod"
-                else {}
-            ),
-        )
+        with use_symmetric_memory(get_tp_group()) as sm:
+            final_hidden_states = self.quant_method.apply(
+                layer=self,
+                x=hidden_states,
+                topk_output=topk_output,
+                activation=self.activation,
+                apply_router_weight_on_input=self.apply_router_weight_on_input,
+                routed_scaling_factor=self.routed_scaling_factor,
+                **(
+                    dict(
+                        tp_rank=self.moe_tp_rank,
+                        tp_size=self.moe_tp_size,
+                        ep_rank=self.moe_ep_rank,
+                        ep_size=self.moe_ep_size,
+                    )
+                    if self.quant_method.__class__.__name__
+                    == "ModelOptNvFp4FusedMoEMethod"
+                    else {}
+                ),
+            )
+            sm.tag(final_hidden_states)
         if self.reduce_results and (self.moe_tp_size > 1 or self.moe_ep_size > 1):
             final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)

sglang 0.4.10.post1__py3-none-any.whl → 0.4.10.post2__py3-none-any.whl

sglang 0.4.10.post1py3-none-any.whl → 0.4.10.post2py3-none-any.whl