PyPI - sglang - Versions diffs - 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl - Mend

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

sglang/bench_one_batch.py +113 -17
sglang/srt/configs/model_config.py +35 -0
sglang/srt/conversation.py +9 -5
sglang/srt/disaggregation/base/conn.py +5 -2
sglang/srt/disaggregation/decode.py +6 -1
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
sglang/srt/disaggregation/mooncake/conn.py +243 -135
sglang/srt/disaggregation/prefill.py +2 -0
sglang/srt/distributed/parallel_state.py +11 -9
sglang/srt/entrypoints/context.py +244 -0
sglang/srt/entrypoints/engine.py +4 -3
sglang/srt/entrypoints/harmony_utils.py +370 -0
sglang/srt/entrypoints/http_server.py +71 -0
sglang/srt/entrypoints/openai/protocol.py +227 -1
sglang/srt/entrypoints/openai/serving_chat.py +278 -42
sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
sglang/srt/entrypoints/openai/tool_server.py +174 -0
sglang/srt/entrypoints/tool.py +87 -0
sglang/srt/eplb/expert_location.py +5 -1
sglang/srt/function_call/harmony_tool_parser.py +130 -0
sglang/srt/hf_transformers_utils.py +30 -3
sglang/srt/jinja_template_utils.py +8 -1
sglang/srt/layers/attention/aiter_backend.py +5 -8
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
sglang/srt/layers/attention/triton_backend.py +85 -14
sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
sglang/srt/layers/attention/vision.py +13 -5
sglang/srt/layers/communicator.py +21 -4
sglang/srt/layers/dp_attention.py +12 -0
sglang/srt/layers/linear.py +2 -7
sglang/srt/layers/moe/cutlass_moe.py +20 -6
sglang/srt/layers/moe/ep_moe/layer.py +77 -73
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
sglang/srt/layers/moe/fused_moe_triton/layer.py +416 -35
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
sglang/srt/layers/moe/topk.py +12 -3
sglang/srt/layers/moe/utils.py +16 -0
sglang/srt/layers/quantization/__init__.py +22 -0
sglang/srt/layers/quantization/fp4.py +557 -0
sglang/srt/layers/quantization/fp8.py +3 -6
sglang/srt/layers/quantization/fp8_utils.py +29 -0
sglang/srt/layers/quantization/modelopt_quant.py +259 -64
sglang/srt/layers/quantization/mxfp4.py +651 -0
sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
sglang/srt/layers/quantization/quark/__init__.py +0 -0
sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
sglang/srt/layers/quantization/quark/utils.py +107 -0
sglang/srt/layers/quantization/unquant.py +60 -6
sglang/srt/layers/quantization/w4afp8.py +1 -1
sglang/srt/layers/rotary_embedding.py +225 -1
sglang/srt/layers/utils.py +9 -0
sglang/srt/layers/vocab_parallel_embedding.py +8 -3
sglang/srt/lora/lora_manager.py +70 -14
sglang/srt/lora/lora_registry.py +3 -2
sglang/srt/lora/mem_pool.py +43 -5
sglang/srt/managers/cache_controller.py +55 -30
sglang/srt/managers/detokenizer_manager.py +1 -1
sglang/srt/managers/io_struct.py +15 -3
sglang/srt/managers/mm_utils.py +5 -11
sglang/srt/managers/schedule_batch.py +28 -7
sglang/srt/managers/scheduler.py +26 -12
sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
sglang/srt/managers/scheduler_recv_skipper.py +37 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
sglang/srt/managers/template_manager.py +35 -1
sglang/srt/managers/tokenizer_manager.py +24 -6
sglang/srt/managers/tp_worker.py +3 -0
sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
sglang/srt/mem_cache/hiradix_cache.py +53 -5
sglang/srt/mem_cache/memory_pool_host.py +1 -1
sglang/srt/mem_cache/multimodal_cache.py +33 -13
sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +7 -6
sglang/srt/model_executor/forward_batch_info.py +35 -14
sglang/srt/model_executor/model_runner.py +19 -2
sglang/srt/model_loader/weight_utils.py +10 -0
sglang/srt/models/bailing_moe.py +425 -0
sglang/srt/models/deepseek_v2.py +72 -33
sglang/srt/models/ernie4.py +426 -0
sglang/srt/models/ernie4_eagle.py +203 -0
sglang/srt/models/gemma3n_mm.py +39 -0
sglang/srt/models/glm4_moe.py +24 -12
sglang/srt/models/gpt_oss.py +1134 -0
sglang/srt/models/qwen2.py +6 -0
sglang/srt/models/qwen2_moe.py +6 -0
sglang/srt/models/qwen3_moe.py +32 -6
sglang/srt/models/step3_vl.py +9 -0
sglang/srt/models/transformers.py +2 -5
sglang/srt/multimodal/processors/step3_vl.py +3 -1
sglang/srt/reasoning_parser.py +18 -39
sglang/srt/server_args.py +142 -7
sglang/srt/two_batch_overlap.py +157 -5
sglang/srt/utils.py +38 -2
sglang/test/runners.py +2 -2
sglang/test/test_utils.py +1 -1
sglang/version.py +1 -1
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +16 -14
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +105 -84
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.10.post2.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/trtllm_mha_backend.py ADDED Viewed

@@ -0,0 +1,332 @@
+from __future__ import annotations
+"""
+Support attention backend for TRTLLM MHA kernels from flashinfer.
+The kernel supports sm100 only, with sliding window and attention sink features.
+"""
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional
+import torch
+from sglang.srt.layers.attention.flashinfer_backend import FlashInferAttnBackend
+from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
+from sglang.srt.utils import is_flashinfer_available
+if is_flashinfer_available():
+    import flashinfer
+if TYPE_CHECKING:
+    from sglang.srt.layers.radix_attention import RadixAttention
+    from sglang.srt.model_executor.model_runner import ModelRunner
+    from sglang.srt.speculative.spec_info import SpecInfo
+# Constants
+DEFAULT_WORKSPACE_SIZE_MB = 128  # Memory workspace size in MB
+# Reuse this workspace buffer across all TRTLLM MHA wrappers
+global_workspace_buffer = None
+@dataclass
+class TRTLLMMHAMetadata:
+    # Sequence lengths for the forward batch
+    cache_seqlens_int32: torch.Tensor = None
+    # Maximum sequence length for query
+    max_seq_len_q: int = 1
+    # Maximum sequence length for key
+    max_seq_len_k: int = 0
+    # Cumulative sequence lengths for `query
+    cu_seqlens_q: torch.Tensor = None
+    # Cumulative sequence lengths for key
+    cu_seqlens_k: torch.Tensor = None
+    # Page table, the index of KV Cache Tables/Blocks
+    page_table: torch.Tensor = None
+class TRTLLMHAAttnBackend(FlashInferAttnBackend):
+    """TRTLLM MHA attention kernel from flashinfer."""
+    def __init__(
+        self,
+        model_runner: ModelRunner,
+        skip_prefill: bool = False,
+        kv_indptr_buf: Optional[torch.Tensor] = None,
+        q_indptr_decode_buf: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(model_runner, skip_prefill, kv_indptr_buf, q_indptr_decode_buf)
+        config = model_runner.model_config
+        # MHA-specific dimensions
+        self.max_context_len = model_runner.model_config.context_len
+        self.hidden_size = config.hidden_size
+        # Runtime parameters
+        self.data_type = model_runner.kv_cache_dtype
+        self.q_data_type = model_runner.dtype
+        self.page_size = model_runner.page_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.device = model_runner.device
+        # Workspace allocation
+        self.workspace_size = DEFAULT_WORKSPACE_SIZE_MB * 1024 * 1024
+        # Allocate buffers
+        global global_workspace_buffer
+        if global_workspace_buffer is None:
+            global_workspace_buffer = torch.empty(
+                self.workspace_size,
+                dtype=torch.uint8,
+                device=model_runner.device,
+            )
+        self.workspace_buffer = global_workspace_buffer
+        # CUDA graph state
+        self.decode_cuda_graph_metadata = {}
+        # Forward metadata
+        self.forward_metadata: Optional[TRTLLMMHAMetadata] = None
+    def init_cuda_graph_state(
+        self,
+        max_bs: int,
+        max_num_tokens: int,
+        kv_indices_buf: Optional[torch.Tensor] = None,
+    ):
+        """Initialize CUDA graph state for TRTLLM MHA."""
+        self.decode_cuda_graph_metadata = {
+            "cache_seqlens": torch.zeros(max_bs, dtype=torch.int32, device=self.device),
+            "page_table": torch.zeros(
+                max_bs,
+                (self.max_context_len + self.page_size - 1) // self.page_size,
+                dtype=torch.int32,
+                device=self.device,
+            ),
+            "strided_indices": torch.arange(
+                0, self.max_context_len, self.page_size, device=self.device
+            ),
+        }
+    def init_forward_metadata_capture_cuda_graph(
+        self,
+        bs: int,
+        num_tokens: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+    ):
+        """Initialize metadata for CUDA graph capture."""
+        metadata = TRTLLMMHAMetadata()
+        # Get sequence information
+        metadata.cache_seqlens_int32 = seq_lens[:bs].to(torch.int32)
+        # Precompute maximum sequence length
+        metadata.max_seq_len_k = self.max_context_len
+        # Precompute page table
+        metadata.page_table = self.decode_cuda_graph_metadata["page_table"][:bs, :]
+        self.decode_cuda_graph_metadata[bs] = metadata
+        self.forward_metadata = metadata
+    def init_forward_metadata_replay_cuda_graph(
+        self,
+        bs: int,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        encoder_lens: Optional[torch.Tensor],
+        forward_mode: ForwardMode,
+        spec_info: Optional[SpecInfo],
+        seq_lens_cpu: Optional[torch.Tensor],
+    ):
+        """Replay CUDA graph with new inputs."""
+        seq_lens = seq_lens[:bs]
+        seq_lens_cpu = seq_lens_cpu[:bs]
+        req_pool_indices = req_pool_indices[:bs]
+        device = seq_lens.device
+        metadata = None
+        # Normal Decode
+        metadata = self.decode_cuda_graph_metadata[bs]
+        max_len = seq_lens_cpu.max().item()
+        max_seq_pages = (max_len + self.page_size - 1) // self.page_size
+        metadata.max_seq_len_k = self.max_context_len
+        metadata.cache_seqlens_int32.copy_(seq_lens)
+        page_indices = self.req_to_token[
+            req_pool_indices[:, None],
+            self.decode_cuda_graph_metadata["strided_indices"][:max_seq_pages][None, :],
+        ]
+        metadata.page_table[:, :max_seq_pages].copy_(page_indices // self.page_size)
+        self.forward_metadata = metadata
+    def get_cuda_graph_seq_len_fill_value(self) -> int:
+        """Get the fill value for sequence lengths in CUDA graph."""
+        return 1
+    def init_forward_metadata(self, forward_batch: ForwardBatch):
+        """Initialize the metadata for a forward pass."""
+        metadata = TRTLLMMHAMetadata()
+        seqlens_in_batch = forward_batch.seq_lens
+        batch_size = forward_batch.batch_size
+        device = seqlens_in_batch.device
+        if forward_batch.forward_mode.is_decode_or_idle():
+            # Normal Decode
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+        else:
+            metadata.cache_seqlens_int32 = seqlens_in_batch.to(torch.int32)
+            metadata.max_seq_len_k = forward_batch.seq_lens_cpu.max().item()
+            metadata.cu_seqlens_k = torch.nn.functional.pad(
+                torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)
+            )
+            metadata.page_table = forward_batch.req_to_token_pool.req_to_token[
+                forward_batch.req_pool_indices, : metadata.max_seq_len_k
+            ]
+            if any(forward_batch.extend_prefix_lens_cpu):
+                extend_seq_lens = forward_batch.extend_seq_lens
+                metadata.max_seq_len_q = max(forward_batch.extend_seq_lens_cpu)
+                metadata.cu_seqlens_q = torch.nn.functional.pad(
+                    torch.cumsum(extend_seq_lens, dim=0, dtype=torch.int32), (1, 0)
+                )
+            else:
+                metadata.max_seq_len_q = metadata.max_seq_len_k
+                metadata.cu_seqlens_q = metadata.cu_seqlens_k
+        # Convert the page table to a strided format
+        if self.page_size > 1:
+            self.strided_indices = torch.arange(
+                0, metadata.page_table.shape[1], self.page_size, device=self.device
+            )
+            metadata.page_table = (
+                metadata.page_table[:, self.strided_indices] // self.page_size
+            )
+        self.forward_metadata = metadata
+    def forward_decode(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache: bool = True,
+        **kwargs,
+    ) -> torch.Tensor:
+        """Run forward for decode using TRTLLM MHA kernel."""
+        cache_loc = forward_batch.out_cache_loc
+        if save_kv_cache and k is not None:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+            )
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        # shape conversion:
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        kv_cache = (k_cache, v_cache)
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+        # Call TRT-LLM kernel
+        # raw_out: like q, [bs, acc_q_len, num_q_heads, head_dim] but with output dtype
+        o = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            block_tables=self.forward_metadata.page_table,
+            seq_lens=self.forward_metadata.cache_seqlens_int32,
+            max_seq_len=self.forward_metadata.max_seq_len_k,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            window_left=layer.sliding_window_size,
+            # TODO: add attention_sink operation or nvfp4 scale factor if needed
+            sinks=attention_sink,
+        )
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)
+    def forward_extend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        layer: RadixAttention,
+        forward_batch: ForwardBatch,
+        save_kv_cache=True,
+        **kwargs,
+    ):
+        cache_loc = forward_batch.out_cache_loc
+        if save_kv_cache and k is not None:
+            forward_batch.token_to_kv_pool.set_kv_buffer(
+                layer, cache_loc, k, v, layer.k_scale, layer.v_scale
+            )
+        q = q.contiguous().view(-1, layer.tp_q_head_num, layer.head_dim)
+        # [num_pages, page_size, num_kv_heads, head_dim] -> [num_pages, num_kv_heads, page_size, head_dim]
+        k_cache, v_cache = forward_batch.token_to_kv_pool.get_kv_buffer(layer.layer_id)
+        k_cache = k_cache.view(
+            -1, self.page_size, layer.tp_k_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        v_cache = v_cache.view(
+            -1, self.page_size, layer.tp_v_head_num, layer.head_dim
+        ).permute(0, 2, 1, 3)
+        kv_cache = (k_cache, v_cache)
+        # sink: additional value per head in the denominator of the softmax.
+        attention_sink = kwargs.get("sinks", None)
+        # TODO: add support for quantization
+        q_scale = 1.0
+        k_scale = (
+            layer.k_scale_float
+            if getattr(layer, "k_scale_float", None) is not None
+            else 1.0
+        )
+        bmm1_scale = q_scale * k_scale * layer.scaling
+        bmm2_scale = 1.0
+        o = flashinfer.prefill.trtllm_batch_context_with_kv_cache(
+            query=q,
+            kv_cache=kv_cache,
+            workspace_buffer=self.workspace_buffer,
+            block_tables=self.forward_metadata.page_table,
+            seq_lens=self.forward_metadata.cache_seqlens_int32,
+            max_q_len=self.forward_metadata.max_seq_len_q,
+            max_kv_len=self.forward_metadata.max_seq_len_k,
+            bmm1_scale=bmm1_scale,
+            bmm2_scale=bmm2_scale,
+            batch_size=forward_batch.batch_size,
+            cum_seq_lens_q=self.forward_metadata.cu_seqlens_q,
+            cum_seq_lens_kv=self.forward_metadata.cu_seqlens_k,
+            window_left=layer.sliding_window_size,
+            # TODO: add attention_sink operation or nvfp4 scale factor if needed
+            sinks=attention_sink,
+        )
+        return o.view(-1, layer.tp_q_head_num * layer.head_dim)

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -11,6 +11,7 @@ import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
+from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.utils import is_cuda, print_info_once
 _is_cuda = is_cuda()
@@ -365,19 +366,20 @@ class VisionAttention(nn.Module):
         **kwargs,
     ):
         super().__init__()
-        world_size = parallel_state.get_tensor_model_parallel_world_size()
-        self.tp_size = world_size
-        self.tp_rank = parallel_state.get_tensor_model_parallel_rank()
+        attn_tp_rank = get_attention_tp_rank()
+        attn_tp_size = get_attention_tp_size()
+        self.tp_size = attn_tp_size
+        self.tp_rank = attn_tp_rank
         self.dropout = dropout
         self.head_size = embed_dim // num_heads
         self.hidden_size_per_attention_head = dist_utils.divide(
             projection_size, num_heads
         )
         self.num_attention_heads_per_partition = dist_utils.divide(
-            num_dummy_heads + num_heads, world_size
+            num_dummy_heads + num_heads, self.tp_size
         )
         self.num_attention_kv_heads_per_partition = dist_utils.divide(
-            num_dummy_heads + num_heads, world_size
+            num_dummy_heads + num_heads, self.tp_size
         )
         self.q_size = self.num_attention_heads_per_partition * self.head_size
@@ -427,6 +429,8 @@ class VisionAttention(nn.Module):
                 total_num_kv_heads=num_dummy_heads + num_heads,
                 bias=qkv_bias,
                 quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
                 prefix=add_prefix("qkv_proj", prefix),
             )
         else:
@@ -435,6 +439,8 @@ class VisionAttention(nn.Module):
                 output_size=3 * self.dummy_dim,
                 bias=qkv_bias,
                 quant_config=quant_config,
+                tp_rank=self.tp_rank,
+                tp_size=self.tp_size,
                 prefix=add_prefix("qkv_proj", prefix),
             )
         self.proj = RowParallelLinear(
@@ -442,6 +448,8 @@ class VisionAttention(nn.Module):
             output_size=embed_dim,
             bias=proj_bias,
             quant_config=quant_config,
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
             prefix=add_prefix("proj", prefix),
         )

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sglang.srt.layers.dp_attention import (
     attn_tp_all_gather_into_tensor,
     attn_tp_reduce_scatter_tensor,
     dp_gather_partial,
+    dp_reduce_scatter_tensor,
     dp_scatter,
     get_attention_dp_size,
     get_attention_tp_rank,
@@ -149,10 +150,13 @@ class LayerCommunicator:
         layer_scatter_modes: LayerScatterModes,
         input_layernorm: torch.nn.Module,
         post_attention_layernorm: torch.nn.Module,
+        # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
+        allow_reduce_scatter: bool = False,
     ):
         self.layer_scatter_modes = layer_scatter_modes
         self.input_layernorm = input_layernorm
         self.post_attention_layernorm = post_attention_layernorm
+        self.allow_reduce_scatter = allow_reduce_scatter
         self._context = CommunicateContext.init_new()
         self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
@@ -239,6 +243,15 @@ class LayerCommunicator:
             residual=residual,
             forward_batch=forward_batch,
             context=self._context,
+            allow_reduce_scatter=self.allow_reduce_scatter,
+        )
+    def should_use_reduce_scatter(self, forward_batch: ForwardBatch):
+        return (
+            self.allow_reduce_scatter
+            and self._communicate_summable_tensor_pair_fn
+            is CommunicateSummableTensorPairFn._scatter_hidden_states
+            and forward_batch.dp_padding_mode.is_max_len()
         )
@@ -524,6 +537,7 @@ class CommunicateSummableTensorPairFn:
         residual: torch.Tensor,
         forward_batch: ForwardBatch,
         context: CommunicateContext,
+        **kwargs,
     ):
         return hidden_states, residual
@@ -533,15 +547,17 @@ class CommunicateSummableTensorPairFn:
         residual: torch.Tensor,
         forward_batch: ForwardBatch,
         context: CommunicateContext,
+        allow_reduce_scatter: bool = False,
     ):
-        # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
-        # important: forward batch.gathered_buffer is used both after scatter and after gather.
-        # be careful about this!
         hidden_states, global_hidden_states = (
             forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
             hidden_states,
         )
-        dp_scatter(hidden_states, global_hidden_states, forward_batch)
+        if allow_reduce_scatter and forward_batch.dp_padding_mode.is_max_len():
+            # When using padding, all_reduce is skipped after MLP and MOE and reduce scatter is used here instead.
+            dp_reduce_scatter_tensor(hidden_states, global_hidden_states)
+        else:
+            dp_scatter(hidden_states, global_hidden_states, forward_batch)
         return hidden_states, residual
     @staticmethod
@@ -550,6 +566,7 @@ class CommunicateSummableTensorPairFn:
         residual: torch.Tensor,
         forward_batch: ForwardBatch,
         context: CommunicateContext,
+        **kwargs,
     ):
         hidden_states += residual
         residual = None

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -12,6 +12,7 @@ import triton.language as tl
 from sglang.srt.distributed import (
     GroupCoordinator,
+    get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
     get_tp_group,
     tensor_model_parallel_all_reduce,
@@ -355,6 +356,17 @@ def dp_scatter(
         )
+def dp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
+    if get_tensor_model_parallel_world_size() == get_attention_dp_size():
+        get_tp_group().reduce_scatter_tensor(output, input)
+    else:
+        scattered_local_tokens = input.tensor_split(
+            get_tensor_model_parallel_world_size()
+        )[get_tensor_model_parallel_rank()]
+        get_tp_group().reduce_scatter_tensor(scattered_local_tokens, input)
+        get_attention_tp_group().all_gather_into_tensor(output, scattered_local_tokens)
 def attn_tp_reduce_scatter_tensor(output: torch.Tensor, input: torch.Tensor):
     return get_attention_tp_group().reduce_scatter_tensor(output, input)

sglang/srt/layers/linear.py CHANGED Viewed

@@ -1191,11 +1191,6 @@ class RowParallelLinear(LinearBase):
                 else self.weight_loader
             ),
         )
-        if not reduce_results and (bias and not skip_bias_add):
-            raise ValueError(
-                "When not reduce the results, adding bias to the "
-                "results can lead to incorrect results"
-            )
         if bias:
             self.bias = Parameter(torch.empty(self.output_size, dtype=params_dtype))
@@ -1282,7 +1277,7 @@ class RowParallelLinear(LinearBase):
             # It does not support additional parameters.
             param.load_row_parallel_weight(loaded_weight)
-    def forward(self, input_, can_fuse_mlp_allreduce=False):
+    def forward(self, input_, skip_all_reduce=False):
         if self.input_is_parallel:
             input_parallel = input_
         else:
@@ -1299,7 +1294,7 @@ class RowParallelLinear(LinearBase):
         with use_symmetric_memory(parallel_state.get_tp_group()) as sm:
             output_parallel = self.quant_method.apply(self, input_parallel, bias=bias_)
             sm.tag(output_parallel)
-        if self.reduce_results and self.tp_size > 1 and not can_fuse_mlp_allreduce:
+        if self.reduce_results and self.tp_size > 1 and not skip_all_reduce:
             output = tensor_model_parallel_all_reduce(output_parallel)
         else:
             output = output_parallel

sglang/srt/layers/moe/cutlass_moe.py CHANGED Viewed

@@ -9,6 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple
 import torch
 from sglang.srt.layers.moe.cutlass_moe_params import CutlassMoEParams
+from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.utils import is_cuda
 _is_cuda = is_cuda()
@@ -123,6 +124,7 @@ def cutlass_fused_experts_fp8(
     if is_cuda:
         from sglang.srt.layers.quantization.fp8_kernel import (
+            per_token_group_quant_fp8_hopper_moe_mn_major,
             sglang_per_token_group_quant_fp8,
         )
@@ -133,9 +135,7 @@ def cutlass_fused_experts_fp8(
     n = w2_q.size(1)
     topk = topk_ids.size(1)
-    a_q, a1_scale = sglang_per_token_group_quant_fp8(a, 128)
-    device = a_q.device
+    device = a.device
     a_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
     c_map = torch.empty((topk_ids.numel()), dtype=torch.int32, device=device)
@@ -152,8 +152,16 @@ def cutlass_fused_experts_fp8(
         k,
     )
-    rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
-    rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
+    if is_sm100_supported():
+        a_q, a1_scale = sglang_per_token_group_quant_fp8(a, 128)
+        rep_a_q = shuffle_rows(a_q, a_map, (m * topk, k))
+        rep_a1_scales = shuffle_rows(a1_scale, a_map, (m * topk, int(k / 128)))
+    else:
+        rep_a = shuffle_rows(a, a_map, (m * topk, k))
+        rep_a_q, rep_a1_scales = per_token_group_quant_fp8_hopper_moe_mn_major(
+            rep_a, expert_offsets, problem_sizes1, 128
+        )
+        w1_scale = w1_scale.contiguous()
     c1 = torch.empty((m * topk, n * 2), device=device, dtype=out_dtype)
     c2 = torch.empty((m * topk, k), device=device, dtype=out_dtype)
@@ -185,7 +193,13 @@ def cutlass_fused_experts_fp8(
     intermediate = torch.empty((m * topk, n), device=device, dtype=out_dtype)
     silu_and_mul(c1, intermediate)
-    intemediate_q, a2_scale = sglang_per_token_group_quant_fp8(intermediate, 128)
+    if is_sm100_supported():
+        intemediate_q, a2_scale = sglang_per_token_group_quant_fp8(intermediate, 128)
+    else:
+        intemediate_q, a2_scale = per_token_group_quant_fp8_hopper_moe_mn_major(
+            intermediate, expert_offsets, problem_sizes2, 128
+        )
+        w2_scale = w2_scale.contiguous()
     fp8_blockwise_scaled_grouped_mm(
         c2,

sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

sglang 0.4.10.post2py3-none-any.whl → 0.5.0rc0py3-none-any.whl