PyPI - sglang - Versions diffs - 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (180) hide show

sglang/bench_one_batch.py +0 -6
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +24 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -1
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +27 -2
sglang/srt/entrypoints/http_server.py +12 -0
sglang/srt/entrypoints/openai/protocol.py +2 -2
sglang/srt/entrypoints/openai/serving_chat.py +22 -6
sglang/srt/entrypoints/openai/serving_completions.py +9 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +11 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
sglang/srt/layers/attention/triton_backend.py +85 -46
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +51 -3
sglang/srt/layers/dp_attention.py +23 -4
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +5 -1
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/quantization/__init__.py +13 -14
sglang/srt/layers/quantization/awq.py +7 -7
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +5 -4
sglang/srt/layers/quantization/marlin_utils.py +11 -3
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +165 -68
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +206 -37
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +25 -0
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +76 -18
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +9 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +4 -9
sglang/srt/managers/scheduler.py +25 -16
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +60 -21
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +7 -5
sglang/srt/mem_cache/allocator_ascend.py +0 -11
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +25 -12
sglang/srt/model_executor/forward_batch_info.py +4 -1
sglang/srt/model_executor/model_runner.py +43 -32
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +3 -1
sglang/srt/models/deepseek_v2.py +224 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +25 -63
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +34 -74
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama4.py +0 -2
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +3 -18
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +7 -1
sglang/srt/models/qwen3_moe.py +9 -38
sglang/srt/models/step3_vl.py +2 -1
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +6 -1
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/server_args.py +237 -104
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +16 -11
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
sglang/srt/layers/quantization/fp4.py +0 -557
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/utils.py CHANGED Viewed

@@ -9,18 +9,89 @@ TRITON_PAD_NUM_PAGE_PER_BLOCK = 64
 @triton.jit
 def create_flashinfer_kv_indices_triton(
-    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_to_token_ptr,
     req_pool_indices_ptr,
     page_kernel_lens_ptr,
     kv_indptr,
     kv_start_idx,
     kv_indices_ptr,
     req_to_token_ptr_stride: tl.constexpr,
+    PAGE_SIZE: tl.constexpr = 1,
 ):
+    """
+    Create KV indices for FlashInfer attention backend.
+    This Triton kernel builds a lookup table that maps from logical request/token
+    coordinates to physical token locations in the global KV cache pool. It's used
+    by FlashInfer attention backends to efficiently access scattered KV cache data.
+    The kernel processes each request in parallel and converts the req_to_token
+    lookup table into a flat list of token indices that can be used by attention kernels.
+    general idea:
+        blocktables/kv_indices_ptr = [batch_size * max_pages(for graph mode with
+                                                            fixed number of pages)]
+        max_pages = max_context_len / PAGED_SIZE
+        kv_indices_ptr will store the flat list of the pages used by each request
+    Args:
+        Inputs Arguments (non mutable):
+        req_to_token_ptr: Request to token location look up table
+                         Shape: [max_batch, max_context_len]
+        req_pool_indices_ptr: Request to pool index look up table. Each request uses
+                             one pool.
+                             Shape: [batch_size]
+        page_kernel_lens_ptr: sequence lengths per request
+                             Shape: [batch_size]
+        kv_indptr: Should be computed based on number of pages used by each request.
+                   It is used by flashinfer attention kernels to index into the kv_indices_ptr.
+                   per request.
+                  Shape: [batch_size + 1]
+                  kv_indptr[i] = start index in kv_indices for request i
+        kv_start_idx: Pointer to array containing start offsets for each request in SGL.
+                     Can be None. If provided, adds offset to token positions.
+        req_to_token_ptr_stride: Stride for the second dimension of req_to_token.
+                                Equal to max_context_len.
+        PAGED_SIZE: Number of tokens per page. Default is 1 for FlashInfer.
+        Outputs:
+        kv_indices_ptr: Pointer to output array where KV indices will be stored.
+                    Shape:[total-num-pages],
+                    where total_num_pages = sum(seq_lens // PAGED_SIZE)
+    Example:
+        If we have:
+        - req_pool_indices = [0, 1] (request 0 uses pool 0, request 1 uses pool 1)
+        - page_kernel_lens = [3, 2] (request 0 has 3 tokens, request 1 has 2 tokens)
+        - req_to_token = [[10, 11, 12, -1], [20, 21, -1, -1]] (tokens are the elements
+         in radix tree, use them as a pointer to the token location in the kv_indices_ptr)
+        The kernel will output:
+        If PAGE_SIZE = 1:
+        packed
+        - kv_indptr (passed in as input arg): [0,3,5]
+        - kv_indices = [10, 11, 12, 20, 21]
+        padded - max_pages is 10 tokens per req
+        - kv_indptr (passed in as input arg): [0,10, 20]
+        - kv_indices = [10, 11, 12, -1, -1, -1, -1, -1, -1, -1,
+                        20, 21, -1, -1, -1, -1, -1, -1, -1, -1]
+        If PAGE_SIZE = 2
+        packed:
+        - kv_indptr (passed in as input arg): [0,3,4]
+        - kv_indices = [5,6,10]
+        padded: max_pages is 4
+        - kv_indptr (passed in as input arg): [0,4,8,..] (note that 4 is the max_pages)
+        - kv_indices = [5, 6, -1, -1,
+                        10, -1, -1, -1]
+        This allows attention kernels to directly access the correct KV cache
+        entries for each request's tokens.
+    """
     BLOCK_SIZE: tl.constexpr = 512
+    NUM_PAGES_PER_BLOCK: tl.constexpr = BLOCK_SIZE // PAGE_SIZE
     pid = tl.program_id(axis=0)
-    # find the req pool idx, this is for batch to token
     req_pool_index = tl.load(req_pool_indices_ptr + pid)
     kv_indices_offset = tl.load(kv_indptr + pid)
@@ -31,19 +102,27 @@ def create_flashinfer_kv_indices_triton(
         kv_end = kv_start
     kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
-    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
-    for i in range(num_loop):
-        # index into req_to_token_ptr needs to be int64
-        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
-        mask = offset < kv_end - kv_start
-        data = tl.load(
-            req_to_token_ptr
-            + req_pool_index * req_to_token_ptr_stride
-            + kv_start
-            + offset,
-            mask=mask,
+    kv_range = kv_end - kv_start
+    num_pages = tl.cdiv(kv_range, PAGE_SIZE)
+    num_loops = tl.cdiv(kv_range, BLOCK_SIZE)
+    req_to_token_block_start = (
+        req_to_token_ptr + req_pool_index * req_to_token_ptr_stride + kv_start
+    )
+    for i in range(num_loops):
+        token_offsets_in_block = (
+            tl.arange(0, NUM_PAGES_PER_BLOCK).to(tl.int64) + i * NUM_PAGES_PER_BLOCK
+        ) * PAGE_SIZE
+        page_offsets_in_block = token_offsets_in_block // PAGE_SIZE
+        valid_tokens = token_offsets_in_block < kv_range
+        valid_pages = page_offsets_in_block < num_pages
+        token_numbers = tl.load(
+            req_to_token_block_start + token_offsets_in_block, mask=valid_tokens
+        )
+        tl.store(
+            kv_indices_ptr + kv_indices_offset + page_offsets_in_block,
+            token_numbers // PAGE_SIZE,  # write the page numbers to kv_indices_ptr
+            mask=valid_pages,
         )
-        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
 @triton.jit

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -12,7 +12,12 @@ import torch.nn.functional as F
 from einops import rearrange
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
-from sglang.srt.utils import is_cuda, print_info_once
+from sglang.srt.utils import (
+    get_device_capability,
+    is_blackwell,
+    is_cuda,
+    print_info_once,
+)
 _is_cuda = is_cuda()
@@ -20,7 +25,6 @@ if _is_cuda:
     from sgl_kernel.flash_attn import flash_attn_varlen_func
 from sglang.srt.distributed import (
-    parallel_state,
     split_tensor_along_last_dim,
     tensor_model_parallel_all_gather,
 )
@@ -402,18 +406,14 @@ class VisionAttention(nn.Module):
                 self.dummy_dim, eps=layer_norm_eps, var_hidden_size=embed_dim
             )
-        # priority: server_args > passed qkv_backend > sdpa
-        if global_server_args_dict["mm_attention_backend"] is None:
-            if qkv_backend is None:
-                if is_cuda():
-                    # Double prefill throughput by setting attn backend to Triton on CUDA
-                    qkv_backend = "triton_attn"
-                else:
-                    qkv_backend = "sdpa"
+        # Select attention backend via a unified method
+        _passed_backend = qkv_backend
+        qkv_backend = self._determine_attention_backend(_passed_backend)
+        if (
+            global_server_args_dict["mm_attention_backend"] is None
+            and _passed_backend is None
+        ):
             print_info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
-        else:
-            qkv_backend = global_server_args_dict["mm_attention_backend"]
         print_info_once(f"Using {qkv_backend} as multimodal attention backend.")
         self.customized_position_embedding_applier = (
@@ -461,6 +461,33 @@ class VisionAttention(nn.Module):
             prefix=add_prefix("proj", prefix),
         )
+    def _determine_attention_backend(self, passed_backend: Optional[str]) -> str:
+        """Decide the multimodal attention backend string.
+        Priority: server args override > constructor arg > platform default.
+        Platform defaults:
+        - CUDA: "triton_attn"
+        - Non-CUDA: "sdpa"
+        """
+        override_backend = global_server_args_dict["mm_attention_backend"]
+        if override_backend is not None:
+            backend = override_backend
+        elif passed_backend is not None:
+            backend = passed_backend
+        elif is_cuda():
+            major, minor = get_device_capability()
+            if major == 9:
+                backend = "fa3"
+            else:
+                backend = "triton_attn"
+        else:
+            backend = "sdpa"
+        if backend == "fa3" and is_blackwell():
+            raise ValueError("The 'fa3' backend is not supported on Blackwell GPUs")
+        return backend
     def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
         """apply qk norm for internvl vit attn"""
         q = q.flatten(1, 2)

sglang/srt/layers/attention/vision_utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Utility functions for vision attention layers."""
+import torch
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+def update_vit_attn_dummy_heads_config(config):
+    """Update HF config to ensure vision attention num_attention_heads is divisible by tp_size"""
+    tp_size = get_attention_tp_size()
+    num_heads = getattr(
+        config.vision_config,
+        "num_heads",
+        getattr(config.vision_config, "num_attention_heads", None),
+    )
+    head_dim = config.vision_config.hidden_size // num_heads
+    num_dummy_heads = 0
+    if num_heads % tp_size != 0:
+        num_dummy_heads = ((num_heads + tp_size - 1) // tp_size) * tp_size - num_heads
+    setattr(config.vision_config, "head_dim", head_dim)
+    setattr(config.vision_config, "num_dummy_heads", num_dummy_heads)
+def pad_vit_attn_dummy_heads(config, name: str, loaded_weight: torch.Tensor):
+    """Pad attention qkv weights for dummy heads"""
+    num_dummy_heads = config.vision_config.num_dummy_heads
+    if num_dummy_heads == 0:
+        return loaded_weight
+    head_dim = config.vision_config.head_dim
+    if "attn.qkv_proj" in name:
+        wq, wk, wv = loaded_weight.chunk(3, dim=0)
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        pad_func = lambda x: torch.cat(
+            [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+        ).flatten(0, 1)
+        wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+        loaded_weight = torch.cat([wq, wk, wv], dim=0)
+    elif any([_ in name for _ in ["attn.q_proj", "attn.k_proj", "attn.v_proj"]]):
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, loaded_weight.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        padded_weight = loaded_weight.new_zeros(dummy_shape)
+        loaded_weight = torch.cat(
+            [loaded_weight.unflatten(0, (-1, head_dim)), padded_weight], dim=0
+        ).flatten(0, 1)
+    elif "attn.proj.weight" in name:
+        padded_weight = loaded_weight.new_zeros(
+            loaded_weight.shape[0], head_dim * num_dummy_heads
+        )
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+    elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+        padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+    return loaded_weight

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -17,7 +17,7 @@ from enum import Enum, auto
 from functools import partial
 from typing import Dict, Optional
-import torch.distributed
+import torch
 from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
@@ -34,6 +34,11 @@ from sglang.srt.layers.dp_attention import (
     get_attention_tp_size,
     get_global_dp_buffer,
     get_local_dp_buffer,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
 )
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -43,6 +48,8 @@ from sglang.srt.utils import is_cuda, is_flashinfer_available
 _is_flashinfer_available = is_flashinfer_available()
 _is_sm100_supported = is_cuda() and is_sm100_supported()
+FUSE_ALLREDUCE_MAX_BATCH_SIZE = 2048
 class ScatterMode(Enum):
     """
@@ -111,7 +118,11 @@ class LayerScatterModes:
         if context.is_layer_sparse:
             return (
                 ScatterMode.SCATTERED
-                if not global_server_args_dict["moe_a2a_backend"].is_standard()
+                if (
+                    # Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
+                    not get_moe_a2a_backend().is_none()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                )
                 else ScatterMode.FULL
             )
         else:
@@ -154,11 +165,13 @@ class LayerCommunicator:
         post_attention_layernorm: torch.nn.Module,
         # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
         allow_reduce_scatter: bool = False,
+        is_last_layer: bool = False,
     ):
         self.layer_scatter_modes = layer_scatter_modes
         self.input_layernorm = input_layernorm
         self.post_attention_layernorm = post_attention_layernorm
         self.allow_reduce_scatter = allow_reduce_scatter
+        self.is_last_layer = is_last_layer
         self._context = CommunicateContext.init_new()
         self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
@@ -256,6 +269,41 @@ class LayerCommunicator:
             and forward_batch.dp_padding_mode.is_max_len()
         )
+    def should_fuse_mlp_allreduce_with_next_layer(
+        self, forward_batch: ForwardBatch
+    ) -> bool:
+        speculative_algo = global_server_args_dict.get("speculative_algorithm", None)
+        if (
+            is_dp_attention_enabled()
+            and speculative_algo is not None
+            and speculative_algo.is_eagle()
+        ):
+            return False
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
+        if batch_size > FUSE_ALLREDUCE_MAX_BATCH_SIZE:
+            return False
+        static_conditions_met = (
+            (not self.is_last_layer)
+            and (self._context.tp_size > 1)
+            and global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False)
+            and _is_flashinfer_available
+        )
+        if not static_conditions_met:
+            return False
+        return (
+            batch_size > 0
+            and batch_size <= FUSE_ALLREDUCE_MAX_BATCH_SIZE
+            and (not self.is_last_layer)
+        )
 @dataclass
 class CommunicateContext:
@@ -382,7 +430,7 @@ class CommunicateWithAllReduceAndLayerNormFn:
             )
         raise NotImplementedError(
-            f"{hidden_states_input_mode=} {residual_input_mode=} {residual_output_mode=} {residual_output_mode=}"
+            f"{hidden_states_input_mode=} {residual_input_mode=} {hidden_states_output_mode=} {residual_output_mode=}"
         )
     @staticmethod

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -72,6 +72,7 @@ class _DpGatheredBufferWrapper:
     _device: torch.device
     _global_dp_buffer_len: int
     _local_dp_buffer_len: int
+    _global_num_tokens: Optional[List[int]]
     @classmethod
     def set_metadata(cls, hidden_size: int, dtype: torch.dtype, device: torch.device):
@@ -80,9 +81,15 @@ class _DpGatheredBufferWrapper:
         cls._device = device
     @classmethod
-    def set_dp_buffer_len(cls, global_dp_buffer_len: int, local_dp_buffer_len: int):
+    def set_dp_buffer_len(
+        cls,
+        global_dp_buffer_len: int,
+        local_dp_buffer_len: int,
+        global_num_tokens: Optional[List[int]] = None,
+    ):
         cls._global_dp_buffer_len = global_dp_buffer_len
         cls._local_dp_buffer_len = local_dp_buffer_len
+        cls._global_num_tokens = global_num_tokens
     @classmethod
     def get_global_dp_buffer(cls) -> torch.Tensor:
@@ -108,10 +115,18 @@ class _DpGatheredBufferWrapper:
     def get_local_dp_buffer_len(cls) -> int:
         return cls._local_dp_buffer_len
+    @classmethod
+    def get_dp_global_num_tokens(cls) -> List[int]:
+        return cls._global_num_tokens
-def set_dp_buffer_len(global_dp_buffer_len: int, local_dp_buffer_len: int):
+def set_dp_buffer_len(
+    global_dp_buffer_len: int,
+    local_dp_buffer_len: int,
+    global_num_tokens: Optional[List[int]] = None,
+):
     _DpGatheredBufferWrapper.set_dp_buffer_len(
-        global_dp_buffer_len, local_dp_buffer_len
+        global_dp_buffer_len, local_dp_buffer_len, global_num_tokens
     )
@@ -131,6 +146,10 @@ def get_local_dp_buffer_len() -> int:
     return _DpGatheredBufferWrapper.get_local_dp_buffer_len()
+def get_dp_global_num_tokens() -> List[int]:
+    return _DpGatheredBufferWrapper.get_dp_global_num_tokens()
 def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
     if not enable_dp_attention:
         return tp_rank, tp_size, 0
@@ -215,7 +234,7 @@ def initialize_dp_attention(
     _DpGatheredBufferWrapper.set_metadata(
         hidden_size=model_config.hidden_size,
         dtype=model_config.dtype,
-        device=torch.device("cuda"),
+        device=torch.device(server_args.device),
     )

sglang/srt/layers/elementwise.py CHANGED Viewed

@@ -486,3 +486,97 @@ def gelu_and_mul_triton(
         return out_hidden_states, out_scales
     else:
         return out_hidden_states, None
+# silu on first half of vector
+@triton.jit
+def silu_and_mul_kernel(
+    out_hidden_states_ptr,  # (bs, hidden_dim)
+    out_scales_ptr,  # (bs,)
+    hidden_states_ptr,  # (bs, hidden_dim * 2)
+    quant_max: tl.constexpr,
+    static_scale: tl.constexpr,
+    hidden_dim: tl.constexpr,  # the output hidden_dim
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(axis=0)
+    input_start = pid * hidden_dim * 2
+    output_start = pid * hidden_dim
+    input1_offs = tl.arange(0, BLOCK_SIZE)
+    mask = tl.arange(0, BLOCK_SIZE) < hidden_dim  # shared for input1, input3, output
+    input3_offs = hidden_dim + tl.arange(0, BLOCK_SIZE)
+    output_offs = tl.arange(0, BLOCK_SIZE)
+    x1 = tl.load(
+        hidden_states_ptr + input_start + input1_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    x3 = tl.load(
+        hidden_states_ptr + input_start + input3_offs, mask=mask, other=0.0
+    ).to(tl.float32)
+    # silu
+    # cast down before mul to better match training?
+    silu_x1 = x1 * tl.sigmoid(x1)
+    out = x3 * silu_x1.to(hidden_states_ptr.dtype.element_ty)
+    if quant_max is not None:
+        raise NotImplementedError()
+    tl.store(out_hidden_states_ptr + output_start + output_offs, out, mask=mask)
+def silu_and_mul_triton(
+    hidden_states,
+    scales=None,
+    quantize=None,  # dtype to quantize to
+    out=None,
+):
+    bs, in_hidden_dim = hidden_states.shape
+    hidden_dim = in_hidden_dim // 2
+    if out is None:
+        out_hidden_states = torch.empty(
+            (bs, hidden_dim),
+            dtype=quantize or hidden_states.dtype,
+            device=hidden_states.device,
+        )
+    else:
+        assert out.shape == (bs, hidden_dim)
+        assert out.dtype == (quantize or hidden_states.dtype)
+        out_hidden_states = out
+    out_scales = None
+    static_scale = False
+    if quantize is not None:
+        if scales is None:
+            out_scales = torch.empty(
+                (bs,), dtype=torch.float32, device=hidden_states.device
+            )
+        else:
+            out_scales = scales
+            static_scale = True
+    max_warps = 16 if _is_hip else 32
+    config = {
+        # 8 ele per thread (not tuned)
+        "num_warps": max(
+            min(triton.next_power_of_2(triton.cdiv(hidden_dim, 8 * 32)), max_warps), 4
+        ),
+    }
+    silu_and_mul_kernel[(bs,)](
+        out_hidden_states,
+        out_scales,
+        hidden_states,
+        quant_max=torch.finfo(quantize).max if quantize is not None else None,
+        static_scale=static_scale,
+        hidden_dim=hidden_dim,
+        BLOCK_SIZE=triton.next_power_of_2(hidden_dim),
+        **config,
+    )
+    if quantize is not None:
+        return out_hidden_states, out_scales
+    else:
+        return out_hidden_states, None

sglang/srt/layers/flashinfer_comm_fusion.py CHANGED Viewed

@@ -5,7 +5,11 @@ import torch
 import torch.distributed as dist
 from sglang.srt.distributed import get_tensor_model_parallel_world_size
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import (
+    direct_register_custom_op,
+    is_flashinfer_available,
+    supports_custom_op,
+)
 logger = logging.getLogger(__name__)
@@ -196,6 +200,30 @@ def flashinfer_allreduce_residual_rmsnorm(
     return norm_out, residual_out
+def fake_flashinfer_allreduce_residual_rmsnorm(
+    input_tensor: torch.Tensor,
+    residual: torch.Tensor,
+    weight: torch.Tensor,
+    eps: float = 1e-6,
+    max_token_num: int = 2048,
+    use_oneshot: Optional[bool] = None,
+    trigger_completion_at_end: bool = False,
+    fp32_acc: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    residual_out = torch.empty_like(residual)
+    norm_out = torch.empty_like(input_tensor)
+    return norm_out, residual_out
+if supports_custom_op():
+    direct_register_custom_op(
+        "flashinfer_allreduce_residual_rmsnorm",
+        flashinfer_allreduce_residual_rmsnorm,
+        mutates_args=["input_tensor", "residual", "weight"],
+        fake_impl=fake_flashinfer_allreduce_residual_rmsnorm,
+    )
 def cleanup_flashinfer_workspace():
     global _workspace_manager
     if _workspace_manager is not None:

sglang/srt/layers/layernorm.py CHANGED Viewed

@@ -27,6 +27,7 @@ from sglang.srt.utils import (
     is_cuda,
     is_hip,
     is_npu,
+    supports_custom_op,
 )
 _is_cuda = is_cuda()
@@ -202,8 +203,14 @@ class RMSNorm(CustomOp):
                 flashinfer_allreduce_residual_rmsnorm,
             )
+            fused_op = (
+                torch.ops.sglang.flashinfer_allreduce_residual_rmsnorm
+                if supports_custom_op()
+                else flashinfer_allreduce_residual_rmsnorm
+            )
             if get_tensor_model_parallel_world_size() > 1:
-                fused_result = flashinfer_allreduce_residual_rmsnorm(
+                fused_result = fused_op(
                     input_tensor=x,
                     residual=residual,
                     weight=self.weight,

sglang/srt/layers/linear.py CHANGED Viewed

@@ -110,6 +110,20 @@ def adjust_scalar_to_fused_array(param, loaded_weight, shard_id):
     return param[shard_id], loaded_weight
+def adjust_shard_offsets(shard_offsets, loaded_weight, dim):
+    actual_weight_size = loaded_weight.size(dim)
+    target_weight_size = shard_offsets[-1][-1] + shard_offsets[-1][-2]
+    if actual_weight_size != target_weight_size:
+        new_shard_offsets = []
+        new_offset = 0
+        for shard_id, shard_offset, shard_size in shard_offsets:
+            actual_shard_size = actual_weight_size * shard_size // target_weight_size
+            new_shard_offsets.append((shard_id, new_offset, actual_shard_size))
+            new_offset += actual_shard_size
+        return new_shard_offsets
+    return shard_offsets
 class LinearBase(torch.nn.Module):
     """Base linear layer.
@@ -535,6 +549,11 @@ class MergedColumnParallelLinear(ColumnParallelLinear):
             packed_dim = getattr(param, "packed_dim", None)
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
             for shard_id, shard_offset, shard_size in shard_offsets:
                 # Special case for Quantization.
                 # If quantized, we need to adjust the offset and size to account
@@ -977,6 +996,11 @@ class QKVParallelLinear(ColumnParallelLinear):
             use_bitsandbytes_4bit = getattr(param, "use_bitsandbytes_4bit", False)
             packed_dim = getattr(param, "packed_dim", None)
+            if _is_cpu:
+                shard_offsets = adjust_shard_offsets(
+                    shard_offsets, loaded_weight, output_dim
+                )
             for shard_id, shard_offset, shard_size in shard_offsets:
                 # Special case for Quantized Weights.
                 # If quantized, we need to adjust the offset and size to account

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -191,7 +191,11 @@ class LogitsMetadata:
         else:
             self.global_dp_buffer_len = self.global_dp_buffer_len
-        set_dp_buffer_len(self.global_dp_buffer_len, self.dp_local_num_tokens)
+        set_dp_buffer_len(
+            self.global_dp_buffer_len,
+            self.dp_local_num_tokens,
+            self.global_num_tokens_for_logprob_cpu,
+        )
 class LogitsProcessor(nn.Module):

sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc2py3-none-any.whl → 0.5.1py3-none-any.whl