PyPI - sglang - Versions diffs - 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

sglang/bench_one_batch.py +0 -7
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +25 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -2
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +29 -4
sglang/srt/entrypoints/http_server.py +76 -0
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +23 -6
sglang/srt/entrypoints/openai/serving_completions.py +10 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +14 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
sglang/srt/layers/attention/triton_backend.py +109 -73
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +58 -10
sglang/srt/layers/dp_attention.py +137 -27
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +16 -18
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +18 -46
sglang/srt/layers/quantization/awq.py +22 -23
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +17 -21
sglang/srt/layers/quantization/marlin_utils.py +26 -8
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +217 -98
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +222 -39
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +77 -2
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/layers.py +6 -2
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +80 -19
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +23 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +22 -48
sglang/srt/managers/scheduler.py +28 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +88 -39
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +10 -157
sglang/srt/mem_cache/allocator_ascend.py +147 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +33 -33
sglang/srt/model_executor/forward_batch_info.py +11 -10
sglang/srt/model_executor/model_runner.py +93 -78
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +5 -2
sglang/srt/models/deepseek_v2.py +226 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +27 -65
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +41 -76
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama.py +10 -2
sglang/srt/models/llama4.py +18 -7
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +23 -23
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +84 -0
sglang/srt/models/qwen3_moe.py +27 -43
sglang/srt/models/step3_vl.py +8 -3
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +22 -2
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +264 -105
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +20 -19
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
sglang/srt/layers/quantization/fp4.py +0 -557
sglang/srt/layers/quantization/scalar_type.py +0 -352
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/utils.py CHANGED Viewed

@@ -9,18 +9,89 @@ TRITON_PAD_NUM_PAGE_PER_BLOCK = 64
 @triton.jit
 def create_flashinfer_kv_indices_triton(
-    req_to_token_ptr,  # [max_batch, max_context_len]
+    req_to_token_ptr,
     req_pool_indices_ptr,
     page_kernel_lens_ptr,
     kv_indptr,
     kv_start_idx,
     kv_indices_ptr,
     req_to_token_ptr_stride: tl.constexpr,
+    PAGE_SIZE: tl.constexpr = 1,
 ):
+    """
+    Create KV indices for FlashInfer attention backend.
+    This Triton kernel builds a lookup table that maps from logical request/token
+    coordinates to physical token locations in the global KV cache pool. It's used
+    by FlashInfer attention backends to efficiently access scattered KV cache data.
+    The kernel processes each request in parallel and converts the req_to_token
+    lookup table into a flat list of token indices that can be used by attention kernels.
+    general idea:
+        blocktables/kv_indices_ptr = [batch_size * max_pages(for graph mode with
+                                                            fixed number of pages)]
+        max_pages = max_context_len / PAGED_SIZE
+        kv_indices_ptr will store the flat list of the pages used by each request
+    Args:
+        Inputs Arguments (non mutable):
+        req_to_token_ptr: Request to token location look up table
+                         Shape: [max_batch, max_context_len]
+        req_pool_indices_ptr: Request to pool index look up table. Each request uses
+                             one pool.
+                             Shape: [batch_size]
+        page_kernel_lens_ptr: sequence lengths per request
+                             Shape: [batch_size]
+        kv_indptr: Should be computed based on number of pages used by each request.
+                   It is used by flashinfer attention kernels to index into the kv_indices_ptr.
+                   per request.
+                  Shape: [batch_size + 1]
+                  kv_indptr[i] = start index in kv_indices for request i
+        kv_start_idx: Pointer to array containing start offsets for each request in SGL.
+                     Can be None. If provided, adds offset to token positions.
+        req_to_token_ptr_stride: Stride for the second dimension of req_to_token.
+                                Equal to max_context_len.
+        PAGED_SIZE: Number of tokens per page. Default is 1 for FlashInfer.
+        Outputs:
+        kv_indices_ptr: Pointer to output array where KV indices will be stored.
+                    Shape:[total-num-pages],
+                    where total_num_pages = sum(seq_lens // PAGED_SIZE)
+    Example:
+        If we have:
+        - req_pool_indices = [0, 1] (request 0 uses pool 0, request 1 uses pool 1)
+        - page_kernel_lens = [3, 2] (request 0 has 3 tokens, request 1 has 2 tokens)
+        - req_to_token = [[10, 11, 12, -1], [20, 21, -1, -1]] (tokens are the elements
+         in radix tree, use them as a pointer to the token location in the kv_indices_ptr)
+        The kernel will output:
+        If PAGE_SIZE = 1:
+        packed
+        - kv_indptr (passed in as input arg): [0,3,5]
+        - kv_indices = [10, 11, 12, 20, 21]
+        padded - max_pages is 10 tokens per req
+        - kv_indptr (passed in as input arg): [0,10, 20]
+        - kv_indices = [10, 11, 12, -1, -1, -1, -1, -1, -1, -1,
+                        20, 21, -1, -1, -1, -1, -1, -1, -1, -1]
+        If PAGE_SIZE = 2
+        packed:
+        - kv_indptr (passed in as input arg): [0,3,4]
+        - kv_indices = [5,6,10]
+        padded: max_pages is 4
+        - kv_indptr (passed in as input arg): [0,4,8,..] (note that 4 is the max_pages)
+        - kv_indices = [5, 6, -1, -1,
+                        10, -1, -1, -1]
+        This allows attention kernels to directly access the correct KV cache
+        entries for each request's tokens.
+    """
     BLOCK_SIZE: tl.constexpr = 512
+    NUM_PAGES_PER_BLOCK: tl.constexpr = BLOCK_SIZE // PAGE_SIZE
     pid = tl.program_id(axis=0)
-    # find the req pool idx, this is for batch to token
     req_pool_index = tl.load(req_pool_indices_ptr + pid)
     kv_indices_offset = tl.load(kv_indptr + pid)
@@ -31,19 +102,27 @@ def create_flashinfer_kv_indices_triton(
         kv_end = kv_start
     kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
-    num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
-    for i in range(num_loop):
-        # index into req_to_token_ptr needs to be int64
-        offset = tl.arange(0, BLOCK_SIZE).to(tl.int64) + i * BLOCK_SIZE
-        mask = offset < kv_end - kv_start
-        data = tl.load(
-            req_to_token_ptr
-            + req_pool_index * req_to_token_ptr_stride
-            + kv_start
-            + offset,
-            mask=mask,
+    kv_range = kv_end - kv_start
+    num_pages = tl.cdiv(kv_range, PAGE_SIZE)
+    num_loops = tl.cdiv(kv_range, BLOCK_SIZE)
+    req_to_token_block_start = (
+        req_to_token_ptr + req_pool_index * req_to_token_ptr_stride + kv_start
+    )
+    for i in range(num_loops):
+        token_offsets_in_block = (
+            tl.arange(0, NUM_PAGES_PER_BLOCK).to(tl.int64) + i * NUM_PAGES_PER_BLOCK
+        ) * PAGE_SIZE
+        page_offsets_in_block = token_offsets_in_block // PAGE_SIZE
+        valid_tokens = token_offsets_in_block < kv_range
+        valid_pages = page_offsets_in_block < num_pages
+        token_numbers = tl.load(
+            req_to_token_block_start + token_offsets_in_block, mask=valid_tokens
+        )
+        tl.store(
+            kv_indices_ptr + kv_indices_offset + page_offsets_in_block,
+            token_numbers // PAGE_SIZE,  # write the page numbers to kv_indices_ptr
+            mask=valid_pages,
         )
-        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)
 @triton.jit

sglang/srt/layers/attention/vision.py CHANGED Viewed

@@ -12,7 +12,12 @@ import torch.nn.functional as F
 from einops import rearrange
 from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
-from sglang.srt.utils import is_cuda, print_info_once
+from sglang.srt.utils import (
+    get_device_capability,
+    is_blackwell,
+    is_cuda,
+    print_info_once,
+)
 _is_cuda = is_cuda()
@@ -20,7 +25,6 @@ if _is_cuda:
     from sgl_kernel.flash_attn import flash_attn_varlen_func
 from sglang.srt.distributed import (
-    parallel_state,
     split_tensor_along_last_dim,
     tensor_model_parallel_all_gather,
 )
@@ -402,18 +406,14 @@ class VisionAttention(nn.Module):
                 self.dummy_dim, eps=layer_norm_eps, var_hidden_size=embed_dim
             )
-        # priority: server_args > passed qkv_backend > sdpa
-        if global_server_args_dict["mm_attention_backend"] is None:
-            if qkv_backend is None:
-                if is_cuda():
-                    # Double prefill throughput by setting attn backend to Triton on CUDA
-                    qkv_backend = "triton_attn"
-                else:
-                    qkv_backend = "sdpa"
+        # Select attention backend via a unified method
+        _passed_backend = qkv_backend
+        qkv_backend = self._determine_attention_backend(_passed_backend)
+        if (
+            global_server_args_dict["mm_attention_backend"] is None
+            and _passed_backend is None
+        ):
             print_info_once(f"Multimodal attention backend not set. Use {qkv_backend}.")
-        else:
-            qkv_backend = global_server_args_dict["mm_attention_backend"]
         print_info_once(f"Using {qkv_backend} as multimodal attention backend.")
         self.customized_position_embedding_applier = (
@@ -461,6 +461,33 @@ class VisionAttention(nn.Module):
             prefix=add_prefix("proj", prefix),
         )
+    def _determine_attention_backend(self, passed_backend: Optional[str]) -> str:
+        """Decide the multimodal attention backend string.
+        Priority: server args override > constructor arg > platform default.
+        Platform defaults:
+        - CUDA: "triton_attn"
+        - Non-CUDA: "sdpa"
+        """
+        override_backend = global_server_args_dict["mm_attention_backend"]
+        if override_backend is not None:
+            backend = override_backend
+        elif passed_backend is not None:
+            backend = passed_backend
+        elif is_cuda():
+            major, minor = get_device_capability()
+            if major == 9:
+                backend = "fa3"
+            else:
+                backend = "triton_attn"
+        else:
+            backend = "sdpa"
+        if backend == "fa3" and is_blackwell():
+            raise ValueError("The 'fa3' backend is not supported on Blackwell GPUs")
+        return backend
     def _apply_qk_norm(self, q: torch.Tensor, k: torch.Tensor):
         """apply qk norm for internvl vit attn"""
         q = q.flatten(1, 2)

sglang/srt/layers/attention/vision_utils.py ADDED Viewed

@@ -0,0 +1,65 @@
+"""Utility functions for vision attention layers."""
+import torch
+from sglang.srt.layers.dp_attention import get_attention_tp_size
+def update_vit_attn_dummy_heads_config(config):
+    """Update HF config to ensure vision attention num_attention_heads is divisible by tp_size"""
+    tp_size = get_attention_tp_size()
+    num_heads = getattr(
+        config.vision_config,
+        "num_heads",
+        getattr(config.vision_config, "num_attention_heads", None),
+    )
+    head_dim = config.vision_config.hidden_size // num_heads
+    num_dummy_heads = 0
+    if num_heads % tp_size != 0:
+        num_dummy_heads = ((num_heads + tp_size - 1) // tp_size) * tp_size - num_heads
+    setattr(config.vision_config, "head_dim", head_dim)
+    setattr(config.vision_config, "num_dummy_heads", num_dummy_heads)
+def pad_vit_attn_dummy_heads(config, name: str, loaded_weight: torch.Tensor):
+    """Pad attention qkv weights for dummy heads"""
+    num_dummy_heads = config.vision_config.num_dummy_heads
+    if num_dummy_heads == 0:
+        return loaded_weight
+    head_dim = config.vision_config.head_dim
+    if "attn.qkv_proj" in name:
+        wq, wk, wv = loaded_weight.chunk(3, dim=0)
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, wq.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        pad_func = lambda x: torch.cat(
+            [x.unflatten(0, (-1, head_dim)), x.new_zeros(dummy_shape)], dim=0
+        ).flatten(0, 1)
+        wq, wk, wv = pad_func(wq), pad_func(wk), pad_func(wv)
+        loaded_weight = torch.cat([wq, wk, wv], dim=0)
+    elif any([_ in name for _ in ["attn.q_proj", "attn.k_proj", "attn.v_proj"]]):
+        if name.endswith(".weight"):
+            dummy_shape = [num_dummy_heads, head_dim, loaded_weight.shape[-1]]
+        elif name.endswith(".bias"):
+            dummy_shape = [num_dummy_heads, head_dim]
+        else:
+            raise RuntimeError(f"Unsupported weight with name={name}")
+        padded_weight = loaded_weight.new_zeros(dummy_shape)
+        loaded_weight = torch.cat(
+            [loaded_weight.unflatten(0, (-1, head_dim)), padded_weight], dim=0
+        ).flatten(0, 1)
+    elif "attn.proj.weight" in name:
+        padded_weight = loaded_weight.new_zeros(
+            loaded_weight.shape[0], head_dim * num_dummy_heads
+        )
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=-1)
+    elif "attn.q_norm.weight" in name or "attn.k_norm.weight" in name:
+        padded_weight = loaded_weight.new_zeros(head_dim * num_dummy_heads)
+        loaded_weight = torch.cat([loaded_weight, padded_weight], dim=0)
+    return loaded_weight

sglang/srt/layers/communicator.py CHANGED Viewed

@@ -17,7 +17,7 @@ from enum import Enum, auto
 from functools import partial
 from typing import Dict, Optional
-import torch.distributed
+import torch
 from sglang.srt.distributed import (
     get_tensor_model_parallel_world_size,
@@ -32,6 +32,13 @@ from sglang.srt.layers.dp_attention import (
     get_attention_dp_size,
     get_attention_tp_rank,
     get_attention_tp_size,
+    get_global_dp_buffer,
+    get_local_dp_buffer,
+    is_dp_attention_enabled,
+)
+from sglang.srt.layers.moe import (
+    get_moe_a2a_backend,
+    should_use_flashinfer_cutlass_moe_fp4_allgather,
 )
 from sglang.srt.layers.utils import is_sm100_supported
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -41,6 +48,8 @@ from sglang.srt.utils import is_cuda, is_flashinfer_available
 _is_flashinfer_available = is_flashinfer_available()
 _is_sm100_supported = is_cuda() and is_sm100_supported()
+FUSE_ALLREDUCE_MAX_BATCH_SIZE = 2048
 class ScatterMode(Enum):
     """
@@ -109,7 +118,11 @@ class LayerScatterModes:
         if context.is_layer_sparse:
             return (
                 ScatterMode.SCATTERED
-                if not global_server_args_dict["moe_a2a_backend"].is_standard()
+                if (
+                    # Token dispatch/combine will be handled outside of LayerCommunicator for these modes.
+                    not get_moe_a2a_backend().is_none()
+                    or should_use_flashinfer_cutlass_moe_fp4_allgather()
+                )
                 else ScatterMode.FULL
             )
         else:
@@ -152,11 +165,13 @@ class LayerCommunicator:
         post_attention_layernorm: torch.nn.Module,
         # Reduce scatter requires skipping all-reduce in model code after MoE/MLP, so only enable for models which have that implemented. Remove flag once done for all models that use LayerCommunicator.
         allow_reduce_scatter: bool = False,
+        is_last_layer: bool = False,
     ):
         self.layer_scatter_modes = layer_scatter_modes
         self.input_layernorm = input_layernorm
         self.post_attention_layernorm = post_attention_layernorm
         self.allow_reduce_scatter = allow_reduce_scatter
+        self.is_last_layer = is_last_layer
         self._context = CommunicateContext.init_new()
         self._communicate_simple_fn = CommunicateSimpleFn.get_fn(
@@ -254,6 +269,41 @@ class LayerCommunicator:
             and forward_batch.dp_padding_mode.is_max_len()
         )
+    def should_fuse_mlp_allreduce_with_next_layer(
+        self, forward_batch: ForwardBatch
+    ) -> bool:
+        speculative_algo = global_server_args_dict.get("speculative_algorithm", None)
+        if (
+            is_dp_attention_enabled()
+            and speculative_algo is not None
+            and speculative_algo.is_eagle()
+        ):
+            return False
+        batch_size = (
+            forward_batch.input_ids.shape[0]
+            if hasattr(forward_batch, "input_ids")
+            else 0
+        )
+        if batch_size > FUSE_ALLREDUCE_MAX_BATCH_SIZE:
+            return False
+        static_conditions_met = (
+            (not self.is_last_layer)
+            and (self._context.tp_size > 1)
+            and global_server_args_dict.get("enable_flashinfer_allreduce_fusion", False)
+            and _is_flashinfer_available
+        )
+        if not static_conditions_met:
+            return False
+        return (
+            batch_size > 0
+            and batch_size <= FUSE_ALLREDUCE_MAX_BATCH_SIZE
+            and (not self.is_last_layer)
+        )
 @dataclass
 class CommunicateContext:
@@ -319,7 +369,7 @@ class CommunicateSimpleFn:
         context: CommunicateContext,
     ) -> torch.Tensor:
         hidden_states, local_hidden_states = (
-            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            get_local_dp_buffer(),
             hidden_states,
         )
         attn_tp_all_gather_into_tensor(
@@ -380,7 +430,7 @@ class CommunicateWithAllReduceAndLayerNormFn:
             )
         raise NotImplementedError(
-            f"{hidden_states_input_mode=} {residual_input_mode=} {residual_output_mode=} {residual_output_mode=}"
+            f"{hidden_states_input_mode=} {residual_input_mode=} {hidden_states_output_mode=} {residual_output_mode=}"
         )
     @staticmethod
@@ -408,9 +458,7 @@ class CommunicateWithAllReduceAndLayerNormFn:
     ):
         if residual_input_mode == ScatterMode.SCATTERED and context.attn_tp_size > 1:
             residual, local_residual = (
-                torch.empty_like(
-                    forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]]
-                ),
+                get_local_dp_buffer(),
                 residual,
             )
             attn_tp_all_gather_into_tensor(residual, local_residual)
@@ -424,7 +472,7 @@ class CommunicateWithAllReduceAndLayerNormFn:
                 residual = hidden_states
                 hidden_states = layernorm(hidden_states)
             hidden_states, local_hidden_states = (
-                torch.empty_like(forward_batch.gathered_buffer),
+                get_global_dp_buffer(),
                 hidden_states,
             )
             dp_gather_partial(hidden_states, local_hidden_states, forward_batch)
@@ -548,7 +596,7 @@ class CommunicateSummableTensorPairFn:
         allow_reduce_scatter: bool = False,
     ):
         hidden_states, global_hidden_states = (
-            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            get_local_dp_buffer(),
             hidden_states,
         )
         if allow_reduce_scatter and forward_batch.dp_padding_mode.is_max_len():
@@ -569,7 +617,7 @@ class CommunicateSummableTensorPairFn:
         hidden_states += residual
         residual = None
         hidden_states, local_hidden_states = (
-            forward_batch.gathered_buffer[: forward_batch.input_ids.shape[0]],
+            get_local_dp_buffer(),
             hidden_states,
         )
         attn_tp_all_gather_into_tensor(

sglang/srt/layers/dp_attention.py CHANGED Viewed

@@ -4,7 +4,7 @@ import functools
 import logging
 from contextlib import contextmanager
 from enum import IntEnum, auto
-from typing import TYPE_CHECKING, List, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 import torch
 import triton
@@ -18,21 +18,26 @@ from sglang.srt.distributed import (
     tensor_model_parallel_all_reduce,
 )
+if TYPE_CHECKING:
+    from sglang.srt.configs.model_config import ModelConfig
+    from sglang.srt.server_args import ServerArgs
 logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
     from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-_ATTN_TP_GROUP = None
-_ATTN_TP_RANK = None
-_ATTN_TP_SIZE = None
-_ATTN_DP_RANK = None
-_ATTN_DP_SIZE = None
-_LOCAL_ATTN_DP_SIZE = None
-_LOCAL_ATTN_DP_RANK = None
+_ATTN_TP_GROUP: Optional[GroupCoordinator] = None
+_ATTN_TP_RANK: Optional[int] = None
+_ATTN_TP_SIZE: Optional[int] = None
+_ATTN_DP_RANK: Optional[int] = None
+_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_SIZE: Optional[int] = None
+_LOCAL_ATTN_DP_RANK: Optional[int] = None
+_ENABLE_DP_ATTENTION_FLAG: bool = False
-class DPPaddingMode(IntEnum):
+class DpPaddingMode(IntEnum):
     # Padding tokens to max length and then gather tokens using `all_gather_into_tensor`
     MAX_LEN = auto()
@@ -40,13 +45,13 @@ class DPPaddingMode(IntEnum):
     SUM_LEN = auto()
     def is_max_len(self):
-        return self == DPPaddingMode.MAX_LEN
+        return self == DpPaddingMode.MAX_LEN
     def is_sum_len(self):
-        return self == DPPaddingMode.SUM_LEN
+        return self == DpPaddingMode.SUM_LEN
     @classmethod
-    def get_dp_padding_mode(cls, global_num_tokens: List[int]) -> DPPaddingMode:
+    def get_dp_padding_mode(cls, global_num_tokens: List[int]) -> DpPaddingMode:
         # we choose the mode that minimizes the communication cost
         max_len = max(global_num_tokens)
         sum_len = sum(global_num_tokens)
@@ -56,10 +61,95 @@ class DPPaddingMode(IntEnum):
             return cls.SUM_LEN
     @classmethod
-    def get_default_mode_in_cuda_graph(cls) -> DPPaddingMode:
+    def get_default_mode_in_cuda_graph(cls) -> DpPaddingMode:
         return cls.MAX_LEN
+class _DpGatheredBufferWrapper:
+    _hidden_size: int
+    _dtype: torch.dtype
+    _device: torch.device
+    _global_dp_buffer_len: int
+    _local_dp_buffer_len: int
+    _global_num_tokens: Optional[List[int]]
+    @classmethod
+    def set_metadata(cls, hidden_size: int, dtype: torch.dtype, device: torch.device):
+        cls._hidden_size = hidden_size
+        cls._dtype = dtype
+        cls._device = device
+    @classmethod
+    def set_dp_buffer_len(
+        cls,
+        global_dp_buffer_len: int,
+        local_dp_buffer_len: int,
+        global_num_tokens: Optional[List[int]] = None,
+    ):
+        cls._global_dp_buffer_len = global_dp_buffer_len
+        cls._local_dp_buffer_len = local_dp_buffer_len
+        cls._global_num_tokens = global_num_tokens
+    @classmethod
+    def get_global_dp_buffer(cls) -> torch.Tensor:
+        return torch.empty(
+            (cls._global_dp_buffer_len, cls._hidden_size),
+            dtype=cls._dtype,
+            device=cls._device,
+        )
+    @classmethod
+    def get_local_dp_buffer(cls) -> torch.Tensor:
+        return torch.empty(
+            (cls._local_dp_buffer_len, cls._hidden_size),
+            dtype=cls._dtype,
+            device=cls._device,
+        )
+    @classmethod
+    def get_global_dp_buffer_len(cls) -> int:
+        return cls._global_dp_buffer_len
+    @classmethod
+    def get_local_dp_buffer_len(cls) -> int:
+        return cls._local_dp_buffer_len
+    @classmethod
+    def get_dp_global_num_tokens(cls) -> List[int]:
+        return cls._global_num_tokens
+def set_dp_buffer_len(
+    global_dp_buffer_len: int,
+    local_dp_buffer_len: int,
+    global_num_tokens: Optional[List[int]] = None,
+):
+    _DpGatheredBufferWrapper.set_dp_buffer_len(
+        global_dp_buffer_len, local_dp_buffer_len, global_num_tokens
+    )
+def get_global_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer()
+def get_local_dp_buffer() -> torch.Tensor:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer()
+def get_global_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_global_dp_buffer_len()
+def get_local_dp_buffer_len() -> int:
+    return _DpGatheredBufferWrapper.get_local_dp_buffer_len()
+def get_dp_global_num_tokens() -> List[int]:
+    return _DpGatheredBufferWrapper.get_dp_global_num_tokens()
 def compute_dp_attention_world_info(enable_dp_attention, tp_rank, tp_size, dp_size):
     if not enable_dp_attention:
         return tp_rank, tp_size, 0
@@ -89,18 +179,24 @@ def compute_dp_attention_local_info(
 def initialize_dp_attention(
-    enable_dp_attention: bool,
-    tp_rank: int,
-    tp_size: int,
-    dp_size: int,
-    moe_dense_tp_size: int,
-    pp_size: int,
+    server_args: ServerArgs,
+    model_config: ModelConfig,
 ):
     global _ATTN_TP_GROUP, _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK, _ATTN_DP_SIZE
-    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK
+    global _LOCAL_ATTN_DP_SIZE, _LOCAL_ATTN_DP_RANK, _ENABLE_DP_ATTENTION_FLAG
     from sglang.srt.layers.sampler import SYNC_TOKEN_IDS_ACROSS_TP
+    enable_dp_attention = server_args.enable_dp_attention
+    tp_size = server_args.tp_size
+    dp_size = server_args.dp_size
+    moe_dense_tp_size = server_args.moe_dense_tp_size
+    pp_size = server_args.pp_size
+    tp_rank = get_tensor_model_parallel_rank()
+    _ENABLE_DP_ATTENTION_FLAG = enable_dp_attention
     _ATTN_TP_RANK, _ATTN_TP_SIZE, _ATTN_DP_RANK = compute_dp_attention_world_info(
         enable_dp_attention, tp_rank, tp_size, dp_size
     )
@@ -135,38 +231,48 @@ def initialize_dp_attention(
         group_name="attention_tp",
     )
+    _DpGatheredBufferWrapper.set_metadata(
+        hidden_size=model_config.hidden_size,
+        dtype=model_config.dtype,
+        device=torch.device(server_args.device),
+    )
+def is_dp_attention_enabled() -> bool:
+    return _ENABLE_DP_ATTENTION_FLAG
-def get_attention_tp_group():
+def get_attention_tp_group() -> GroupCoordinator:
     assert _ATTN_TP_GROUP is not None, "dp attention not initialized!"
     return _ATTN_TP_GROUP
-def get_attention_tp_rank():
+def get_attention_tp_rank() -> int:
     assert _ATTN_TP_RANK is not None, "dp attention not initialized!"
     return _ATTN_TP_RANK
-def get_attention_tp_size():
+def get_attention_tp_size() -> int:
     assert _ATTN_TP_SIZE is not None, "dp attention not initialized!"
     return _ATTN_TP_SIZE
-def get_attention_dp_rank():
+def get_attention_dp_rank() -> int:
     assert _ATTN_DP_RANK is not None, "dp attention not initialized!"
     return _ATTN_DP_RANK
-def get_attention_dp_size():
+def get_attention_dp_size() -> int:
     assert _ATTN_DP_SIZE is not None, "dp attention not initialized!"
     return _ATTN_DP_SIZE
-def get_local_attention_dp_rank():
+def get_local_attention_dp_rank() -> int:
     assert _LOCAL_ATTN_DP_RANK is not None, "dp attention not initialized!"
     return _LOCAL_ATTN_DP_RANK
-def get_local_attention_dp_size():
+def get_local_attention_dp_size() -> int:
     assert _LOCAL_ATTN_DP_SIZE is not None, "dp attention not initialized!"
     return _LOCAL_ATTN_DP_SIZE
@@ -292,6 +398,10 @@ def _dp_gather_via_all_gather(
     forward_batch: ForwardBatch,
     is_partial: bool,
 ):
+    if get_attention_tp_size() == 1:
+        get_tp_group().all_gather_into_tensor(global_tokens, local_tokens)
+        return
     if not is_partial:
         if get_attention_tp_rank() != 0:
             local_tokens.fill_(0)

sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl