PyPI - sglang - Versions diffs - 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (203) hide show

sglang/bench_one_batch.py +0 -7
sglang/bench_one_batch_server.py +7 -2
sglang/bench_serving.py +3 -3
sglang/eval/llama3_eval.py +0 -1
sglang/srt/configs/model_config.py +25 -9
sglang/srt/configs/update_config.py +40 -5
sglang/srt/constrained/xgrammar_backend.py +23 -11
sglang/srt/conversation.py +2 -15
sglang/srt/disaggregation/ascend/conn.py +1 -3
sglang/srt/disaggregation/base/conn.py +1 -0
sglang/srt/disaggregation/decode.py +1 -2
sglang/srt/disaggregation/launch_lb.py +7 -1
sglang/srt/disaggregation/mini_lb.py +11 -5
sglang/srt/disaggregation/mooncake/conn.py +141 -47
sglang/srt/disaggregation/prefill.py +261 -5
sglang/srt/disaggregation/utils.py +2 -1
sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
sglang/srt/distributed/device_communicators/pynccl.py +68 -18
sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
sglang/srt/distributed/naive_distributed.py +112 -0
sglang/srt/distributed/parallel_state.py +90 -4
sglang/srt/entrypoints/context.py +20 -1
sglang/srt/entrypoints/engine.py +29 -4
sglang/srt/entrypoints/http_server.py +76 -0
sglang/srt/entrypoints/openai/protocol.py +4 -2
sglang/srt/entrypoints/openai/serving_chat.py +23 -6
sglang/srt/entrypoints/openai/serving_completions.py +10 -1
sglang/srt/entrypoints/openai/serving_responses.py +2 -2
sglang/srt/eplb/expert_distribution.py +2 -3
sglang/srt/function_call/deepseekv3_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +24 -0
sglang/srt/host_shared_memory.py +83 -0
sglang/srt/layers/attention/ascend_backend.py +132 -22
sglang/srt/layers/attention/flashattention_backend.py +24 -17
sglang/srt/layers/attention/flashinfer_backend.py +14 -3
sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
sglang/srt/layers/attention/triton_backend.py +109 -73
sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
sglang/srt/layers/attention/utils.py +94 -15
sglang/srt/layers/attention/vision.py +40 -13
sglang/srt/layers/attention/vision_utils.py +65 -0
sglang/srt/layers/communicator.py +58 -10
sglang/srt/layers/dp_attention.py +137 -27
sglang/srt/layers/elementwise.py +94 -0
sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
sglang/srt/layers/layernorm.py +8 -1
sglang/srt/layers/linear.py +24 -0
sglang/srt/layers/logits_processor.py +16 -18
sglang/srt/layers/moe/__init__.py +31 -0
sglang/srt/layers/moe/ep_moe/layer.py +37 -33
sglang/srt/layers/moe/fused_moe_native.py +14 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
sglang/srt/layers/moe/moe_runner/base.py +13 -0
sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
sglang/srt/layers/moe/router.py +15 -9
sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
sglang/srt/layers/moe/topk.py +167 -83
sglang/srt/layers/moe/utils.py +159 -18
sglang/srt/layers/multimodal.py +156 -40
sglang/srt/layers/quantization/__init__.py +18 -46
sglang/srt/layers/quantization/awq.py +22 -23
sglang/srt/layers/quantization/base_config.py +2 -6
sglang/srt/layers/quantization/blockwise_int8.py +4 -12
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
sglang/srt/layers/quantization/fp8.py +127 -119
sglang/srt/layers/quantization/fp8_kernel.py +195 -24
sglang/srt/layers/quantization/fp8_utils.py +34 -9
sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
sglang/srt/layers/quantization/gptq.py +17 -21
sglang/srt/layers/quantization/marlin_utils.py +26 -8
sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
sglang/srt/layers/quantization/modelopt_quant.py +217 -98
sglang/srt/layers/quantization/moe_wna16.py +10 -15
sglang/srt/layers/quantization/mxfp4.py +222 -39
sglang/srt/layers/quantization/quark/quark.py +390 -0
sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
sglang/srt/layers/quantization/unquant.py +34 -70
sglang/srt/layers/quantization/utils.py +77 -2
sglang/srt/layers/quantization/w4afp8.py +7 -8
sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
sglang/srt/layers/quantization/w8a8_int8.py +5 -13
sglang/srt/layers/radix_attention.py +6 -0
sglang/srt/layers/rotary_embedding.py +1 -0
sglang/srt/layers/sampler.py +5 -2
sglang/srt/lora/layers.py +6 -2
sglang/srt/lora/lora_manager.py +21 -22
sglang/srt/lora/lora_registry.py +3 -3
sglang/srt/lora/mem_pool.py +26 -24
sglang/srt/lora/utils.py +10 -12
sglang/srt/managers/cache_controller.py +80 -19
sglang/srt/managers/detokenizer_manager.py +10 -2
sglang/srt/managers/io_struct.py +23 -0
sglang/srt/managers/mm_utils.py +1 -1
sglang/srt/managers/schedule_batch.py +22 -48
sglang/srt/managers/scheduler.py +28 -20
sglang/srt/managers/session_controller.py +1 -1
sglang/srt/managers/template_manager.py +7 -5
sglang/srt/managers/tokenizer_manager.py +88 -39
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/managers/utils.py +59 -1
sglang/srt/mem_cache/allocator.py +10 -157
sglang/srt/mem_cache/allocator_ascend.py +147 -0
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +14 -4
sglang/srt/mem_cache/memory_pool.py +3 -3
sglang/srt/mem_cache/memory_pool_host.py +35 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
sglang/srt/model_executor/cuda_graph_runner.py +33 -33
sglang/srt/model_executor/forward_batch_info.py +11 -10
sglang/srt/model_executor/model_runner.py +93 -78
sglang/srt/model_executor/npu_graph_runner.py +94 -0
sglang/srt/model_loader/loader.py +24 -6
sglang/srt/models/dbrx.py +12 -6
sglang/srt/models/deepseek.py +2 -1
sglang/srt/models/deepseek_nextn.py +5 -2
sglang/srt/models/deepseek_v2.py +226 -223
sglang/srt/models/ernie4.py +2 -2
sglang/srt/models/glm4_moe.py +27 -65
sglang/srt/models/glm4_moe_nextn.py +2 -1
sglang/srt/models/glm4v.py +52 -1
sglang/srt/models/glm4v_moe.py +8 -11
sglang/srt/models/gpt_oss.py +41 -76
sglang/srt/models/granitemoe.py +0 -1
sglang/srt/models/grok.py +376 -48
sglang/srt/models/interns1.py +12 -47
sglang/srt/models/internvl.py +6 -51
sglang/srt/models/llama.py +10 -2
sglang/srt/models/llama4.py +18 -7
sglang/srt/models/minicpm3.py +0 -1
sglang/srt/models/mixtral.py +0 -2
sglang/srt/models/nemotron_nas.py +435 -0
sglang/srt/models/olmoe.py +0 -1
sglang/srt/models/phi4mm.py +3 -21
sglang/srt/models/qwen2.py +2 -2
sglang/srt/models/qwen2_5_vl.py +2 -0
sglang/srt/models/qwen2_moe.py +23 -23
sglang/srt/models/qwen3.py +2 -2
sglang/srt/models/qwen3_classification.py +84 -0
sglang/srt/models/qwen3_moe.py +27 -43
sglang/srt/models/step3_vl.py +8 -3
sglang/srt/models/xverse_moe.py +11 -5
sglang/srt/multimodal/processors/base_processor.py +3 -3
sglang/srt/multimodal/processors/internvl.py +7 -2
sglang/srt/multimodal/processors/llava.py +11 -7
sglang/srt/offloader.py +433 -0
sglang/srt/operations.py +22 -2
sglang/srt/reasoning_parser.py +4 -3
sglang/srt/sampling/sampling_batch_info.py +7 -4
sglang/srt/server_args.py +264 -105
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
sglang/srt/speculative/eagle_utils.py +36 -13
sglang/srt/speculative/eagle_worker.py +56 -3
sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
sglang/srt/two_batch_overlap.py +20 -19
sglang/srt/utils.py +68 -70
sglang/test/runners.py +8 -5
sglang/test/test_block_fp8.py +5 -6
sglang/test/test_block_fp8_ep.py +13 -19
sglang/test/test_cutlass_moe.py +4 -6
sglang/test/test_cutlass_w4a8_moe.py +4 -3
sglang/test/test_fp4_moe.py +4 -3
sglang/test/test_marlin_moe.py +1 -1
sglang/test/test_marlin_utils.py +1 -1
sglang/test/test_utils.py +7 -0
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
sglang/srt/layers/quantization/fp4.py +0 -557
sglang/srt/layers/quantization/scalar_type.py +0 -352
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/attention/triton_backend.py CHANGED Viewed

@@ -20,6 +20,14 @@ if TYPE_CHECKING:
     from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
+def logit_capping_mod(logit_capping_method, logit_cap):
+    # positive logit_cap -> tanh cap
+    if logit_capping_method == "tanh":
+        return logit_cap
+    else:
+        raise ValueError()
 @dataclass
 class ForwardMetadata:
     attn_logits: torch.Tensor
@@ -35,6 +43,7 @@ class ForwardMetadata:
     window_kv_indptr: torch.Tensor
     window_kv_indices: torch.Tensor
     window_num_kv_splits: torch.Tensor
+    window_kv_offsets: torch.Tensor
 class TritonAttnBackend(AttentionBackend):
@@ -57,16 +66,36 @@ class TritonAttnBackend(AttentionBackend):
         self.decode_attention_fwd = torch.compiler.disable(decode_attention_fwd)
         self.extend_attention_fwd = torch.compiler.disable(extend_attention_fwd)
+        # Parse args
         self.skip_prefill = skip_prefill
         max_bs = model_runner.req_to_token_pool.size
+        self.sliding_window_size = model_runner.sliding_window_size
+        self.req_to_token = model_runner.req_to_token_pool.req_to_token
+        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
+        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
+        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
+        self.num_head = (
+            model_runner.model_config.num_attention_heads // get_attention_tp_size()
+        )
+        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
+            get_attention_tp_size()
+        )
+        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+        self.max_context_len = model_runner.model_config.context_len
+        self.device = model_runner.device
+        self.device_core_count = get_device_core_count(model_runner.gpu_id)
+        self.static_kv_splits = get_bool_env_var(
+            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
+        )
+        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
+        # Check arguments
         assert not (
             model_runner.sliding_window_size is not None
             and model_runner.model_config.is_encoder_decoder
         ), "Sliding window and cross attention are not supported together"
-        self.sliding_window_size = model_runner.sliding_window_size
+        # Initialize buffers
         # TODO(Jianan Ji): Make sure it behaves as expected when kv_indptr_buf is provided and sliding window is enabled
         if kv_indptr_buf is None:
             self.kv_indptr = torch.zeros(
@@ -87,9 +116,6 @@ class TritonAttnBackend(AttentionBackend):
                 # When provided a buffer, create a clone for the second buffer
                 self.window_kv_indptr = torch.zeros_like(kv_indptr_buf)
-        self.req_to_token = model_runner.req_to_token_pool.req_to_token
-        self.token_to_kv_pool_allocator = model_runner.token_to_kv_pool_allocator
         if not self.skip_prefill:
             self.qo_indptr = torch.zeros(
                 (max_bs + 1,), dtype=torch.int32, device=model_runner.device
@@ -99,29 +125,9 @@ class TritonAttnBackend(AttentionBackend):
                 (max_bs + 1,), dtype=torch.int64, device=model_runner.device
             )
-        self.num_draft_tokens = model_runner.server_args.speculative_num_draft_tokens
-        self.speculative_num_steps = model_runner.server_args.speculative_num_steps
-        self.num_head = (
-            model_runner.model_config.num_attention_heads // get_attention_tp_size()
-        )
-        self.num_kv_head = model_runner.model_config.get_num_kv_heads(
-            get_attention_tp_size()
-        )
-        self.static_kv_splits = get_bool_env_var(
-            "SGLANG_TRITON_DECODE_ATTN_STATIC_KV_SPLITS", "false"
-        )
-        self.max_kv_splits = model_runner.server_args.triton_attention_num_kv_splits
-        self.v_head_dim = model_runner.token_to_kv_pool.get_value_buffer(0).shape[-1]
+        # Initialize forward metadata
         self.forward_metadata: ForwardMetadata = None
-        self.max_context_len = model_runner.model_config.context_len
-        self.device = model_runner.device
-        self.device_core_count = get_device_core_count(model_runner.gpu_id)
     def get_num_kv_splits(
         self,
         num_kv_splits: torch.Tensor,
@@ -166,6 +172,7 @@ class TritonAttnBackend(AttentionBackend):
         window_kv_indptr = self.window_kv_indptr
         window_kv_indices = None
         window_num_kv_splits = None
+        window_kv_offsets = None
         spec_info = forward_batch.spec_info
         if forward_batch.forward_mode.is_decode_or_idle():
@@ -173,7 +180,7 @@ class TritonAttnBackend(AttentionBackend):
                 kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
                 kv_indptr = kv_indptr[: bs + 1]
                 kv_indices = torch.empty(
-                    forward_batch.seq_lens_sum, dtype=torch.int32, device=self.device
+                    forward_batch.seq_lens_sum, dtype=torch.int64, device=self.device
                 )
                 create_flashinfer_kv_indices_triton[(bs,)](
                     self.req_to_token,
@@ -189,7 +196,7 @@ class TritonAttnBackend(AttentionBackend):
                     self.sliding_window_size is not None
                     and self.sliding_window_size > 0
                 ):
-                    window_kv_indptr, window_kv_indices, window_kv_lens = (
+                    window_kv_indptr, window_kv_indices, window_kv_lens, _ = (
                         update_sliding_window_buffer(
                             self.window_kv_indptr,
                             self.req_to_token,
@@ -239,7 +246,7 @@ class TritonAttnBackend(AttentionBackend):
             kv_indptr[1 : bs + 1] = torch.cumsum(forward_batch.seq_lens, dim=0)
             kv_indptr = kv_indptr[: bs + 1]
             kv_indices = torch.empty(
-                kv_indptr[-1], dtype=torch.int32, device=self.device
+                kv_indptr[-1], dtype=torch.int64, device=self.device
             )
             create_flashinfer_kv_indices_triton[(bs,)](
                 self.req_to_token,
@@ -252,17 +259,21 @@ class TritonAttnBackend(AttentionBackend):
             )
             if self.sliding_window_size is not None and self.sliding_window_size > 0:
-                window_kv_indptr, window_kv_indices, window_kv_lens = (
-                    update_sliding_window_buffer(
-                        self.window_kv_indptr,
-                        self.req_to_token,
-                        self.sliding_window_size,
-                        forward_batch.seq_lens,
-                        forward_batch.req_pool_indices,
-                        bs,
-                        self.device,
-                        self.token_to_kv_pool_allocator,
-                    )
+                # window_kv_offsets is used to calculate the start position in custom mask
+                (
+                    window_kv_indptr,
+                    window_kv_indices,
+                    window_kv_lens,
+                    window_kv_offsets,
+                ) = update_sliding_window_buffer(
+                    self.window_kv_indptr,
+                    self.req_to_token,
+                    self.sliding_window_size,
+                    forward_batch.seq_lens,
+                    forward_batch.req_pool_indices,
+                    bs,
+                    self.device,
+                    self.token_to_kv_pool_allocator,
                 )
             custom_mask = spec_info.custom_mask
@@ -286,6 +297,7 @@ class TritonAttnBackend(AttentionBackend):
                     self.req_to_token,
                 )
             )
+            kv_indices = kv_indices.to(torch.int64)
             mask_indptr = None
             # TODO(FIXME): This will trigger an invalid Eagle tree when using
             # `max(spec_info.accept_length_cpu)`.
@@ -301,7 +313,7 @@ class TritonAttnBackend(AttentionBackend):
             kv_indptr = kv_indptr[: bs + 1]
             kv_indices = torch.empty(
                 forward_batch.extend_prefix_lens.sum().item(),
-                dtype=torch.int32,
+                dtype=torch.int64,
                 device=self.device,
             )
             create_flashinfer_kv_indices_triton[(bs,)](
@@ -315,15 +327,17 @@ class TritonAttnBackend(AttentionBackend):
             )
             # Sliding window
             if self.sliding_window_size is not None and self.sliding_window_size > 0:
-                window_kv_indptr, window_kv_indices, _ = update_sliding_window_buffer(
-                    self.window_kv_indptr,
-                    self.req_to_token,
-                    self.sliding_window_size,
-                    forward_batch.extend_prefix_lens,
-                    forward_batch.req_pool_indices,
-                    bs,
-                    self.device,
-                    self.token_to_kv_pool_allocator,
+                window_kv_indptr, window_kv_indices, _, _ = (
+                    update_sliding_window_buffer(
+                        self.window_kv_indptr,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        forward_batch.extend_prefix_lens,
+                        forward_batch.req_pool_indices,
+                        bs,
+                        self.device,
+                        self.token_to_kv_pool_allocator,
+                    )
                 )
             qo_indptr = self.qo_indptr
@@ -333,7 +347,7 @@ class TritonAttnBackend(AttentionBackend):
             mask_indptr = None
             attn_logits = None
             attn_lse = None
-            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
+            max_extend_len = max(forward_batch.extend_seq_lens_cpu)
             num_kv_splits = None
         self.forward_metadata = ForwardMetadata(
@@ -349,6 +363,7 @@ class TritonAttnBackend(AttentionBackend):
             window_kv_indptr,
             window_kv_indices,
             window_num_kv_splits,
+            window_kv_offsets,
         )
     def init_cuda_graph_state(
@@ -373,7 +388,7 @@ class TritonAttnBackend(AttentionBackend):
         if kv_indices_buf is None:
             self.cuda_graph_kv_indices = torch.zeros(
                 (max_num_tokens * self.max_context_len),
-                dtype=torch.int32,
+                dtype=torch.int64,
                 device=self.device,
             )
         else:
@@ -390,7 +405,7 @@ class TritonAttnBackend(AttentionBackend):
             if kv_indices_buf is None:
                 self.cuda_graph_window_kv_indices = torch.zeros(
                     (max_num_tokens * self.sliding_window_size),
-                    dtype=torch.int32,
+                    dtype=torch.int64,
                     device=self.device,
                 )
             else:
@@ -403,6 +418,12 @@ class TritonAttnBackend(AttentionBackend):
                 device=self.device,
             )
+            self.cuda_graph_window_kv_offsets = torch.zeros(
+                (max_bs,),
+                dtype=torch.int32,
+                device=self.device,
+            )
     def init_forward_metadata_capture_cuda_graph(
         self,
         bs: int,
@@ -417,6 +438,7 @@ class TritonAttnBackend(AttentionBackend):
         window_kv_indptr = self.window_kv_indptr
         window_kv_indices = None
         window_num_kv_splits = None
+        window_kv_offsets = None
         if forward_mode.is_decode_or_idle():
             if spec_info is None:
@@ -439,7 +461,7 @@ class TritonAttnBackend(AttentionBackend):
                 ):
                     window_kv_indices = self.cuda_graph_window_kv_indices
                     window_num_kv_splits = self.cuda_graph_window_num_kv_splits
-                    window_kv_indptr, window_kv_indices, _ = (
+                    window_kv_indptr, window_kv_indices, _, _ = (
                         update_sliding_window_buffer_cuda_graph(
                             self.window_kv_indptr,
                             window_kv_indices,
@@ -486,13 +508,14 @@ class TritonAttnBackend(AttentionBackend):
             if self.sliding_window_size is not None and self.sliding_window_size > 0:
                 window_kv_indices = self.cuda_graph_window_kv_indices
                 window_num_kv_splits = self.cuda_graph_window_num_kv_splits
-                window_kv_indptr, window_kv_indices, _ = (
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                window_kv_indptr, window_kv_indices, _, window_kv_offsets[:bs] = (
                     update_sliding_window_buffer_cuda_graph(
                         self.window_kv_indptr,
                         window_kv_indices,
                         self.req_to_token,
                         self.sliding_window_size,
-                        seq_lens,
+                        seq_lens[:bs],
                         req_pool_indices,
                         bs,
                         self.token_to_kv_pool_allocator,
@@ -554,6 +577,7 @@ class TritonAttnBackend(AttentionBackend):
             window_kv_indptr,
             window_kv_indices,
             window_num_kv_splits,
+            window_kv_offsets,
         )
     def init_forward_metadata_replay_cuda_graph(
@@ -592,7 +616,7 @@ class TritonAttnBackend(AttentionBackend):
                 ):
                     window_num_kv_splits = self.cuda_graph_window_num_kv_splits
                     window_kv_indices = self.cuda_graph_window_kv_indices
-                    _, _, window_kv_lens = update_sliding_window_buffer_cuda_graph(
+                    _, _, window_kv_lens, _ = update_sliding_window_buffer_cuda_graph(
                         self.window_kv_indptr,
                         window_kv_indices,
                         self.req_to_token,
@@ -638,15 +662,18 @@ class TritonAttnBackend(AttentionBackend):
             if self.sliding_window_size is not None and self.sliding_window_size > 0:
                 window_num_kv_splits = self.cuda_graph_window_num_kv_splits
                 window_kv_indices = self.cuda_graph_window_kv_indices
-                _, _, window_kv_lens = update_sliding_window_buffer_cuda_graph(
-                    self.window_kv_indptr,
-                    window_kv_indices,
-                    self.req_to_token,
-                    self.sliding_window_size,
-                    seq_lens,
-                    req_pool_indices,
-                    bs,
-                    self.token_to_kv_pool_allocator,
+                window_kv_offsets = self.cuda_graph_window_kv_offsets
+                _, _, window_kv_lens, window_kv_offsets[:bs] = (
+                    update_sliding_window_buffer_cuda_graph(
+                        self.window_kv_indptr,
+                        window_kv_indices,
+                        self.req_to_token,
+                        self.sliding_window_size,
+                        seq_lens[:bs],
+                        req_pool_indices,
+                        bs,
+                        self.token_to_kv_pool_allocator,
+                    )
                 )
             custom_mask = self.cuda_graph_custom_mask
             custom_mask[: spec_info.custom_mask.shape[0]] = spec_info.custom_mask
@@ -699,6 +726,8 @@ class TritonAttnBackend(AttentionBackend):
                 layer, forward_batch.out_cache_loc, k, v
             )
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
         causal = True
         if layer.attn_type == AttentionType.ENCODER_ONLY:
             causal = False
@@ -709,10 +738,12 @@ class TritonAttnBackend(AttentionBackend):
             )  # Needed for sliding window mask
             kv_indptr = self.forward_metadata.window_kv_indptr
             kv_indices = self.forward_metadata.window_kv_indices
+            window_kv_offsets = self.forward_metadata.window_kv_offsets
         else:
             sliding_window_size = -1
             kv_indptr = self.forward_metadata.kv_indptr
             kv_indices = self.forward_metadata.kv_indices
+            window_kv_offsets = None
         self.extend_attention_fwd(
             q.view(-1, layer.tp_q_head_num, layer.qk_head_dim),
@@ -729,9 +760,11 @@ class TritonAttnBackend(AttentionBackend):
             self.forward_metadata.mask_indptr,
             self.forward_metadata.max_extend_len,
             layer.scaling,
-            layer.logit_cap,
+            logit_cap=logits_soft_cap,
             sliding_window_size=sliding_window_size,
             sinks=sinks,
+            window_kv_offsets=window_kv_offsets,
+            xai_temperature_len=layer.xai_temperature_len,
         )
         return o
@@ -755,6 +788,8 @@ class TritonAttnBackend(AttentionBackend):
         else:
             o = torch.empty_like(q)
+        logits_soft_cap = logit_capping_mod(layer.logit_capping_method, layer.logit_cap)
         if save_kv_cache:
             forward_batch.token_to_kv_pool.set_kv_buffer(
                 layer, forward_batch.out_cache_loc, k, v
@@ -779,8 +814,9 @@ class TritonAttnBackend(AttentionBackend):
             self.forward_metadata.num_kv_splits,
             self.max_kv_splits,
             layer.scaling,
-            layer.logit_cap,
+            logit_cap=logits_soft_cap,
             sinks=sinks,
+            xai_temperature_len=layer.xai_temperature_len,
         )
         return o
@@ -867,7 +903,7 @@ class TritonMultiStepDraftBackend:
                 self.speculative_num_steps,
                 forward_batch.batch_size * self.topk * self.max_context_len,
             ),
-            dtype=torch.int32,
+            dtype=torch.int64,
             device=self.device,
         )
@@ -885,7 +921,7 @@ class TritonMultiStepDraftBackend:
     def init_cuda_graph_state(self, max_bs: int, max_num_tokens: int):
         self.cuda_graph_kv_indices = torch.zeros(
             (self.speculative_num_steps, max_num_tokens * self.max_context_len),
-            dtype=torch.int32,
+            dtype=torch.int64,
             device=self.device,
         )
         for i in range(self.speculative_num_steps):
@@ -994,7 +1030,7 @@ def update_sliding_window_buffer(
     window_kv_indptr[1 : bs + 1] = torch.cumsum(window_kv_lens, dim=0)
     window_kv_indptr = window_kv_indptr[: bs + 1]
     window_kv_indices = torch.empty(
-        window_kv_indptr[-1], dtype=torch.int32, device=device
+        window_kv_indptr[-1], dtype=torch.int64, device=device
     )
     window_kv_start_idx = seq_lens - window_kv_lens
     create_flashinfer_kv_indices_triton[(bs,)](
@@ -1014,7 +1050,7 @@ def update_sliding_window_buffer(
                 window_kv_indices[:kv_last_index]
             )
         )
-    return window_kv_indptr, window_kv_indices, window_kv_lens
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx
 def update_sliding_window_buffer_cuda_graph(
@@ -1051,4 +1087,4 @@ def update_sliding_window_buffer_cuda_graph(
                 window_kv_indices[:kv_last_index]
             )
         )
-    return window_kv_indptr, window_kv_indices, window_kv_lens
+    return window_kv_indptr, window_kv_indices, window_kv_lens, window_kv_start_idx

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -69,6 +69,7 @@ def _fwd_kernel_stage1(
     logit_cap: tl.constexpr,
     Lk: tl.constexpr,
     Lv: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -85,6 +86,12 @@ def _fwd_kernel_stage1(
     cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
     kv_splits = tl.load(num_kv_splits + cur_batch)
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
     off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
     kv_len_per_split = (
@@ -122,6 +129,9 @@ def _fwd_kernel_stage1(
             if logit_cap > 0:
                 qk = logit_cap * tanh(qk / logit_cap)
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg
             qk = tl.where(offs_n < split_kv_end, qk, float("-inf"))
             offs_buf_v = (
@@ -181,6 +191,7 @@ def _decode_att_m_fwd(
     max_kv_splits,
     sm_scale,
     logit_cap,
+    xai_temperature_len=-1,
 ):
     BLOCK = 64
     # [TODO] work around SGPR limit on MI3xx
@@ -190,7 +201,7 @@ def _decode_att_m_fwd(
     Lk = k_buffer.shape[-1]
     Lv = v_buffer.shape[-1]
-    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
+    batch, head_num = q.shape[0], q.shape[1]
     grid = (batch, head_num, MAX_KV_SPLITS)
     kv_group_num = q.shape[1] // k_buffer.shape[1]
@@ -230,6 +241,7 @@ def _decode_att_m_fwd(
         BLOCK_N=BLOCK,
         MIN_BLOCK_KV=_MIN_BLOCK_KV,
         logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
         num_warps=num_warps,
         num_stages=2,
         Lk=Lk,
@@ -266,6 +278,7 @@ def _fwd_grouped_kernel_stage1(
     BLOCK_H: tl.constexpr,
     MIN_BLOCK_KV: tl.constexpr,
     logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
     Lk: tl.constexpr,
     Lv: tl.constexpr,
 ):
@@ -291,6 +304,12 @@ def _fwd_grouped_kernel_stage1(
     cur_batch_seq_len = tl.load(kv_indptr + cur_batch + 1) - cur_batch_kv_start_idx
     kv_splits = tl.load(num_kv_splits + cur_batch)
+    if xai_temperature_len > 0:
+        offs_qidx = cur_batch_seq_len - 1
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        _qtemp = tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale
+        xai_temperature_reg = tl.where(offs_qidx > xai_temperature_len, _qtemp, 1.0)
     offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
     if BLOCK_DPE > 0:
@@ -351,6 +370,9 @@ def _fwd_grouped_kernel_stage1(
             if logit_cap > 0:
                 qk = logit_cap * tanh(qk / logit_cap)
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
             qk = tl.where(
                 mask_h[:, None] & (offs_n[None, :] < split_kv_end), qk, float("-inf")
             )
@@ -413,6 +435,7 @@ def _decode_grouped_att_m_fwd(
     max_kv_splits,
     sm_scale,
     logit_cap,
+    xai_temperature_len=-1,
 ):
     BLOCK = 32
     Lk = k_buffer.shape[-1]
@@ -433,7 +456,7 @@ def _decode_grouped_att_m_fwd(
         BLOCK_DPE = 0
     BLOCK_DV = triton.next_power_of_2(Lv)
-    batch, head_num = kv_indptr.shape[0] - 1, q.shape[1]
+    batch, head_num = q.shape[0], q.shape[1]
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     BLOCK_H = 16
@@ -480,6 +503,7 @@ def _decode_grouped_att_m_fwd(
         BLOCK_H=BLOCK_H,
         MIN_BLOCK_KV=_MIN_BLOCK_KV,
         logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
         num_warps=4,
         num_stages=num_stages,
         Lk=Lk,
@@ -620,6 +644,7 @@ def decode_attention_fwd_normal(
     sm_scale,
     logit_cap=0.0,
     sinks=None,
+    xai_temperature_len=-1,
 ):
     _decode_att_m_fwd(
         q,
@@ -633,6 +658,7 @@ def decode_attention_fwd_normal(
         max_kv_splits,
         sm_scale,
         logit_cap,
+        xai_temperature_len,
     )
     _decode_softmax_reducev_fwd(
         attn_logits,
@@ -661,6 +687,7 @@ def decode_attention_fwd_grouped(
     sm_scale,
     logit_cap=0.0,
     sinks=None,
+    xai_temperature_len=-1,
 ):
     _decode_grouped_att_m_fwd(
         q,
@@ -674,6 +701,7 @@ def decode_attention_fwd_grouped(
         max_kv_splits,
         sm_scale,
         logit_cap,
+        xai_temperature_len,
     )
     _decode_softmax_reducev_fwd(
         attn_logits,
@@ -702,6 +730,7 @@ def decode_attention_fwd(
     sm_scale,
     logit_cap=0.0,
     sinks=None,
+    xai_temperature_len=-1,
 ):
     assert max_kv_splits == attn_logits.shape[2]
     assert q.shape[0] <= kv_indptr.shape[0] - 1
@@ -725,6 +754,7 @@ def decode_attention_fwd(
             sm_scale,
             logit_cap=logit_cap,
             sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
         )
     else:
         # GQA/MQA/MLA
@@ -742,4 +772,5 @@ def decode_attention_fwd(
             sm_scale,
             logit_cap=logit_cap,
             sinks=sinks,
+            xai_temperature_len=xai_temperature_len,
         )

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -52,6 +52,7 @@ def _fwd_kernel(
     mask_ptr,
     mask_indptr,
     sink_ptr,
+    window_kv_offset_ptr,
     sm_scale,
     kv_group_num,
     stride_qbs,
@@ -68,6 +69,7 @@ def _fwd_kernel(
     stride_buf_vh,
     SLIDING_WINDOW_SIZE: tl.constexpr,
     logit_cap: tl.constexpr,
+    xai_temperature_len: tl.constexpr,
     Lq: tl.constexpr,
     Lv: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
@@ -95,6 +97,11 @@ def _fwd_kernel(
     if USE_CUSTOM_MASK:
         cur_seq_mask_start_idx = tl.load(mask_indptr + cur_seq)
+    # For SWA, we should only load the mask in the sliding window
+    window_kv_offset = 0
+    if USE_CUSTOM_MASK and SLIDING_WINDOW_SIZE > 0:
+        window_kv_offset = tl.load(window_kv_offset_ptr + cur_seq)
     offs_d = tl.arange(0, BLOCK_DMODEL)
     offs_dv = tl.arange(0, BLOCK_DV)
     offs_m = tl.arange(0, BLOCK_M)
@@ -103,6 +110,15 @@ def _fwd_kernel(
     mask_d = offs_d < Lq
     mask_dv = offs_dv < Lv
+    if xai_temperature_len > 0:
+        offs_qidx = cur_seq_len_prefix + cur_block_m * BLOCK_M + offs_m
+        xai_temperature_scale = 1.0 / tl.log2(float(xai_temperature_len))
+        xai_temperature_reg = tl.where(
+            offs_qidx > xai_temperature_len,
+            tl.log2(offs_qidx.to(tl.float32)) * xai_temperature_scale,
+            1.0,
+        )
     offs_q = (
         (cur_seq_extend_start_idx + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_qbs
@@ -139,7 +155,9 @@ def _fwd_kernel(
             custom_mask = tl.load(
                 mask_ptr
                 + cur_seq_mask_start_idx
-                + (cur_block_m * BLOCK_M + offs_m[:, None]) * cur_seq_len
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
                 + start_n
                 + offs_n[None, :],
                 mask=(mask_m[:, None] & mask_n[None, :]),
@@ -195,6 +213,9 @@ def _fwd_kernel(
             if logit_cap > 0:
                 qk = logit_cap * tanh(qk / logit_cap)
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
             qk = tl.where(final_mask, qk, float("-inf"))
             row_max = tl.max(qk, 1)
@@ -236,7 +257,9 @@ def _fwd_kernel(
             custom_mask = tl.load(
                 mask_ptr
                 + cur_seq_mask_start_idx
-                + (cur_block_m * BLOCK_M + offs_m[:, None]) * cur_seq_len
+                + (cur_block_m * BLOCK_M + offs_m[:, None])
+                * (cur_seq_len + window_kv_offset)
+                + window_kv_offset
                 + cur_seq_len_prefix
                 + start_n
                 + offs_n[None, :],
@@ -296,6 +319,9 @@ def _fwd_kernel(
             if logit_cap > 0:
                 qk = logit_cap * tanh(qk / logit_cap)
+            if xai_temperature_len > 0:
+                qk *= xai_temperature_reg[:, None]
             qk = tl.where(final_mask, qk, float("-inf"))
             row_max = tl.max(qk, 1)
@@ -362,6 +388,8 @@ def extend_attention_fwd(
     skip_prefix_custom_mask=True,
     sliding_window_size=-1,
     sinks=None,
+    window_kv_offsets=None,
+    xai_temperature_len=-1,
 ):
     """
     q_extend, k_extend, v_extend, o_extend: contiguous tensors
@@ -449,6 +477,7 @@ def extend_attention_fwd(
         custom_mask,
         mask_indptr,
         sinks,
+        window_kv_offsets,
         sm_scale,
         kv_group_num,
         q_extend.stride(0),
@@ -465,6 +494,7 @@ def extend_attention_fwd(
         v_buffer.stride(1),
         SLIDING_WINDOW_SIZE=sliding_window_size,
         logit_cap=logit_cap,
+        xai_temperature_len=xai_temperature_len,
         BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_DPE=BLOCK_DPE,
         BLOCK_DV=BLOCK_DV,

sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

sglang 0.5.0rc1py3-none-any.whl → 0.5.1py3-none-any.whl