PyPI - sglang - Versions diffs - 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl - Mend

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (118) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_latency.py +1 -553
sglang/bench_offline_throughput.py +48 -20
sglang/bench_one_batch.py +472 -0
sglang/{bench_server_latency.py → bench_one_batch_server.py} +3 -3
sglang/bench_serving.py +125 -6
sglang/check_env.py +3 -6
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/srt/configs/model_config.py +13 -14
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +28 -17
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +47 -58
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +16 -13
sglang/srt/layers/attention/flashinfer_backend.py +106 -54
sglang/srt/layers/attention/triton_backend.py +9 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +25 -0
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +11 -4
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +17 -15
sglang/srt/layers/logits_processor.py +23 -25
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +4 -8
sglang/srt/layers/torchao_utils.py +2 -0
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +98 -27
sglang/srt/managers/detokenizer_manager.py +13 -15
sglang/srt/managers/io_struct.py +63 -21
sglang/srt/managers/schedule_batch.py +154 -59
sglang/srt/managers/schedule_policy.py +18 -16
sglang/srt/managers/scheduler.py +278 -109
sglang/srt/managers/session_controller.py +61 -0
sglang/srt/managers/tokenizer_manager.py +63 -18
sglang/srt/managers/tp_worker.py +25 -16
sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +63 -25
sglang/srt/model_executor/forward_batch_info.py +128 -32
sglang/srt/model_executor/model_runner.py +132 -64
sglang/srt/model_parallel.py +98 -0
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +162 -59
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +31 -25
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +14 -16
sglang/srt/models/llavavid.py +14 -16
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +22 -20
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/phi3_small.py +447 -0
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/qwen2_vl.py +13 -6
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +107 -93
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +19 -17
sglang/srt/openai_api/protocol.py +14 -16
sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
sglang/srt/sampling/sampling_batch_info.py +61 -57
sglang/srt/sampling/sampling_params.py +14 -16
sglang/srt/server.py +86 -35
sglang/srt/server_args.py +96 -80
sglang/srt/utils.py +266 -68
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +38 -20
sglang/test/srt/sampling/penaltylib/utils.py +23 -21
sglang/test/test_utils.py +31 -20
sglang/version.py +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +66 -57
sglang-0.3.6.post1.dist-info/RECORD +164 -0
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +1 -1
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.5.post2.dist-info/RECORD +0 -156
{sglang-0.3.5.post2.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -1,34 +1,30 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Constrained decoding with xgrammar backend."""
 import logging
 from typing import List, Tuple
 import torch
-try:
-    from xgrammar import CachedGrammarCompiler, CompiledGrammar, GrammarMatcher
-    import_error = None
-except ImportError as e:
-    CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
-        ImportError
-    )
-    import_error = e
+from xgrammar import (
+    CompiledGrammar,
+    GrammarCompiler,
+    GrammarMatcher,
+    TokenizerInfo,
+    allocate_token_bitmask,
+    apply_token_bitmask_inplace,
+)
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
@@ -38,7 +34,7 @@ from sglang.srt.constrained.base_grammar_backend import (
 logger = logging.getLogger(__name__)
-MAX_ROLLBACK_TOKENS = 10
+MAX_ROLLBACK_TOKENS = 200
 class XGrammarGrammar(BaseGrammarObject):
@@ -80,20 +76,25 @@ class XGrammarGrammar(BaseGrammarObject):
         for i in range(k, len(new_output_ids)):
             assert self.matcher.accept_token(new_output_ids[i])
-    def fill_vocab_mask(self, vocab_mask: torch.Tensor):
-        # Note that this bitmask is a bitset, not bool
-        bitmask = self.matcher.get_next_token_bitmask()
-        # Mask the tokens that are not allowed
-        vocab_mask[
-            self.matcher.get_rejected_tokens_from_bitmask(bitmask, self.vocab_size)
-        ] = 1
+    def allocate_vocab_mask(
+        self, vocab_size: int, batch_size: int, device
+    ) -> torch.Tensor:
+        return allocate_token_bitmask(batch_size, vocab_size)
+    def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
+        self.matcher.fill_next_token_bitmask(vocab_mask, idx)
+    @staticmethod
+    def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
+        if vocab_mask.device.type != logits.device.type:
+            # vocab_mask must then be on the same device as logits
+            # when applying the token bitmask, so we check and move if needed
+            vocab_mask = vocab_mask.to(logits.device)
+        apply_token_bitmask_inplace(logits, vocab_mask)
     def copy(self):
-        matcher = GrammarMatcher(
-            self.ctx,
-            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            mask_vocab_size=self.vocab_size,
-        )
+        matcher = GrammarMatcher(self.ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS)
         return XGrammarGrammar(matcher, self.vocab_size, self.ctx)
@@ -105,26 +106,18 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     ):
         super().__init__()
-        if import_error:
-            logger.warning(
-                f"Ignore import error for the grammar backend: {import_error}"
-            )
-            self.grammar_cache = None
-            return
-        self.grammar_cache = CachedGrammarCompiler(tokenizer_or_vocab=tokenizer)
+        tokenizer_info = TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=vocab_size
+        )
+        self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size
     def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
-        if import_error:
-            raise import_error
         key_type, key_string = key
         if key_type == "json":
             try:
-                ctx = self.grammar_cache.get_compiled_grammar_for_json_schema(
-                    key_string
-                )
+                ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
             except RuntimeError as e:
                 logging.warning(
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"
@@ -138,13 +131,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
-        matcher = GrammarMatcher(
-            ctx,
-            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            mask_vocab_size=self.vocab_size,
-        )
+        matcher = GrammarMatcher(ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS)
         return XGrammarGrammar(matcher, self.vocab_size, ctx)
     def reset(self):
-        if self.grammar_cache:
-            self.grammar_cache.clear()
+        if self.grammar_compiler:
+            self.grammar_compiler.clear_cache()

sglang/srt/conversation.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Conversation chat templates."""
 # Adapted from

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Utilities for Huggingface Transformers."""
 import contextlib

sglang/srt/layers/activation.py CHANGED Viewed

@@ -1,16 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Fused operators for activation layers."""
 import logging
@@ -32,12 +32,14 @@ from vllm.distributed import (
 )
 from vllm.model_executor.custom_op import CustomOp
+from sglang.srt.layers.custom_op_util import register_custom_op
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.utils import set_weight_attrs
 logger = logging.getLogger(__name__)
+@register_custom_op("sglang_silu_and_mul")
 class SiluAndMul(CustomOp):
     def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
@@ -51,6 +53,7 @@ class SiluAndMul(CustomOp):
         return out
+@register_custom_op("sglang_gelu_and_mul")
 class GeluAndMul(CustomOp):
     def __init__(self, approximate="tanh"):
         super().__init__()

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -7,8 +7,9 @@ FlashInfer is faster and Triton is easier to customize.
 Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode.
 """
+import os
 from enum import Enum, auto
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, List
 import torch
 import triton
@@ -45,13 +46,19 @@ class FlashInferAttnBackend(AttentionBackend):
         super().__init__()
         # Parse constants
-        if not _grouped_size_compiled_for_decode_kernels(
-            model_runner.model_config.num_attention_heads // model_runner.tp_size,
-            model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
-        ):
-            self.decode_use_tensor_cores = True
+        if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ:
+            self.decode_use_tensor_cores = (
+                os.environ["SGLANG_FLASHINFER_USE_TENSOR_CORE"].lower() == "true"
+            )
         else:
-            self.decode_use_tensor_cores = False
+            if not _grouped_size_compiled_for_decode_kernels(
+                model_runner.model_config.num_attention_heads // model_runner.tp_size,
+                model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
+            ):
+                self.decode_use_tensor_cores = True
+            else:
+                self.decode_use_tensor_cores = False
         self.max_context_len = model_runner.model_config.context_len
         assert not (
@@ -136,15 +143,17 @@ class FlashInferAttnBackend(AttentionBackend):
             prefix_lens = forward_batch.extend_prefix_lens
             # Some heuristics to check whether to use ragged forward
-            use_ragged = False
             if forward_batch.extend_num_tokens >= 4096 and self.num_wrappers == 1:
                 use_ragged = True
-            extend_no_prefix = not torch.any(forward_batch.extend_prefix_lens).item()
+                extend_no_prefix = not any(forward_batch.extend_prefix_lens_cpu)
+            else:
+                use_ragged = False
+                extend_no_prefix = False
             self.indices_updater_prefill.update(
                 forward_batch.req_pool_indices,
                 forward_batch.seq_lens,
+                forward_batch.seq_lens_sum,
                 prefix_lens,
                 use_ragged=use_ragged,
                 encoder_lens=forward_batch.encoder_lens,
@@ -314,7 +323,6 @@ class FlashInferIndicesUpdaterDecode:
         self.head_dim = model_runner.model_config.head_dim
         self.data_type = model_runner.kv_cache_dtype
         self.q_data_type = model_runner.dtype
-        self.max_context_len = model_runner.req_to_token_pool.req_to_token.size(1)
         self.sliding_window_size = model_runner.sliding_window_size
         self.attn_backend = attn_backend
@@ -335,7 +343,12 @@ class FlashInferIndicesUpdaterDecode:
             self.update = self.update_single_wrapper
     def update(
-        self, req_pool_indices, seq_lens, seq_lens_sum, decode_wrappers, encoder_lens
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        decode_wrappers: List,
+        encoder_lens: torch.Tensor,
     ):
         # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
@@ -345,8 +358,8 @@ class FlashInferIndicesUpdaterDecode:
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
-        decode_wrappers=None,
-        encoder_lens=None,
+        decode_wrappers: List,
+        encoder_lens: torch.Tensor,
     ):
         decode_wrappers = decode_wrappers or self.decode_wrappers
         self.call_begin_forward(
@@ -363,8 +376,8 @@ class FlashInferIndicesUpdaterDecode:
         req_pool_indices: torch.Tensor,
         seq_lens: torch.Tensor,
         seq_lens_sum: int,
-        decode_wrappers=None,
-        encoder_lens=None,
+        decode_wrappers: List,
+        encoder_lens: torch.Tensor,
     ):
         decode_wrappers = decode_wrappers or self.decode_wrappers
@@ -394,11 +407,11 @@ class FlashInferIndicesUpdaterDecode:
     def update_cross_attention(
         self,
-        req_pool_indices,
-        seq_lens,
-        seq_lens_sum,
-        decode_wrappers=None,
-        encoder_lens=None,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        decode_wrappers: List,
+        encoder_lens: torch.Tensor,
     ):
         decode_wrappers = decode_wrappers or self.decode_wrappers
@@ -425,11 +438,11 @@ class FlashInferIndicesUpdaterDecode:
     def call_begin_forward(
         self,
         wrapper,
-        req_pool_indices,
-        paged_kernel_lens,
-        paged_kernel_lens_sum,
-        kv_indptr,
-        kv_start_idx,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        kv_indptr: torch.Tensor,
+        kv_start_idx: torch.Tensor,
     ):
         bs = len(req_pool_indices)
         kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
@@ -445,7 +458,7 @@ class FlashInferIndicesUpdaterDecode:
             kv_indptr,
             kv_start_idx,
             kv_indices,
-            self.max_context_len,
+            self.req_to_token.shape[1],
         )
         wrapper.end_forward()
@@ -474,7 +487,6 @@ class FlashInferIndicesUpdaterPrefill:
         self.head_dim = model_runner.model_config.head_dim
         self.data_type = model_runner.kv_cache_dtype
         self.q_data_type = model_runner.dtype
-        self.max_context_len = model_runner.req_to_token_pool.req_to_token.size(1)
         self.sliding_window_size = model_runner.sliding_window_size
         self.attn_backend = attn_backend
@@ -496,23 +508,40 @@ class FlashInferIndicesUpdaterPrefill:
             assert self.attn_backend.num_wrappers == 1
             self.update = self.update_single_wrapper
-    def update(self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens):
+    def update(
+        self,
+        req_pool_indices: torch.Tnesor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        use_ragged: bool,
+        encoder_lens: torch.Tensor,
+    ):
         # Keep the signature for type checking. It will be assigned during runtime.
         raise NotImplementedError()
     def update_single_wrapper(
-        self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens
+        self,
+        req_pool_indices: torch.Tnesor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        use_ragged: bool,
+        encoder_lens: torch.Tensor,
     ):
         if use_ragged:
             paged_kernel_lens = prefix_lens
+            paged_kernel_lens_sum = paged_kernel_lens.sum().item()
         else:
             paged_kernel_lens = seq_lens
+            paged_kernel_lens_sum = seq_lens_sum
         self.call_begin_forward(
             self.wrapper_ragged,
             self.wrappers_paged[0],
             req_pool_indices,
             paged_kernel_lens,
+            paged_kernel_lens_sum,
             seq_lens,
             prefix_lens,
             None,
@@ -522,7 +551,13 @@ class FlashInferIndicesUpdaterPrefill:
         )
     def update_sliding_window(
-        self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        use_ragged: bool,
+        encoder_lens: torch.Tensor,
     ):
         for wrapper_id in range(2):
             if wrapper_id == 0:
@@ -531,9 +566,12 @@ class FlashInferIndicesUpdaterPrefill:
                     seq_lens,
                     torch.tensor(self.sliding_window_size) + seq_lens - prefix_lens,
                 )
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
             else:
                 # full attention
                 paged_kernel_lens = seq_lens
+                paged_kernel_lens_sum = seq_lens_sum
             kv_start_idx = seq_lens - paged_kernel_lens
             self.call_begin_forward(
@@ -541,6 +579,7 @@ class FlashInferIndicesUpdaterPrefill:
                 self.wrappers_paged[wrapper_id],
                 req_pool_indices,
                 paged_kernel_lens,
+                paged_kernel_lens_sum,
                 seq_lens,
                 prefix_lens,
                 kv_start_idx,
@@ -550,23 +589,32 @@ class FlashInferIndicesUpdaterPrefill:
             )
     def update_cross_attention(
-        self, req_pool_indices, seq_lens, prefix_lens, use_ragged, encoder_lens
+        self,
+        req_pool_indices: torch.Tensor,
+        seq_lens: torch.Tensor,
+        seq_lens_sum: int,
+        prefix_lens: torch.Tensor,
+        use_ragged: bool,
+        encoder_lens: torch.Tensor,
     ):
         for wrapper_id in range(2):
             if wrapper_id == 0:
                 # normal attention
                 paged_kernel_lens = seq_lens
                 kv_start_idx = encoder_lens
+                paged_kernel_lens_sum = seq_lens_sum
             else:
                 # cross attention
                 paged_kernel_lens = encoder_lens
                 kv_start_idx = torch.zeros_like(encoder_lens)
+                paged_kernel_lens_sum = paged_kernel_lens.sum().item()
             self.call_begin_forward(
                 self.wrapper_ragged,
                 self.wrappers_paged[wrapper_id],
                 req_pool_indices,
                 paged_kernel_lens,
+                paged_kernel_lens_sum,
                 seq_lens,
                 prefix_lens,
                 kv_start_idx,
@@ -579,19 +627,22 @@ class FlashInferIndicesUpdaterPrefill:
         self,
         wrapper_ragged,
         wrapper_paged,
-        req_pool_indices,
-        paged_kernel_lens,
-        seq_lens,
-        prefix_lens,
-        kv_start_idx,
-        kv_indptr,
-        qo_indptr,
-        use_ragged,
+        req_pool_indices: torch.Tensor,
+        paged_kernel_lens: torch.Tensor,
+        paged_kernel_lens_sum: int,
+        seq_lens: torch.Tensor,
+        prefix_lens: torch.Tensor,
+        kv_start_idx: torch.Tensor,
+        kv_indptr: torch.Tensor,
+        qo_indptr: torch.Tensor,
+        use_ragged: bool,
     ):
         bs = len(req_pool_indices)
         kv_indptr[1 : bs + 1] = torch.cumsum(paged_kernel_lens, dim=0)
         kv_indptr = kv_indptr[: bs + 1]
-        kv_indices = torch.empty(kv_indptr[-1], dtype=torch.int32, device="cuda")
+        kv_indices = torch.empty(
+            paged_kernel_lens_sum, dtype=torch.int32, device="cuda"
+        )
         create_flashinfer_kv_indices_triton[(bs,)](
             self.req_to_token,
             req_pool_indices,
@@ -599,7 +650,7 @@ class FlashInferIndicesUpdaterPrefill:
             kv_indptr,
             kv_start_idx,
             kv_indices,
-            self.max_context_len,
+            self.req_to_token.shape[1],
         )
         qo_indptr[1 : bs + 1] = torch.cumsum(seq_lens - prefix_lens, dim=0)
@@ -638,10 +689,11 @@ def create_flashinfer_kv_indices_triton(
     kv_indptr,
     kv_start_idx,
     kv_indices_ptr,
-    max_context_len: tl.constexpr,
+    req_to_token_ptr_stride: tl.constexpr,
 ):
     BLOCK_SIZE: tl.constexpr = 512
     pid = tl.program_id(axis=0)
     req_pool_index = tl.load(req_pool_indices_ptr + pid)
     kv_indices_offset = tl.load(kv_indptr + pid)
@@ -652,15 +704,15 @@ def create_flashinfer_kv_indices_triton(
         kv_end = kv_start
     kv_end += tl.load(page_kernel_lens_ptr + pid).to(tl.int32)
-    req_to_token_ptr += req_pool_index * max_context_len
-    kv_indices_ptr += kv_indices_offset
-    ld_offset = kv_start + tl.arange(0, BLOCK_SIZE)
-    st_offset = tl.arange(0, BLOCK_SIZE)
     num_loop = tl.cdiv(kv_end - kv_start, BLOCK_SIZE)
-    for _ in range(num_loop):
-        mask = ld_offset < kv_end
-        data = tl.load(req_to_token_ptr + ld_offset, mask=mask)
-        tl.store(kv_indices_ptr + st_offset, data, mask=mask)
-        ld_offset += BLOCK_SIZE
-        st_offset += BLOCK_SIZE
+    for i in range(num_loop):
+        offset = tl.arange(0, BLOCK_SIZE) + i * BLOCK_SIZE
+        mask = offset < kv_end - kv_start
+        data = tl.load(
+            req_to_token_ptr
+            + req_pool_index * req_to_token_ptr_stride
+            + kv_start
+            + offset,
+            mask=mask,
+        )
+        tl.store(kv_indices_ptr + kv_indices_offset + offset, data, mask=mask)

sglang/srt/layers/attention/triton_backend.py CHANGED Viewed

@@ -3,7 +3,6 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 import torch
-import torch.nn as nn
 from sglang.srt.layers.attention import AttentionBackend
 from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -28,9 +27,13 @@ class TritonAttnBackend(AttentionBackend):
         self.decode_attention_fwd = decode_attention_fwd
         self.extend_attention_fwd = extend_attention_fwd
-        self.num_head = (
-            model_runner.model_config.num_attention_heads // model_runner.tp_size
-        )
+        if model_runner.server_args.enable_dp_attention:
+            self.num_head = model_runner.model_config.num_attention_heads
+        else:
+            self.num_head = (
+                model_runner.model_config.num_attention_heads // model_runner.tp_size
+            )
         if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
             self.reduce_dtype = torch.float32
@@ -50,7 +53,7 @@ class TritonAttnBackend(AttentionBackend):
             start_loc = torch.zeros_like(forward_batch.seq_lens, dtype=torch.int32)
             start_loc[1:] = torch.cumsum(forward_batch.seq_lens[:-1], dim=0)
-            total_num_tokens = torch.sum(forward_batch.seq_lens).item()
+            total_num_tokens = forward_batch.seq_lens_sum
             attn_logits = torch.empty(
                 (self.num_head, total_num_tokens),
                 dtype=self.reduce_dtype,
@@ -61,8 +64,7 @@ class TritonAttnBackend(AttentionBackend):
             max_extend_len = None
         else:
             start_loc = attn_logits = max_seq_len = None
-            prefix_lens = forward_batch.extend_prefix_lens
-            max_extend_len = torch.max(forward_batch.seq_lens - prefix_lens).item()
+            max_extend_len = torch.max(forward_batch.extend_seq_lens).item()
         self.forward_metadata = start_loc, attn_logits, max_seq_len, max_extend_len

sglang 0.3.5.post2__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

sglang 0.3.5.post2py3-none-any.whl → 0.3.6.post1py3-none-any.whl