PyPI - sglang - Versions diffs - 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl - Mend

sglang 0.3.6py3-none-any.whl → 0.3.6.post2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (108) hide show

sglang/__init__.py +2 -2
sglang/api.py +2 -2
sglang/bench_one_batch.py +4 -7
sglang/bench_one_batch_server.py +2 -2
sglang/bench_serving.py +75 -26
sglang/check_env.py +7 -1
sglang/lang/backend/base_backend.py +1 -1
sglang/lang/backend/runtime_endpoint.py +2 -2
sglang/lang/tracer.py +1 -1
sglang/launch_server.py +0 -3
sglang/srt/configs/model_config.py +15 -20
sglang/srt/constrained/__init__.py +13 -14
sglang/srt/constrained/base_grammar_backend.py +13 -15
sglang/srt/constrained/outlines_backend.py +13 -15
sglang/srt/constrained/outlines_jump_forward.py +13 -15
sglang/srt/constrained/xgrammar_backend.py +38 -57
sglang/srt/conversation.py +13 -15
sglang/srt/hf_transformers_utils.py +13 -15
sglang/srt/layers/activation.py +13 -13
sglang/srt/layers/attention/flashinfer_backend.py +14 -7
sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
sglang/srt/layers/custom_op_util.py +13 -14
sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
sglang/srt/layers/fused_moe_triton/layer.py +633 -0
sglang/srt/layers/layernorm.py +13 -15
sglang/srt/layers/logits_processor.py +13 -15
sglang/srt/layers/quantization/__init__.py +77 -17
sglang/srt/layers/radix_attention.py +13 -15
sglang/srt/layers/rotary_embedding.py +13 -13
sglang/srt/layers/sampler.py +1 -1
sglang/srt/lora/lora.py +13 -14
sglang/srt/lora/lora_config.py +13 -14
sglang/srt/lora/lora_manager.py +22 -24
sglang/srt/managers/data_parallel_controller.py +25 -19
sglang/srt/managers/detokenizer_manager.py +13 -18
sglang/srt/managers/image_processor.py +6 -9
sglang/srt/managers/io_struct.py +43 -28
sglang/srt/managers/schedule_batch.py +92 -27
sglang/srt/managers/schedule_policy.py +13 -15
sglang/srt/managers/scheduler.py +94 -72
sglang/srt/managers/session_controller.py +29 -19
sglang/srt/managers/tokenizer_manager.py +29 -22
sglang/srt/managers/tp_worker.py +13 -15
sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
sglang/srt/metrics/collector.py +13 -15
sglang/srt/metrics/func_timer.py +13 -15
sglang/srt/mm_utils.py +13 -14
sglang/srt/model_executor/cuda_graph_runner.py +20 -19
sglang/srt/model_executor/forward_batch_info.py +19 -17
sglang/srt/model_executor/model_runner.py +42 -30
sglang/srt/models/chatglm.py +15 -16
sglang/srt/models/commandr.py +15 -16
sglang/srt/models/dbrx.py +15 -16
sglang/srt/models/deepseek.py +15 -15
sglang/srt/models/deepseek_v2.py +15 -15
sglang/srt/models/exaone.py +14 -15
sglang/srt/models/gemma.py +14 -14
sglang/srt/models/gemma2.py +24 -19
sglang/srt/models/gemma2_reward.py +13 -14
sglang/srt/models/gpt_bigcode.py +14 -14
sglang/srt/models/grok.py +15 -15
sglang/srt/models/internlm2.py +13 -15
sglang/srt/models/internlm2_reward.py +13 -14
sglang/srt/models/llama.py +21 -21
sglang/srt/models/llama_classification.py +13 -14
sglang/srt/models/llama_reward.py +13 -14
sglang/srt/models/llava.py +20 -16
sglang/srt/models/llavavid.py +13 -15
sglang/srt/models/minicpm.py +13 -15
sglang/srt/models/minicpm3.py +13 -15
sglang/srt/models/mistral.py +13 -15
sglang/srt/models/mixtral.py +15 -15
sglang/srt/models/mixtral_quant.py +14 -14
sglang/srt/models/olmo.py +21 -19
sglang/srt/models/olmoe.py +23 -20
sglang/srt/models/qwen.py +14 -14
sglang/srt/models/qwen2.py +22 -19
sglang/srt/models/qwen2_moe.py +17 -18
sglang/srt/models/stablelm.py +18 -16
sglang/srt/models/torch_native_llama.py +15 -17
sglang/srt/models/xverse.py +13 -14
sglang/srt/models/xverse_moe.py +15 -16
sglang/srt/models/yivl.py +13 -15
sglang/srt/openai_api/adapter.py +13 -15
sglang/srt/openai_api/protocol.py +13 -15
sglang/srt/sampling/sampling_batch_info.py +4 -1
sglang/srt/sampling/sampling_params.py +13 -15
sglang/srt/server.py +60 -34
sglang/srt/server_args.py +22 -22
sglang/srt/utils.py +208 -19
sglang/test/few_shot_gsm8k.py +8 -4
sglang/test/runners.py +13 -14
sglang/test/test_utils.py +2 -2
sglang/version.py +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/LICENSE +1 -1
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/METADATA +25 -15
sglang-0.3.6.post2.dist-info/RECORD +164 -0
sglang/srt/layers/fused_moe/__init__.py +0 -1
sglang-0.3.6.dist-info/RECORD +0 -161
/sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/WHEEL +0 -0
{sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/top_level.txt +0 -0

sglang/srt/constrained/xgrammar_backend.py CHANGED Viewed

@@ -1,39 +1,30 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Constrained decoding with xgrammar backend."""
 import logging
 from typing import List, Tuple
 import torch
-try:
-    from xgrammar import (
-        CachedGrammarCompiler,
-        CompiledGrammar,
-        GrammarMatcher,
-        TokenizerInfo,
-    )
-    import_error = None
-except ImportError as e:
-    CachedGrammarCompiler = CompiledGrammar = GrammarMatcher = TokenizerInfo = (
-        ImportError
-    )
-    import_error = e
+from xgrammar import (
+    CompiledGrammar,
+    GrammarCompiler,
+    GrammarMatcher,
+    TokenizerInfo,
+    allocate_token_bitmask,
+    apply_token_bitmask_inplace,
+)
 from sglang.srt.constrained.base_grammar_backend import (
     BaseGrammarBackend,
@@ -43,7 +34,7 @@ from sglang.srt.constrained.base_grammar_backend import (
 logger = logging.getLogger(__name__)
-MAX_ROLLBACK_TOKENS = 10
+MAX_ROLLBACK_TOKENS = 200
 class XGrammarGrammar(BaseGrammarObject):
@@ -88,21 +79,22 @@ class XGrammarGrammar(BaseGrammarObject):
     def allocate_vocab_mask(
         self, vocab_size: int, batch_size: int, device
     ) -> torch.Tensor:
-        return self.matcher.allocate_token_bitmask(vocab_size, batch_size)
+        return allocate_token_bitmask(batch_size, vocab_size)
     def fill_vocab_mask(self, vocab_mask: torch.Tensor, idx: int) -> None:
         self.matcher.fill_next_token_bitmask(vocab_mask, idx)
     @staticmethod
     def apply_vocab_mask(logits: torch.Tensor, vocab_mask: torch.Tensor) -> None:
-        GrammarMatcher.apply_token_bitmask_inplace(logits, vocab_mask)
+        if vocab_mask.device.type != logits.device.type:
+            # vocab_mask must then be on the same device as logits
+            # when applying the token bitmask, so we check and move if needed
+            vocab_mask = vocab_mask.to(logits.device)
+        apply_token_bitmask_inplace(logits, vocab_mask)
     def copy(self):
-        matcher = GrammarMatcher(
-            self.ctx,
-            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            vocab_size=self.vocab_size,
-        )
+        matcher = GrammarMatcher(self.ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS)
         return XGrammarGrammar(matcher, self.vocab_size, self.ctx)
@@ -114,25 +106,18 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
     ):
         super().__init__()
-        if import_error:
-            logger.warning(
-                f"Ignore import error for the grammar backend: {import_error}"
-            )
-            self.grammar_cache = None
-            return
-        tokenizer_info = TokenizerInfo.from_huggingface(tokenizer)
-        self.grammar_cache = CachedGrammarCompiler(tokenizer_info=tokenizer_info)
+        tokenizer_info = TokenizerInfo.from_huggingface(
+            tokenizer, vocab_size=vocab_size
+        )
+        self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
         self.vocab_size = vocab_size
     def init_value_impl(self, key: Tuple[str, str]) -> XGrammarGrammar:
-        if import_error:
-            raise import_error
         key_type, key_string = key
         if key_type == "json":
             try:
-                ctx = self.grammar_cache.compile_json_schema_grammar(schema=key_string)
+                ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
             except RuntimeError as e:
                 logging.warning(
                     f"Skip invalid json_schema: json_schema={key_string}, {e=}"
@@ -146,13 +131,9 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
         else:
             raise ValueError(f"Invalid key_type: {key_type}")
-        matcher = GrammarMatcher(
-            ctx,
-            max_rollback_tokens=MAX_ROLLBACK_TOKENS,
-            vocab_size=self.vocab_size,
-        )
+        matcher = GrammarMatcher(ctx, max_rollback_tokens=MAX_ROLLBACK_TOKENS)
         return XGrammarGrammar(matcher, self.vocab_size, ctx)
     def reset(self):
-        if self.grammar_cache:
-            self.grammar_cache.clear()
+        if self.grammar_compiler:
+            self.grammar_compiler.clear_cache()

sglang/srt/conversation.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Conversation chat templates."""
 # Adapted from

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Utilities for Huggingface Transformers."""
 import contextlib

sglang/srt/layers/activation.py CHANGED Viewed

@@ -1,16 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """Fused operators for activation layers."""
 import logging

sglang/srt/layers/attention/flashinfer_backend.py CHANGED Viewed

@@ -7,6 +7,7 @@ FlashInfer is faster and Triton is easier to customize.
 Each backend supports two operators: extend (i.e. prefill with cached prefix) and decode.
 """
+import os
 from enum import Enum, auto
 from typing import TYPE_CHECKING, List
@@ -17,7 +18,7 @@ import triton.language as tl
 from sglang.global_config import global_config
 from sglang.srt.layers.attention import AttentionBackend
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 if TYPE_CHECKING:
     from sglang.srt.layers.radix_attention import RadixAttention
@@ -45,13 +46,19 @@ class FlashInferAttnBackend(AttentionBackend):
         super().__init__()
         # Parse constants
-        if not _grouped_size_compiled_for_decode_kernels(
-            model_runner.model_config.num_attention_heads // model_runner.tp_size,
-            model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
-        ):
-            self.decode_use_tensor_cores = True
+        if "SGLANG_FLASHINFER_USE_TENSOR_CORE" in os.environ:
+            self.decode_use_tensor_cores = get_bool_env_var(
+                "SGLANG_FLASHINFER_USE_TENSOR_CORE"
+            )
         else:
-            self.decode_use_tensor_cores = False
+            if not _grouped_size_compiled_for_decode_kernels(
+                model_runner.model_config.num_attention_heads // model_runner.tp_size,
+                model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
+            ):
+                self.decode_use_tensor_cores = True
+            else:
+                self.decode_use_tensor_cores = False
         self.max_context_len = model_runner.model_config.context_len
         assert not (

sglang/srt/layers/attention/triton_ops/decode_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for decoding.
 It supports page size = 1.
@@ -26,6 +24,8 @@ import triton.language as tl
 from sglang.srt.utils import is_hip
+is_hip_ = is_hip()
 @triton.jit
 def tanh(x):
@@ -52,12 +52,13 @@ def _fwd_kernel_stage1(
     kv_group_num: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     logit_cap: tl.constexpr,
     Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
-    start_n = tl.program_id(2)
+    split_k_id = tl.program_id(2)
     reduce_dtype = Att_Out.dtype.element_ty
     cur_kv_head = cur_head // kv_group_num
@@ -67,22 +68,18 @@ def _fwd_kernel_stage1(
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
-    cur_batch_start_index = 0
-    cur_batch_end_index = cur_batch_seq_len
     off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    q = tl.load(Q + off_q).to(reduce_dtype)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, SPLIT_K)
+    split_k_start = kv_len_per_split * split_k_id
+    split_k_end = tl.minimum(split_k_start + kv_len_per_split, cur_batch_seq_len)
-    block_stard_index = start_n * BLOCK_N
-    block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
-    for start_mark in range(0, block_mask, 1):
-        q = tl.load(Q + off_q + start_mark).to(reduce_dtype)
-        offs_n_new = cur_batch_start_index + offs_n
+    for start_n in range(split_k_start, split_k_end, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
         k_loc = tl.load(
-            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
-            mask=offs_n_new < cur_batch_end_index,
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+            mask=offs_n < split_k_end,
             other=0,
         )
         offs_buf_k = (
@@ -92,7 +89,7 @@ def _fwd_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=(offs_n_new[:, None] < cur_batch_end_index) & (offs_d[None, :] < Lk),
+            mask=(offs_n[:, None] < split_k_end) & (offs_d[None, :] < Lk),
             other=0.0,
         ).to(reduce_dtype)
         att_value = tl.sum(q[None, :] * k, 1)
@@ -102,7 +99,7 @@ def _fwd_kernel_stage1(
             att_value = logit_cap * tanh(att_value / logit_cap)
         off_o = cur_head * att_stride_h + (cur_batch_in_all_start_index + offs_n)
-        tl.store(Att_Out + off_o, att_value, mask=offs_n_new < cur_batch_end_index)
+        tl.store(Att_Out + off_o, att_value, mask=offs_n < split_k_end)
 @triton.jit
@@ -191,11 +188,12 @@ def _decode_att_m_fwd(
     logit_cap,
 ):
     BLOCK = 32
+    SPLIT_K = 8
     Lk = k_buffer.shape[-1]
     batch, head_num = B_req_idx.shape[0], q.shape[1]
-    grid = (batch, head_num, triton.cdiv(max_len_in_batch, BLOCK))
+    grid = (batch, head_num, SPLIT_K)
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     if kv_group_num == 1:
@@ -223,6 +221,7 @@ def _decode_att_m_fwd(
         kv_group_num=kv_group_num,
         BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_N=BLOCK,
+        SPLIT_K=SPLIT_K,
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
@@ -294,13 +293,14 @@ def _fwd_grouped_kernel_stage1(
     BLOCK_DPE: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_H: tl.constexpr,
+    SPLIT_K: tl.constexpr,
     logit_cap: tl.constexpr,
     Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head_id = tl.program_id(1)
     cur_kv_head = cur_head_id // tl.cdiv(kv_group_num, BLOCK_H)
-    start_n = tl.program_id(2)
+    split_k_id = tl.program_id(2)
     reduce_dtype = Att_Out.dtype.element_ty
@@ -317,30 +317,27 @@ def _fwd_grouped_kernel_stage1(
     cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch)
     cur_batch_req_idx = tl.load(B_req_idx + cur_batch)
-    cur_batch_start_index = 0
-    cur_batch_end_index = cur_batch_seq_len
     offs_q = cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_d[None, :]
+    q = tl.load(
+        Q + offs_q, mask=(mask_h[:, None]) & (offs_d[None, :] < Lk), other=0.0
+    ).to(reduce_dtype)
     if BLOCK_DPE > 0:
         offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
         off_qpe = (
             cur_batch * stride_qbs + cur_head[:, None] * stride_qh + offs_dpe[None, :]
         )
+        qpe = tl.load(Q + off_qpe, mask=mask_h[:, None], other=0.0).to(reduce_dtype)
-    offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
-    block_stard_index = start_n * BLOCK_N
-    block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
+    kv_len_per_split = tl.cdiv(cur_batch_seq_len, SPLIT_K)
+    split_k_start = kv_len_per_split * split_k_id
+    split_k_end = tl.minimum(split_k_start + kv_len_per_split, cur_batch_seq_len)
-    for start_mark in range(0, block_mask, 1):
-        q = tl.load(
-            Q + offs_q + start_mark, mask=(mask_h[:, None]) & (offs_d[None, :] < Lk)
-        ).to(reduce_dtype)
-        offs_n_new = cur_batch_start_index + offs_n
+    for start_n in range(split_k_start, split_k_end, BLOCK_N):
+        offs_n = start_n + tl.arange(0, BLOCK_N)
         k_loc = tl.load(
-            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
-            mask=offs_n_new < cur_batch_end_index,
+            Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n,
+            mask=offs_n < split_k_end,
             other=0,
         )
         offs_buf_k = (
@@ -350,14 +347,11 @@ def _fwd_grouped_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=(offs_n_new[None, :] < cur_batch_end_index) & (offs_d[:, None] < Lk),
+            mask=(offs_n[None, :] < split_k_end) & (offs_d[:, None] < Lk),
             other=0.0,
         ).to(reduce_dtype)
         qk = tl.dot(q, k)
         if BLOCK_DPE > 0:
-            qpe = tl.load(Q + off_qpe + start_mark, mask=mask_h[:, None]).to(
-                reduce_dtype
-            )
             offs_buf_kpe = (
                 k_loc[None, :] * stride_buf_kbs
                 + cur_kv_head * stride_buf_kh
@@ -365,7 +359,7 @@ def _fwd_grouped_kernel_stage1(
             )
             kpe = tl.load(
                 K_Buffer + offs_buf_kpe,
-                mask=offs_n_new[None, :] < cur_batch_end_index,
+                mask=offs_n[None, :] < split_k_end,
                 other=0.0,
             ).to(reduce_dtype)
             qk += tl.dot(qpe, kpe)
@@ -381,7 +375,7 @@ def _fwd_grouped_kernel_stage1(
         tl.store(
             Att_Out + offs_o,
             qk,
-            mask=mask_h[:, None] & (offs_n_new[None, :] < cur_batch_end_index),
+            mask=mask_h[:, None] & (offs_n[None, :] < split_k_end),
         )
@@ -499,16 +493,17 @@ def _decode_grouped_att_m_fwd(
     kv_group_num = q.shape[1] // k_buffer.shape[1]
     BLOCK_H = max(16, min(64, triton.next_power_of_2(kv_group_num)))
+    SPLIT_K = 8
     grid = (
         batch,
         triton.cdiv(head_num, min(BLOCK_H, kv_group_num)),
-        triton.cdiv(max_len_in_batch, BLOCK),
+        SPLIT_K,
     )
     num_warps = 4
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
@@ -534,6 +529,7 @@ def _decode_grouped_att_m_fwd(
         BLOCK_DPE=BLOCK_DPE,
         BLOCK_N=BLOCK,
         BLOCK_H=BLOCK_H,
+        SPLIT_K=SPLIT_K,
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
@@ -563,7 +559,7 @@ def _decode_grouped_softmax_reducev_fwd(
     BLOCK_DMODEL = triton.next_power_of_2(Lv)
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         # https://rocm.docs.amd.com/en/docs-6.2.0/how-to/llm-fine-tuning-optimization/optimizing-triton-kernel.html
         # https://github.com/triton-lang/triton/blob/main/third_party/amd/backend/compiler.py
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}

sglang/srt/layers/attention/triton_ops/extend_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for prefill.
 It supports page size = 1 and prefill with KV cache (i.e. extend).
@@ -31,6 +29,8 @@ is_cuda_available = torch.cuda.is_available()
 if is_cuda_available:
     CUDA_CAPABILITY = torch.cuda.get_device_capability()
+is_hip_ = is_hip()
 @triton.jit
 def tanh(x):
@@ -313,7 +313,7 @@ def extend_attention_fwd(
     num_stages = 1
     extra_kargs = {}
-    if is_hip():
+    if is_hip_:
         extra_kargs = {"waves_per_eu": 4, "matrix_instr_nonkdim": 16, "kpack": 2}
     _fwd_kernel[grid](

sglang/srt/layers/attention/triton_ops/prefill_attention.py CHANGED Viewed

@@ -1,18 +1,16 @@
-"""
-Copyright 2023-2024 SGLang Team
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-"""
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
 """
 Memory-efficient attention for prefill.
 It supporst page size = 1.

sglang 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl

sglang 0.3.6py3-none-any.whl → 0.3.6.post2py3-none-any.whl