PyPI - sglang - Versions diffs - 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/bench_latency.py +10 -6
sglang/bench_serving.py +33 -38
sglang/global_config.py +0 -4
sglang/lang/backend/runtime_endpoint.py +13 -6
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +29 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +2 -4
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +40 -35
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +256 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +110 -74
sglang/srt/managers/tokenizer_manager.py +24 -15
sglang/srt/managers/tp_worker.py +181 -115
sglang/srt/model_executor/cuda_graph_runner.py +60 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +118 -141
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +6 -8
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/exaone.py +8 -43
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/{llama2.py → llama.py} +48 -26
sglang/srt/models/llama_classification.py +14 -40
sglang/srt/models/llama_embedding.py +7 -6
sglang/srt/models/llava.py +38 -16
sglang/srt/models/llavavid.py +7 -8
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +665 -0
sglang/srt/models/mistral.py +2 -3
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +67 -58
sglang/srt/server.py +24 -14
sglang/srt/server_args.py +130 -28
sglang/srt/utils.py +12 -0
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +70 -0
sglang/test/test_utils.py +89 -1
sglang/utils.py +38 -4
sglang/version.py +1 -1
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/METADATA +31 -18
sglang-0.3.1.dist-info/RECORD +129 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/WHEEL +1 -1
sglang-0.2.15.dist-info/RECORD +0 -118
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/LICENSE +0 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/sampler.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import dataclasses
 import logging
-from typing import Union
+from typing import Tuple, Union
 import torch
 from flashinfer.sampling import (
@@ -9,6 +9,7 @@ from flashinfer.sampling import (
     top_k_top_p_sampling_from_probs,
     top_p_renorm_prob,
 )
+from torch.library import custom_op as torch_custom_op
 from vllm.model_executor.custom_op import CustomOp
 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
@@ -30,43 +31,18 @@ class SampleOutput:
 class Sampler(CustomOp):
     def __init__(self):
         super().__init__()
+        # FIXME: torch.multinomial has too many bugs
+        self.forward_native = self.forward_cuda
+        self.is_torch_compile = False
-    def _apply_penalties(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
-        # min-token, presence, frequency
-        if sampling_info.linear_penalties is not None:
-            logits += sampling_info.linear_penalties
-        # repetition
-        if sampling_info.scaling_penalties is not None:
-            logits = torch.where(
-                logits > 0,
-                logits / sampling_info.scaling_penalties,
-                logits * sampling_info.scaling_penalties,
-            )
-        return logits
-    def _get_probs(
-        self,
-        logits: torch.Tensor,
-        sampling_info: SamplingBatchInfo,
-        is_torch_compile: bool = False,
-    ):
+    def _get_probs(self, logits: torch.Tensor, sampling_info: SamplingBatchInfo):
         # Post process logits
         logits = logits.contiguous()
         logits.div_(sampling_info.temperatures)
-        if is_torch_compile:
+        if self.is_torch_compile:
             # FIXME: Temporary workaround for unknown bugs in torch.compile
             logits.add_(0)
-        if sampling_info.logit_bias is not None:
-            logits.add_(sampling_info.logit_bias)
-        if sampling_info.vocab_mask is not None:
-            logits = logits.masked_fill(sampling_info.vocab_mask, float("-inf"))
-        logits = self._apply_penalties(logits, sampling_info)
         return torch.softmax(logits, dim=-1)
     def forward_cuda(
@@ -79,7 +55,7 @@ class Sampler(CustomOp):
         probs = self._get_probs(logits, sampling_info)
-        if not global_server_args_dict["disable_flashinfer_sampling"]:
+        if global_server_args_dict["sampling_backend"] == "flashinfer":
             max_top_k_round, batch_size = 32, probs.shape[0]
             uniform_samples = torch.rand(
                 (max_top_k_round, batch_size), device=probs.device
@@ -91,14 +67,18 @@ class Sampler(CustomOp):
                     probs, uniform_samples, sampling_info.min_ps
                 )
             else:
-                batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
+                batch_next_token_ids, success = flashinfer_top_k_top_p(
                     probs, uniform_samples, sampling_info.top_ks, sampling_info.top_ps
                 )
-        else:
+        elif global_server_args_dict["sampling_backend"] == "pytorch":
             # Here we provide a slower fallback implementation.
             batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
                 probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
             )
+        else:
+            raise ValueError(
+                f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
+            )
         return SampleOutput(success, probs, batch_next_token_ids)
@@ -110,7 +90,7 @@ class Sampler(CustomOp):
         if isinstance(logits, LogitsProcessorOutput):
             logits = logits.next_token_logits
-        probs = self._get_probs(logits, sampling_info, is_torch_compile=True)
+        probs = self._get_probs(logits, sampling_info)
         batch_next_token_ids, success = top_k_top_p_min_p_sampling_from_probs_torch(
             probs, sampling_info.top_ks, sampling_info.top_ps, sampling_info.min_ps
@@ -119,6 +99,31 @@ class Sampler(CustomOp):
         return SampleOutput(success, probs, batch_next_token_ids)
+@torch_custom_op("my_lib::flashinfer_top_k_top_p", mutates_args={})
+def flashinfer_top_k_top_p(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    top_ks: torch.Tensor,
+    top_ps: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    # NOTE: we do not use min_p neither in CUDA nor in torch.compile
+    return top_k_top_p_sampling_from_probs(probs, uniform_samples, top_ks, top_ps)
+@flashinfer_top_k_top_p.register_fake
+def _(
+    probs: torch.Tensor,
+    uniform_samples: torch.Tensor,
+    top_ks: torch.Tensor,
+    top_ps: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    bs = probs.shape[0]
+    return (
+        torch.ones(bs, dtype=torch.bool, device=probs.device),
+        torch.zeros(bs, dtype=torch.int32, device=probs.device),
+    )
 def top_k_top_p_min_p_sampling_from_probs_torch(
     probs: torch.Tensor,
     top_ks: torch.Tensor,

sglang/srt/layers/torchao_utils.py ADDED Viewed

@@ -0,0 +1,75 @@
+"""
+Common utilities for torchao.
+"""
+from typing import Dict, Set
+import torch
+def torchao_quantize_param_data(param: torch.Tensor, torchao_config: str):
+    """Quantize a Tensor with torchao quantization specified by torchao_config
+    Args:
+       `param`: weight parameter of the linear module
+       `torchao_config`: type of quantization and their arguments we want to use to
+        quantize the Tensor, e.g. int4wo-128 means int4 weight only quantization with group_size
+        128
+    """
+    # Lazy import to suppress some warnings
+    from torchao.quantization import (
+        int4_weight_only,
+        int8_dynamic_activation_int8_weight,
+        int8_weight_only,
+        quantize_,
+    )
+    dummy_linear = torch.nn.Linear(param.shape[1], param.shape[0], bias=False)
+    dummy_linear.weight = param
+    if "int8wo" in torchao_config:
+        quantize_(dummy_linear, int8_weight_only())
+    elif "int8dq" in torchao_config:
+        quantize_(dummy_linear, int8_dynamic_activation_int8_weight())
+    elif "int4wo" in torchao_config:
+        group_size = int(torchao_config.split("-")[-1])
+        assert group_size in [
+            32,
+            64,
+            128,
+            256,
+        ], f"int4wo groupsize needs to be one of [32, 64, 128, 256] but got {group_size}"
+        quantize_(dummy_linear, int4_weight_only(group_size=group_size))
+    elif "fp8wo" in torchao_config:
+        from torchao.quantization import float8_weight_only
+        # this requires newer hardware
+        # [rank0]: AssertionError: fp8e4nv data type is not supported on CUDA arch < 89
+        quantize_(dummy_linear, float8_weight_only())
+    return dummy_linear.weight
+def apply_torchao_config_(
+    self: torch.nn.Module,
+    params_dict: Dict[str, torch.Tensor],
+    param_suffixes: Set[str],
+) -> None:
+    """A util function used for quantizing the weight parameters after they are loaded if
+       self.torchao_config is specified
+    Args:
+      `self`: the model we want to quantize
+      `params_dict`: dictionary mapping from param_name to the parameter Tensor
+      `param_suffixes`: a set of suffixes, we'll quantize the Tensor matching these suffixes
+    Returns:
+       None, the `params_dict` is modified inplace and the weights of `self` model are quantized
+    """
+    if self.torchao_config:
+        for param_suffix in param_suffixes:
+            for name in params_dict:
+                param = params_dict[name]
+                if param_suffix in name and param.ndim == 2:
+                    params_dict[name] = torchao_quantize_param_data(
+                        param, self.torchao_config
+                    )
+        self.load_state_dict(params_dict, assign=True)

sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} RENAMED Viewed

@@ -15,24 +15,15 @@ limitations under the License.
 """
 Memory-efficient attention for decoding.
+It supports page size = 1.
 """
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
-import torch
 import triton
 import triton.language as tl
-from sglang.srt.managers.schedule_batch import global_server_args_dict
-if global_server_args_dict.get("triton_attention_reduce_in_fp32", False):
-    REDUCE_TRITON_TYPE = tl.float32
-    REDUCE_TORCH_TYPE = torch.float32
-else:
-    REDUCE_TRITON_TYPE = tl.float16
-    REDUCE_TORCH_TYPE = torch.float16
 @triton.jit
 def tanh(x):
@@ -60,11 +51,13 @@ def _fwd_kernel_stage1(
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
     logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
     start_n = tl.program_id(2)
+    reduce_dtype = Att_Out.dtype.element_ty
     cur_kv_head = cur_head // kv_group_num
     offs_d = tl.arange(0, BLOCK_DMODEL)
@@ -83,7 +76,7 @@ def _fwd_kernel_stage1(
     block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
     for start_mark in range(0, block_mask, 1):
-        q = tl.load(Q + off_q + start_mark).to(REDUCE_TRITON_TYPE)
+        q = tl.load(Q + off_q + start_mark).to(reduce_dtype)
         offs_n_new = cur_batch_start_index + offs_n
         k_loc = tl.load(
             Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
@@ -97,9 +90,9 @@ def _fwd_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=offs_n_new[:, None] < cur_batch_end_index,
+            mask=(offs_n_new[:, None] < cur_batch_end_index) & (offs_d[None, :] < Lk),
             other=0.0,
-        ).to(REDUCE_TRITON_TYPE)
+        ).to(reduce_dtype)
         att_value = tl.sum(q[None, :] * k, 1)
         att_value *= sm_scale
@@ -112,7 +105,7 @@ def _fwd_kernel_stage1(
 @triton.jit
 def _fwd_kernel_stage2(
-    Logics,
+    logits,
     V_Buffer,
     Out,
     Req_to_tokens,
@@ -128,6 +121,7 @@ def _fwd_kernel_stage2(
     kv_group_num: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    Lv: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -159,7 +153,7 @@ def _fwd_kernel_stage2(
         )
         qk = tl.load(
-            Logics
+            logits
             + cur_head * stride_logic_h
             + (cur_batch_start_loc + start_n + offs_n),
             mask=start_n + offs_n < cur_batch_seq_len,
@@ -170,14 +164,16 @@ def _fwd_kernel_stage2(
         old_scale = tl.exp(e_max - n_e_max)
         p = tl.exp(qk - n_e_max)
         e_sum = e_sum * old_scale + tl.sum(p, 0)
-        v = tl.load(v_ptrs + v_index[:, None] * stride_buf_vbs)
+        v = tl.load(
+            v_ptrs + v_index[:, None] * stride_buf_vbs, mask=(offs_d[None, :] < Lv)
+        )
         acc = acc * old_scale + tl.sum(p[:, None] * v, 0)
         e_max = n_e_max
     acc = acc / e_sum
     off_o = cur_batch * stride_obs + cur_head * stride_oh + offs_d
     out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc)
+    tl.store(out_ptrs, acc, mask=(offs_d < Lv))
 def _decode_att_m_fwd(
@@ -193,10 +189,7 @@ def _decode_att_m_fwd(
     logit_cap,
 ):
     BLOCK = 32
-    # shape constraints
-    Lq, Lk = q.shape[-1], k_buffer.shape[-1]
-    assert Lq == Lk
-    assert Lk in {16, 32, 64, 128, 256}
+    Lk = k_buffer.shape[-1]
     batch, head_num = B_req_idx.shape[0], q.shape[1]
@@ -208,6 +201,8 @@ def _decode_att_m_fwd(
     else:
         num_warps = 2
+    BLOCK_DMODEL = triton.next_power_of_2(Lk)
     _fwd_kernel_stage1[grid](
         q,
         k_buffer,
@@ -224,16 +219,17 @@ def _decode_att_m_fwd(
         k_buffer.stride(1),
         att_out.stride(0),
         kv_group_num=kv_group_num,
-        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_N=BLOCK,
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
+        Lk=Lk,
     )
 def _decode_softmax_reducev_fwd(
-    logics,
+    logits,
     v_buffer,
     o,
     req_to_tokens,
@@ -242,31 +238,35 @@ def _decode_softmax_reducev_fwd(
     b_seq_len,
 ):
     BLOCK = 64
-    batch, head = b_seq_len.shape[0], logics.shape[0]
+    batch, head = b_seq_len.shape[0], logits.shape[0]
     grid = (batch, head, 1)
-    kv_group_num = logics.shape[0] // v_buffer.shape[1]
+    kv_group_num = logits.shape[0] // v_buffer.shape[1]
     num_warps = 1
+    Lv = v_buffer.shape[-1]
+    BLOCK_DMODEL = triton.next_power_of_2(Lv)
     _fwd_kernel_stage2[grid](
-        logics,
+        logits,
         v_buffer,
         o,
         req_to_tokens,
         b_req_idx,
         b_start_loc,
         b_seq_len,
-        logics.stride(0),
+        logits.stride(0),
         v_buffer.stride(0),
         v_buffer.stride(1),
         o.stride(0),
         o.stride(1),
         req_to_tokens.stride(0),
         kv_group_num=kv_group_num,
-        BLOCK_DMODEL=v_buffer.shape[-1],
+        BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_N=BLOCK,
         num_warps=num_warps,
         num_stages=3,
+        Lv=Lv,
     )
@@ -293,11 +293,13 @@ def _fwd_grouped_kernel_stage1(
     BLOCK_N: tl.constexpr,
     BLOCK_H: tl.constexpr,
     logit_cap: tl.constexpr,
+    Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_kv_head = tl.program_id(1)
     start_n = tl.program_id(2)
+    reduce_dtype = Att_Out.dtype.element_ty
     cur_head = cur_kv_head * kv_group_num + tl.arange(0, BLOCK_H)
     mask_h = cur_head < (cur_kv_head + 1) * kv_group_num
     mask_h = mask_h & (cur_head < q_head_num)
@@ -324,9 +326,9 @@ def _fwd_grouped_kernel_stage1(
     block_mask = tl.where(block_stard_index < cur_batch_seq_len, 1, 0)
     for start_mark in range(0, block_mask, 1):
-        q = tl.load(Q + offs_q + start_mark, mask=mask_h[:, None]).to(
-            REDUCE_TRITON_TYPE
-        )
+        q = tl.load(
+            Q + offs_q + start_mark, mask=(mask_h[:, None]) & (offs_d[None, :] < Lk)
+        ).to(reduce_dtype)
         offs_n_new = cur_batch_start_index + offs_n
         k_loc = tl.load(
             Req_to_tokens + stride_req_to_tokens_b * cur_batch_req_idx + offs_n_new,
@@ -340,13 +342,13 @@ def _fwd_grouped_kernel_stage1(
         )
         k = tl.load(
             K_Buffer + offs_buf_k,
-            mask=offs_n_new[None, :] < cur_batch_end_index,
+            mask=(offs_n_new[None, :] < cur_batch_end_index) & (offs_d[:, None] < Lk),
             other=0.0,
-        ).to(REDUCE_TRITON_TYPE)
+        ).to(reduce_dtype)
         qk = tl.dot(q, k)
         if BLOCK_DPE > 0:
             qpe = tl.load(Q + off_qpe + start_mark, mask=mask_h[:, None]).to(
-                REDUCE_TRITON_TYPE
+                reduce_dtype
             )
             offs_buf_kpe = (
                 k_loc[None, :] * stride_buf_kbs
@@ -357,7 +359,7 @@ def _fwd_grouped_kernel_stage1(
                 K_Buffer + offs_buf_kpe,
                 mask=offs_n_new[None, :] < cur_batch_end_index,
                 other=0.0,
-            ).to(REDUCE_TRITON_TYPE)
+            ).to(reduce_dtype)
             qk += tl.dot(qpe, kpe)
         qk *= sm_scale
@@ -377,7 +379,7 @@ def _fwd_grouped_kernel_stage1(
 @triton.jit
 def _fwd_grouped_kernel_stage2(
-    Logics,
+    logits,
     V_Buffer,
     Out,
     Req_to_tokens,
@@ -395,6 +397,7 @@ def _fwd_grouped_kernel_stage2(
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
     BLOCK_H: tl.constexpr,
+    Lv: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_kv_head = tl.program_id(1)
@@ -432,7 +435,7 @@ def _fwd_grouped_kernel_stage2(
         )
         qk = tl.load(
-            Logics + offs_qk,
+            logits + offs_qk,
             mask=mask_h[:, None] & (start_n + offs_n[None, :] < cur_batch_seq_len),
             other=float("-inf"),
         )
@@ -441,7 +444,9 @@ def _fwd_grouped_kernel_stage2(
         old_scale = tl.exp(e_max - n_e_max)
         p = tl.exp(qk - n_e_max[:, None])
         e_sum = e_sum * old_scale + tl.sum(p, 1)
-        v = tl.load(v_ptrs + v_index[:, None] * stride_buf_vbs)
+        v = tl.load(
+            v_ptrs + v_index[:, None] * stride_buf_vbs, mask=(offs_d[None, :] < Lv)
+        )
         p = p.to(v.dtype)
         acc = acc * old_scale[:, None] + tl.dot(p, v)
         e_max = n_e_max
@@ -449,7 +454,7 @@ def _fwd_grouped_kernel_stage2(
     acc = acc / e_sum[:, None]
     off_o = cur_batch * stride_obs + cur_head[:, None] * stride_oh + offs_d[None, :]
     out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc, mask=mask_h[:, None])
+    tl.store(out_ptrs, acc, mask=(mask_h[:, None]) & (offs_d[None, :] < Lv))
 def _decode_grouped_att_m_fwd(
@@ -464,17 +469,17 @@ def _decode_grouped_att_m_fwd(
     sm_scale,
     logit_cap,
 ):
-    BLOCK = 32
-    # shape constraints
-    Lq, Lk = q.shape[-1], k_buffer.shape[-1]
-    assert Lq == Lk
-    assert Lk in {16, 32, 64, 128, 256, 576}
+    BLOCK = 64
+    Lk = k_buffer.shape[-1]
     if Lk == 576:
         BLOCK_DMODEL = 512
         BLOCK_DPE = 64
+    elif Lk == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
     else:
-        BLOCK_DMODEL = Lk
+        BLOCK_DMODEL = triton.next_power_of_2(Lk)
         BLOCK_DPE = 0
     batch, head_num = B_req_idx.shape[0], q.shape[1]
@@ -513,11 +518,12 @@ def _decode_grouped_att_m_fwd(
         logit_cap=logit_cap,
         num_warps=num_warps,
         num_stages=1,
+        Lk=Lk,
     )
 def _decode_grouped_softmax_reducev_fwd(
-    logics,
+    logits,
     v_buffer,
     o,
     req_to_tokens,
@@ -526,22 +532,25 @@ def _decode_grouped_softmax_reducev_fwd(
     b_seq_len,
 ):
     BLOCK = 128
-    batch, head_num = b_seq_len.shape[0], logics.shape[0]
-    kv_group_num = logics.shape[0] // v_buffer.shape[1]
+    batch, head_num = b_seq_len.shape[0], logits.shape[0]
+    kv_group_num = logits.shape[0] // v_buffer.shape[1]
     BLOCK_H = max(16, triton.next_power_of_2(kv_group_num))
     grid = (batch, triton.cdiv(head_num, min(BLOCK_H, kv_group_num)), 1)
     num_warps = 8
+    Lv = v_buffer.shape[-1]
+    BLOCK_DMODEL = triton.next_power_of_2(Lv)
     _fwd_grouped_kernel_stage2[grid](
-        logics,
+        logits,
         v_buffer,
         o,
         req_to_tokens,
         b_req_idx,
         b_start_loc,
         b_seq_len,
-        logics.stride(0),
+        logits.stride(0),
         v_buffer.stride(0),
         v_buffer.stride(1),
         o.stride(0),
@@ -549,9 +558,10 @@ def _decode_grouped_softmax_reducev_fwd(
         req_to_tokens.stride(0),
         kv_group_num=kv_group_num,
         q_head_num=head_num,
-        BLOCK_DMODEL=v_buffer.shape[-1],
+        BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_N=BLOCK,
         BLOCK_H=BLOCK_H,
+        Lv=Lv,
         num_warps=num_warps,
         num_stages=1,
     )
@@ -566,17 +576,11 @@ def decode_attention_fwd(
     b_req_idx,
     b_start_loc,
     b_seq_len,
+    attn_logits,
     max_len_in_batch,
-    total_num_tokens,
     sm_scale,
-    logit_cap=-1,
-    att_m=None,
+    logit_cap=0.0,
 ):
-    if att_m is None:
-        att_m = torch.empty(
-            (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
-        )
     kv_group_num = q.shape[1] // v_buffer.shape[1]
     if kv_group_num == 1:
@@ -584,7 +588,7 @@ def decode_attention_fwd(
         _decode_att_m_fwd(
             q,
             k_buffer,
-            att_m,
+            attn_logits,
             req_to_token,
             b_req_idx,
             b_start_loc,
@@ -594,7 +598,7 @@ def decode_attention_fwd(
             logit_cap,
         )
         _decode_softmax_reducev_fwd(
-            att_m,
+            attn_logits,
             v_buffer,
             o,
             req_to_token,
@@ -607,7 +611,7 @@ def decode_attention_fwd(
         _decode_grouped_att_m_fwd(
             q,
             k_buffer,
-            att_m,
+            attn_logits,
             req_to_token,
             b_req_idx,
             b_start_loc,
@@ -617,7 +621,7 @@ def decode_attention_fwd(
             logit_cap,
         )
         _decode_grouped_softmax_reducev_fwd(
-            att_m,
+            attn_logits,
             v_buffer,
             o,
             req_to_token,

sglang 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl