PyPI - sglang - Versions diffs - 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl - Mend

sglang 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

sglang/bench_latency.py +114 -63
sglang/check_env.py +2 -0
sglang/lang/backend/runtime_endpoint.py +0 -11
sglang/srt/hf_transformers_utils.py +2 -2
sglang/srt/layers/extend_attention.py +59 -7
sglang/srt/layers/radix_attention.py +22 -9
sglang/srt/layers/token_attention.py +28 -2
sglang/srt/managers/io_struct.py +9 -4
sglang/srt/managers/schedule_batch.py +15 -11
sglang/srt/managers/tokenizer_manager.py +28 -13
sglang/srt/mem_cache/memory_pool.py +65 -24
sglang/srt/model_config.py +11 -0
sglang/srt/model_executor/model_runner.py +52 -21
sglang/srt/models/deepseek_v2.py +198 -16
sglang/srt/openai_api/adapter.py +120 -20
sglang/srt/openai_api/protocol.py +1 -1
sglang/srt/server.py +87 -78
sglang/srt/server_args.py +8 -2
sglang/srt/utils.py +25 -20
sglang/test/run_eval.py +21 -10
sglang/test/runners.py +237 -0
sglang/test/simple_eval_common.py +12 -12
sglang/test/simple_eval_gpqa.py +92 -0
sglang/test/simple_eval_humaneval.py +5 -5
sglang/test/simple_eval_math.py +72 -0
sglang/test/test_utils.py +94 -13
sglang/utils.py +15 -37
sglang/version.py +1 -1
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/METADATA +29 -27
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/RECORD +33 -30
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/LICENSE +0 -0
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/WHEEL +0 -0
{sglang-0.2.9.dist-info → sglang-0.2.10.dist-info}/top_level.txt +0 -0

sglang/bench_latency.py CHANGED Viewed

@@ -1,13 +1,13 @@
 """
 Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
-# Usage (latency test):
+# Usage (latency test) with dummy weights:
 python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
-### Reference output:
+### Reference output (of the correctness test above, can be gpu dependent):
 prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
@@ -31,7 +31,9 @@ import dataclasses
 import logging
 import multiprocessing
 import time
+from typing import Tuple
+import jsonlines
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -47,25 +49,34 @@ from sglang.srt.utils import suppress_other_loggers
 @dataclasses.dataclass
 class BenchArgs:
-    batch_size: int = 1
+    batch_size: Tuple[int] = (1,)
     input_len: int = 1024
     output_len: int = 4
+    result_filename: str = ""
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
         parser.add_argument("--input-len", type=int, default=BenchArgs.input_len)
         parser.add_argument("--output-len", type=int, default=BenchArgs.output_len)
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        attrs = [attr.name for attr in dataclasses.fields(cls)]
-        return cls(**{attr: getattr(args, attr) for attr in attrs})
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
 def load_model(server_args, tp_rank):
@@ -93,7 +104,7 @@ def load_model(server_args, tp_rank):
     return model_runner, tokenizer
-def prepare_inputs(bench_args, tokenizer):
+def prepare_inputs_for_correctness_test(bench_args, tokenizer):
     prompts = [
         "The capital of France is",
         "The capital of the United Kindom is",
@@ -119,7 +130,9 @@ def prepare_inputs(bench_args, tokenizer):
     return input_ids, reqs
-def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
+def prepare_extend_inputs_for_correctness_test(
+    bench_args, input_ids, reqs, model_runner
+):
     for i in range(len(reqs)):
         req = reqs[i]
         req.input_ids += input_ids[i][bench_args.cut_len :]
@@ -129,8 +142,8 @@ def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
     return reqs
-def prepare_synthetic_inputs(bench_args, tokenizer):
-    input_ids = np.ones((bench_args.batch_size, bench_args.input_len), dtype=np.int32)
+def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
+    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -179,7 +192,7 @@ def correctness_test(
     model_runner, tokenizer = load_model(server_args, tp_rank)
     # Prepare inputs
-    input_ids, reqs = prepare_inputs(bench_args, tokenizer)
+    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
     if bench_args.cut_len > 0:
         # Prefill
@@ -187,7 +200,9 @@ def correctness_test(
         rank_print("prefill logits (first half)", next_token_logits)
     # Prepare extend inputs
-    reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
+    reqs = prepare_extend_inputs_for_correctness_test(
+        bench_args, input_ids, reqs, model_runner
+    )
     # Extend
     next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
@@ -205,6 +220,68 @@ def correctness_test(
         rank_print(tokenizer.decode(output_ids[i]))
+@torch.inference_mode()
+def latency_test_run_once(
+    model_runner, rank_print, reqs, batch_size, input_len, output_len
+):
+    # Clear the pools.
+    model_runner.req_to_token_pool.clear()
+    model_runner.token_to_kv_pool.clear()
+    measurement_results = {
+        "run_name": "before",
+        "batch_size": batch_size,
+        "input_len": input_len,
+        "output_len": output_len,
+    }
+    tot_latency = 0
+    # Prefill
+    torch.cuda.synchronize()
+    tic = time.time()
+    next_token_ids, _, batch = extend(reqs, model_runner)
+    torch.cuda.synchronize()
+    prefill_latency = time.time() - tic
+    tot_latency += prefill_latency
+    throughput = input_len * batch_size / prefill_latency
+    rank_print(
+        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["prefill_latency"] = prefill_latency
+    measurement_results["prefill_throughput"] = throughput
+    # Decode
+    for i in range(output_len):
+        torch.cuda.synchronize()
+        tic = time.time()
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        torch.cuda.synchronize()
+        latency = time.time() - tic
+        tot_latency += latency
+        throughput = batch_size / latency
+        if i < 5:
+            rank_print(
+                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+            )
+    avg_decode_latency = (tot_latency - prefill_latency) / output_len
+    avg_decode_throughput = batch_size / avg_decode_latency
+    rank_print(
+        f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
+    )
+    measurement_results["avg_decode_latency"] = avg_decode_latency
+    measurement_results["avg_decode_throughput"] = avg_decode_throughput
+    throughput = (input_len + output_len) * batch_size / tot_latency
+    rank_print(
+        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["total_latency"] = tot_latency
+    measurement_results["total_throughput"] = throughput
+    return measurement_results
 def latency_test(
     server_args,
     bench_args,
@@ -218,62 +295,36 @@ def latency_test(
         f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
     )
-    # Prepare inputs
-    reqs = prepare_synthetic_inputs(bench_args, tokenizer)
-    def clear():
-        model_runner.req_to_token_pool.clear()
-        model_runner.token_to_kv_pool.clear()
-    @torch.inference_mode()
-    def run_once(output_len):
-        # Prefill
-        torch.cuda.synchronize()
-        tot_latency = 0
-        tic = time.time()
-        next_token_ids, _, batch = extend(reqs, model_runner)
-        torch.cuda.synchronize()
-        prefill_latency = time.time() - tic
-        tot_latency += prefill_latency
-        throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
-        rank_print(
-            f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-        )
+    # To make this PR easier to review, for now, only do the first element in batch_size tuple.
+    bench_args.batch_size = bench_args.batch_size[0]
-        # Decode
-        for i in range(output_len):
-            torch.cuda.synchronize()
-            tic = time.time()
-            next_token_ids, _ = decode(next_token_ids, batch, model_runner)
-            torch.cuda.synchronize()
-            latency = time.time() - tic
-            tot_latency += latency
-            throughput = bench_args.batch_size / latency
-            if i < 5:
-                rank_print(
-                    f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-                )
-        avg_decode_latency = (tot_latency - prefill_latency) / output_len
-        avg_decode_throughput = bench_args.batch_size / avg_decode_latency
-        rank_print(
-            f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
-        )
-        throughput = (
-            (bench_args.input_len + bench_args.output_len)
-            * bench_args.batch_size
-            / tot_latency
-        )
-        rank_print(
-            f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
-        )
+    # Prepare inputs
+    reqs = prepare_synthetic_inputs_for_latency_test(
+        bench_args.batch_size, bench_args.input_len
+    )
     # Warm up
-    run_once(4)
-    clear()
+    latency_test_run_once(
+        model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
+    )
     # Run again
-    run_once(bench_args.output_len)
+    result_list = []
+    result_list.append(
+        latency_test_run_once(
+            model_runner,
+            rank_print,
+            reqs,
+            bench_args.batch_size,
+            bench_args.input_len,
+            bench_args.output_len,
+        )
+    )
+    # Write results in jsonlines format.
+    if bench_args.result_filename:
+        with jsonlines.open(bench_args.result_filename, "a") as f:
+            f.write_all(result_list)
 def main(server_args, bench_args):

sglang/check_env.py CHANGED Viewed

@@ -13,6 +13,7 @@ import torch
 PACKAGE_LIST = [
     "sglang",
     "flashinfer",
+    "triton",
     "requests",
     "tqdm",
     "numpy",
@@ -30,6 +31,7 @@ PACKAGE_LIST = [
     "zmq",
     "vllm",
     "outlines",
+    "multipart",
     "openai",
     "tiktoken",
     "anthropic",

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -15,7 +15,6 @@ class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
-        auth_token: Optional[str] = None,
         api_key: Optional[str] = None,
         verify: Optional[str] = None,
     ):
@@ -23,13 +22,11 @@ class RuntimeEndpoint(BaseBackend):
         self.support_concate_and_append = True
         self.base_url = base_url
-        self.auth_token = auth_token
         self.api_key = api_key
         self.verify = verify
         res = http_request(
             self.base_url + "/get_model_info",
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -67,7 +64,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -79,7 +75,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -91,7 +86,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -139,7 +133,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -193,7 +186,6 @@ class RuntimeEndpoint(BaseBackend):
             self.base_url + "/generate",
             json=data,
             stream=True,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -225,7 +217,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -243,7 +234,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -267,7 +257,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/concate_and_append_request",
             json={"src_rids": src_rids, "dst_rid": dst_rid},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import functools
 import json
 import os
 import warnings
-from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
+from typing import AbstractSet, Collection, Dict, List, Literal, Optional, Type, Union
 from huggingface_hub import snapshot_download
 from transformers import (
@@ -259,7 +259,7 @@ class TiktokenTokenizer:
                 Literal["all"], AbstractSet[str]
             ] = set(),  # noqa: B006
             disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        ) -> list[int]:
+        ) -> List[int]:
             if isinstance(allowed_special, set):
                 allowed_special |= self._default_allowed_special
             return tiktoken.Encoding.encode(

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -57,6 +57,8 @@ def _fwd_kernel(
     stride_buf_vh,
     stride_req_to_tokens_b,
     BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     logit_cap: tl.constexpr,
@@ -75,8 +77,10 @@ def _fwd_kernel(
     cur_batch_req_idx = tl.load(B_req_idx + cur_seq)
     offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
     offs_m = tl.arange(0, BLOCK_M)
     mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
     offs_q = (
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_qbs
@@ -85,10 +89,20 @@ def _fwd_kernel(
     )
     q = tl.load(Q_Extend + offs_q, mask=mask_m[:, None], other=0.0)
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
     # stage1: compute scores with prefix
     offs_n = tl.arange(0, BLOCK_N)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
     deno = tl.zeros([BLOCK_M], dtype=tl.float32)
     e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -110,6 +124,18 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -125,7 +151,7 @@ def _fwd_kernel(
         offs_buf_v = (
             offs_kv_loc[:, None] * stride_buf_vbs
             + cur_kv_head * stride_buf_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Buffer + offs_buf_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -150,6 +176,21 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
+                * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Extend + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -169,7 +210,7 @@ def _fwd_kernel(
         offs_v = (
             (cur_seq_extend_start_contiguous + start_n + offs_n[:, None]) * stride_vbs
             + cur_kv_head * stride_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Extend + offs_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -181,7 +222,7 @@ def _fwd_kernel(
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_obs
         + cur_head * stride_oh
-        + offs_d[None, :]
+        + offs_dv[None, :]
     )
     tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
@@ -217,8 +258,17 @@ def extend_attention_fwd(
         o_extend.shape[-1],
     )
-    assert Lq == Lk and Lk == Lv and Lv == Lo
-    assert Lq in {16, 32, 64, 128, 256}
+    assert Lq == Lk and Lv == Lo
+    assert Lq in {16, 32, 64, 128, 256, 576}
+    assert Lv in {16, 32, 64, 128, 256, 512}
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = Lq
+        BLOCK_DPE = 0
+    BLOCK_DV = Lv
     if CUDA_CAPABILITY[0] >= 8:
         BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
@@ -260,7 +310,9 @@ def extend_attention_fwd(
         v_buffer.stride(0),
         v_buffer.stride(1),
         req_to_tokens.stride(0),
-        BLOCK_DMODEL=Lq,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         num_warps=num_warps,

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -38,16 +38,22 @@ class RadixAttention(nn.Module):
         num_kv_heads: int,
         layer_id: int,
         logit_cap: int = -1,
+        v_head_dim: int = -1,
     ):
         super().__init__()
         self.tp_q_head_num = num_heads
         self.tp_k_head_num = num_kv_heads
         self.tp_v_head_num = num_kv_heads
         self.head_dim = head_dim
+        self.qk_head_dim = head_dim
+        self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim
         self.scaling = scaling
         self.layer_id = layer_id
-        if not global_server_args_dict.get("disable_flashinfer", False):
+        if (
+            not global_server_args_dict.get("disable_flashinfer", False)
+            and self.qk_head_dim == self.v_head_dim
+        ):
             self.extend_forward = self.extend_forward_flashinfer
             self.decode_forward = self.decode_forward_flashinfer
         else:
@@ -57,13 +63,17 @@ class RadixAttention(nn.Module):
         self.logit_cap = logit_cap if logit_cap is not None and logit_cap > 0 else 0
     def extend_forward_triton(self, q, k, v, input_metadata: InputMetadata):
-        o = torch.empty_like(q)
+        if self.qk_head_dim != self.v_head_dim:
+            o = q.new_empty((q.shape[0], self.tp_q_head_num * self.v_head_dim))
+        else:
+            o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
         extend_attention_fwd(
-            q.view(-1, self.tp_q_head_num, self.head_dim),
+            q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             k.contiguous(),
             v.contiguous(),
-            o.view(-1, self.tp_q_head_num, self.head_dim),
+            o.view(-1, self.tp_q_head_num, self.v_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),
             input_metadata.req_to_token_pool.req_to_token,
@@ -82,14 +92,17 @@ class RadixAttention(nn.Module):
         return o
     def decode_forward_triton(self, q, k, v, input_metadata: InputMetadata):
-        o = torch.empty_like(q)
+        if self.qk_head_dim != self.v_head_dim:
+            o = q.new_empty((q.shape[0], self.tp_q_head_num * self.v_head_dim))
+        else:
+            o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
         token_attention_fwd(
-            q.view(-1, self.tp_q_head_num, self.head_dim),
+            q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),
-            o.view(-1, self.tp_q_head_num, self.head_dim),
+            o.view(-1, self.tp_q_head_num, self.v_head_dim),
             input_metadata.req_to_token_pool.req_to_token,
             input_metadata.req_pool_indices,
             input_metadata.triton_start_loc,
@@ -160,8 +173,8 @@ class RadixAttention(nn.Module):
         return o.view(-1, self.tp_q_head_num * self.head_dim)
     def forward(self, q, k, v, input_metadata: InputMetadata):
-        k = k.view(-1, self.tp_k_head_num, self.head_dim)
-        v = v.view(-1, self.tp_v_head_num, self.head_dim)
+        k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
+        v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
         if input_metadata.forward_mode == ForwardMode.EXTEND:
             return self.extend_forward(q, k, v, input_metadata)

sglang/srt/layers/token_attention.py CHANGED Viewed

@@ -54,6 +54,7 @@ def _fwd_kernel_stage1(
     att_stride_h,
     kv_group_num: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
     BLOCK_N: tl.constexpr,
     logit_cap: tl.constexpr,
 ):
@@ -73,6 +74,10 @@ def _fwd_kernel_stage1(
     off_q = cur_batch * stride_qbs + cur_head * stride_qh + offs_d
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        off_qpe = cur_batch * stride_qbs + cur_head * stride_qh + offs_dpe
     offs_n = start_n * BLOCK_N + tl.arange(0, BLOCK_N)
     block_stard_index = start_n * BLOCK_N
@@ -97,6 +102,19 @@ def _fwd_kernel_stage1(
             other=0.0,
         ).to(REDUCE_TRITON_TYPE)
         att_value = tl.sum(q[None, :] * k, 1)
+        if BLOCK_DPE > 0:
+            qpe = tl.load(Q + off_qpe + start_mark).to(REDUCE_TRITON_TYPE)
+            offs_buf_kpe = (
+                k_loc[:, None] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[None, :]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_buf_kpe,
+                mask=offs_n_new[:, None] < cur_batch_end_index,
+                other=0.0,
+            ).to(REDUCE_TRITON_TYPE)
+            att_value += tl.sum(qpe[None, :] * kpe, 1)
         att_value *= sm_scale
         if logit_cap > 0:
@@ -192,7 +210,14 @@ def _token_att_m_fwd(
     # shape constraints
     Lq, Lk = q.shape[-1], k_buffer.shape[-1]
     assert Lq == Lk
-    assert Lk in {16, 32, 64, 128, 256}
+    assert Lk in {16, 32, 64, 128, 256, 576}
+    if Lk == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = Lk
+        BLOCK_DPE = 0
     batch, head_num = B_req_idx.shape[0], q.shape[1]
@@ -220,7 +245,8 @@ def _token_att_m_fwd(
         k_buffer.stride(1),
         att_out.stride(0),
         kv_group_num=kv_group_num,
-        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
         BLOCK_N=BLOCK,
         logit_cap=logit_cap,
         num_warps=num_warps,

sglang 0.2.9__py3-none-any.whl → 0.2.10__py3-none-any.whl

sglang 0.2.9py3-none-any.whl → 0.2.10py3-none-any.whl