PyPI - sglang - Versions diffs - 0.2.9__tar.gz → 0.2.10__tar.gz - Mend

sglang 0.2.9tar.gz → 0.2.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (106) hide show

{sglang-0.2.9/sglang.egg-info → sglang-0.2.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sglang
-Version: 0.2.9
+Version: 0.2.10
 Summary: SGLang is yet another fast serving framework for large language models and vision language models.
 License:                                  Apache License
                                    Version 2.0, January 2004
@@ -220,10 +220,12 @@ Requires-Dist: fastapi; extra == "srt"
 Requires-Dist: hf_transfer; extra == "srt"
 Requires-Dist: huggingface_hub; extra == "srt"
 Requires-Dist: interegular; extra == "srt"
+Requires-Dist: jsonlines; extra == "srt"
 Requires-Dist: packaging; extra == "srt"
 Requires-Dist: pillow; extra == "srt"
 Requires-Dist: psutil; extra == "srt"
 Requires-Dist: pydantic; extra == "srt"
+Requires-Dist: python-multipart; extra == "srt"
 Requires-Dist: torch; extra == "srt"
 Requires-Dist: uvicorn; extra == "srt"
 Requires-Dist: uvloop; extra == "srt"
@@ -299,8 +301,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 2: From source
 ```
-# Use the stable v0.2.9 branch
-git clone -b v0.2.9 https://github.com/sgl-project/sglang.git
+# Use the last release branch
+git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -452,7 +454,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 ### Benchmark Performance
-- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
+- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
   ```
   python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
   ```

{sglang-0.2.9 → sglang-0.2.10}/README.md RENAMED Viewed

@@ -54,8 +54,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
 ### Method 2: From source
 ```
-# Use the stable v0.2.9 branch
-git clone -b v0.2.9 https://github.com/sgl-project/sglang.git
+# Use the last release branch
+git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
 cd sglang
 pip install --upgrade pip
@@ -207,7 +207,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
 ### Benchmark Performance
-- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
+- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
   ```
   python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
   ```

{sglang-0.2.9 → sglang-0.2.10}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "sglang"
-version = "0.2.9"
+version = "0.2.10"
 description = "SGLang is yet another fast serving framework for large language models and vision language models."
 readme = "README.md"
 requires-python = ">=3.8"
@@ -20,8 +20,10 @@ dependencies = [
 ]
 [project.optional-dependencies]
-srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
-       "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.3.post1", "outlines>=0.0.44"]
+srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "jsonlines",
+       "packaging", "pillow", "psutil", "pydantic", "python-multipart",
+       "torch", "uvicorn", "uvloop", "zmq",
+       "vllm==0.5.3.post1", "outlines>=0.0.44"]
 openai = ["openai>=1.0", "tiktoken"]
 anthropic = ["anthropic>=0.20.0"]
 litellm = ["litellm>=1.0.0"]

{sglang-0.2.9 → sglang-0.2.10}/sglang/bench_latency.py RENAMED Viewed

@@ -1,13 +1,13 @@
 """
 Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
-# Usage (latency test):
+# Usage (latency test) with dummy weights:
 python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
 # Usage (correctness test):
 python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
-### Reference output:
+### Reference output (of the correctness test above, can be gpu dependent):
 prefill logits (first half) tensor([[-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [-10.0312,  -9.5000,   0.8936,  ...,  -4.9414,  -3.2402,  -3.3633],
         [ -9.1875, -10.2500,   2.7109,  ...,  -4.3359,  -4.0664,  -4.1328]],
@@ -31,7 +31,9 @@ import dataclasses
 import logging
 import multiprocessing
 import time
+from typing import Tuple
+import jsonlines
 import numpy as np
 import torch
 import torch.distributed as dist
@@ -47,25 +49,34 @@ from sglang.srt.utils import suppress_other_loggers
 @dataclasses.dataclass
 class BenchArgs:
-    batch_size: int = 1
+    batch_size: Tuple[int] = (1,)
     input_len: int = 1024
     output_len: int = 4
+    result_filename: str = ""
     correctness_test: bool = False
     # This is only used for correctness test
     cut_len: int = 4
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument("--batch-size", type=int, default=BenchArgs.batch_size)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
         parser.add_argument("--input-len", type=int, default=BenchArgs.input_len)
         parser.add_argument("--output-len", type=int, default=BenchArgs.output_len)
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
         parser.add_argument("--correctness-test", action="store_true")
         parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
-        attrs = [attr.name for attr in dataclasses.fields(cls)]
-        return cls(**{attr: getattr(args, attr) for attr in attrs})
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
 def load_model(server_args, tp_rank):
@@ -93,7 +104,7 @@ def load_model(server_args, tp_rank):
     return model_runner, tokenizer
-def prepare_inputs(bench_args, tokenizer):
+def prepare_inputs_for_correctness_test(bench_args, tokenizer):
     prompts = [
         "The capital of France is",
         "The capital of the United Kindom is",
@@ -119,7 +130,9 @@ def prepare_inputs(bench_args, tokenizer):
     return input_ids, reqs
-def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
+def prepare_extend_inputs_for_correctness_test(
+    bench_args, input_ids, reqs, model_runner
+):
     for i in range(len(reqs)):
         req = reqs[i]
         req.input_ids += input_ids[i][bench_args.cut_len :]
@@ -129,8 +142,8 @@ def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
     return reqs
-def prepare_synthetic_inputs(bench_args, tokenizer):
-    input_ids = np.ones((bench_args.batch_size, bench_args.input_len), dtype=np.int32)
+def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
+    input_ids = np.ones((batch_size, input_len), dtype=np.int32)
     sampling_params = SamplingParams(
         temperature=0,
         max_new_tokens=BenchArgs.output_len,
@@ -179,7 +192,7 @@ def correctness_test(
     model_runner, tokenizer = load_model(server_args, tp_rank)
     # Prepare inputs
-    input_ids, reqs = prepare_inputs(bench_args, tokenizer)
+    input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
     if bench_args.cut_len > 0:
         # Prefill
@@ -187,7 +200,9 @@ def correctness_test(
         rank_print("prefill logits (first half)", next_token_logits)
     # Prepare extend inputs
-    reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
+    reqs = prepare_extend_inputs_for_correctness_test(
+        bench_args, input_ids, reqs, model_runner
+    )
     # Extend
     next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
@@ -205,6 +220,68 @@ def correctness_test(
         rank_print(tokenizer.decode(output_ids[i]))
+@torch.inference_mode()
+def latency_test_run_once(
+    model_runner, rank_print, reqs, batch_size, input_len, output_len
+):
+    # Clear the pools.
+    model_runner.req_to_token_pool.clear()
+    model_runner.token_to_kv_pool.clear()
+    measurement_results = {
+        "run_name": "before",
+        "batch_size": batch_size,
+        "input_len": input_len,
+        "output_len": output_len,
+    }
+    tot_latency = 0
+    # Prefill
+    torch.cuda.synchronize()
+    tic = time.time()
+    next_token_ids, _, batch = extend(reqs, model_runner)
+    torch.cuda.synchronize()
+    prefill_latency = time.time() - tic
+    tot_latency += prefill_latency
+    throughput = input_len * batch_size / prefill_latency
+    rank_print(
+        f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["prefill_latency"] = prefill_latency
+    measurement_results["prefill_throughput"] = throughput
+    # Decode
+    for i in range(output_len):
+        torch.cuda.synchronize()
+        tic = time.time()
+        next_token_ids, _ = decode(next_token_ids, batch, model_runner)
+        torch.cuda.synchronize()
+        latency = time.time() - tic
+        tot_latency += latency
+        throughput = batch_size / latency
+        if i < 5:
+            rank_print(
+                f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
+            )
+    avg_decode_latency = (tot_latency - prefill_latency) / output_len
+    avg_decode_throughput = batch_size / avg_decode_latency
+    rank_print(
+        f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
+    )
+    measurement_results["avg_decode_latency"] = avg_decode_latency
+    measurement_results["avg_decode_throughput"] = avg_decode_throughput
+    throughput = (input_len + output_len) * batch_size / tot_latency
+    rank_print(
+        f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
+    )
+    measurement_results["total_latency"] = tot_latency
+    measurement_results["total_throughput"] = throughput
+    return measurement_results
 def latency_test(
     server_args,
     bench_args,
@@ -218,62 +295,36 @@ def latency_test(
         f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
     )
-    # Prepare inputs
-    reqs = prepare_synthetic_inputs(bench_args, tokenizer)
-    def clear():
-        model_runner.req_to_token_pool.clear()
-        model_runner.token_to_kv_pool.clear()
-    @torch.inference_mode()
-    def run_once(output_len):
-        # Prefill
-        torch.cuda.synchronize()
-        tot_latency = 0
-        tic = time.time()
-        next_token_ids, _, batch = extend(reqs, model_runner)
-        torch.cuda.synchronize()
-        prefill_latency = time.time() - tic
-        tot_latency += prefill_latency
-        throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
-        rank_print(
-            f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-        )
+    # To make this PR easier to review, for now, only do the first element in batch_size tuple.
+    bench_args.batch_size = bench_args.batch_size[0]
-        # Decode
-        for i in range(output_len):
-            torch.cuda.synchronize()
-            tic = time.time()
-            next_token_ids, _ = decode(next_token_ids, batch, model_runner)
-            torch.cuda.synchronize()
-            latency = time.time() - tic
-            tot_latency += latency
-            throughput = bench_args.batch_size / latency
-            if i < 5:
-                rank_print(
-                    f"Decode.  latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
-                )
-        avg_decode_latency = (tot_latency - prefill_latency) / output_len
-        avg_decode_throughput = bench_args.batch_size / avg_decode_latency
-        rank_print(
-            f"Decode.  avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
-        )
-        throughput = (
-            (bench_args.input_len + bench_args.output_len)
-            * bench_args.batch_size
-            / tot_latency
-        )
-        rank_print(
-            f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
-        )
+    # Prepare inputs
+    reqs = prepare_synthetic_inputs_for_latency_test(
+        bench_args.batch_size, bench_args.input_len
+    )
     # Warm up
-    run_once(4)
-    clear()
+    latency_test_run_once(
+        model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
+    )
     # Run again
-    run_once(bench_args.output_len)
+    result_list = []
+    result_list.append(
+        latency_test_run_once(
+            model_runner,
+            rank_print,
+            reqs,
+            bench_args.batch_size,
+            bench_args.input_len,
+            bench_args.output_len,
+        )
+    )
+    # Write results in jsonlines format.
+    if bench_args.result_filename:
+        with jsonlines.open(bench_args.result_filename, "a") as f:
+            f.write_all(result_list)
 def main(server_args, bench_args):

{sglang-0.2.9 → sglang-0.2.10}/sglang/check_env.py RENAMED Viewed

@@ -13,6 +13,7 @@ import torch
 PACKAGE_LIST = [
     "sglang",
     "flashinfer",
+    "triton",
     "requests",
     "tqdm",
     "numpy",
@@ -30,6 +31,7 @@ PACKAGE_LIST = [
     "zmq",
     "vllm",
     "outlines",
+    "multipart",
     "openai",
     "tiktoken",
     "anthropic",

{sglang-0.2.9 → sglang-0.2.10}/sglang/lang/backend/runtime_endpoint.py RENAMED Viewed

@@ -15,7 +15,6 @@ class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
-        auth_token: Optional[str] = None,
         api_key: Optional[str] = None,
         verify: Optional[str] = None,
     ):
@@ -23,13 +22,11 @@ class RuntimeEndpoint(BaseBackend):
         self.support_concate_and_append = True
         self.base_url = base_url
-        self.auth_token = auth_token
         self.api_key = api_key
         self.verify = verify
         res = http_request(
             self.base_url + "/get_model_info",
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -67,7 +64,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -79,7 +75,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -91,7 +86,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -139,7 +133,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -193,7 +186,6 @@ class RuntimeEndpoint(BaseBackend):
             self.base_url + "/generate",
             json=data,
             stream=True,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -225,7 +217,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -243,7 +234,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -267,7 +257,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/concate_and_append_request",
             json={"src_rids": src_rids, "dst_rid": dst_rid},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )

{sglang-0.2.9 → sglang-0.2.10}/sglang/srt/hf_transformers_utils.py RENAMED Viewed

@@ -19,7 +19,7 @@ import functools
 import json
 import os
 import warnings
-from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
+from typing import AbstractSet, Collection, Dict, List, Literal, Optional, Type, Union
 from huggingface_hub import snapshot_download
 from transformers import (
@@ -259,7 +259,7 @@ class TiktokenTokenizer:
                 Literal["all"], AbstractSet[str]
             ] = set(),  # noqa: B006
             disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        ) -> list[int]:
+        ) -> List[int]:
             if isinstance(allowed_special, set):
                 allowed_special |= self._default_allowed_special
             return tiktoken.Encoding.encode(

{sglang-0.2.9 → sglang-0.2.10}/sglang/srt/layers/extend_attention.py RENAMED Viewed

@@ -57,6 +57,8 @@ def _fwd_kernel(
     stride_buf_vh,
     stride_req_to_tokens_b,
     BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     logit_cap: tl.constexpr,
@@ -75,8 +77,10 @@ def _fwd_kernel(
     cur_batch_req_idx = tl.load(B_req_idx + cur_seq)
     offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
     offs_m = tl.arange(0, BLOCK_M)
     mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
     offs_q = (
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_qbs
@@ -85,10 +89,20 @@ def _fwd_kernel(
     )
     q = tl.load(Q_Extend + offs_q, mask=mask_m[:, None], other=0.0)
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
     # stage1: compute scores with prefix
     offs_n = tl.arange(0, BLOCK_N)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
     deno = tl.zeros([BLOCK_M], dtype=tl.float32)
     e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -110,6 +124,18 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -125,7 +151,7 @@ def _fwd_kernel(
         offs_buf_v = (
             offs_kv_loc[:, None] * stride_buf_vbs
             + cur_kv_head * stride_buf_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Buffer + offs_buf_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -150,6 +176,21 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
+                * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Extend + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -169,7 +210,7 @@ def _fwd_kernel(
         offs_v = (
             (cur_seq_extend_start_contiguous + start_n + offs_n[:, None]) * stride_vbs
             + cur_kv_head * stride_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Extend + offs_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -181,7 +222,7 @@ def _fwd_kernel(
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_obs
         + cur_head * stride_oh
-        + offs_d[None, :]
+        + offs_dv[None, :]
     )
     tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
@@ -217,8 +258,17 @@ def extend_attention_fwd(
         o_extend.shape[-1],
     )
-    assert Lq == Lk and Lk == Lv and Lv == Lo
-    assert Lq in {16, 32, 64, 128, 256}
+    assert Lq == Lk and Lv == Lo
+    assert Lq in {16, 32, 64, 128, 256, 576}
+    assert Lv in {16, 32, 64, 128, 256, 512}
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = Lq
+        BLOCK_DPE = 0
+    BLOCK_DV = Lv
     if CUDA_CAPABILITY[0] >= 8:
         BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
@@ -260,7 +310,9 @@ def extend_attention_fwd(
         v_buffer.stride(0),
         v_buffer.stride(1),
         req_to_tokens.stride(0),
-        BLOCK_DMODEL=Lq,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         num_warps=num_warps,

sglang 0.2.9__tar.gz → 0.2.10__tar.gz

sglang 0.2.9tar.gz → 0.2.10tar.gz