PyPI - sglang - Versions diffs - 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl - Mend

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (81) hide show

sglang/bench_latency.py +10 -6
sglang/bench_serving.py +33 -38
sglang/global_config.py +0 -4
sglang/lang/backend/runtime_endpoint.py +13 -6
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +3 -6
sglang/launch_server_llavavid.py +7 -8
sglang/srt/{model_config.py → configs/model_config.py} +5 -0
sglang/srt/constrained/__init__.py +2 -0
sglang/srt/constrained/fsm_cache.py +29 -38
sglang/srt/constrained/jump_forward.py +0 -1
sglang/srt/conversation.py +4 -1
sglang/srt/hf_transformers_utils.py +2 -4
sglang/srt/layers/attention_backend.py +480 -0
sglang/srt/layers/flashinfer_utils.py +235 -0
sglang/srt/layers/logits_processor.py +64 -77
sglang/srt/layers/radix_attention.py +11 -161
sglang/srt/layers/sampler.py +40 -35
sglang/srt/layers/torchao_utils.py +75 -0
sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
sglang/srt/lora/lora.py +403 -0
sglang/srt/lora/lora_config.py +43 -0
sglang/srt/lora/lora_manager.py +256 -0
sglang/srt/managers/controller_multi.py +1 -5
sglang/srt/managers/controller_single.py +0 -5
sglang/srt/managers/io_struct.py +16 -1
sglang/srt/managers/policy_scheduler.py +122 -5
sglang/srt/managers/schedule_batch.py +110 -74
sglang/srt/managers/tokenizer_manager.py +24 -15
sglang/srt/managers/tp_worker.py +181 -115
sglang/srt/model_executor/cuda_graph_runner.py +60 -133
sglang/srt/model_executor/forward_batch_info.py +35 -312
sglang/srt/model_executor/model_runner.py +118 -141
sglang/srt/models/baichuan.py +416 -0
sglang/srt/models/chatglm.py +6 -8
sglang/srt/models/commandr.py +1 -5
sglang/srt/models/dbrx.py +1 -5
sglang/srt/models/deepseek.py +1 -5
sglang/srt/models/deepseek_v2.py +1 -5
sglang/srt/models/exaone.py +8 -43
sglang/srt/models/gemma.py +1 -5
sglang/srt/models/gemma2.py +1 -5
sglang/srt/models/gpt_bigcode.py +1 -5
sglang/srt/models/grok.py +1 -5
sglang/srt/models/internlm2.py +1 -5
sglang/srt/models/{llama2.py → llama.py} +48 -26
sglang/srt/models/llama_classification.py +14 -40
sglang/srt/models/llama_embedding.py +7 -6
sglang/srt/models/llava.py +38 -16
sglang/srt/models/llavavid.py +7 -8
sglang/srt/models/minicpm.py +1 -5
sglang/srt/models/minicpm3.py +665 -0
sglang/srt/models/mistral.py +2 -3
sglang/srt/models/mixtral.py +6 -5
sglang/srt/models/mixtral_quant.py +1 -5
sglang/srt/models/qwen.py +1 -5
sglang/srt/models/qwen2.py +1 -5
sglang/srt/models/qwen2_moe.py +6 -5
sglang/srt/models/stablelm.py +1 -5
sglang/srt/models/xverse.py +375 -0
sglang/srt/models/xverse_moe.py +445 -0
sglang/srt/openai_api/adapter.py +65 -46
sglang/srt/openai_api/protocol.py +11 -3
sglang/srt/sampling/sampling_batch_info.py +67 -58
sglang/srt/server.py +24 -14
sglang/srt/server_args.py +130 -28
sglang/srt/utils.py +12 -0
sglang/test/few_shot_gsm8k.py +132 -0
sglang/test/runners.py +114 -22
sglang/test/test_programs.py +70 -0
sglang/test/test_utils.py +89 -1
sglang/utils.py +38 -4
sglang/version.py +1 -1
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/METADATA +31 -18
sglang-0.3.1.dist-info/RECORD +129 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/WHEEL +1 -1
sglang-0.2.15.dist-info/RECORD +0 -118
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/LICENSE +0 -0
{sglang-0.2.15.dist-info → sglang-0.3.1.dist-info}/top_level.txt +0 -0

sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} RENAMED Viewed

@@ -15,14 +15,14 @@ limitations under the License.
 """
 Memory-efficient attention for prefill.
-It supporst page size = 1 and prefill with KV cache (i.e. extend).
+It supports page size = 1 and prefill with KV cache (i.e. extend).
 """
 import torch
 import triton
 import triton.language as tl
-from sglang.srt.layers.prefill_attention import context_attention_fwd
+from sglang.srt.layers.triton_attention.prefill_attention import context_attention_fwd
 CUDA_CAPABILITY = torch.cuda.get_device_capability()
@@ -61,12 +61,14 @@ def _fwd_kernel(
     stride_buf_vbs,
     stride_buf_vh,
     stride_req_to_tokens_b,
+    logit_cap: tl.constexpr,
+    Lq: tl.constexpr,
+    Lv: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_DPE: tl.constexpr,
     BLOCK_DV: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
-    logit_cap: tl.constexpr,
 ):
     cur_seq = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -86,13 +88,18 @@ def _fwd_kernel(
     offs_m = tl.arange(0, BLOCK_M)
     mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
+    mask_d = offs_d < Lq
+    mask_dv = offs_dv < Lv
     offs_q = (
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_qbs
         + cur_head * stride_qh
         + offs_d[None, :]
     )
-    q = tl.load(Q_Extend + offs_q, mask=mask_m[:, None], other=0.0)
+    q = tl.load(
+        Q_Extend + offs_q, mask=(mask_m[:, None]) & (mask_d[None, :]), other=0.0
+    )
     if BLOCK_DPE > 0:
         offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
@@ -104,7 +111,7 @@ def _fwd_kernel(
         )
         qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
-    # stage1: compute scores with prefix
+    # stage 1: compute scores with prefix
     offs_n = tl.arange(0, BLOCK_N)
     acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
@@ -125,7 +132,9 @@ def _fwd_kernel(
             + cur_kv_head * stride_buf_kh
             + offs_d[:, None]
         )
-        k = tl.load(K_Buffer + offs_buf_k, mask=mask_n[None, :], other=0.0)
+        k = tl.load(
+            K_Buffer + offs_buf_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
         qk = tl.dot(q.to(k.dtype), k)
         if BLOCK_DPE > 0:
@@ -157,13 +166,15 @@ def _fwd_kernel(
             + cur_kv_head * stride_buf_vh
             + offs_dv[None, :]
         )
-        v = tl.load(V_Buffer + offs_buf_v, mask=mask_n[:, None], other=0.0)
+        v = tl.load(
+            V_Buffer + offs_buf_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
         p = p.to(v.dtype)
         acc = acc * re_scale[:, None] + tl.dot(p, v)
         e_max = n_e_max
-    # stage2: compute the trianlge part
+    # stage 2: compute the trianlge part
     cur_block_m_end = tl.minimum(cur_seq_len_extend, (cur_block_m + 1) * BLOCK_M)
     for start_n in range(0, cur_block_m_end, BLOCK_N):
@@ -176,7 +187,9 @@ def _fwd_kernel(
             + cur_kv_head * stride_kh
             + offs_d[:, None]
         )
-        k = tl.load(K_Extend + offs_k, mask=mask_n[None, :], other=0.0)
+        k = tl.load(
+            K_Extend + offs_k, mask=(mask_n[None, :]) & (mask_d[:, None]), other=0.0
+        )
         qk = tl.dot(q, k, out_dtype=tl.float32)
         if BLOCK_DPE > 0:
@@ -214,7 +227,9 @@ def _fwd_kernel(
             + cur_kv_head * stride_vh
             + offs_dv[None, :]
         )
-        v = tl.load(V_Extend + offs_v, mask=mask_n[:, None], other=0.0)
+        v = tl.load(
+            V_Extend + offs_v, mask=mask_n[:, None] & mask_dv[None, :], other=0.0
+        )
         p = p.to(v.dtype)
         acc = acc * re_scale[:, None] + tl.dot(p, v)
@@ -226,7 +241,9 @@ def _fwd_kernel(
         + cur_head * stride_oh
         + offs_dv[None, :]
     )
-    tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
+    tl.store(
+        O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None] & mask_dv[None, :]
+    )
 def extend_attention_fwd(
@@ -238,39 +255,34 @@ def extend_attention_fwd(
     v_buffer,
     req_to_tokens,
     b_req_idx,
-    b_start_loc,
     b_seq_len,
-    b_seq_len_prefix,
-    b_start_loc_extend,
     b_seq_len_extend,
-    max_len_in_batch,
+    b_start_loc_extend,
     max_len_extend,
     sm_scale=None,
-    logit_cap=-1,
+    logit_cap=0.0,
 ):
     """
     q_extend, k_extend, v_extend, o_extend: contiguous tensors
     k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
     """
-    Lq, Lk, Lv, Lo = (
+    Lq, Lk, Lv = (
         q_extend.shape[-1],
         k_extend.shape[-1],
         v_extend.shape[-1],
-        o_extend.shape[-1],
     )
-    assert Lq == Lk and Lv == Lo
-    assert Lq in {16, 32, 64, 128, 256, 576}
-    assert Lv in {16, 32, 64, 128, 256, 512}
     if Lq == 576:
         BLOCK_DMODEL = 512
         BLOCK_DPE = 64
+    elif Lq == 288:
+        BLOCK_DMODEL = 256
+        BLOCK_DPE = 32
     else:
-        BLOCK_DMODEL = Lq
+        BLOCK_DMODEL = triton.next_power_of_2(Lq)
         BLOCK_DPE = 0
-    BLOCK_DV = Lv
+    BLOCK_DV = triton.next_power_of_2(Lv)
     if CUDA_CAPABILITY[0] >= 9:
         if Lq <= 256:
@@ -287,7 +299,7 @@ def extend_attention_fwd(
     else:
         BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
-    sm_scale = 1.0 / (Lq**0.5) if sm_scale is None else sm_scale
+    sm_scale = sm_scale or 1.0 / (Lq**0.5)
     batch_size, head_num = b_seq_len.shape[0], q_extend.shape[1]
     kv_group_num = q_extend.shape[1] // k_extend.shape[1]
@@ -322,25 +334,24 @@ def extend_attention_fwd(
         v_buffer.stride(0),
         v_buffer.stride(1),
         req_to_tokens.stride(0),
+        logit_cap=logit_cap,
         BLOCK_DMODEL=BLOCK_DMODEL,
         BLOCK_DPE=BLOCK_DPE,
         BLOCK_DV=BLOCK_DV,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
+        Lq=Lq,
+        Lv=Lv,
         num_warps=num_warps,
         num_stages=num_stages,
-        logit_cap=logit_cap,
     )
 def redundant_attention(
     q_extend,
-    k_extend,
-    v_extend,
     o_extend,
     k_buffer,
     v_buffer,
-    req_to_tokens,
     b_req_idx,
     b_start_loc,
     b_seq_len,
@@ -371,106 +382,3 @@ def redundant_attention(
         pl, pr = b_start_loc[i] + b_seq_len_prefix[i], b_start_loc[i] + b_seq_len[i]
         o_extend[pt : pt + cur_seq_len_extend] = o_buffer[pl:pr]
         pt += cur_seq_len_extend
-def test():
-    torch.manual_seed(0)
-    B, N_CTX, H_Q, H_KV, D = 19, 12331, 12, 4, 128
-    dtype = torch.float16
-    b_seq_len_prefix = torch.randint(
-        1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
-    )
-    b_seq_len_extend = torch.randint(
-        1, N_CTX // 2, (B,), dtype=torch.int32, device="cuda"
-    )
-    b_seq_len = b_seq_len_prefix + b_seq_len_extend
-    max_len_in_batch = torch.max(b_seq_len, 0)[0].item()
-    b_req_idx = torch.arange(B, dtype=torch.int32, device="cuda")
-    req_to_tokens = torch.empty((B, max_len_in_batch), dtype=torch.int32, device="cuda")
-    b_start_loc = torch.zeros((B,), dtype=torch.int32, device="cuda")
-    b_start_loc[1:] = torch.cumsum(b_seq_len[:-1], 0)
-    b_start_loc_extend = torch.zeros((B,), dtype=torch.int32, device="cuda")
-    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
-    for i in range(B):
-        req_to_tokens[i, : b_seq_len[i]] = torch.arange(
-            b_start_loc[i], b_start_loc[i] + b_seq_len[i]
-        )
-    total_token_num = torch.sum(b_seq_len).item()
-    extend_token_num = torch.sum(b_seq_len_extend).item()
-    k_buffer = torch.empty(
-        (total_token_num, H_KV, D), dtype=dtype, device="cuda"
-    ).normal_(mean=0.1, std=0.2)
-    v_buffer = torch.empty(
-        (total_token_num, H_KV, D), dtype=dtype, device="cuda"
-    ).normal_(mean=0.1, std=0.2)
-    k_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
-    v_extend = torch.empty((extend_token_num, H_KV, D), dtype=dtype, device="cuda")
-    q_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
-    for i in range(B):
-        extend_start_in_buffer = b_start_loc[i] + b_seq_len_prefix[i]
-        extend_end_in_buffer = b_start_loc[i] + b_seq_len[i]
-        extend_start = b_start_loc_extend[i]
-        extend_end = b_start_loc_extend[i] + b_seq_len_extend[i]
-        k_extend[extend_start:extend_end] = k_buffer[
-            extend_start_in_buffer:extend_end_in_buffer
-        ]
-        v_extend[extend_start:extend_end] = v_buffer[
-            extend_start_in_buffer:extend_end_in_buffer
-        ]
-        q_extend[extend_start:extend_end] = torch.empty(
-            (b_seq_len_extend[i], H_Q, D), dtype=dtype, device="cuda"
-        ).normal_(mean=0.1, std=0.2)
-    o_extend = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
-    o_redundant = torch.empty((extend_token_num, H_Q, D), dtype=dtype, device="cuda")
-    b_seq_len_extend = b_seq_len - b_seq_len_prefix
-    b_start_loc_extend = torch.zeros_like(b_seq_len)
-    b_start_loc_extend[1:] = torch.cumsum(b_seq_len_extend[:-1], 0)
-    max_len_extend = torch.max(b_seq_len_extend, 0)[0].item()
-    extend_attention_fwd(
-        q_extend,
-        k_extend,
-        v_extend,
-        o_extend,
-        k_buffer,
-        v_buffer,
-        req_to_tokens,
-        b_req_idx,
-        b_start_loc,
-        b_seq_len,
-        b_seq_len_prefix,
-        b_start_loc_extend,
-        b_seq_len_extend,
-        max_len_in_batch,
-        max_len_extend,
-    )
-    redundant_attention(
-        q_extend,
-        k_extend,
-        v_extend,
-        o_redundant,
-        k_buffer,
-        v_buffer,
-        req_to_tokens,
-        b_req_idx,
-        b_start_loc,
-        b_seq_len,
-        b_seq_len_prefix,
-        max_len_in_batch,
-    )
-    print("Mean: ", torch.mean(torch.abs(o_extend - o_redundant)))
-    print("Max: ", torch.max(torch.abs(o_extend - o_redundant)))
-    assert torch.allclose(o_extend, o_redundant, rtol=1e-2)
-if __name__ == "__main__":
-    test()

sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} RENAMED Viewed

@@ -48,6 +48,7 @@ def _fwd_kernel(
     BLOCK_M: tl.constexpr,
     BLOCK_DMODEL: tl.constexpr,
     BLOCK_N: tl.constexpr,
+    Lk: tl.constexpr,
 ):
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -72,7 +73,11 @@ def _fwd_kernel(
     off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None]
     off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :]
-    q = tl.load(Q + off_q, mask=offs_m[:, None] < cur_batch_seq_len, other=0.0)
+    mask_d = offs_d < Lk
+    q = tl.load(
+        Q + off_q, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d), other=0.0
+    )
     k_ptrs = K + off_k
     v_ptrs = V + off_v
@@ -89,7 +94,7 @@ def _fwd_kernel(
         # -- compute qk ----
         k = tl.load(
             k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs,
-            mask=(start_n + offs_n[None, :]) < cur_batch_seq_len,
+            mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]),
             other=0.0,
         )
         # mask = tl.load(mask_ptrs + start_n, mask=start_n + offs_n < cur_batch_end_loc, other=0.0)
@@ -118,7 +123,7 @@ def _fwd_kernel(
         # update acc
         v = tl.load(
             v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs,
-            mask=(start_n + offs_n[:, None]) < cur_batch_seq_len,
+            mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]),
             other=0.0,
         )
@@ -134,7 +139,9 @@ def _fwd_kernel(
         + offs_d[None, :]
     )
     out_ptrs = Out + off_o
-    tl.store(out_ptrs, acc, mask=offs_m[:, None] < cur_batch_seq_len)
+    tl.store(
+        out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :])
+    )
 def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
@@ -144,8 +151,6 @@ def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
         BLOCK = 64
     Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
-    assert Lq == Lk and Lk == Lv
-    assert Lk in {16, 32, 64, 128, 256}
     sm_scale = 1.0 / (Lq**0.5)
     batch, head = b_seq_len.shape[0], q.shape[1]
@@ -172,8 +177,9 @@ def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
         o.stride(1),
         kv_group_num=kv_group_num,
         BLOCK_M=BLOCK,
-        BLOCK_DMODEL=Lk,
+        BLOCK_DMODEL=triton.next_power_of_2(Lk),
         BLOCK_N=BLOCK,
         num_warps=num_warps,
         num_stages=1,
+        Lk=Lk,
     )

sglang 0.2.15__py3-none-any.whl → 0.3.1__py3-none-any.whl

sglang 0.2.15py3-none-any.whl → 0.3.1py3-none-any.whl