PyPI - sglang - Versions diffs - 0.2.9.post1__py3-none-any.whl → 0.2.11__py3-none-any.whl - Mend

sglang 0.2.9.post1py3-none-any.whl → 0.2.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +234 -74
sglang/check_env.py +25 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -40
sglang/lang/choices.py +164 -0
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +11 -2
sglang/srt/hf_transformers_utils.py +2 -2
sglang/srt/layers/extend_attention.py +59 -7
sglang/srt/layers/logits_processor.py +1 -1
sglang/srt/layers/radix_attention.py +24 -14
sglang/srt/layers/token_attention.py +28 -2
sglang/srt/managers/io_struct.py +9 -4
sglang/srt/managers/schedule_batch.py +98 -323
sglang/srt/managers/tokenizer_manager.py +34 -16
sglang/srt/managers/tp_worker.py +20 -22
sglang/srt/mem_cache/memory_pool.py +74 -38
sglang/srt/model_config.py +11 -0
sglang/srt/model_executor/cuda_graph_runner.py +3 -3
sglang/srt/model_executor/forward_batch_info.py +256 -0
sglang/srt/model_executor/model_runner.py +51 -26
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +199 -17
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -1
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +1 -1
sglang/srt/models/llama2.py +1 -1
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -1
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +151 -29
sglang/srt/openai_api/protocol.py +7 -1
sglang/srt/server.py +111 -84
sglang/srt/server_args.py +12 -2
sglang/srt/utils.py +25 -20
sglang/test/run_eval.py +21 -10
sglang/test/runners.py +237 -0
sglang/test/simple_eval_common.py +12 -12
sglang/test/simple_eval_gpqa.py +92 -0
sglang/test/simple_eval_humaneval.py +5 -5
sglang/test/simple_eval_math.py +72 -0
sglang/test/test_utils.py +95 -14
sglang/utils.py +15 -37
sglang/version.py +1 -1
{sglang-0.2.9.post1.dist-info → sglang-0.2.11.dist-info}/METADATA +59 -48
sglang-0.2.11.dist-info/RECORD +102 -0
sglang-0.2.9.post1.dist-info/RECORD +0 -97
{sglang-0.2.9.post1.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
{sglang-0.2.9.post1.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
{sglang-0.2.9.post1.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0

sglang/lang/backend/runtime_endpoint.py CHANGED Viewed

@@ -1,21 +1,24 @@
 import json
 from typing import List, Optional
-import numpy as np
 from sglang.global_config import global_config
 from sglang.lang.backend.base_backend import BaseBackend
 from sglang.lang.chat_template import get_chat_template_by_model_path
+from sglang.lang.choices import (
+    ChoicesDecision,
+    ChoicesSamplingMethod,
+    token_length_normalized,
+)
 from sglang.lang.interpreter import StreamExecutor
 from sglang.lang.ir import SglSamplingParams
 from sglang.utils import http_request
 class RuntimeEndpoint(BaseBackend):
     def __init__(
         self,
         base_url: str,
-        auth_token: Optional[str] = None,
         api_key: Optional[str] = None,
         verify: Optional[str] = None,
     ):
@@ -23,13 +26,11 @@ class RuntimeEndpoint(BaseBackend):
         self.support_concate_and_append = True
         self.base_url = base_url
-        self.auth_token = auth_token
         self.api_key = api_key
         self.verify = verify
         res = http_request(
             self.base_url + "/get_model_info",
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -46,7 +47,7 @@ class RuntimeEndpoint(BaseBackend):
     def flush_cache(self):
         res = http_request(
             self.base_url + "/flush_cache",
-            auth_token=self.auth_token,
+            api_key=self.api_key,
             verify=self.verify,
         )
         self._assert_success(res)
@@ -54,7 +55,7 @@ class RuntimeEndpoint(BaseBackend):
     def get_server_args(self):
         res = http_request(
             self.base_url + "/get_server_args",
-            auth_token=self.auth_token,
+            api_key=self.api_key,
             verify=self.verify,
         )
         self._assert_success(res)
@@ -67,7 +68,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json={"text": prefix_str, "sampling_params": {"max_new_tokens": 0}},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -79,7 +79,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -91,7 +90,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -139,7 +137,6 @@ class RuntimeEndpoint(BaseBackend):
         res = http_request(
             self.base_url + "/generate",
             json=data,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -193,7 +190,6 @@ class RuntimeEndpoint(BaseBackend):
             self.base_url + "/generate",
             json=data,
             stream=True,
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
@@ -216,21 +212,14 @@ class RuntimeEndpoint(BaseBackend):
         s: StreamExecutor,
         choices: List[str],
         temperature: float,
-    ):
+        choices_method: ChoicesSamplingMethod,
+    ) -> ChoicesDecision:
         assert temperature <= 1e-5
         # Cache common prefix
         data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
-        self._add_images(s, data)
-        res = http_request(
-            self.base_url + "/generate",
-            json=data,
-            auth_token=self.auth_token,
-            api_key=self.api_key,
-            verify=self.verify,
-        )
-        self._assert_success(res)
-        prompt_len = res.json()["meta_info"]["prompt_tokens"]
+        obj = self._generate_http_request(s, data)
+        prompt_len = obj["meta_info"]["prompt_tokens"]
         # Compute logprob
         data = {
@@ -239,40 +228,57 @@ class RuntimeEndpoint(BaseBackend):
             "return_logprob": True,
             "logprob_start_len": max(prompt_len - 2, 0),
         }
-        self._add_images(s, data)
-        res = http_request(
-            self.base_url + "/generate",
-            json=data,
-            auth_token=self.auth_token,
-            api_key=self.api_key,
-            verify=self.verify,
-        )
-        self._assert_success(res)
-        obj = res.json()
+        obj = self._generate_http_request(s, data)
         normalized_prompt_logprobs = [
             r["meta_info"]["normalized_prompt_logprob"] for r in obj
         ]
-        decision = choices[np.argmax(normalized_prompt_logprobs)]
         input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
         output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
-        return (
-            decision,
-            normalized_prompt_logprobs,
-            input_token_logprobs,
-            output_token_logprobs,
+        # Compute unconditional logprobs if required
+        if choices_method.requires_unconditional_logprobs:
+            input_ids = [[el[1] for el in subl] for subl in input_token_logprobs]
+            data = {
+                "input_ids": input_ids,
+                "sampling_params": {"max_new_tokens": 0},
+                "return_logprob": True,
+            }
+            obj = self._generate_http_request(s, data)
+            unconditional_token_logprobs = [
+                r["meta_info"]["input_token_logprobs"] for r in obj
+            ]
+        else:
+            unconditional_token_logprobs = None
+        return choices_method(
+            choices=choices,
+            normalized_prompt_logprobs=normalized_prompt_logprobs,
+            input_token_logprobs=input_token_logprobs,
+            output_token_logprobs=output_token_logprobs,
+            unconditional_token_logprobs=unconditional_token_logprobs,
         )
     def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
         res = http_request(
             self.base_url + "/concate_and_append_request",
             json={"src_rids": src_rids, "dst_rid": dst_rid},
-            auth_token=self.auth_token,
             api_key=self.api_key,
             verify=self.verify,
         )
         self._assert_success(res)
+    def _generate_http_request(self, s: StreamExecutor, data):
+        self._add_images(s, data)
+        res = http_request(
+            self.base_url + "/generate",
+            json=data,
+            api_key=self.api_key,
+            verify=self.verify,
+        )
+        self._assert_success(res)
+        return res.json()
     def _add_images(self, s: StreamExecutor, data):
         if s.images_:
             assert len(s.images_) == 1, "Only support one image."

sglang/lang/choices.py ADDED Viewed

@@ -0,0 +1,164 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional
+import numpy as np
+@dataclass
+class ChoicesDecision:
+    decision: str
+    meta_info: Optional[Dict[str, Any]] = None
+class ChoicesSamplingMethod(ABC):
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return False
+    @abstractmethod
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision: ...
+class TokenLengthNormalized(ChoicesSamplingMethod):
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest token length normalized prompt logprob."""
+        best_choice = choices[np.argmax(normalized_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+token_length_normalized = TokenLengthNormalized()
+class GreedyTokenSelection(ChoicesSamplingMethod):
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option based on greedy logprob selection. For overlapping options
+        where one option is a subset of a longer option, extend the shorter option using
+        its average logprob for comparison against the longer option."""
+        num_options = len(choices)
+        max_tokens = max(len(option) for option in input_token_logprobs)
+        logprob_matrix = self._build_logprob_matrix(
+            input_token_logprobs, max_tokens, num_options
+        )
+        remaining = self._greedy_selection(logprob_matrix, num_options, max_tokens)
+        best_choice = choices[remaining[0]]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "greedy_logprob_matrix": logprob_matrix.tolist(),
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+    def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_options):
+        logprob_matrix = np.zeros((num_options, max_tokens))
+        for i, option in enumerate(input_token_logprobs):
+            actual_logprobs = [token[0] for token in option]
+            avg_logprob = np.mean(actual_logprobs)
+            logprob_matrix[i, : len(option)] = actual_logprobs
+            if len(option) < max_tokens:
+                logprob_matrix[i, len(option) :] = avg_logprob
+        return logprob_matrix
+    def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
+        remaining = np.arange(num_options)
+        for j in range(max_tokens):
+            max_logprob = np.max(logprob_matrix[remaining, j])
+            remaining = remaining[logprob_matrix[remaining, j] == max_logprob]
+            if len(remaining) == 1:
+                break
+        return remaining
+greedy_token_selection = GreedyTokenSelection()
+class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
+    @property
+    def requires_unconditional_logprobs(self) -> bool:
+        return True
+    def __call__(
+        self,
+        *,
+        choices: List[str],
+        normalized_prompt_logprobs: List[float],
+        input_token_logprobs: List[List[Any]],
+        output_token_logprobs: List[List[Any]],
+        unconditional_token_logprobs: Optional[List[List[Any]]] = None,
+    ) -> ChoicesDecision:
+        """Select the option with the highest average token logprob once normalized by
+        the unconditional token logprobs.
+        The first unconditional token logprob is assumed to be None. If so, it is
+        replaced with 0 for the purposes of normalization."""
+        if unconditional_token_logprobs is None:
+            raise ValueError(
+                "Unconditional token logprobs are required for this method."
+            )
+        normalized_unconditional_prompt_logprobs = self._normalize_logprobs(
+            input_token_logprobs, unconditional_token_logprobs
+        )
+        best_choice = choices[np.argmax(normalized_unconditional_prompt_logprobs)]
+        meta_info = {
+            "normalized_prompt_logprobs": normalized_prompt_logprobs,
+            "input_token_logprobs": input_token_logprobs,
+            "output_token_logprobs": output_token_logprobs,
+            "unconditional_token_logprobs": unconditional_token_logprobs,
+            "normalized_unconditional_prompt_logprobs": normalized_unconditional_prompt_logprobs,
+        }
+        return ChoicesDecision(decision=best_choice, meta_info=meta_info)
+    def _normalize_logprobs(self, input_token_logprobs, unconditional_token_logprobs):
+        normalized_unconditional_prompt_logprobs = []
+        for inputs, unconditionals in zip(
+            input_token_logprobs, unconditional_token_logprobs
+        ):
+            inputs_logprobs = np.array([token[0] for token in inputs])
+            unconditionals_logprobs = np.array([token[0] for token in unconditionals])
+            unconditionals_logprobs[0] = unconditionals_logprobs[0] or 0
+            normalized_unconditional_prompt_logprobs.append(
+                float(np.mean(inputs_logprobs - unconditionals_logprobs))
+            )
+        return normalized_unconditional_prompt_logprobs
+unconditional_likelihood_normalized = UnconditionalLikelihoodNormalized()

sglang/lang/interpreter.py CHANGED Viewed

@@ -538,24 +538,17 @@ class StreamExecutor:
             self.stream_var_event[name].set()
     def _execute_select(self, expr: SglSelect):
-        (
-            decision,
-            normalized_prompt_logprobs,
-            input_token_logprobs,
-            output_token_logprobs,
-        ) = self.backend.select(self, expr.choices, expr.temperature)
+        choices_decision = self.backend.select(
+            self, expr.choices, expr.temperature, expr.choices_method
+        )
         if expr.name is not None:
             name = expr.name
-            self.variables[name] = decision
-            self.meta_info[name] = {
-                "normalized_prompt_logprobs": normalized_prompt_logprobs,
-                "input_token_logprobs": input_token_logprobs,
-                "output_token_logprobs": output_token_logprobs,
-            }
+            self.variables[name] = choices_decision.decision
+            self.meta_info[name] = choices_decision.meta_info
             self.variable_event[name].set()
             if self.stream_var_event:
                 self.stream_var_event[name].set()
-        self.text_ += decision
+        self.text_ += choices_decision.decision
     def _execute_variable(self, expr: SglVariable):
         src_executor = expr.source_stream_executor

sglang/lang/ir.py CHANGED Viewed

@@ -6,6 +6,7 @@ import warnings
 from typing import List, Optional, Union
 from sglang.global_config import global_config
+from sglang.lang.choices import ChoicesSamplingMethod
 REGEX_INT = r"[-+]?[0-9]+"
 REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+"
@@ -461,14 +462,22 @@ class SglRoleEnd(SglExpr):
 class SglSelect(SglExpr):
-    def __init__(self, name: str, choices: List[str], temperature: float):
+    def __init__(
+        self,
+        name: str,
+        choices: List[str],
+        temperature: float,
+        choices_method: ChoicesSamplingMethod,
+    ):
         super().__init__()
         self.name = name
         self.choices = choices
         self.temperature = temperature
+        self.choices_method = choices_method
     def __repr__(self):
-        return f"Select({self.name}, choices={self.choices})"
+        return f"Select({self.name}, choices={self.choices}, choices_method={self.choices_method})"
 class SglFork(SglExpr):

sglang/srt/hf_transformers_utils.py CHANGED Viewed

@@ -19,7 +19,7 @@ import functools
 import json
 import os
 import warnings
-from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
+from typing import AbstractSet, Collection, Dict, List, Literal, Optional, Type, Union
 from huggingface_hub import snapshot_download
 from transformers import (
@@ -259,7 +259,7 @@ class TiktokenTokenizer:
                 Literal["all"], AbstractSet[str]
             ] = set(),  # noqa: B006
             disallowed_special: Union[Literal["all"], Collection[str]] = "all",
-        ) -> list[int]:
+        ) -> List[int]:
             if isinstance(allowed_special, set):
                 allowed_special |= self._default_allowed_special
             return tiktoken.Encoding.encode(

sglang/srt/layers/extend_attention.py CHANGED Viewed

@@ -57,6 +57,8 @@ def _fwd_kernel(
     stride_buf_vh,
     stride_req_to_tokens_b,
     BLOCK_DMODEL: tl.constexpr,
+    BLOCK_DPE: tl.constexpr,
+    BLOCK_DV: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_N: tl.constexpr,
     logit_cap: tl.constexpr,
@@ -75,8 +77,10 @@ def _fwd_kernel(
     cur_batch_req_idx = tl.load(B_req_idx + cur_seq)
     offs_d = tl.arange(0, BLOCK_DMODEL)
+    offs_dv = tl.arange(0, BLOCK_DV)
     offs_m = tl.arange(0, BLOCK_M)
     mask_m = (cur_block_m * BLOCK_M + offs_m) < cur_seq_len_extend
     offs_q = (
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_qbs
@@ -85,10 +89,20 @@ def _fwd_kernel(
     )
     q = tl.load(Q_Extend + offs_q, mask=mask_m[:, None], other=0.0)
+    if BLOCK_DPE > 0:
+        offs_dpe = BLOCK_DMODEL + tl.arange(0, BLOCK_DPE)
+        offs_qpe = (
+            (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
+            * stride_qbs
+            + cur_head * stride_qh
+            + offs_dpe[None, :]
+        )
+        qpe = tl.load(Q_Extend + offs_qpe, mask=mask_m[:, None], other=0.0)
     # stage1: compute scores with prefix
     offs_n = tl.arange(0, BLOCK_N)
-    acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_DV], dtype=tl.float32)
     deno = tl.zeros([BLOCK_M], dtype=tl.float32)
     e_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
@@ -110,6 +124,18 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                offs_kv_loc[None, :] * stride_buf_kbs
+                + cur_kv_head * stride_buf_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Buffer + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -125,7 +151,7 @@ def _fwd_kernel(
         offs_buf_v = (
             offs_kv_loc[:, None] * stride_buf_vbs
             + cur_kv_head * stride_buf_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Buffer + offs_buf_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -150,6 +176,21 @@ def _fwd_kernel(
         qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
         qk += tl.dot(q, k)
+        if BLOCK_DPE > 0:
+            offs_kpe = (
+                (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
+                * stride_kbs
+                + cur_kv_head * stride_kh
+                + offs_dpe[:, None]
+            )
+            kpe = tl.load(
+                K_Extend + offs_kpe,
+                mask=mask_n[None, :],
+                other=0.0,
+            )
+            qk += tl.dot(qpe, kpe)
         qk *= sm_scale
         if logit_cap > 0:
@@ -169,7 +210,7 @@ def _fwd_kernel(
         offs_v = (
             (cur_seq_extend_start_contiguous + start_n + offs_n[:, None]) * stride_vbs
             + cur_kv_head * stride_vh
-            + offs_d[None, :]
+            + offs_dv[None, :]
         )
         v = tl.load(V_Extend + offs_v, mask=mask_n[:, None], other=0.0)
         p = p.to(v.dtype)
@@ -181,7 +222,7 @@ def _fwd_kernel(
         (cur_seq_extend_start_contiguous + cur_block_m * BLOCK_M + offs_m[:, None])
         * stride_obs
         + cur_head * stride_oh
-        + offs_d[None, :]
+        + offs_dv[None, :]
     )
     tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
@@ -217,8 +258,17 @@ def extend_attention_fwd(
         o_extend.shape[-1],
     )
-    assert Lq == Lk and Lk == Lv and Lv == Lo
-    assert Lq in {16, 32, 64, 128, 256}
+    assert Lq == Lk and Lv == Lo
+    assert Lq in {16, 32, 64, 128, 256, 576}
+    assert Lv in {16, 32, 64, 128, 256, 512}
+    if Lq == 576:
+        BLOCK_DMODEL = 512
+        BLOCK_DPE = 64
+    else:
+        BLOCK_DMODEL = Lq
+        BLOCK_DPE = 0
+    BLOCK_DV = Lv
     if CUDA_CAPABILITY[0] >= 8:
         BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
@@ -260,7 +310,9 @@ def extend_attention_fwd(
         v_buffer.stride(0),
         v_buffer.stride(1),
         req_to_tokens.stride(0),
-        BLOCK_DMODEL=Lq,
+        BLOCK_DMODEL=BLOCK_DMODEL,
+        BLOCK_DPE=BLOCK_DPE,
+        BLOCK_DV=BLOCK_DV,
         BLOCK_M=BLOCK_M,
         BLOCK_N=BLOCK_N,
         num_warps=num_warps,

sglang/srt/layers/logits_processor.py CHANGED Viewed

@@ -25,7 +25,7 @@ from vllm.distributed import (
     tensor_model_parallel_all_gather,
 )
-from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
 @dataclasses.dataclass

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -22,11 +22,8 @@ from torch import nn
 from sglang.global_config import global_config
 from sglang.srt.layers.extend_attention import extend_attention_fwd
 from sglang.srt.layers.token_attention import token_attention_fwd
-from sglang.srt.model_executor.model_runner import (
-    ForwardMode,
-    InputMetadata,
-    global_server_args_dict,
-)
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
+from sglang.srt.model_executor.model_runner import global_server_args_dict
 class RadixAttention(nn.Module):
@@ -38,16 +35,22 @@ class RadixAttention(nn.Module):
         num_kv_heads: int,
         layer_id: int,
         logit_cap: int = -1,
+        v_head_dim: int = -1,
     ):
         super().__init__()
         self.tp_q_head_num = num_heads
         self.tp_k_head_num = num_kv_heads
         self.tp_v_head_num = num_kv_heads
         self.head_dim = head_dim
+        self.qk_head_dim = head_dim
+        self.v_head_dim = v_head_dim if v_head_dim != -1 else head_dim
         self.scaling = scaling
         self.layer_id = layer_id
-        if not global_server_args_dict.get("disable_flashinfer", False):
+        if (
+            not global_server_args_dict.get("disable_flashinfer", False)
+            and self.qk_head_dim == self.v_head_dim
+        ):
             self.extend_forward = self.extend_forward_flashinfer
             self.decode_forward = self.decode_forward_flashinfer
         else:
@@ -57,13 +60,17 @@ class RadixAttention(nn.Module):
         self.logit_cap = logit_cap if logit_cap is not None and logit_cap > 0 else 0
     def extend_forward_triton(self, q, k, v, input_metadata: InputMetadata):
-        o = torch.empty_like(q)
+        if self.qk_head_dim != self.v_head_dim:
+            o = q.new_empty((q.shape[0], self.tp_q_head_num * self.v_head_dim))
+        else:
+            o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
         extend_attention_fwd(
-            q.view(-1, self.tp_q_head_num, self.head_dim),
+            q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             k.contiguous(),
             v.contiguous(),
-            o.view(-1, self.tp_q_head_num, self.head_dim),
+            o.view(-1, self.tp_q_head_num, self.v_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),
             input_metadata.req_to_token_pool.req_to_token,
@@ -82,14 +89,17 @@ class RadixAttention(nn.Module):
         return o
     def decode_forward_triton(self, q, k, v, input_metadata: InputMetadata):
-        o = torch.empty_like(q)
+        if self.qk_head_dim != self.v_head_dim:
+            o = q.new_empty((q.shape[0], self.tp_q_head_num * self.v_head_dim))
+        else:
+            o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
         token_attention_fwd(
-            q.view(-1, self.tp_q_head_num, self.head_dim),
+            q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),
-            o.view(-1, self.tp_q_head_num, self.head_dim),
+            o.view(-1, self.tp_q_head_num, self.v_head_dim),
             input_metadata.req_to_token_pool.req_to_token,
             input_metadata.req_pool_indices,
             input_metadata.triton_start_loc,
@@ -160,8 +170,8 @@ class RadixAttention(nn.Module):
         return o.view(-1, self.tp_q_head_num * self.head_dim)
     def forward(self, q, k, v, input_metadata: InputMetadata):
-        k = k.view(-1, self.tp_k_head_num, self.head_dim)
-        v = v.view(-1, self.tp_v_head_num, self.head_dim)
+        k = k.view(-1, self.tp_k_head_num, self.qk_head_dim)
+        v = v.view(-1, self.tp_v_head_num, self.v_head_dim)
         if input_metadata.forward_mode == ForwardMode.EXTEND:
             return self.extend_forward(q, k, v, input_metadata)

sglang 0.2.9.post1__py3-none-any.whl → 0.2.11__py3-none-any.whl

sglang 0.2.9.post1py3-none-any.whl → 0.2.11py3-none-any.whl