PyPI - sglang - Versions diffs - 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (89) hide show

sglang/__init__.py +8 -0
sglang/api.py +10 -2
sglang/bench_latency.py +151 -40
sglang/bench_serving.py +46 -22
sglang/check_env.py +24 -2
sglang/global_config.py +0 -1
sglang/lang/backend/base_backend.py +3 -1
sglang/lang/backend/openai.py +8 -3
sglang/lang/backend/runtime_endpoint.py +46 -29
sglang/lang/choices.py +164 -0
sglang/lang/compiler.py +2 -2
sglang/lang/interpreter.py +6 -13
sglang/lang/ir.py +14 -5
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +6 -1
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +4 -7
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +174 -380
sglang/srt/managers/tokenizer_manager.py +197 -112
sglang/srt/managers/tp_worker.py +299 -364
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +10 -15
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +27 -12
sglang/srt/model_executor/forward_batch_info.py +319 -0
sglang/srt/model_executor/model_runner.py +30 -47
sglang/srt/models/chatglm.py +1 -1
sglang/srt/models/commandr.py +1 -1
sglang/srt/models/dbrx.py +1 -1
sglang/srt/models/deepseek.py +1 -1
sglang/srt/models/deepseek_v2.py +1 -1
sglang/srt/models/gemma.py +1 -1
sglang/srt/models/gemma2.py +1 -2
sglang/srt/models/gpt_bigcode.py +1 -1
sglang/srt/models/grok.py +1 -1
sglang/srt/models/internlm2.py +3 -8
sglang/srt/models/llama2.py +5 -5
sglang/srt/models/llama_classification.py +1 -1
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/llava.py +1 -2
sglang/srt/models/llavavid.py +1 -2
sglang/srt/models/minicpm.py +1 -1
sglang/srt/models/mixtral.py +1 -1
sglang/srt/models/mixtral_quant.py +1 -1
sglang/srt/models/qwen.py +1 -1
sglang/srt/models/qwen2.py +1 -1
sglang/srt/models/qwen2_moe.py +1 -12
sglang/srt/models/stablelm.py +1 -1
sglang/srt/openai_api/adapter.py +189 -39
sglang/srt/openai_api/protocol.py +43 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +93 -21
sglang/srt/server_args.py +30 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +21 -3
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/METADATA +50 -31
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.10.dist-info/RECORD +0 -100
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.10.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} RENAMED Viewed

@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 """
+"""
+Memory-efficient attention for prefill.
+It supporst page size = 1.
+"""
 # Adapted from
 # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1
 import torch

sglang/srt/layers/radix_attention.py CHANGED Viewed

@@ -20,13 +20,10 @@ from flashinfer.cascade import merge_state
 from torch import nn
 from sglang.global_config import global_config
+from sglang.srt.layers.decode_attention import decode_attention_fwd
 from sglang.srt.layers.extend_attention import extend_attention_fwd
-from sglang.srt.layers.token_attention import token_attention_fwd
-from sglang.srt.model_executor.model_runner import (
-    ForwardMode,
-    InputMetadata,
-    global_server_args_dict,
-)
+from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
+from sglang.srt.model_executor.model_runner import global_server_args_dict
 class RadixAttention(nn.Module):
@@ -98,7 +95,7 @@ class RadixAttention(nn.Module):
             o = torch.empty_like(q)
         self.store_kv_cache(k, v, input_metadata)
-        token_attention_fwd(
+        decode_attention_fwd(
             q.view(-1, self.tp_q_head_num, self.qk_head_dim),
             input_metadata.token_to_kv_pool.get_key_buffer(self.layer_id),
             input_metadata.token_to_kv_pool.get_value_buffer(self.layer_id),

sglang/srt/managers/detokenizer_manager.py CHANGED Viewed

@@ -25,10 +25,14 @@ import zmq
 import zmq.asyncio
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.managers.io_struct import BatchStrOut, BatchTokenIDOut
+from sglang.srt.managers.io_struct import (
+    BatchEmbeddingOut,
+    BatchStrOut,
+    BatchTokenIDOut,
+)
 from sglang.srt.managers.schedule_batch import FINISH_MATCHED_STR
 from sglang.srt.server_args import PortArgs, ServerArgs
-from sglang.utils import find_printable_text, get_exception_traceback, graceful_registry
+from sglang.utils import find_printable_text, get_exception_traceback
 asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
@@ -55,20 +59,40 @@ class DetokenizerManager:
         self.send_to_tokenizer = context.socket(zmq.PUSH)
         self.send_to_tokenizer.connect(f"tcp://127.0.0.1:{port_args.tokenizer_port}")
-        self.tokenizer = get_tokenizer(
-            server_args.tokenizer_path,
-            tokenizer_mode=server_args.tokenizer_mode,
-            trust_remote_code=server_args.trust_remote_code,
-        )
+        if server_args.skip_tokenizer_init:
+            self.tokenizer = None
+        else:
+            self.tokenizer = get_tokenizer(
+                server_args.tokenizer_path,
+                tokenizer_mode=server_args.tokenizer_mode,
+                trust_remote_code=server_args.trust_remote_code,
+            )
         self.decode_status = {}
     async def handle_loop(self):
         while True:
             recv_obj: BatchTokenIDOut = await self.recv_from_router.recv_pyobj()
+            if isinstance(recv_obj, BatchEmbeddingOut):
+                self.send_to_tokenizer.send_pyobj(
+                    BatchEmbeddingOut(
+                        rids=recv_obj.rids,
+                        embeddings=recv_obj.embeddings,
+                        meta_info=recv_obj.meta_info,
+                        finished_reason=recv_obj.finished_reason,
+                    )
+                )
+                continue
             assert isinstance(recv_obj, BatchTokenIDOut)
             bs = len(recv_obj.rids)
+            if self.tokenizer is None:
+                # Send BatchTokenIDOut if no tokenizer init'ed.
+                self.send_to_tokenizer.send_pyobj(recv_obj)
+                continue
             # Initialize decode status
             read_ids, surr_ids = [], []
             for i in range(bs):
@@ -140,8 +164,6 @@ def start_detokenizer_process(
     port_args: PortArgs,
     pipe_writer,
 ):
-    graceful_registry(inspect.currentframe().f_code.co_name)
     try:
         manager = DetokenizerManager(server_args, port_args)
     except Exception:

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -22,6 +22,8 @@ import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
+import torch
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling_params import SamplingParams
@@ -166,6 +168,59 @@ class TokenizedGenerateReqInput:
     stream: bool
+@dataclass
+class EmbeddingReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can either specify text or input_ids.
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Union[List[Dict], Dict] = None
+    def post_init(self):
+        if (self.text is None and self.input_ids is None) or (
+            self.text is not None and self.input_ids is not None
+        ):
+            raise ValueError("Either text or input_ids should be provided.")
+        if self.text is not None:
+            is_single = isinstance(self.text, str)
+        else:
+            is_single = isinstance(self.input_ids[0], int)
+        self.is_single = is_single
+        if is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 1
+        else:
+            # support select operation
+            self.batch_size = (
+                len(self.text) if self.text is not None else len(self.input_ids)
+            )
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                if not isinstance(self.rid, list):
+                    raise ValueError("The rid should be a list.")
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 1
+@dataclass
+class TokenizedEmbeddingReqInput:
+    rid: str
+    input_text: str
+    input_ids: List[int]
+    sampling_params: SamplingParams
 @dataclass
 class BatchTokenIDOut:
     rids: List[str]
@@ -187,6 +242,14 @@ class BatchStrOut:
     finished_reason: List[BaseFinishReason]
+@dataclass
+class BatchEmbeddingOut:
+    rids: List[str]
+    embeddings: List[List[float]]
+    meta_info: List[Dict]
+    finished_reason: List[BaseFinishReason]
 @dataclass
 class FlushCacheReq:
     pass

sglang/srt/managers/policy_scheduler.py CHANGED Viewed

@@ -15,44 +15,54 @@ limitations under the License.
 """Request policy scheduler"""
+import os
 import random
 from collections import defaultdict
+from contextlib import contextmanager
+from typing import Dict, List, Optional
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.radix_cache import TreeNode
+# Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
+# This can prevent the server from being too conservative.
+# Note that this only clips the estimation in the scheduler but does not change the stop
+# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
+CLIP_MAX_NEW_TOKENS = int(os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS", "4096"))
 class PolicyScheduler:
-    def __init__(
-        self,
-        policy,
-        max_running_seqs,
-        max_prefill_num_tokens,
-        max_total_num_tokens,
-        tree_cache,
-    ):
-        if tree_cache.disable and policy == "lpm":
-            # LMP is meaningless when the tree cache is disabled.
+    def __init__(self, policy: str, tree_cache: BasePrefixCache):
+        if tree_cache.disable and policy in ["lpm", "dfs-weight"]:
+            # LPM and DFS-weight is meaningless when the tree cache is disabled.
             policy = "fcfs"
         self.policy = policy
-        self.max_running_seqs = max_running_seqs
-        self.max_prefill_num_tokens = max_prefill_num_tokens
-        self.max_total_num_tokens = max_total_num_tokens
         self.tree_cache = tree_cache
-    def get_priority_queue(self, waiting_queue):
+    def calc_priority(self, waiting_queue: List[Req]):
+        # Compute matched prefix length
+        prefix_computed = False
+        if self.policy in ["lpm", "dfs-weight"]:
+            for r in waiting_queue:
+                # NOTE: the prefix_indices must always be aligned with last_node
+                r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
+                    rid=r.rid, key=r.adjust_max_prefix_ids()
+                )
+            prefix_computed = True
         if self.policy == "lpm":
-            # longest prefix match
+            # Longest Prefix Match
             waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
-            return waiting_queue
         elif self.policy == "fcfs":
             # first come first serve
-            return waiting_queue
+            pass
         elif self.policy == "lof":
             # longest output first
             waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
-            return waiting_queue
         elif self.policy == "random":
             random.shuffle(waiting_queue)
-            return waiting_queue
         elif self.policy == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
             for req in waiting_queue:
@@ -63,23 +73,161 @@ class PolicyScheduler:
                 node_to_weight[node] = len(last_node_to_reqs[node])
             self.calc_weight(self.tree_cache.root_node, node_to_weight)
-            q = []
+            waiting_queue.clear()
             self.get_dfs_priority(
-                self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
+                self.tree_cache.root_node,
+                node_to_weight,
+                last_node_to_reqs,
+                waiting_queue,
             )
-            assert len(q) == len(waiting_queue)
-            return q
         else:
             raise ValueError(f"Unknown schedule_policy: {self.policy}")
-    def calc_weight(self, cur_node, node_to_weight):
+        return prefix_computed
+    def calc_weight(self, cur_node: TreeNode, node_to_weight: Dict):
         for child in cur_node.children.values():
             self.calc_weight(child, node_to_weight)
             node_to_weight[cur_node] += node_to_weight[child]
-    def get_dfs_priority(self, cur_node, node_to_priority, last_node_to_reqs, q):
+    def get_dfs_priority(
+        self,
+        cur_node: TreeNode,
+        node_to_priority: Dict,
+        last_node_to_reqs: Dict,
+        q: List,
+    ):
         childs = [child for child in cur_node.children.values()]
         childs.sort(key=lambda x: -node_to_priority[x])
         for child in childs:
             self.get_dfs_priority(child, node_to_priority, last_node_to_reqs, q)
         q.extend(last_node_to_reqs[cur_node])
+class PrefillAdder:
+    def __init__(
+        self,
+        tree_cache: BasePrefixCache,
+        rem_total_tokens: int,
+        rem_input_tokens: int,
+        rem_chunk_tokens: Optional[int],
+    ):
+        self.tree_cache = tree_cache
+        self.rem_total_tokens = rem_total_tokens
+        self.rem_input_tokens = rem_input_tokens
+        self.rem_chunk_tokens = rem_chunk_tokens
+        self.can_run_list = []
+        self.new_inflight_req = None
+        self.log_hit_tokens = 0
+        self.log_input_tokens = 0
+    def no_remaining_tokens(self):
+        return (
+            self.rem_total_tokens <= 0
+            or self.rem_input_tokens <= 0
+            or (
+                self.rem_chunk_tokens <= 0
+                if self.rem_chunk_tokens is not None
+                else False
+            )
+        )
+    def remove_running_tokens(
+        self, running_batch: ScheduleBatch, new_token_ratio: float
+    ):
+        self.rem_total_tokens -= sum(
+            [
+                min(
+                    (r.sampling_params.max_new_tokens - len(r.output_ids)),
+                    CLIP_MAX_NEW_TOKENS,
+                )
+                * new_token_ratio
+                for r in running_batch.reqs
+            ]
+        )
+    def _prefill_one_req(
+        self, prefix_len: int, extend_input_len: int, max_new_tokens: int
+    ):
+        self.rem_total_tokens -= extend_input_len + max_new_tokens
+        self.rem_input_tokens -= extend_input_len
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= extend_input_len
+        self.log_hit_tokens += prefix_len
+        self.log_input_tokens += extend_input_len
+    def add_inflight_req(self, req: Req):
+        truncated = req.extend_input_len > self.rem_chunk_tokens
+        req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
+        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
+        self.can_run_list.append(req)
+        self._prefill_one_req(
+            len(req.prefix_indices),
+            req.extend_input_len,
+            (
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+                if not truncated
+                else 0
+            ),
+        )
+        # Return if chunked prefill not finished
+        return req if truncated else None
+    @contextmanager
+    def _lock_node(self, last_node: TreeNode):
+        try:
+            delta = self.tree_cache.inc_lock_ref(last_node)
+            self.rem_total_tokens += delta
+            yield None
+        finally:
+            delta = self.tree_cache.dec_lock_ref(last_node)
+            self.rem_total_tokens += delta
+    def add_one_req(self, req: Req):
+        total_tokens = req.extend_input_len + min(
+            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS
+        )
+        input_tokens = req.extend_input_len
+        prefix_len = len(req.prefix_indices)
+        if total_tokens >= self.rem_total_tokens:
+            return False
+        if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
+            return False
+        with self._lock_node(req.last_node):
+            if total_tokens > self.rem_total_tokens:
+                return False
+            if (
+                self.rem_chunk_tokens is None
+                or input_tokens <= self.rem_chunk_tokens
+                or (req.return_logprob and req.normalized_prompt_logprob is None)
+            ):
+                # Non-chunked prefill
+                self.can_run_list.append(req)
+                self.tree_cache.inc_lock_ref(req.last_node)
+                self._prefill_one_req(
+                    prefix_len,
+                    input_tokens,
+                    min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+                )
+            else:
+                # Chunked prefill
+                trunc_len = self.rem_chunk_tokens
+                if trunc_len == 0:
+                    return False
+                req.extend_input_len = trunc_len
+                req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
+                self.can_run_list.append(req)
+                self.new_inflight_req = req
+                self.tree_cache.inc_lock_ref(req.last_node)
+                self._prefill_one_req(prefix_len, trunc_len, 0)
+        return True

sglang 0.2.10__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.10py3-none-any.whl → 0.2.12py3-none-any.whl