PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/bench_latency.py +6 -4
sglang/bench_serving.py +46 -22
sglang/lang/compiler.py +2 -2
sglang/lang/ir.py +3 -3
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +110 -87
sglang/srt/managers/tokenizer_manager.py +193 -111
sglang/srt/managers/tp_worker.py +289 -352
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +168 -105
sglang/srt/model_executor/model_runner.py +24 -37
sglang/srt/models/gemma2.py +0 -1
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/qwen2_moe.py +0 -11
sglang/srt/openai_api/adapter.py +155 -27
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +69 -15
sglang/srt/server_args.py +26 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +20 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/METADATA +23 -14
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -22,6 +22,8 @@ import uuid
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
+import torch
 from sglang.srt.managers.schedule_batch import BaseFinishReason
 from sglang.srt.sampling_params import SamplingParams
@@ -166,6 +168,59 @@ class TokenizedGenerateReqInput:
     stream: bool
+@dataclass
+class EmbeddingReqInput:
+    # The input prompt. It can be a single prompt or a batch of prompts.
+    text: Optional[Union[List[str], str]] = None
+    # The token ids for text; one can either specify text or input_ids.
+    input_ids: Optional[Union[List[List[int]], List[int]]] = None
+    # The request id.
+    rid: Optional[Union[List[str], str]] = None
+    # Dummy sampling params for compatibility
+    sampling_params: Union[List[Dict], Dict] = None
+    def post_init(self):
+        if (self.text is None and self.input_ids is None) or (
+            self.text is not None and self.input_ids is not None
+        ):
+            raise ValueError("Either text or input_ids should be provided.")
+        if self.text is not None:
+            is_single = isinstance(self.text, str)
+        else:
+            is_single = isinstance(self.input_ids[0], int)
+        self.is_single = is_single
+        if is_single:
+            if self.rid is None:
+                self.rid = uuid.uuid4().hex
+            if self.sampling_params is None:
+                self.sampling_params = {}
+            self.sampling_params["max_new_tokens"] = 1
+        else:
+            # support select operation
+            self.batch_size = (
+                len(self.text) if self.text is not None else len(self.input_ids)
+            )
+            if self.rid is None:
+                self.rid = [uuid.uuid4().hex for _ in range(self.batch_size)]
+            else:
+                if not isinstance(self.rid, list):
+                    raise ValueError("The rid should be a list.")
+            if self.sampling_params is None:
+                self.sampling_params = [{}] * self.batch_size
+            for i in range(self.batch_size):
+                self.sampling_params[i]["max_new_tokens"] = 1
+@dataclass
+class TokenizedEmbeddingReqInput:
+    rid: str
+    input_text: str
+    input_ids: List[int]
+    sampling_params: SamplingParams
 @dataclass
 class BatchTokenIDOut:
     rids: List[str]
@@ -187,6 +242,14 @@ class BatchStrOut:
     finished_reason: List[BaseFinishReason]
+@dataclass
+class BatchEmbeddingOut:
+    rids: List[str]
+    embeddings: List[List[float]]
+    meta_info: List[Dict]
+    finished_reason: List[BaseFinishReason]
 @dataclass
 class FlushCacheReq:
     pass

sglang/srt/managers/policy_scheduler.py CHANGED Viewed

@@ -15,44 +15,54 @@ limitations under the License.
 """Request policy scheduler"""
+import os
 import random
 from collections import defaultdict
+from contextlib import contextmanager
+from typing import Dict, List, Optional
+from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.radix_cache import TreeNode
+# Clip the estimation of max_new_tokens for the request whose max_new_tokens is very large.
+# This can prevent the server from being too conservative.
+# Note that this only clips the estimation in the scheduler but does not change the stop
+# condition. The request can still generate tokens until it hits the unclipped max_new_tokens.
+CLIP_MAX_NEW_TOKENS = int(os.environ.get("SGLANG_CLIP_MAX_NEW_TOKENS", "4096"))
 class PolicyScheduler:
-    def __init__(
-        self,
-        policy,
-        max_running_seqs,
-        max_prefill_num_tokens,
-        max_total_num_tokens,
-        tree_cache,
-    ):
-        if tree_cache.disable and policy == "lpm":
-            # LMP is meaningless when the tree cache is disabled.
+    def __init__(self, policy: str, tree_cache: BasePrefixCache):
+        if tree_cache.disable and policy in ["lpm", "dfs-weight"]:
+            # LPM and DFS-weight is meaningless when the tree cache is disabled.
             policy = "fcfs"
         self.policy = policy
-        self.max_running_seqs = max_running_seqs
-        self.max_prefill_num_tokens = max_prefill_num_tokens
-        self.max_total_num_tokens = max_total_num_tokens
         self.tree_cache = tree_cache
-    def get_priority_queue(self, waiting_queue):
+    def calc_priority(self, waiting_queue: List[Req]):
+        # Compute matched prefix length
+        prefix_computed = False
+        if self.policy in ["lpm", "dfs-weight"]:
+            for r in waiting_queue:
+                # NOTE: the prefix_indices must always be aligned with last_node
+                r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
+                    rid=r.rid, key=r.adjust_max_prefix_ids()
+                )
+            prefix_computed = True
         if self.policy == "lpm":
-            # longest prefix match
+            # Longest Prefix Match
             waiting_queue.sort(key=lambda x: -len(x.prefix_indices))
-            return waiting_queue
         elif self.policy == "fcfs":
             # first come first serve
-            return waiting_queue
+            pass
         elif self.policy == "lof":
             # longest output first
             waiting_queue.sort(key=lambda x: -x.sampling_params.max_new_tokens)
-            return waiting_queue
         elif self.policy == "random":
             random.shuffle(waiting_queue)
-            return waiting_queue
         elif self.policy == "dfs-weight":
             last_node_to_reqs = defaultdict(list)
             for req in waiting_queue:
@@ -63,23 +73,161 @@ class PolicyScheduler:
                 node_to_weight[node] = len(last_node_to_reqs[node])
             self.calc_weight(self.tree_cache.root_node, node_to_weight)
-            q = []
+            waiting_queue.clear()
             self.get_dfs_priority(
-                self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
+                self.tree_cache.root_node,
+                node_to_weight,
+                last_node_to_reqs,
+                waiting_queue,
             )
-            assert len(q) == len(waiting_queue)
-            return q
         else:
             raise ValueError(f"Unknown schedule_policy: {self.policy}")
-    def calc_weight(self, cur_node, node_to_weight):
+        return prefix_computed
+    def calc_weight(self, cur_node: TreeNode, node_to_weight: Dict):
         for child in cur_node.children.values():
             self.calc_weight(child, node_to_weight)
             node_to_weight[cur_node] += node_to_weight[child]
-    def get_dfs_priority(self, cur_node, node_to_priority, last_node_to_reqs, q):
+    def get_dfs_priority(
+        self,
+        cur_node: TreeNode,
+        node_to_priority: Dict,
+        last_node_to_reqs: Dict,
+        q: List,
+    ):
         childs = [child for child in cur_node.children.values()]
         childs.sort(key=lambda x: -node_to_priority[x])
         for child in childs:
             self.get_dfs_priority(child, node_to_priority, last_node_to_reqs, q)
         q.extend(last_node_to_reqs[cur_node])
+class PrefillAdder:
+    def __init__(
+        self,
+        tree_cache: BasePrefixCache,
+        rem_total_tokens: int,
+        rem_input_tokens: int,
+        rem_chunk_tokens: Optional[int],
+    ):
+        self.tree_cache = tree_cache
+        self.rem_total_tokens = rem_total_tokens
+        self.rem_input_tokens = rem_input_tokens
+        self.rem_chunk_tokens = rem_chunk_tokens
+        self.can_run_list = []
+        self.new_inflight_req = None
+        self.log_hit_tokens = 0
+        self.log_input_tokens = 0
+    def no_remaining_tokens(self):
+        return (
+            self.rem_total_tokens <= 0
+            or self.rem_input_tokens <= 0
+            or (
+                self.rem_chunk_tokens <= 0
+                if self.rem_chunk_tokens is not None
+                else False
+            )
+        )
+    def remove_running_tokens(
+        self, running_batch: ScheduleBatch, new_token_ratio: float
+    ):
+        self.rem_total_tokens -= sum(
+            [
+                min(
+                    (r.sampling_params.max_new_tokens - len(r.output_ids)),
+                    CLIP_MAX_NEW_TOKENS,
+                )
+                * new_token_ratio
+                for r in running_batch.reqs
+            ]
+        )
+    def _prefill_one_req(
+        self, prefix_len: int, extend_input_len: int, max_new_tokens: int
+    ):
+        self.rem_total_tokens -= extend_input_len + max_new_tokens
+        self.rem_input_tokens -= extend_input_len
+        if self.rem_chunk_tokens is not None:
+            self.rem_chunk_tokens -= extend_input_len
+        self.log_hit_tokens += prefix_len
+        self.log_input_tokens += extend_input_len
+    def add_inflight_req(self, req: Req):
+        truncated = req.extend_input_len > self.rem_chunk_tokens
+        req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
+        req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
+        self.can_run_list.append(req)
+        self._prefill_one_req(
+            len(req.prefix_indices),
+            req.extend_input_len,
+            (
+                min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
+                if not truncated
+                else 0
+            ),
+        )
+        # Return if chunked prefill not finished
+        return req if truncated else None
+    @contextmanager
+    def _lock_node(self, last_node: TreeNode):
+        try:
+            delta = self.tree_cache.inc_lock_ref(last_node)
+            self.rem_total_tokens += delta
+            yield None
+        finally:
+            delta = self.tree_cache.dec_lock_ref(last_node)
+            self.rem_total_tokens += delta
+    def add_one_req(self, req: Req):
+        total_tokens = req.extend_input_len + min(
+            req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS
+        )
+        input_tokens = req.extend_input_len
+        prefix_len = len(req.prefix_indices)
+        if total_tokens >= self.rem_total_tokens:
+            return False
+        if input_tokens > self.rem_input_tokens and len(self.can_run_list) != 0:
+            return False
+        with self._lock_node(req.last_node):
+            if total_tokens > self.rem_total_tokens:
+                return False
+            if (
+                self.rem_chunk_tokens is None
+                or input_tokens <= self.rem_chunk_tokens
+                or (req.return_logprob and req.normalized_prompt_logprob is None)
+            ):
+                # Non-chunked prefill
+                self.can_run_list.append(req)
+                self.tree_cache.inc_lock_ref(req.last_node)
+                self._prefill_one_req(
+                    prefix_len,
+                    input_tokens,
+                    min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS),
+                )
+            else:
+                # Chunked prefill
+                trunc_len = self.rem_chunk_tokens
+                if trunc_len == 0:
+                    return False
+                req.extend_input_len = trunc_len
+                req.fill_ids = req.fill_ids[: len(req.prefix_indices) + trunc_len]
+                self.can_run_list.append(req)
+                self.new_inflight_req = req
+                self.tree_cache.inc_lock_ref(req.last_node)
+                self._prefill_one_req(prefix_len, trunc_len, 0)
+        return True

sglang 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl