PyPI - sglang - Versions diffs - 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl - Mend

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

sglang/bench_latency.py +6 -4
sglang/bench_serving.py +46 -22
sglang/lang/compiler.py +2 -2
sglang/lang/ir.py +3 -3
sglang/srt/constrained/base_tool_cache.py +1 -1
sglang/srt/constrained/fsm_cache.py +12 -2
sglang/srt/layers/activation.py +33 -0
sglang/srt/layers/{token_attention.py → decode_attention.py} +9 -5
sglang/srt/layers/extend_attention.py +6 -1
sglang/srt/layers/layernorm.py +65 -0
sglang/srt/layers/logits_processor.py +5 -0
sglang/srt/layers/pooler.py +50 -0
sglang/srt/layers/{context_flashattention_nopad.py → prefill_attention.py} +5 -0
sglang/srt/layers/radix_attention.py +2 -2
sglang/srt/managers/detokenizer_manager.py +31 -9
sglang/srt/managers/io_struct.py +63 -0
sglang/srt/managers/policy_scheduler.py +173 -25
sglang/srt/managers/schedule_batch.py +110 -87
sglang/srt/managers/tokenizer_manager.py +193 -111
sglang/srt/managers/tp_worker.py +289 -352
sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} +9 -4
sglang/srt/mem_cache/chunk_cache.py +43 -20
sglang/srt/mem_cache/memory_pool.py +2 -2
sglang/srt/mem_cache/radix_cache.py +74 -40
sglang/srt/model_executor/cuda_graph_runner.py +24 -9
sglang/srt/model_executor/forward_batch_info.py +168 -105
sglang/srt/model_executor/model_runner.py +24 -37
sglang/srt/models/gemma2.py +0 -1
sglang/srt/models/internlm2.py +2 -7
sglang/srt/models/llama2.py +4 -4
sglang/srt/models/llama_embedding.py +88 -0
sglang/srt/models/qwen2_moe.py +0 -11
sglang/srt/openai_api/adapter.py +155 -27
sglang/srt/openai_api/protocol.py +37 -1
sglang/srt/sampling/penaltylib/__init__.py +13 -0
sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
sglang/srt/sampling_params.py +31 -4
sglang/srt/server.py +69 -15
sglang/srt/server_args.py +26 -19
sglang/srt/utils.py +31 -13
sglang/test/run_eval.py +10 -1
sglang/test/runners.py +63 -63
sglang/test/simple_eval_humaneval.py +2 -8
sglang/test/simple_eval_mgsm.py +203 -0
sglang/test/srt/sampling/penaltylib/utils.py +337 -0
sglang/test/test_layernorm.py +60 -0
sglang/test/test_programs.py +4 -2
sglang/test/test_utils.py +20 -2
sglang/utils.py +0 -1
sglang/version.py +1 -1
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/METADATA +23 -14
sglang-0.2.12.dist-info/RECORD +112 -0
sglang/srt/layers/linear.py +0 -884
sglang/srt/layers/quantization/__init__.py +0 -64
sglang/srt/layers/quantization/fp8.py +0 -677
sglang-0.2.11.dist-info/RECORD +0 -102
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/LICENSE +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/WHEEL +0 -0
{sglang-0.2.11.dist-info → sglang-0.2.12.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/{base_cache.py → base_prefix_cache.py} RENAMED Viewed

@@ -1,4 +1,5 @@
 from abc import ABC, abstractmethod
+from typing import Callable
 class BasePrefixCache(ABC):
@@ -17,11 +18,15 @@ class BasePrefixCache(ABC):
         pass
     @abstractmethod
-    def cache_req(self, **kwargs):
+    def cache_finished_req(self, **kwargs):
         pass
     @abstractmethod
-    def evict(self, num_tokens, evict_callback):
+    def cache_unfinished_req(self, **kwargs):
+        pass
+    @abstractmethod
+    def evict(self, num_tokens: int, evict_callback: Callable):
         pass
     @abstractmethod
@@ -37,7 +42,7 @@ class BasePrefixCache(ABC):
         pass
     def total_size(self):
-        raise NotImplementedError
+        raise NotImplementedError()
     def pretty_print(self):
-        raise NotImplementedError
+        raise NotImplementedError()

sglang/srt/mem_cache/chunk_cache.py CHANGED Viewed

@@ -1,6 +1,14 @@
+from __future__ import annotations
 """Cache for chunked prefill, used when RadixCache is disabled."""
-from sglang.srt.mem_cache.base_cache import BasePrefixCache
+from typing import TYPE_CHECKING, Callable, List, Optional
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
 class ChunkCacheEntry:
@@ -10,7 +18,9 @@ class ChunkCacheEntry:
 class ChunkCache(BasePrefixCache):
-    def __init__(self, req_to_token_pool, token_to_kv_pool):
+    def __init__(
+        self, req_to_token_pool: ReqToTokenPool, token_to_kv_pool: BaseTokenToKVPool
+    ):
         self.disable = True
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool = token_to_kv_pool
@@ -20,34 +30,47 @@ class ChunkCache(BasePrefixCache):
     def reset(self):
         self.entries = {}
-    def match_prefix(self, rid, **kwargs):
+    def match_prefix(self, rid: int, key: List[int]):
         if rid not in self.entries:
             return [], None
         entry = self.entries[rid]
-        return entry.value, entry
+        max_prefix_len = len(key)
+        return entry.value[:max_prefix_len], entry
-    def cache_req(
-        self, rid, token_ids, req_pool_idx, del_in_memory_pool=True, **kwargs
-    ):
-        indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
-        if del_in_memory_pool:
-            assert rid in self.entries
-            self.req_to_token_pool.free(req_pool_idx)
-            self.token_to_kv_pool.free(indices)
-            return
+    def cache_finished_req(self, req: Req, token_ids: Optional[List[int]] = None):
+        if token_ids is None:
+            token_ids = (req.origin_input_ids + req.output_ids)[:-1]
-        if rid not in self.entries:
-            self.entries[rid] = ChunkCacheEntry(rid, indices)
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.token_to_kv_pool.free(kv_indices)
-        entry = self.entries[rid]
-        entry.value = indices
-        return indices, entry
+        if req.rid in self.entries:
+            del self.entries[req.rid]
+    def cache_unfinished_req(self, req: Req, token_ids: Optional[List[int]] = None):
+        if token_ids is None:
+            token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        if req.rid not in self.entries:
+            self.entries[req.rid] = ChunkCacheEntry(req.rid, kv_indices)
+        entry = self.entries[req.rid]
+        entry.value = kv_indices
+        req.prefix_indices = kv_indices
+        req.last_node = entry
     def insert(self):
-        raise NotImplementedError
+        raise NotImplementedError()
-    def evict(self, num_tokens, evict_callback):
+    def evict(self, num_tokens: int, evict_callback: Callable):
         pass
     def inc_lock_ref(self, node):

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -16,7 +16,7 @@ limitations under the License.
 """Memory pool."""
 import logging
-from typing import List
+from typing import List, Union
 import torch
@@ -42,7 +42,7 @@ class ReqToTokenPool:
         return select_index
-    def free(self, free_index):
+    def free(self, free_index: Union[int, List[int]]):
         if isinstance(free_index, (int,)):
             self.free_slots.append(free_index)
         else:

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -1,3 +1,5 @@
+from __future__ import annotations
 """
 Copyright 2023-2024 SGLang Team
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,10 +22,15 @@ The radix tree data structure for managing the KV cache.
 import heapq
 import time
 from collections import defaultdict
+from typing import TYPE_CHECKING, Callable, List, Optional
 import torch
-from sglang.srt.mem_cache.base_cache import BasePrefixCache
+from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
+from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
+if TYPE_CHECKING:
+    from sglang.srt.managers.schedule_batch import Req
 class TreeNode:
@@ -39,7 +46,7 @@ class TreeNode:
         return self.last_access_time < other.last_access_time
-def _key_match(key0, key1):
+def _key_match(key0: List, key1: List):
     i = 0
     for k0, k1 in zip(key0, key1):
         if k0 != k1:
@@ -49,7 +56,12 @@ def _key_match(key0, key1):
 class RadixCache(BasePrefixCache):
-    def __init__(self, req_to_token_pool, token_to_kv_pool, disable: bool = False):
+    def __init__(
+        self,
+        req_to_token_pool: ReqToTokenPool,
+        token_to_kv_pool: BaseTokenToKVPool,
+        disable: bool = False,
+    ):
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool = token_to_kv_pool
         self.disable = disable
@@ -64,7 +76,7 @@ class RadixCache(BasePrefixCache):
         self.root_node.lock_ref = 1
         self.evictable_size_ = 0
-    def match_prefix(self, key, **kwargs):
+    def match_prefix(self, key: List, **kwargs):
         if self.disable:
             return [], self.root_node
@@ -74,10 +86,10 @@ class RadixCache(BasePrefixCache):
         if value:
             value = torch.concat(value)
         else:
-            value = torch.tensor([], dtype=torch.int64)
+            value = torch.tensor([], dtype=torch.int32)
         return value, last_node[0]
-    def insert(self, key, value=None):
+    def insert(self, key: List, value=None):
         if self.disable:
             return 0
@@ -85,40 +97,54 @@ class RadixCache(BasePrefixCache):
             value = [x for x in key]
         return self._insert_helper(self.root_node, key, value)
-    def cache_req(
-        self,
-        token_ids,
-        last_uncached_pos,
-        req_pool_idx,
-        del_in_memory_pool=True,
-        old_last_node=None,
-        **kwargs,
-    ):
-        # Insert the request into radix cache
-        indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
-        new_prefix_len = self.insert(token_ids, indices.clone())
+    def cache_finished_req(self, req: Req, token_ids: Optional[List[int]] = None):
+        """Cache request when it finishes."""
+        if token_ids is None:
+            token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
         if self.disable:
-            if del_in_memory_pool:
-                self.token_to_kv_pool.free(indices)
-            else:
-                return torch.tensor([], dtype=torch.int64), self.root_node
+            self.token_to_kv_pool.free(kv_indices)
+            self.req_to_token_pool.free(req.req_pool_idx)
+            return
         # Radix Cache takes one ref in memory pool
-        self.token_to_kv_pool.free(indices[last_uncached_pos:new_prefix_len])
+        new_prefix_len = self.insert(token_ids, kv_indices.clone())
+        self.token_to_kv_pool.free(kv_indices[len(req.prefix_indices) : new_prefix_len])
-        if del_in_memory_pool:
-            self.req_to_token_pool.free(req_pool_idx)
-        else:
-            cached_indices, new_last_node = self.match_prefix(token_ids)
-            assert len(cached_indices) == len(token_ids)
+        # Remove req slot release the cache lock
+        self.req_to_token_pool.free(req.req_pool_idx)
+        self.dec_lock_ref(req.last_node)
+    def cache_unfinished_req(self, req: Req, token_ids: Optional[List[int]] = None):
+        """Cache request when it is unfinished."""
+        if self.disable:
+            return
-            self.req_to_token_pool.req_to_token[
-                req_pool_idx, last_uncached_pos : len(cached_indices)
-            ] = cached_indices[last_uncached_pos:]
-            self.dec_lock_ref(old_last_node)
-            self.inc_lock_ref(new_last_node)
-            return cached_indices, new_last_node
+        if token_ids is None:
+            token_ids = req.fill_ids
+        kv_indices = self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, : len(token_ids)
+        ]
+        # Radix Cache takes one ref in memory pool
+        new_prefix_len = self.insert(token_ids, kv_indices.clone())
+        self.token_to_kv_pool.free(kv_indices[len(req.prefix_indices) : new_prefix_len])
+        # The prefix indices could be updated, reuse it
+        new_indices, new_last_node = self.match_prefix(token_ids)
+        assert len(new_indices) == len(token_ids)
+        self.req_to_token_pool.req_to_token[
+            req.req_pool_idx, len(req.prefix_indices) : len(new_indices)
+        ] = new_indices[len(req.prefix_indices) :]
+        self.dec_lock_ref(req.last_node)
+        self.inc_lock_ref(new_last_node)
+        req.prefix_indices = new_indices
+        req.last_node = new_last_node
     def pretty_print(self):
         self._print_helper(self.root_node, 0)
@@ -127,7 +153,7 @@ class RadixCache(BasePrefixCache):
     def total_size(self):
         return self._total_size_helper(self.root_node)
-    def evict(self, num_tokens, evict_callback):
+    def evict(self, num_tokens: int, evict_callback: Callable):
         if self.disable:
             return
@@ -151,6 +177,9 @@ class RadixCache(BasePrefixCache):
                 heapq.heappush(leaves, x.parent)
     def inc_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
         delta = 0
         while node != self.root_node:
             if node.lock_ref == 0:
@@ -161,6 +190,9 @@ class RadixCache(BasePrefixCache):
         return delta
     def dec_lock_ref(self, node: TreeNode):
+        if self.disable:
+            return 0
         delta = 0
         while node != self.root_node:
             if node.lock_ref == 1:
@@ -175,7 +207,9 @@ class RadixCache(BasePrefixCache):
     ##### Internal Helper Functions #####
-    def _match_prefix_helper(self, node, key, value, last_node):
+    def _match_prefix_helper(
+        self, node: TreeNode, key: List, value, last_node: TreeNode
+    ):
         node.last_access_time = time.time()
         if len(key) == 0:
             return
@@ -192,7 +226,7 @@ class RadixCache(BasePrefixCache):
                 last_node[0] = child
                 self._match_prefix_helper(child, key[prefix_len:], value, last_node)
-    def _split_node(self, key, child: TreeNode, split_len):
+    def _split_node(self, key, child: TreeNode, split_len: int):
         # new_node -> child
         new_node = TreeNode()
         new_node.children = {key[split_len:][0]: child}
@@ -206,7 +240,7 @@ class RadixCache(BasePrefixCache):
         new_node.parent.children[key[:split_len][0]] = new_node
         return new_node
-    def _insert_helper(self, node, key, value):
+    def _insert_helper(self, node: TreeNode, key: List, value):
         node.last_access_time = time.time()
         if len(key) == 0:
             return 0
@@ -237,7 +271,7 @@ class RadixCache(BasePrefixCache):
             self.evictable_size_ += len(value)
         return 0
-    def _print_helper(self, node: TreeNode, indent):
+    def _print_helper(self, node: TreeNode, indent: int):
         for _, child in node.children.items():
             print(" " * indent, len(child.key), child.key[:10], f"r={child.lock_ref}")
             self._print_helper(child, indent=indent + 2)
@@ -249,7 +283,7 @@ class RadixCache(BasePrefixCache):
         del node.parent.children[k]
         self.evictable_size_ -= len(node.key)
-    def _total_size_helper(self, node):
+    def _total_size_helper(self, node: TreeNode):
         x = len(node.value)
         for child in node.children.values():
             x += self._total_size_helper(child)

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -33,7 +33,7 @@ from sglang.srt.managers.schedule_batch import ScheduleBatch
 from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
     InputMetadata,
-    init_flashinfer_args,
+    update_flashinfer_indices,
 )
 from sglang.srt.utils import monkey_patch_vllm_all_gather
@@ -71,6 +71,18 @@ def patch_model(
             tp_group.ca_comm = backup_ca_comm
+def set_torch_compile_config():
+    import torch._dynamo.config
+    import torch._inductor.config
+    torch._inductor.config.coordinate_descent_tuning = True
+    torch._inductor.config.triton.unique_kernel_names = True
+    torch._inductor.config.fx_graph_cache = True  # Experimental feature to reduce compilation times, will be on by default in future
+    # FIXME: tmp workaround
+    torch._dynamo.config.accumulated_cache_size_limit = 1024
 class CudaGraphRunner:
     def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
         self.model_runner = model_runner
@@ -112,6 +124,9 @@ class CudaGraphRunner:
         self.compile_bs = [1, 2, 4, 8, 16, 24, 32] if use_torch_compile else []
+        if use_torch_compile:
+            set_torch_compile_config()
     def can_run(self, batch_size):
         return batch_size < self.max_bs
@@ -165,7 +180,7 @@ class CudaGraphRunner:
             paged_kv_indices_buffer=self.flashinfer_kv_indices,
             paged_kv_last_page_len_buffer=self.flashinfer_kv_last_page_len[:bs],
         )
-        init_flashinfer_args(
+        update_flashinfer_indices(
             ForwardMode.DECODE,
             self.model_runner,
             req_pool_indices,
@@ -176,19 +191,19 @@ class CudaGraphRunner:
         # Run and capture
         def run_once():
-            input_metadata = InputMetadata.create(
-                self.model_runner,
+            input_metadata = InputMetadata(
                 forward_mode=ForwardMode.DECODE,
+                batch_size=bs,
                 req_pool_indices=req_pool_indices,
                 seq_lens=seq_lens,
-                prefix_lens=None,
-                position_ids_offsets=position_ids_offsets,
+                req_to_token_pool=self.model_runner.req_to_token_pool,
+                token_to_kv_pool=self.model_runner.token_to_kv_pool,
                 out_cache_loc=out_cache_loc,
                 return_logprob=False,
                 top_logprobs_nums=0,
-                skip_flashinfer_init=True,
+                positions=(seq_lens - 1).to(torch.int64),
+                flashinfer_decode_wrapper=flashinfer_decode_wrapper,
             )
-            input_metadata.flashinfer_decode_wrapper = flashinfer_decode_wrapper
             return forward(input_ids, input_metadata.positions, input_metadata)
@@ -222,7 +237,7 @@ class CudaGraphRunner:
         self.out_cache_loc[:raw_bs] = batch.out_cache_loc
         # FlashInfer inputs
-        init_flashinfer_args(
+        update_flashinfer_indices(
             ForwardMode.DECODE,
             self.model_runner,
             self.req_pool_indices[:bs],

sglang 0.2.11__py3-none-any.whl → 0.2.12__py3-none-any.whl

sglang 0.2.11py3-none-any.whl → 0.2.12py3-none-any.whl