PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

sglang/srt/mem_cache/chunk_cache.py CHANGED Viewed

@@ -1,7 +1,8 @@
 from __future__ import annotations
 """Cache for chunked prefill, used when RadixCache is disabled."""
-from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Callable, List, Tuple
 import torch
@@ -24,73 +25,40 @@ class ChunkCache(BasePrefixCache):
         req_to_token_pool: ReqToTokenPool,
         token_to_kv_pool_allocator: TokenToKVPoolAllocator,
     ):
-        self.disable = True
         self.req_to_token_pool = req_to_token_pool
         self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
-        self.entries: Dict[str, ChunkCacheEntry] = {}
-        self.reset()
     def reset(self):
-        self.entries = {}
-    def match_prefix(self, rid: int, key: List[int]) -> Tuple[List[int], int]:
-        if rid not in self.entries:
-            return [], None
-        entry = self.entries[rid]
-        max_prefix_len = len(key)
-        return entry.value[:max_prefix_len], entry
+        pass
-    def cache_finished_req(self, req: Req, token_ids: Optional[List[int]] = None):
-        if token_ids is None:
-            token_id_len = len(req.origin_input_ids) + len(req.output_ids) - 1
-        else:
-            token_id_len = len(token_ids)
+    def match_prefix(self, **unused_kwargs) -> Tuple[List[int], int]:
+        return [], None
+    def cache_finished_req(self, req: Req):
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, :token_id_len
+            req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
         ]
         self.req_to_token_pool.free(req.req_pool_idx)
         self.token_to_kv_pool_allocator.free(kv_indices)
-        if req.rid in self.entries:
-            del self.entries[req.rid]
     def cache_unfinished_req(self, req: Req):
-        token_id_len = len(req.fill_ids)
         kv_indices = self.req_to_token_pool.req_to_token[
-            req.req_pool_idx, :token_id_len
+            req.req_pool_idx, : len(req.fill_ids)
         ]
-        if req.rid not in self.entries:
-            self.entries[req.rid] = ChunkCacheEntry(req.rid, kv_indices)
-        entry = self.entries[req.rid]
-        entry.value = kv_indices
+        # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
         req.prefix_indices = kv_indices
-        req.last_node = entry
     def insert(self):
         raise NotImplementedError()
-    def evict(self, num_tokens: int, evict_callback: Callable):
+    def evict(self, num_tokens: int):
         pass
-    def inc_lock_ref(self, node):
+    def inc_lock_ref(self, node: Any):
         return 0
-    def dec_lock_ref(self, node):
-        return 0
-    def evictable_size(self):
-        return 0
-    def pretty_print(self):
-        return ""
-    def protected_size(self):
+    def dec_lock_ref(self, node: Any):
         return 0
     def pretty_print(self):

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import heapq
 import logging
+import threading
 import time
 from typing import List, Optional
@@ -7,11 +8,12 @@ import torch
 from sglang.srt.managers.cache_controller import HiCacheController
 from sglang.srt.mem_cache.memory_pool import (
-    MHATokenToKVPool,
     MHATokenToKVPoolHost,
     ReqToTokenPool,
+    TokenToKVPoolAllocator,
 )
-from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode, _key_match
+from sglang.srt.mem_cache.radix_cache import RadixCache, TreeNode
+from sglang.srt.mem_cache.radix_cache import _key_match_page_size1 as _key_match
 logger = logging.getLogger(__name__)
@@ -21,11 +23,25 @@ class HiRadixCache(RadixCache):
     def __init__(
         self,
         req_to_token_pool: ReqToTokenPool,
-        token_to_kv_pool: MHATokenToKVPool,
+        token_to_kv_pool_allocator: TokenToKVPoolAllocator,
+        tp_cache_group: torch.distributed.ProcessGroup,
+        page_size: int,
     ):
-        self.token_to_kv_pool_host = MHATokenToKVPoolHost(token_to_kv_pool)
+        if page_size != 1:
+            raise ValueError(
+                "Page size larger than 1 is not yet supported in HiRadixCache."
+            )
+        self.token_to_kv_pool_host = MHATokenToKVPoolHost(
+            token_to_kv_pool_allocator.get_kvcache()
+        )
+        self.tp_group = tp_cache_group
+        self.page_size = page_size
+        self.load_cache_event = threading.Event()
         self.cache_controller = HiCacheController(
-            token_to_kv_pool, self.token_to_kv_pool_host
+            token_to_kv_pool_allocator,
+            self.token_to_kv_pool_host,
+            load_cache_event=self.load_cache_event,
         )
         # record the nodes with ongoing write through
@@ -35,7 +51,9 @@ class HiRadixCache(RadixCache):
         # todo: dynamically adjust the threshold
         self.write_through_threshold = 1
         self.load_back_threshold = 10
-        super().__init__(req_to_token_pool, token_to_kv_pool, disable=False)
+        super().__init__(
+            req_to_token_pool, token_to_kv_pool_allocator, self.page_size, disable=False
+        )
     def reset(self):
         TreeNode.counter = 0
@@ -53,14 +71,12 @@ class HiRadixCache(RadixCache):
     def write_backup(self, node: TreeNode):
         host_indices = self.cache_controller.write(
             device_indices=node.value,
-            priority=-self.get_height(node),
             node_id=node.id,
         )
         if host_indices is None:
             self.evict_host(len(node.value))
             host_indices = self.cache_controller.write(
                 device_indices=node.value,
-                priority=-self.get_height(node),
                 node_id=node.id,
             )
         if host_indices is not None:
@@ -81,14 +97,20 @@ class HiRadixCache(RadixCache):
             node.hit_count = 0
     def writing_check(self):
-        while not self.cache_controller.ack_write_queue.empty():
-            try:
-                ack_id = self.cache_controller.ack_write_queue.get_nowait()
-                self.dec_lock_ref(self.ongoing_write_through[ack_id])
-                # clear the reference
-                del self.ongoing_write_through[ack_id]
-            except Exception:
-                break
+        queue_size = torch.tensor(
+            self.cache_controller.ack_write_queue.qsize(), dtype=torch.int
+        )
+        if torch.distributed.get_world_size(group=self.tp_group) > 1:
+            # synchrnoize TP workers to make the same update to radix cache
+            torch.distributed.all_reduce(
+                queue_size,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+        for _ in range(queue_size.item()):
+            ack_id = self.cache_controller.ack_write_queue.get()
+            self.dec_lock_ref(self.ongoing_write_through[ack_id])
+            del self.ongoing_write_through[ack_id]
     def loading_check(self):
         while not self.cache_controller.ack_load_queue.empty():
@@ -106,11 +128,9 @@ class HiRadixCache(RadixCache):
                 break
     def evictable_size(self):
-        self.writing_check()
-        self.loading_check()
         return self.evictable_size_
-    def evict(self, num_tokens: int, evict_callback=None):
+    def evict(self, num_tokens: int):
         leaves = self._collect_leaves_device()
         heapq.heapify(leaves)
@@ -160,7 +180,7 @@ class HiRadixCache(RadixCache):
     def _evict_write_through_selective(self, node: TreeNode):
         # evict a node not initiated write to host
-        self.cache_controller.mem_pool_device.free(node.value)
+        self.cache_controller.mem_pool_device_allocator.free(node.value)
         num_evicted = len(node.value)
         self._delete_leaf(node)
         return num_evicted
@@ -240,10 +260,6 @@ class HiRadixCache(RadixCache):
         return device_indices
-    def loading_complete(self, node: TreeNode):
-        self.loading_check()
-        return node.loading == False
     def init_load_back(
         self,
         last_node: TreeNode,
@@ -270,28 +286,49 @@ class HiRadixCache(RadixCache):
         return last_node, prefix_indices
-    def _match_prefix_helper(
-        self, node: TreeNode, key: List, value, last_node: TreeNode
-    ):
-        node.last_access_time = time.time()
-        if len(key) == 0:
-            return
+    def read_to_load_cache(self):
+        self.load_cache_event.set()
-        if key[0] in node.children.keys():
+    def match_prefix(self, key: List[int], include_evicted=False, **kwargs):
+        if self.disable:
+            return [], self.root_node
+        value, last_node = self._match_prefix_helper(self.root_node, key)
+        if value:
+            value = torch.concat(value)
+        else:
+            value = torch.tensor([], dtype=torch.int32)
+        last_node_global = last_node
+        while last_node.evicted:
+            last_node = last_node.parent
+        if include_evicted:
+            return value, last_node, last_node_global
+        else:
+            return value, last_node
+    def _match_prefix_helper(self, node: TreeNode, key: List):
+        node.last_access_time = time.time()
+        value = []
+        while len(key) > 0 and key[0] in node.children.keys():
             child = node.children[key[0]]
+            child.last_access_time = time.time()
             prefix_len = _key_match(child.key, key)
             if prefix_len < len(child.key):
                 new_node = self._split_node(child.key, child, prefix_len)
                 self.inc_hit_count(new_node)
                 if not new_node.evicted:
                     value.append(new_node.value)
-                last_node[0] = new_node
+                node = new_node
+                break
             else:
                 self.inc_hit_count(child)
                 if not child.evicted:
                     value.append(child.value)
-                last_node[0] = child
-                self._match_prefix_helper(child, key[prefix_len:], value, last_node)
+                node = child
+                key = key[prefix_len:]
+        return value, node
     def _split_node(self, key, child: TreeNode, split_len: int):
         # child node split into new_node -> child

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -129,6 +129,7 @@ class TokenToKVPoolAllocator:
         self.size = size
         self.dtype = dtype
         self.device = device
+        self.page_size = 1
         self.free_slots = None
         self.is_not_in_free_group = True
@@ -149,15 +150,14 @@ class TokenToKVPoolAllocator:
         select_index = self.free_slots[:need_size]
         self.free_slots = self.free_slots[need_size:]
-        return select_index.to(self.device, non_blocking=True)
+        return select_index
     def free(self, free_index: torch.Tensor):
         if free_index.numel() == 0:
             return
         if self.is_not_in_free_group:
-            self.free_slots = torch.concat((self.free_slots, free_index.cpu()))
+            self.free_slots = torch.concat((self.free_slots, free_index))
         else:
             self.free_group.append(free_index)
@@ -172,7 +172,9 @@ class TokenToKVPoolAllocator:
     def clear(self):
         # The padded slot 0 is used for writing dummy outputs from padded tokens.
-        self.free_slots = torch.arange(1, self.size + 1, dtype=torch.int32)
+        self.free_slots = torch.arange(
+            1, self.size + 1, dtype=torch.int64, device=self.device
+        )
         self.is_in_free_group = False
         self.free_group = []
@@ -182,6 +184,7 @@ class MHATokenToKVPool(KVCache):
     def __init__(
         self,
         size: int,
+        page_size: int,
         dtype: torch.dtype,
         head_num: int,
         head_dim: int,
@@ -190,6 +193,7 @@ class MHATokenToKVPool(KVCache):
         enable_memory_saver: bool,
     ):
         self.size = size
+        self.page_size = page_size
         self.dtype = dtype
         self.device = device
         if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
@@ -206,6 +210,10 @@ class MHATokenToKVPool(KVCache):
         self.layer_num = layer_num
         self._create_buffers()
+        self.layer_transfer_counter = None
+        self.capture_mode = False
+        self.alt_stream = torch.cuda.Stream()
         k_size, v_size = self.get_kv_size_bytes()
         logger.info(
             f"KV Cache is allocated. #tokens: {size}, K size: {k_size / GB:.2f} GB, V size: {v_size / GB:.2f} GB"
@@ -216,16 +224,16 @@ class MHATokenToKVPool(KVCache):
             # [size, head_num, head_dim] for each layer
             # The padded slot 0 is used for writing dummy outputs from padded tokens.
             self.k_buffer = [
-                torch.empty(
-                    (self.size + 1, self.head_num, self.head_dim),
+                torch.zeros(
+                    (self.size + self.page_size, self.head_num, self.head_dim),
                     dtype=self.store_dtype,
                     device=self.device,
                 )
                 for _ in range(self.layer_num)
             ]
             self.v_buffer = [
-                torch.empty(
-                    (self.size + 1, self.head_num, self.head_dim),
+                torch.zeros(
+                    (self.size + self.page_size, self.head_num, self.head_dim),
                     dtype=self.store_dtype,
                     device=self.device,
                 )
@@ -267,12 +275,28 @@ class MHATokenToKVPool(KVCache):
             self.k_buffer[i][indices] = k_data[i]
             self.v_buffer[i][indices] = v_data[i]
+    def register_layer_transfer_counter(self, layer_transfer_counter):
+        self.layer_transfer_counter = layer_transfer_counter
+    def transfer_per_layer(self, indices, flat_data, layer_id):
+        # transfer prepared data from host to device
+        flat_data = flat_data.to(device=self.device, non_blocking=False)
+        k_data, v_data = flat_data[0], flat_data[1]
+        self.k_buffer[layer_id][indices] = k_data
+        self.v_buffer[layer_id][indices] = v_data
     def get_key_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id)
         if self.store_dtype != self.dtype:
             return self.k_buffer[layer_id].view(self.dtype)
         return self.k_buffer[layer_id]
     def get_value_buffer(self, layer_id: int):
+        if self.layer_transfer_counter is not None:
+            self.layer_transfer_counter.wait_until(layer_id)
         if self.store_dtype != self.dtype:
             return self.v_buffer[layer_id].view(self.dtype)
         return self.v_buffer[layer_id]
@@ -297,14 +321,44 @@ class MHATokenToKVPool(KVCache):
                 cache_v.div_(v_scale)
             cache_k = cache_k.to(self.dtype)
             cache_v = cache_v.to(self.dtype)
         if self.store_dtype != self.dtype:
-            self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
-            self.v_buffer[layer_id][loc] = cache_v.view(self.store_dtype)
+            cache_k = cache_k.view(self.store_dtype)
+            cache_v = cache_v.view(self.store_dtype)
+        if self.capture_mode and cache_k.shape[0] < 4:
+            self.alt_stream.wait_stream(torch.cuda.current_stream())
+            with torch.cuda.stream(self.alt_stream):
+                self.k_buffer[layer_id][loc] = cache_k
+            self.v_buffer[layer_id][loc] = cache_v
+            torch.cuda.current_stream().wait_stream(self.alt_stream)
         else:
             self.k_buffer[layer_id][loc] = cache_k
             self.v_buffer[layer_id][loc] = cache_v
+@torch.compile
+def fused_downcast(
+    cache_k: torch.Tensor,
+    cache_v: torch.Tensor,
+    k_scale: torch.Tensor,
+    v_scale: torch.Tensor,
+    dtype: torch.dtype,
+    store_dtype: torch.dtype,
+    max_fp8: float,
+    min_fp8: float,
+):
+    cache_k = cache_k / k_scale
+    cache_k = torch.clamp(cache_k, min_fp8, max_fp8)
+    cache_v = cache_v / v_scale
+    cache_v = torch.clamp(cache_v, min_fp8, max_fp8)
+    cache_k = cache_k.to(dtype)
+    cache_v = cache_v.to(dtype)
+    cache_k = cache_k.view(store_dtype)
+    cache_v = cache_v.view(store_dtype)
+    return cache_k, cache_v
 # This compiled version is slower in the unit test
 # python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
 @torch.compile(dynamic=True, backend=get_compiler_backend())
@@ -317,6 +371,7 @@ class MLATokenToKVPool(KVCache):
     def __init__(
         self,
         size: int,
+        page_size: int,
         dtype: torch.dtype,
         kv_lora_rank: int,
         qk_rope_head_dim: int,
@@ -341,8 +396,8 @@ class MLATokenToKVPool(KVCache):
         with memory_saver_adapter.region():
             # The padded slot 0 is used for writing dummy outputs from padded tokens.
             self.kv_buffer = [
-                torch.empty(
-                    (size + 1, 1, kv_lora_rank + qk_rope_head_dim),
+                torch.zeros(
+                    (size + page_size, 1, kv_lora_rank + qk_rope_head_dim),
                     dtype=self.store_dtype,
                     device=device,
                 )
@@ -382,6 +437,7 @@ class DoubleSparseTokenToKVPool(KVCache):
     def __init__(
         self,
         size: int,
+        page_size: int,
         dtype: torch.dtype,
         head_num: int,
         head_dim: int,
@@ -391,6 +447,7 @@ class DoubleSparseTokenToKVPool(KVCache):
         enable_memory_saver: bool,
     ):
         self.size = size
+        self.page_size = page_size
         self.dtype = dtype
         self.device = device
         if dtype in (torch.float8_e5m2, torch.float8_e4m3fn):
@@ -405,17 +462,21 @@ class DoubleSparseTokenToKVPool(KVCache):
         with memory_saver_adapter.region():
             # [size, head_num, head_dim] for each layer
             self.k_buffer = [
-                torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
+                torch.zeros(
+                    (size + page_size, head_num, head_dim), dtype=dtype, device=device
+                )
                 for _ in range(layer_num)
             ]
             self.v_buffer = [
-                torch.empty((size + 1, head_num, head_dim), dtype=dtype, device=device)
+                torch.zeros(
+                    (size + page_size, head_num, head_dim), dtype=dtype, device=device
+                )
                 for _ in range(layer_num)
             ]
             # [size, head_num, heavy_channel_num] for each layer
             self.label_buffer = [
-                torch.empty(
+                torch.zeros(
                     (size + 1, head_num, heavy_channel_num), dtype=dtype, device=device
                 )
                 for _ in range(layer_num)
@@ -470,7 +531,7 @@ class MHATokenToKVPoolHost:
     def __init__(
         self,
         device_pool: MHATokenToKVPool,
-        host_to_device_ratio: float = 2.0,
+        host_to_device_ratio: float = 3.0,
         pin_memory: bool = False,  # no need to use pin memory with the double buffering
         device: str = "cpu",
     ):
@@ -510,7 +571,7 @@ class MHATokenToKVPoolHost:
                 f"Allocating {requested_bytes / 1e9:.2f} GB host memory for hierarchical KV cache."
             )
-        self.kv_buffer = torch.empty(
+        self.kv_buffer = torch.zeros(
             (2, self.layer_num, self.size, self.head_num, self.head_dim),
             dtype=self.dtype,
             device=self.device,
@@ -530,6 +591,9 @@ class MHATokenToKVPoolHost:
     def get_flat_data(self, indices):
         return self.kv_buffer[:, :, indices]
+    def get_flat_data_by_layer(self, indices, layer_id):
+        return self.kv_buffer[:, layer_id, indices]
     def assign_flat_data(self, indices, flat_data):
         self.kv_buffer[:, :, indices] = flat_data

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4.post1__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4.post1py3-none-any.whl