PyPI - sglang - Versions diffs - 0.4.3.post4__py3-none-any.whl → 0.4.4__py3-none-any.whl - Mend

sglang 0.4.3.post4py3-none-any.whl → 0.4.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (124) hide show

sglang/srt/managers/schedule_policy.py CHANGED Viewed

@@ -73,16 +73,29 @@ class CacheAgnosticPolicy(Enum):
 class SchedulePolicy:
     Policy = Union[CacheAwarePolicy, CacheAgnosticPolicy]
-    def __init__(self, policy: str, tree_cache: BasePrefixCache):
+    def __init__(
+        self,
+        policy: str,
+        tree_cache: BasePrefixCache,
+        enable_hierarchical_cache: bool,
+    ):
         self.policy = self._validate_and_adjust_policy(policy, tree_cache)
         self.tree_cache = tree_cache
+        self.enable_hierarchical_cache = enable_hierarchical_cache
         # It is used to find the matching prefix for in-batch prefix caching.
         self.waiting_queue_radix_tree = RadixCache(
-            req_to_token_pool=None, token_to_kv_pool_allocator=None, disable=False
+            req_to_token_pool=None,
+            token_to_kv_pool_allocator=None,
+            page_size=1,
+            disable=False,
         )
     def calc_priority(self, waiting_queue: List[Req]) -> bool:
+        if self.policy == CacheAgnosticPolicy.FCFS:
+            # A shortcut for FCFS
+            return
         policy = self._determine_active_policy(waiting_queue)
         prefix_computed = False
@@ -112,7 +125,7 @@ class SchedulePolicy:
         return prefix_computed
     def _determine_active_policy(self, waiting_queue: List[Req]) -> Policy:
-        if len(waiting_queue) > 128 and self.policy == CacheAwarePolicy.LPM:
+        if self.policy == CacheAwarePolicy.LPM and len(waiting_queue) > 128:
             # Turn off the expensive prefix matching and sorting when the #queue is large.
             return CacheAgnosticPolicy.FCFS
         return self.policy
@@ -149,9 +162,14 @@ class SchedulePolicy:
             prefix_ids = r.adjust_max_prefix_ids()
             # NOTE: the prefix_indices must always be aligned with last_node
-            r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
-                rid=r.rid, key=prefix_ids
-            )
+            if self.enable_hierarchical_cache:
+                r.prefix_indices, r.last_node, r.last_node_global = (
+                    self.tree_cache.match_prefix(key=prefix_ids, include_evicted=True)
+                )
+            else:
+                r.prefix_indices, r.last_node = self.tree_cache.match_prefix(
+                    rid=r.rid, key=prefix_ids
+                )
             # NOTE(sang): This logic is for in-batch prefix caching;
             # If there are more than 1 request that have small matching prefix from
@@ -428,8 +446,10 @@ class PrefillAdder:
         return self.budget_state()
-    def add_one_req(self, req: Req, has_chunked_req: bool):
-        if req.sampling_params.ignore_eos and self.tree_cache.disable:
+    def add_one_req(
+        self, req: Req, has_chunked_req: bool, enable_hierarchical_cache: bool = False
+    ):
+        if req.sampling_params.ignore_eos and getattr(self.tree_cache, "disable", True):
             return self.add_one_req_ignore_eos(req, has_chunked_req)
         total_tokens = req.extend_input_len + min(
@@ -448,6 +468,18 @@ class PrefillAdder:
             if total_tokens > self.rem_total_tokens:
                 return AddReqResult.NO_TOKEN
+            if (
+                enable_hierarchical_cache
+                and req.last_node_global is not None
+                and req.last_node_global.evicted
+            ):
+                req.last_node, req.prefix_indices = self.tree_cache.init_load_back(
+                    req.last_node_global, req.prefix_indices
+                )
+                req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
+                input_tokens = req.extend_input_len
+                prefix_len = len(req.prefix_indices)
             if self.rem_chunk_tokens is None or input_tokens <= self.rem_chunk_tokens:
                 # Non-chunked prefill
                 self.can_run_list.append(req)

sglang 0.4.3.post4__py3-none-any.whl → 0.4.4__py3-none-any.whl

sglang 0.4.3.post4py3-none-any.whl → 0.4.4py3-none-any.whl