PyPI - sglang - Versions diffs - 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl - Mend

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (62) hide show

sglang/bench_serving.py +3 -2
sglang/compile_deep_gemm.py +136 -0
sglang/lang/backend/openai.py +5 -1
sglang/lang/backend/runtime_endpoint.py +5 -1
sglang/srt/configs/model_config.py +4 -1
sglang/srt/constrained/xgrammar_backend.py +1 -0
sglang/srt/disaggregation/decode.py +43 -0
sglang/srt/disaggregation/mini_lb.py +69 -8
sglang/srt/disaggregation/mooncake/conn.py +1 -1
sglang/srt/disaggregation/nixl/__init__.py +1 -0
sglang/srt/disaggregation/nixl/conn.py +622 -0
sglang/srt/disaggregation/prefill.py +100 -16
sglang/srt/disaggregation/utils.py +17 -0
sglang/srt/entrypoints/engine.py +4 -0
sglang/srt/entrypoints/http_server.py +3 -7
sglang/srt/function_call_parser.py +60 -0
sglang/srt/layers/activation.py +2 -2
sglang/srt/layers/attention/flashattention_backend.py +781 -150
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -5
sglang/srt/layers/attention/triton_ops/prefill_attention.py +7 -3
sglang/srt/layers/dp_attention.py +1 -1
sglang/srt/layers/layernorm.py +19 -4
sglang/srt/layers/moe/ep_moe/layer.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +153 -0
sglang/srt/layers/quantization/deep_gemm.py +378 -0
sglang/srt/layers/quantization/fp8_kernel.py +7 -38
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +13 -7
sglang/srt/layers/quantization/modelopt_quant.py +2 -2
sglang/srt/layers/quantization/w8a8_int8.py +3 -3
sglang/srt/layers/rotary_embedding.py +6 -6
sglang/srt/layers/sampler.py +2 -2
sglang/srt/managers/data_parallel_controller.py +7 -1
sglang/srt/managers/io_struct.py +14 -3
sglang/srt/managers/schedule_batch.py +13 -0
sglang/srt/managers/scheduler.py +16 -6
sglang/srt/managers/tokenizer_manager.py +115 -29
sglang/srt/managers/tp_worker.py +1 -0
sglang/srt/mem_cache/hiradix_cache.py +40 -32
sglang/srt/mem_cache/memory_pool.py +31 -13
sglang/srt/model_executor/cuda_graph_runner.py +13 -8
sglang/srt/model_executor/model_runner.py +19 -4
sglang/srt/models/deepseek_v2.py +9 -6
sglang/srt/models/minicpm3.py +2 -2
sglang/srt/models/minicpmo.py +17 -6
sglang/srt/openai_api/adapter.py +71 -4
sglang/srt/openai_api/protocol.py +6 -1
sglang/srt/server_args.py +52 -40
sglang/srt/speculative/build_eagle_tree.py +2 -2
sglang/srt/speculative/eagle_utils.py +2 -2
sglang/srt/speculative/eagle_worker.py +2 -7
sglang/srt/utils.py +46 -5
sglang/test/test_utils.py +3 -1
sglang/version.py +1 -1
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/METADATA +3 -3
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/RECORD +62 -57
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.5.post2.dist-info → sglang-0.4.5.post3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/io_struct.py CHANGED Viewed

@@ -96,8 +96,8 @@ class GenerateReqInput:
     return_hidden_states: bool = False
     # For disaggregated inference
-    bootstrap_host: Optional[str] = None
-    bootstrap_room: Optional[int] = None
+    bootstrap_host: Optional[Union[List[str], str]] = None
+    bootstrap_room: Optional[Union[List[int], int]] = None
     def normalize_batch_and_arguments(self):
         """
@@ -397,6 +397,12 @@ class GenerateReqInput:
                 else None
             ),
             return_hidden_states=self.return_hidden_states,
+            bootstrap_host=(
+                self.bootstrap_host[i] if self.bootstrap_host is not None else None
+            ),
+            bootstrap_room=(
+                self.bootstrap_room[i] if self.bootstrap_room is not None else None
+            ),
         )
@@ -665,10 +671,15 @@ class BatchEmbeddingOut:
 @dataclass
-class FlushCacheReq:
+class FlushCacheReqInput:
     pass
+@dataclass
+class FlushCacheReqOutput:
+    success: bool
 @dataclass
 class UpdateWeightFromDiskReqInput:
     # The model path with the new weights

sglang/srt/managers/schedule_batch.py CHANGED Viewed

@@ -539,6 +539,11 @@ class Req:
         # The first output_id transferred from prefill instance.
         self.transferred_output_id: Optional[int] = None
+        # For overlap schedule, we delay the kv transfer until `process_batch_result_disagg_prefill` rather than `process_prefill_chunk` in non-overlap
+        # This is because kv is not ready in `process_prefill_chunk`.
+        # We use `tmp_end_idx` to store the end index of the kv cache to send.
+        self.tmp_end_idx: int = -1
     @property
     def seqlen(self):
         return len(self.origin_input_ids) + len(self.output_ids)
@@ -571,6 +576,14 @@ class Req:
                 self.prefix_indices, self.last_node = tree_cache.match_prefix(
                     rid=self.rid, key=self.adjust_max_prefix_ids()
                 )
+        elif enable_hierarchical_cache:
+            # in case last_node is evicted during scheduling, we need to update the prefix_indices
+            while self.last_node.evicted:
+                self.prefix_indices = self.prefix_indices[
+                    : -len(self.last_node.host_value)
+                ]
+                self.last_node = self.last_node.parent
         self.extend_input_len = len(self.fill_ids) - len(self.prefix_indices)
     def adjust_max_prefix_ids(self):

sglang/srt/managers/scheduler.py CHANGED Viewed

@@ -60,7 +60,8 @@ from sglang.srt.managers.io_struct import (
     CloseSessionReqInput,
     ExpertDistributionReq,
     ExpertDistributionReqOutput,
-    FlushCacheReq,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
     GetInternalStateReq,
     GetInternalStateReqOutput,
     GetWeightsByNameReqInput,
@@ -402,7 +403,7 @@ class Scheduler(
             [
                 (TokenizedGenerateReqInput, self.handle_generate_request),
                 (TokenizedEmbeddingReqInput, self.handle_embedding_request),
-                (FlushCacheReq, self.flush_cache_wrapped),
+                (FlushCacheReqInput, self.flush_cache_wrapped),
                 (AbortReq, self.abort_request),
                 (OpenSessionReqInput, self.open_session),
                 (CloseSessionReqInput, self.close_session),
@@ -488,6 +489,8 @@ class Scheduler(
                     tp_cache_group=self.tp_cpu_group,
                     page_size=self.page_size,
                     hicache_ratio=server_args.hicache_ratio,
+                    hicache_size=server_args.hicache_size,
+                    hicache_write_policy=server_args.hicache_write_policy,
                 )
             else:
                 self.tree_cache = RadixCache(
@@ -1596,8 +1599,9 @@ class Scheduler(
         time.sleep(5)
         self.parent_process.send_signal(signal.SIGQUIT)
-    def flush_cache_wrapped(self, recv_req: FlushCacheReq):
-        self.flush_cache()
+    def flush_cache_wrapped(self, recv_req: FlushCacheReqInput):
+        success = self.flush_cache()
+        return FlushCacheReqOutput(success=success)
     def flush_cache(self):
         """Flush the memory pool and cache."""
@@ -2010,9 +2014,15 @@ def run_scheduler_process(
             else:
                 scheduler.event_loop_normal()
         elif disaggregation_mode == DisaggregationMode.PREFILL:
-            scheduler.event_loop_normal_disagg_prefill()
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_prefill()
+            else:
+                scheduler.event_loop_normal_disagg_prefill()
         elif disaggregation_mode == DisaggregationMode.DECODE:
-            scheduler.event_loop_normal_disagg_decode()
+            if scheduler.enable_overlap:
+                scheduler.event_loop_overlap_disagg_decode()
+            else:
+                scheduler.event_loop_normal_disagg_decode()
     except Exception:
         traceback = get_exception_traceback()

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -66,7 +66,8 @@ from sglang.srt.managers.io_struct import (
     EmbeddingReqInput,
     ExpertDistributionReq,
     ExpertDistributionReqOutput,
-    FlushCacheReq,
+    FlushCacheReqInput,
+    FlushCacheReqOutput,
     GenerateReqInput,
     GetInternalStateReq,
     GetInternalStateReqOutput,
@@ -264,6 +265,9 @@ class TokenizerManager:
         self.resume_memory_occupation_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
+        self.flush_cache_communicator = _Communicator(
+            self.send_to_scheduler, server_args.dp_size
+        )
         self.start_profile_communicator = _Communicator(
             self.send_to_scheduler, server_args.dp_size
         )
@@ -314,6 +318,10 @@ class TokenizerManager:
                     ResumeMemoryOccupationReqOutput,
                     self.resume_memory_occupation_communicator.handle_recv,
                 ),
+                (
+                    FlushCacheReqOutput,
+                    self.flush_cache_communicator.handle_recv,
+                ),
                 (
                     ProfileReqOutput,
                     self.start_profile_communicator.handle_recv,
@@ -415,38 +423,60 @@ class TokenizerManager:
         )
         if image_inputs and "input_ids" in image_inputs:
             input_ids = image_inputs["input_ids"]
-        if self.is_generation:
-            return_logprob = obj.return_logprob
-            logprob_start_len = obj.logprob_start_len
-            top_logprobs_num = obj.top_logprobs_num
-            token_ids_logprob = obj.token_ids_logprob
-            session_params = (
-                SessionParams(**obj.session_params) if obj.session_params else None
-            )
+        self._validate_token_len(obj, input_ids)
+        return self._create_tokenized_object(
+            obj, input_text, input_ids, input_embeds, image_inputs
+        )
+    def _validate_token_len(
+        self, obj: Union[GenerateReqInput, EmbeddingReqInput], input_ids: List[int]
+    ) -> None:
+        """Validates that the input token count and the requested token count doesn't exceed the model's context length."""
         input_token_num = len(input_ids) if input_ids is not None else 0
+        # Check if input alone exceeds context length
         if input_token_num >= self.context_len:
             raise ValueError(
                 f"The input ({input_token_num} tokens) is longer than the "
                 f"model's context length ({self.context_len} tokens)."
             )
+        # Check total tokens (input + max_new_tokens)
+        max_new_tokens = obj.sampling_params.get("max_new_tokens")
         if (
-            obj.sampling_params.get("max_new_tokens") is not None
-            and obj.sampling_params.get("max_new_tokens") + input_token_num
-            >= self.context_len
+            max_new_tokens is not None
+            and (max_new_tokens + input_token_num) >= self.context_len
         ):
-            raise ValueError(
+            total_tokens = max_new_tokens + input_token_num
+            error_msg = (
                 f"Requested token count exceeds the model's maximum context length "
-                f"of {self.context_len} tokens. You requested a total of "
-                f"{obj.sampling_params.get('max_new_tokens') + input_token_num} "
+                f"of {self.context_len} tokens. You requested a total of {total_tokens} "
                 f"tokens: {input_token_num} tokens from the input messages and "
-                f"{obj.sampling_params.get('max_new_tokens')} tokens for the "
-                f"completion. Please reduce the number of tokens in the input "
-                f"messages or the completion to fit within the limit."
+                f"{max_new_tokens} tokens for the completion. Please reduce the number "
+                f"of tokens in the input messages or the completion to fit within the limit."
+            )
+            raise ValueError(error_msg)
+    def _create_tokenized_object(
+        self,
+        obj: Union[GenerateReqInput, EmbeddingReqInput],
+        input_text: str,
+        input_ids: List[int],
+        input_embeds: Optional[Union[List[float], None]] = None,
+        image_inputs: Optional[Dict] = None,
+    ) -> Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]:
+        """Create a tokenized request object from common parameters."""
+        if self.is_generation:
+            return_logprob = obj.return_logprob
+            logprob_start_len = obj.logprob_start_len
+            top_logprobs_num = obj.top_logprobs_num
+            token_ids_logprob = obj.token_ids_logprob
+            session_params = (
+                SessionParams(**obj.session_params) if obj.session_params else None
             )
-        # Parse sampling parameters
         sampling_params = SamplingParams(**obj.sampling_params)
         sampling_params.normalize(self.tokenizer)
         sampling_params.verify()
@@ -483,6 +513,50 @@ class TokenizerManager:
         return tokenized_obj
+    async def _batch_tokenize_and_process(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> List[Union[TokenizedGenerateReqInput, TokenizedEmbeddingReqInput]]:
+        """Handle batch tokenization for text inputs only."""
+        logger.debug(f"Starting batch tokenization for {batch_size} text requests")
+        # Collect requests and texts
+        requests = [obj[i] for i in range(batch_size)]
+        texts = [req.text for req in requests]
+        # Batch tokenize all texts
+        encoded = self.tokenizer(texts)
+        input_ids_list = encoded["input_ids"]
+        # Process all requests
+        tokenized_objs = []
+        for i, req in enumerate(requests):
+            self._validate_token_len(obj[i], input_ids_list[i])
+            tokenized_objs.append(
+                self._create_tokenized_object(
+                    req, req.text, input_ids_list[i], None, None
+                )
+            )
+        logger.debug(f"Completed batch processing for {batch_size} requests")
+        return tokenized_objs
+    def _validate_batch_tokenization_constraints(
+        self, batch_size: int, obj: Union[GenerateReqInput, EmbeddingReqInput]
+    ) -> None:
+        """Validate constraints for batch tokenization processing."""
+        for i in range(batch_size):
+            if self.is_generation and obj[i].image_data:
+                raise ValueError(
+                    "For image input processing do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_ids is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for pre-tokenized input_ids. Do not set `enable_tokenizer_batch_encode`."
+                )
+            if obj[i].input_embeds is not None:
+                raise ValueError(
+                    "Batch tokenization is not needed for input_embeds. Do not set `enable_tokenizer_batch_encode`."
+                )
     def _send_one_request(
         self,
         obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -560,14 +634,27 @@ class TokenizerManager:
         generators = []
         rids = []
         if getattr(obj, "parallel_sample_num", 1) == 1:
-            # Send all requests
-            for i in range(batch_size):
-                tmp_obj = obj[i]
-                tokenized_obj = await self._tokenize_one_request(tmp_obj)
-                self._send_one_request(tmp_obj, tokenized_obj, created_time)
-                generators.append(self._wait_one_response(tmp_obj, request))
-                rids.append(tmp_obj.rid)
+            if self.server_args.enable_tokenizer_batch_encode:
+                # Validate batch tokenization constraints
+                self._validate_batch_tokenization_constraints(batch_size, obj)
+                tokenized_objs = await self._batch_tokenize_and_process(batch_size, obj)
+                for i, tokenized_obj in enumerate(tokenized_objs):
+                    tmp_obj = obj[i]
+                    self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, request))
+                    rids.append(tmp_obj.rid)
+            else:
+                # Sequential tokenization and processing
+                for i in range(batch_size):
+                    tmp_obj = obj[i]
+                    tokenized_obj = await self._tokenize_one_request(tmp_obj)
+                    self._send_one_request(tmp_obj, tokenized_obj, created_time)
+                    generators.append(self._wait_one_response(tmp_obj, request))
+                    rids.append(tmp_obj.rid)
         else:
             # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
             if batch_size > 128:
@@ -628,9 +715,8 @@ class TokenizerManager:
                     except StopAsyncIteration:
                         pass
-    def flush_cache(self):
-        req = FlushCacheReq()
-        self.send_to_scheduler.send_pyobj(req)
+    async def flush_cache(self) -> FlushCacheReqOutput:
+        return (await self.flush_cache_communicator(FlushCacheReqInput()))[0]
     def abort_request(self, rid: str):
         if rid not in self.rid_to_state:

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -116,6 +116,7 @@ class TpModelWorker:
             ),
             self.model_runner.req_to_token_pool.size,
         )
+        assert self.max_running_requests > 0, "max_running_request is zero"
         self.max_req_len = min(
             self.model_config.context_len - 1,
             self.max_total_num_tokens - 1,

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -29,15 +29,17 @@ class HiRadixCache(RadixCache):
         tp_cache_group: torch.distributed.ProcessGroup,
         page_size: int,
         hicache_ratio: float,
+        hicache_size: int,
+        hicache_write_policy: str,
     ):
         self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
         if isinstance(self.kv_cache, MHATokenToKVPool):
             self.token_to_kv_pool_host = MHATokenToKVPoolHost(
-                self.kv_cache, hicache_ratio, page_size
+                self.kv_cache, hicache_ratio, hicache_size, page_size
             )
         elif isinstance(self.kv_cache, MLATokenToKVPool):
             self.token_to_kv_pool_host = MLATokenToKVPoolHost(
-                self.kv_cache, hicache_ratio, page_size
+                self.kv_cache, hicache_ratio, hicache_size, page_size
             )
         else:
             raise ValueError(f"HiRadixCache only supports MHA and MLA yet")
@@ -50,6 +52,7 @@ class HiRadixCache(RadixCache):
             self.token_to_kv_pool_host,
             page_size,
             load_cache_event=self.load_cache_event,
+            write_policy=hicache_write_policy,
         )
         # record the nodes with ongoing write through
@@ -57,7 +60,9 @@ class HiRadixCache(RadixCache):
         # record the node segments with ongoing load back
         self.ongoing_load_back = {}
         # todo: dynamically adjust the threshold
-        self.write_through_threshold = 1
+        self.write_through_threshold = (
+            1 if hicache_write_policy == "write_through" else 3
+        )
         self.load_back_threshold = 10
         super().__init__(
             req_to_token_pool, token_to_kv_pool_allocator, page_size, disable=False
@@ -76,7 +81,7 @@ class HiRadixCache(RadixCache):
             height += 1
         return height
-    def write_backup(self, node: TreeNode):
+    def write_backup(self, node: TreeNode, write_back=False):
         host_indices = self.cache_controller.write(
             device_indices=node.value,
             node_id=node.id,
@@ -90,21 +95,29 @@ class HiRadixCache(RadixCache):
         if host_indices is not None:
             node.host_value = host_indices
             self.ongoing_write_through[node.id] = node
-            self.inc_lock_ref(node)
+            if not write_back:
+                # no need to lock nodes if write back
+                self.inc_lock_ref(node)
         else:
             return 0
         return len(host_indices)
     def inc_hit_count(self, node: TreeNode):
-        if self.cache_controller.write_policy != "write_through_selective":
+        if node.backuped or self.cache_controller.write_policy == "write_back":
             return
         node.hit_count += 1
-        if node.host_value is None and node.hit_count > self.write_through_threshold:
+        if node.hit_count >= self.write_through_threshold:
             self.write_backup(node)
             node.hit_count = 0
-    def writing_check(self):
+    def writing_check(self, write_back=False):
+        if write_back:
+            # blocking till all write back complete
+            while len(self.ongoing_write_through) > 0:
+                ack_id = self.cache_controller.ack_write_queue.get()
+                del self.ongoing_write_through[ack_id]
+            return
         queue_size = torch.tensor(
             self.cache_controller.ack_write_queue.qsize(), dtype=torch.int
         )
@@ -143,29 +156,25 @@ class HiRadixCache(RadixCache):
         heapq.heapify(leaves)
         num_evicted = 0
-        pending_nodes = []
+        write_back_nodes = []
         while num_evicted < num_tokens and len(leaves):
             x = heapq.heappop(leaves)
             if x.lock_ref > 0:
                 continue
-            if x.host_value is None:
+            if not x.backuped:
                 if self.cache_controller.write_policy == "write_back":
-                    num_evicted += self.write_backup(x)
-                    pending_nodes.append(x)
-                elif self.cache_controller.write_policy == "write_through_selective":
-                    num_evicted += self._evict_write_through_selective(x)
+                    # write to host if the node is not backuped
+                    num_evicted += self.write_backup(x, write_back=True)
+                    write_back_nodes.append(x)
                 else:
-                    assert (
-                        self.cache_controller.write_policy != "write_through"
-                    ), "write_through should be inclusive"
-                    raise NotImplementedError
+                    num_evicted += self._evict_regular(x)
             else:
-                num_evicted += self._evict_write_through(x)
+                num_evicted += self._evict_backuped(x)
             for child in x.parent.children.values():
-                if child in pending_nodes:
+                if child in write_back_nodes:
                     continue
                 if not child.evicted:
                     break
@@ -174,15 +183,12 @@ class HiRadixCache(RadixCache):
                 heapq.heappush(leaves, x.parent)
         if self.cache_controller.write_policy == "write_back":
-            # blocking till all write back complete
-            while len(self.ongoing_write_through) > 0:
-                self.writing_check()
-                time.sleep(0.1)
-            for node in pending_nodes:
-                assert node.host_value is not None
-                self._evict_write_through(node)
+            self.writing_check(write_back=True)
+            for node in write_back_nodes:
+                assert node.backuped
+                self._evict_backuped(node)
-    def _evict_write_through(self, node: TreeNode):
+    def _evict_backuped(self, node: TreeNode):
         # evict a node already written to host
         num_evicted = self.cache_controller.evict_device(node.value, node.host_value)
         assert num_evicted > 0
@@ -190,7 +196,7 @@ class HiRadixCache(RadixCache):
         node.value = None
         return num_evicted
-    def _evict_write_through_selective(self, node: TreeNode):
+    def _evict_regular(self, node: TreeNode):
         # evict a node not initiated write to host
         self.cache_controller.mem_pool_device_allocator.free(node.value)
         num_evicted = len(node.value)
@@ -339,11 +345,13 @@ class HiRadixCache(RadixCache):
             prefix_len = self.key_match_fn(child.key, key)
             if prefix_len < len(child.key):
                 new_node = self._split_node(child.key, child, prefix_len)
+                self.inc_hit_count(new_node)
                 if not new_node.evicted:
                     value.append(new_node.value)
                 node = new_node
                 break
             else:
+                self.inc_hit_count(child)
                 if not child.evicted:
                     value.append(child.value)
                 node = child
@@ -369,7 +377,7 @@ class HiRadixCache(RadixCache):
         else:
             new_node.value = child.value[:split_len]
             child.value = child.value[split_len:]
-        if child.host_value is not None:
+        if child.backuped:
             new_node.host_value = child.host_value[:split_len]
             child.host_value = child.host_value[split_len:]
         child.parent = new_node
@@ -426,8 +434,8 @@ class HiRadixCache(RadixCache):
             node.children[child_key] = new_node
             self.evictable_size_ += len(value)
-            if self.cache_controller.write_policy == "write_through":
-                self.write_backup(new_node)
+            if self.cache_controller.write_policy != "write_back":
+                self.inc_hit_count(new_node)
         return total_prefix_length
     def _collect_leaves_device(self):

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -446,13 +446,28 @@ class MLATokenToKVPool(KVCache):
             ]
         self.layer_transfer_counter = None
+        self.page_size = page_size
+        kv_size = self.get_kv_size_bytes()
+        logger.info(
+            f"KV Cache is allocated. #tokens: {size}, KV size: {kv_size / GB:.2f} GB"
+        )
+    def get_kv_size_bytes(self):
+        assert hasattr(self, "kv_buffer")
+        kv_size_bytes = 0
+        for kv_cache in self.kv_buffer:
+            kv_size_bytes += np.prod(kv_cache.shape) * kv_cache.dtype.itemsize
+        return kv_size_bytes
     # for disagg
     def get_contiguous_buf_infos(self):
         # MLA has only one kv_buffer, so only the information of this buffer needs to be returned.
         kv_data_ptrs = [self.kv_buffer[i].data_ptr() for i in range(self.layer_num)]
         kv_data_lens = [self.kv_buffer[i].nbytes for i in range(self.layer_num)]
-        kv_item_lens = [self.kv_buffer[i][0].nbytes for i in range(self.layer_num)]
+        kv_item_lens = [
+            self.kv_buffer[i][0].nbytes * self.page_size for i in range(self.layer_num)
+        ]
         return kv_data_ptrs, kv_data_lens, kv_item_lens
     def get_key_buffer(self, layer_id: int):
@@ -621,26 +636,27 @@ class HostKVCache(abc.ABC):
         self,
         device_pool: MHATokenToKVPool,
         host_to_device_ratio: float,
+        host_size: int,
         pin_memory: bool,
         device: str,
         page_size: int,
     ):
-        assert (
-            host_to_device_ratio >= 1
-        ), "The host memory should be larger than the device memory with the current protocol"
-        # todo, other ways of configuring the size
         self.device_pool = device_pool
-        self.host_to_device_ratio = host_to_device_ratio
+        self.dtype = device_pool.store_dtype
         self.pin_memory = pin_memory
         self.device = device
         self.page_size = page_size
-        self.size = int(device_pool.size * host_to_device_ratio)
+        self.size_per_token = self.get_size_per_token()
+        if host_size > 0:
+            self.size = int(host_size * 1e9 // self.size_per_token)
+        else:
+            self.size = int(device_pool.size * host_to_device_ratio)
         # Align the host memory pool size to the page size
         self.size = self.size - (self.size % self.page_size)
-        self.dtype = device_pool.store_dtype
-        self.size_per_token = self.get_size_per_token()
+        assert (
+            self.size > device_pool.size
+        ), "The host memory should be larger than the device memory with the current protocol"
         # Verify there is enough available host memory.
         host_mem = psutil.virtual_memory()
@@ -792,12 +808,13 @@ class MHATokenToKVPoolHost(HostKVCache):
         self,
         device_pool: MHATokenToKVPool,
         host_to_device_ratio: float,
+        host_size: int,
         page_size: int,
         pin_memory: bool = True,
         device: str = "cpu",
     ):
         super().__init__(
-            device_pool, host_to_device_ratio, pin_memory, device, page_size
+            device_pool, host_to_device_ratio, host_size, pin_memory, device, page_size
         )
     def get_size_per_token(self):
@@ -866,12 +883,13 @@ class MLATokenToKVPoolHost(HostKVCache):
         self,
         device_pool: MLATokenToKVPool,
         host_to_device_ratio: float,
+        host_size: int,
         page_size: int,
         pin_memory: bool = True,
         device: str = "cpu",
     ):
         super().__init__(
-            device_pool, host_to_device_ratio, pin_memory, device, page_size
+            device_pool, host_to_device_ratio, host_size, pin_memory, device, page_size
         )
     def get_size_per_token(self):

sglang/srt/model_executor/cuda_graph_runner.py CHANGED Viewed

@@ -35,7 +35,11 @@ from sglang.srt.model_executor.forward_batch_info import (
     ForwardMode,
 )
 from sglang.srt.patch_torch import monkey_patch_torch_compile
-from sglang.srt.utils import get_available_gpu_memory, is_hip
+from sglang.srt.utils import (
+    get_available_gpu_memory,
+    get_device_memory_capacity,
+    is_hip,
+)
 if TYPE_CHECKING:
     from sglang.srt.model_executor.model_runner import ModelRunner
@@ -129,7 +133,8 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
                 list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
             )
-        if _is_hip:
+        gpu_mem = get_device_memory_capacity()
+        if gpu_mem is not None and gpu_mem > 81920:
             capture_bs += list(range(160, 257, 8))
     if max(capture_bs) > model_runner.req_to_token_pool.size:
@@ -140,12 +145,11 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
         ]
     capture_bs = list(sorted(set(capture_bs)))
-    capture_bs = [
-        bs
-        for bs in capture_bs
-        if bs <= model_runner.req_to_token_pool.size
-        and bs <= server_args.cuda_graph_max_bs
-    ]
+    assert len(capture_bs) > 0 and capture_bs[0] > 0
+    capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
+    if server_args.cuda_graph_max_bs:
+        capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
     compile_bs = (
         [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
         if server_args.enable_torch_compile
@@ -186,6 +190,7 @@ class CudaGraphRunner:
         # Batch sizes to capture
         self.capture_bs, self.compile_bs = get_batch_sizes_to_capture(model_runner)
         self.capture_forward_mode = ForwardMode.DECODE
         self.capture_hidden_mode = CaptureHiddenMode.NULL
         self.num_tokens_per_bs = 1

sglang 0.4.5.post2__py3-none-any.whl → 0.4.5.post3__py3-none-any.whl

sglang 0.4.5.post2py3-none-any.whl → 0.4.5.post3py3-none-any.whl