PyPI - sglang - Versions diffs - 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl - Mend

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (168) hide show

sglang/bench_one_batch.py +2 -1
sglang/eval/loogle_eval.py +7 -0
sglang/srt/configs/deepseekvl2.py +11 -2
sglang/srt/configs/internvl.py +3 -0
sglang/srt/configs/janus_pro.py +3 -0
sglang/srt/configs/model_config.py +9 -7
sglang/srt/configs/update_config.py +3 -1
sglang/srt/conversation.py +1 -0
sglang/srt/custom_op.py +5 -2
sglang/srt/disaggregation/decode.py +9 -1
sglang/srt/disaggregation/mooncake/conn.py +44 -56
sglang/srt/distributed/parallel_state.py +33 -0
sglang/srt/entrypoints/engine.py +30 -26
sglang/srt/entrypoints/openai/serving_chat.py +21 -2
sglang/srt/eplb/expert_location_dispatch.py +1 -1
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/qwen3_detector.py +150 -0
sglang/srt/hf_transformers_utils.py +0 -1
sglang/srt/layers/activation.py +13 -0
sglang/srt/layers/attention/flashattention_backend.py +3 -3
sglang/srt/layers/attention/flashinfer_backend.py +40 -1
sglang/srt/layers/linear.py +13 -102
sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
sglang/srt/layers/moe/ep_moe/layer.py +23 -402
sglang/srt/layers/moe/fused_moe_native.py +7 -47
sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +35 -45
sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
sglang/srt/layers/moe/topk.py +187 -12
sglang/srt/layers/quantization/__init__.py +20 -134
sglang/srt/layers/quantization/awq.py +578 -11
sglang/srt/layers/quantization/awq_triton.py +339 -0
sglang/srt/layers/quantization/base_config.py +85 -10
sglang/srt/layers/quantization/blockwise_int8.py +17 -55
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +24 -73
sglang/srt/layers/quantization/fp8.py +273 -62
sglang/srt/layers/quantization/fp8_kernel.py +210 -46
sglang/srt/layers/quantization/fp8_utils.py +2 -2
sglang/srt/layers/quantization/gptq.py +501 -143
sglang/srt/layers/quantization/marlin_utils.py +790 -0
sglang/srt/layers/quantization/modelopt_quant.py +26 -108
sglang/srt/layers/quantization/moe_wna16.py +45 -49
sglang/srt/layers/quantization/petit.py +252 -0
sglang/srt/layers/quantization/petit_utils.py +104 -0
sglang/srt/layers/quantization/qoq.py +7 -6
sglang/srt/layers/quantization/scalar_type.py +352 -0
sglang/srt/layers/quantization/unquant.py +422 -0
sglang/srt/layers/quantization/utils.py +343 -3
sglang/srt/layers/quantization/w4afp8.py +8 -4
sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
sglang/srt/layers/quantization/w8a8_int8.py +51 -115
sglang/srt/layers/vocab_parallel_embedding.py +1 -41
sglang/srt/lora/lora.py +0 -4
sglang/srt/lora/lora_manager.py +87 -53
sglang/srt/lora/mem_pool.py +81 -33
sglang/srt/lora/utils.py +12 -5
sglang/srt/managers/cache_controller.py +241 -0
sglang/srt/managers/io_struct.py +41 -29
sglang/srt/managers/mm_utils.py +7 -8
sglang/srt/managers/schedule_batch.py +150 -110
sglang/srt/managers/schedule_policy.py +68 -27
sglang/srt/managers/scheduler.py +243 -61
sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
sglang/srt/managers/tokenizer_manager.py +11 -3
sglang/srt/managers/tp_worker.py +14 -0
sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
sglang/srt/mem_cache/allocator.py +7 -16
sglang/srt/mem_cache/base_prefix_cache.py +14 -2
sglang/srt/mem_cache/chunk_cache.py +5 -2
sglang/srt/mem_cache/hicache_storage.py +152 -0
sglang/srt/mem_cache/hiradix_cache.py +179 -4
sglang/srt/mem_cache/memory_pool.py +16 -1
sglang/srt/mem_cache/memory_pool_host.py +41 -2
sglang/srt/mem_cache/radix_cache.py +26 -0
sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
sglang/srt/metrics/collector.py +9 -0
sglang/srt/model_executor/cuda_graph_runner.py +5 -6
sglang/srt/model_executor/forward_batch_info.py +14 -1
sglang/srt/model_executor/model_runner.py +109 -22
sglang/srt/model_loader/loader.py +7 -1
sglang/srt/model_loader/utils.py +4 -4
sglang/srt/models/clip.py +1 -1
sglang/srt/models/deepseek.py +9 -6
sglang/srt/models/deepseek_janus_pro.py +1 -1
sglang/srt/models/deepseek_v2.py +191 -171
sglang/srt/models/deepseek_vl2.py +5 -5
sglang/srt/models/gemma.py +48 -0
sglang/srt/models/gemma2.py +52 -0
sglang/srt/models/gemma3_causal.py +63 -0
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -4
sglang/srt/models/granitemoe.py +385 -0
sglang/srt/models/grok.py +9 -3
sglang/srt/models/hunyuan.py +63 -16
sglang/srt/models/internvl.py +1 -1
sglang/srt/models/kimi_vl.py +1 -1
sglang/srt/models/llama.py +41 -0
sglang/srt/models/llama4.py +11 -11
sglang/srt/models/llava.py +2 -2
sglang/srt/models/llavavid.py +1 -1
sglang/srt/models/minicpm.py +0 -2
sglang/srt/models/minicpmo.py +3 -7
sglang/srt/models/minicpmv.py +1 -1
sglang/srt/models/mistral.py +1 -1
sglang/srt/models/mixtral.py +9 -2
sglang/srt/models/mllama.py +3 -5
sglang/srt/models/mllama4.py +3 -3
sglang/srt/models/olmoe.py +8 -5
sglang/srt/models/persimmon.py +330 -0
sglang/srt/models/phi.py +321 -0
sglang/srt/models/phi4mm.py +44 -4
sglang/srt/models/phi4mm_audio.py +1260 -0
sglang/srt/models/phi4mm_utils.py +1917 -0
sglang/srt/models/phimoe.py +9 -3
sglang/srt/models/qwen.py +37 -0
sglang/srt/models/qwen2.py +41 -0
sglang/srt/models/qwen2_5_vl.py +4 -4
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +53 -5
sglang/srt/models/qwen2_vl.py +4 -4
sglang/srt/models/qwen3.py +65 -1
sglang/srt/models/qwen3_moe.py +56 -18
sglang/srt/models/vila.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +91 -97
sglang/srt/multimodal/processors/clip.py +21 -19
sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
sglang/srt/multimodal/processors/gemma3.py +13 -17
sglang/srt/multimodal/processors/gemma3n.py +19 -23
sglang/srt/multimodal/processors/internvl.py +9 -10
sglang/srt/multimodal/processors/janus_pro.py +12 -27
sglang/srt/multimodal/processors/kimi_vl.py +12 -14
sglang/srt/multimodal/processors/llava.py +4 -2
sglang/srt/multimodal/processors/minicpm.py +35 -44
sglang/srt/multimodal/processors/mlama.py +21 -18
sglang/srt/multimodal/processors/mllama4.py +4 -5
sglang/srt/multimodal/processors/phi4mm.py +63 -39
sglang/srt/multimodal/processors/pixtral.py +14 -35
sglang/srt/multimodal/processors/qwen_audio.py +65 -0
sglang/srt/multimodal/processors/qwen_vl.py +16 -21
sglang/srt/multimodal/processors/vila.py +14 -14
sglang/srt/sampling/sampling_params.py +8 -1
sglang/srt/server_args.py +393 -230
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +9 -1
sglang/srt/two_batch_overlap.py +1 -0
sglang/srt/utils.py +27 -1
sglang/test/runners.py +14 -3
sglang/test/test_block_fp8.py +8 -3
sglang/test/test_block_fp8_ep.py +1 -1
sglang/test/test_custom_ops.py +12 -7
sglang/test/test_cutlass_w4a8_moe.py +1 -3
sglang/test/test_fp4_moe.py +1 -3
sglang/test/test_marlin_moe.py +286 -0
sglang/test/test_marlin_utils.py +171 -0
sglang/test/test_utils.py +35 -0
sglang/version.py +1 -1
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/METADATA +8 -8
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/RECORD +166 -146
sglang/srt/layers/quantization/quant_utils.py +0 -166
sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/WHEEL +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.4.9.post2.dist-info → sglang-0.4.9.post3.dist-info}/top_level.txt +0 -0

sglang/srt/managers/tokenizer_manager.py CHANGED Viewed

@@ -574,7 +574,7 @@ class TokenizerManager:
                     "The server is not configured to enable custom logit processor. "
                     "Please set `--enable-custom-logits-processor` to enable this feature."
                 )
-            if self.server_args.lora_paths and obj.lora_path:
+            if self.server_args.enable_lora and obj.lora_path:
                 self._validate_lora_adapters(obj)
     def _validate_input_ids_in_vocab(
@@ -604,7 +604,7 @@ class TokenizerManager:
             sampling_kwargs = obj.sampling_params
         sampling_params = SamplingParams(**sampling_kwargs)
         sampling_params.normalize(self.tokenizer)
-        sampling_params.verify()
+        sampling_params.verify(self.model_config.vocab_size)
         # Build return object
         if isinstance(obj, GenerateReqInput):
@@ -1037,6 +1037,10 @@ class TokenizerManager:
         _: Optional[fastapi.Request] = None,
     ) -> LoadLoRAAdapterReqOutput:
         self.auto_create_handle_loop()
+        if not self.server_args.enable_lora:
+            raise ValueError(
+                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+            )
         # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
         # with dp_size > 1.
@@ -1060,6 +1064,10 @@ class TokenizerManager:
         _: Optional[fastapi.Request] = None,
     ) -> UnloadLoRAAdapterReqOutput:
         self.auto_create_handle_loop()
+        if not self.server_args.enable_lora:
+            raise ValueError(
+                "LoRA is not enabled. Please set `--enable-lora` to enable LoRA."
+            )
         # TODO (lifuhuang): Remove this after we verify that dynamic lora loading works
         # with dp_size > 1.
@@ -1359,7 +1367,7 @@ class TokenizerManager:
         while True:
             recv_obj = await self.recv_from_detokenizer.recv_pyobj()
             self._result_dispatcher(recv_obj)
-            self.last_receive_tstamp = time.time()
+            self.last_receive_tstamp = time.perf_counter()
     def _handle_batch_output(
         self,

sglang/srt/managers/tp_worker.py CHANGED Viewed

@@ -174,6 +174,20 @@ class TpModelWorker:
             self.model_runner.token_to_kv_pool.size,
         )
+    @property
+    def sliding_window_size(self) -> Optional[int]:
+        return self.model_runner.sliding_window_size
+    @property
+    def is_hybrid(self) -> bool:
+        return self.model_runner.is_hybrid is not None
+    def get_tokens_per_layer_info(self):
+        return (
+            self.model_runner.full_max_total_num_tokens,
+            self.model_runner.swa_max_total_num_tokens,
+        )
     def get_pad_input_ids_func(self):
         return getattr(self.model_runner.model, "pad_input_ids", None)

sglang/srt/managers/tp_worker_overlap_thread.py CHANGED Viewed

@@ -102,6 +102,17 @@ class TpModelWorkerClient:
     def get_worker_info(self):
         return self.worker.get_worker_info()
+    def get_tokens_per_layer_info(self):
+        return self.worker.get_tokens_per_layer_info()
+    @property
+    def sliding_window_size(self) -> Optional[int]:
+        return self.worker.sliding_window_size
+    @property
+    def is_hybrid(self) -> bool:
+        return self.worker.is_hybrid
     def get_pad_input_ids_func(self):
         return self.worker.get_pad_input_ids_func()

sglang/srt/mem_cache/allocator.py CHANGED Viewed

@@ -57,11 +57,6 @@ class BaseTokenToKVPoolAllocator(abc.ABC):
     def debug_print(self) -> str:
         return ""
-    def log_usage(self, evictable_size: int = 0):
-        num_used = self.size - (self.available_size() + evictable_size)
-        msg = f"#token: {num_used}, token usage: {num_used / self.size:.2f}, "
-        return msg, num_used
     def available_size(self):
         return len(self.free_pages) * self.page_size
@@ -190,7 +185,7 @@ class SWATokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
         self._kvcache.full_to_swa_index_mapping = self.full_to_swa_index_mapping
     def available_size(self):
-        return min(self.full_available_size(), self.swa_available_size())
+        raise NotImplementedError()
     def full_available_size(self):
         return self.full_attn_allocator.available_size()
@@ -214,16 +209,6 @@ class SWATokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
         )
         return msg
-    def log_usage(self, swa_evictable_size: int = 0, full_evictable_size: int = 0):
-        used_full = self.size_full - (self.full_available_size() + full_evictable_size)
-        used_swa = self.size_swa - (self.swa_available_size() + swa_evictable_size)
-        msg = (
-            f"#token: full={used_full}, swa={used_swa}, "
-            f"token usage: full={used_full / self.size_full:.2f}, "
-            f"swa={used_swa / self.size_swa:.2f}, "
-        )
-        return msg, used_full
     def get_kvcache(self):
         return self._kvcache
@@ -541,6 +526,12 @@ class PagedTokenToKVPoolAllocator(BaseTokenToKVPoolAllocator):
         self.is_not_in_free_group = True
         self.free_group = []
+    def get_cpu_copy(self, indices):
+        return self._kvcache.get_cpu_copy(indices)
+    def load_cpu_copy(self, kv_cache_cpu, indices):
+        return self._kvcache.load_cpu_copy(kv_cache_cpu, indices)
 def alloc_extend_kernel_ascend(
     prefix_lens,

sglang/srt/mem_cache/base_prefix_cache.py CHANGED Viewed

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, List, NamedTuple, Tuple
+from typing import TYPE_CHECKING, Any, List, NamedTuple, Optional, Tuple
 import torch
@@ -56,15 +56,27 @@ class BasePrefixCache(ABC):
         pass
     @abstractmethod
-    def dec_lock_ref(self, node: Any):
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
         pass
     def evictable_size(self):
         return 0
+    def full_evictable_size(self):
+        return 0
+    def swa_evictable_size(self):
+        return 0
     def protected_size(self):
         return 0
+    def full_protected_size(self):
+        return 0
+    def swa_protected_size(self):
+        return 0
     def total_size(self):
         raise NotImplementedError()

sglang/srt/mem_cache/chunk_cache.py CHANGED Viewed

@@ -61,7 +61,7 @@ class ChunkCache(BasePrefixCache):
     def inc_lock_ref(self, node: Any):
         return 0
-    def dec_lock_ref(self, node: Any):
+    def dec_lock_ref(self, node: Any, swa_uuid_for_lock: Optional[str] = None):
         return 0
     def pretty_print(self):
@@ -80,7 +80,7 @@ class SWAChunkCache(ChunkCache):
         super().__init__(req_to_token_pool, token_to_kv_pool_allocator, page_size)
         assert isinstance(token_to_kv_pool_allocator, SWATokenToKVPoolAllocator)
-    def evict(
+    def evict_swa(
         self,
         req: Req,
         prelen: int,
@@ -95,3 +95,6 @@ class SWAChunkCache(ChunkCache):
             ]
             self.token_to_kv_pool_allocator.free_swa(free_slots)
             req.evicted_seqlen_local = new_evicted_seqlen_local
+    def evict(self, num_tokens: int):
+        pass

sglang/srt/mem_cache/hicache_storage.py ADDED Viewed

@@ -0,0 +1,152 @@
+import hashlib
+import logging
+import os
+from abc import ABC, abstractmethod
+from typing import List, Optional
+import torch
+logger = logging.getLogger(__name__)
+def get_hash_str(token_ids: List[int], prior_hash: Optional[str] = None) -> str:
+    hasher = hashlib.sha256()
+    if prior_hash:
+        hasher.update(bytes.fromhex(prior_hash))
+    for t in token_ids:
+        hasher.update(t.to_bytes(4, byteorder="little", signed=False))
+    return hasher.hexdigest()
+class HiCacheStorage(ABC):
+    """
+    HiCacheStorage is a class that provides a generic key-value interface for storing and retrieving KV cache.
+    It abstracts the underlying storage mechanism, allowing different implementations to be used.
+    """
+    # todo, translate tensor object access for different TP ranks
+    # potentially pass model and TP configs into storage backend
+    # todo, the page size of storage backend does not have to be the same as the same as host memory pool
+    @abstractmethod
+    def get(
+        self, key: str, target_location: Optional[torch.Tensor] = None
+    ) -> torch.Tensor | None:
+        """
+        Retrieve the value associated with the given key.
+        Returns None if the key does not exist.
+        """
+        pass
+    @abstractmethod
+    def batch_get(
+        self, keys: List[str], target_locations: Optional[List[torch.Tensor]] = None
+    ) -> List[torch.Tensor | None]:
+        """
+        Retrieve values for multiple keys.
+        Returns a list of tensors or None for each key.
+        """
+        pass
+    @abstractmethod
+    def set(self, key, value) -> bool:
+        """
+        Store the value associated with the given key.
+        Returns True if the operation was successful, False otherwise.
+        """
+        pass
+    @abstractmethod
+    def batch_set(self, keys: List[str], values: List[torch.Tensor]) -> bool:
+        """
+        Store multiple key-value pairs.
+        Returns True if all operations were successful, False otherwise.
+        """
+        pass
+    @abstractmethod
+    def exists(self, key: str) -> bool:
+        """
+        Check if the key exists in the storage.
+        Returns True if the key exists, False otherwise.
+        """
+        pass
+class HiCacheFile(HiCacheStorage):
+    def __init__(self, file_path: str = "/tmp/hicache"):
+        self.file_path = file_path
+        if not os.path.exists(self.file_path):
+            os.makedirs(self.file_path)
+            logger.info(f"Created HiCacheFile storage directory at {self.file_path}")
+    def get(
+        self, key: str, target_location: Optional[torch.Tensor] = None
+    ) -> torch.Tensor | None:
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            # todo: fixing the target_location logic to enable in-place loading
+            loaded_tensor = torch.load(tensor_path)
+            if isinstance(loaded_tensor, torch.Tensor):
+                return loaded_tensor
+            else:
+                logger.error(f"Loaded data for key {key} is not a tensor.")
+                return None
+        except FileNotFoundError:
+            return None
+    def batch_get(
+        self,
+        keys: List[str],
+        target_locations: Optional[List[torch.Tensor]] = None,
+    ) -> List[torch.Tensor | None]:
+        return [
+            self.get(key, target_location)
+            for key, target_location in zip(
+                keys, target_locations or [None] * len(keys)
+            )
+        ]
+    def set(self, key: str, value: torch.Tensor) -> bool:
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        if self.exists(key):
+            logger.debug(f"Key {key} already exists. Skipped.")
+            return True
+        try:
+            torch.save(value, tensor_path)
+            return True
+        except Exception as e:
+            logger.error(f"Failed to save tensor {key}: {e}")
+            return False
+    def batch_set(self, keys: List[str], values: List[torch.Tensor]) -> bool:
+        for key, value in zip(keys, values):
+            if not self.set(key, value):
+                return False
+        return True
+    def exists(self, key: str) -> bool:
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        return os.path.exists(tensor_path)
+    def delete(self, key: str) -> None:
+        tensor_path = os.path.join(self.file_path, f"{key}.bin")
+        try:
+            os.remove(tensor_path)
+        except FileNotFoundError:
+            logger.warning(f"Key {key} does not exist. Cannot delete.")
+            return
+    def clear(self) -> None:
+        try:
+            for filename in os.listdir(self.file_path):
+                file_path = os.path.join(self.file_path, filename)
+                if os.path.isfile(file_path):
+                    os.remove(file_path)
+            logger.info("Cleared all entries in HiCacheFile storage.")
+        except Exception as e:
+            logger.error(f"Failed to clear HiCacheFile storage: {e}")

sglang/srt/mem_cache/hiradix_cache.py CHANGED Viewed

@@ -35,6 +35,7 @@ class HiRadixCache(RadixCache):
         hicache_size: int,
         hicache_write_policy: str,
         hicache_io_backend: str,
+        hicache_storage_backend: Optional[str] = None,
     ):
         self.kv_cache = token_to_kv_pool_allocator.get_kvcache()
         if isinstance(self.kv_cache, MHATokenToKVPool):
@@ -49,6 +50,9 @@ class HiRadixCache(RadixCache):
             raise ValueError(f"HiRadixCache only supports MHA and MLA yet")
         self.tp_group = tp_cache_group
+        self.enable_storage = hicache_storage_backend is not None
+        # todo: customizable storage prefetch threshold
+        self.prefetch_threshold = 256
         self.load_cache_event = threading.Event()
         self.cache_controller = HiCacheController(
@@ -58,16 +62,22 @@ class HiRadixCache(RadixCache):
             load_cache_event=self.load_cache_event,
             write_policy=hicache_write_policy,
             io_backend=hicache_io_backend,
+            storage_backend=hicache_storage_backend,
+            prefetch_threshold=self.prefetch_threshold,
         )
         # record the nodes with ongoing write through
         self.ongoing_write_through = {}
         # record the node segments with ongoing load back
         self.ongoing_load_back = {}
+        # record the ongoing prefetch requests
+        self.ongoing_prefetch = {}
+        self.ongoing_backup = {}
         # todo: dynamically adjust the threshold
         self.write_through_threshold = (
             1 if hicache_write_policy == "write_through" else 3
         )
+        self.write_through_threshold_storage = 3
         self.load_back_threshold = 10
         super().__init__(
             req_to_token_pool, token_to_kv_pool_allocator, page_size, disable=False
@@ -108,13 +118,30 @@ class HiRadixCache(RadixCache):
         return len(host_indices)
+    def write_backup_storage(self, node: TreeNode):
+        operation_id = self.cache_controller.write_storage(
+            node.host_value, node.key, node.parent.get_last_hash_value()
+        )
+        self.ongoing_backup[operation_id] = node
+        node.protect_host()
     def inc_hit_count(self, node: TreeNode):
-        if node.backuped or self.cache_controller.write_policy == "write_back":
+        if self.cache_controller.write_policy == "write_back":
             return
         node.hit_count += 1
-        if node.hit_count >= self.write_through_threshold:
-            self.write_backup(node)
-            node.hit_count = 0
+        if not node.backuped:
+            if node.hit_count >= self.write_through_threshold:
+                # write to host if the node is not backuped
+                self.write_backup(node)
+        else:
+            if (
+                self.enable_storage
+                and (not node.backuped_storage)
+                and node.hit_count >= self.write_through_threshold_storage
+            ):
+                # if the node is backuped on host memory but not on storage
+                self.write_backup_storage(node)
     def writing_check(self, write_back=False):
         if write_back:
@@ -221,6 +248,10 @@ class HiRadixCache(RadixCache):
             if not x.evicted:
                 continue
+            # node is protected from eviction as it has ongoing prefetch or backup to storage
+            if x.host_ref_counter > 0:
+                continue
             num_evicted += self.cache_controller.evict_host(x.host_value)
             for k, v in x.parent.children.items():
@@ -314,6 +345,85 @@ class HiRadixCache(RadixCache):
     def check_hicache_events(self):
         self.writing_check()
         self.loading_check()
+        if self.enable_storage:
+            self.check_revoked_prefetch()
+            self.check_backup_progress()
+    def check_revoked_prefetch(self):
+        queue_size = torch.tensor(
+            self.cache_controller.prefetch_revoke_queue.qsize(), dtype=torch.int
+        )
+        if torch.distributed.get_world_size(group=self.tp_group) > 1:
+            # synchrnoize TP workers to make the same update to hiradix cache
+            torch.distributed.all_reduce(
+                queue_size,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+        for _ in range(queue_size.item()):
+            req_id = self.cache_controller.prefetch_revoke_queue.get()
+            if req_id in self.ongoing_prefetch:
+                last_host_node, _, host_indices, _ = self.ongoing_prefetch[req_id]
+                last_host_node.release_host()
+                self.cache_controller.mem_pool_host.free(host_indices)
+                del self.ongoing_prefetch[req_id]
+    def check_backup_progress(self):
+        queue_size = torch.tensor(
+            self.cache_controller.ack_backup_queue.qsize(), dtype=torch.int
+        )
+        if torch.distributed.get_world_size(group=self.tp_group) > 1:
+            # synchrnoize TP workers to make the same update to hiradix cache
+            torch.distributed.all_reduce(
+                queue_size,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+        for _ in range(queue_size.item()):
+            ack_id, hash_value = self.cache_controller.ack_backup_queue.get()
+            self.ongoing_backup[ack_id].hash_value = hash_value
+            self.ongoing_backup[ack_id].release_host()
+            del self.ongoing_backup[ack_id]
+    def check_prefetch_progress(self, req_id: str):
+        if req_id not in self.ongoing_prefetch:
+            # there is no ongoing prefetch for this request or it has been revoked
+            return
+        # todo: more policies for prefetch progress such as timeout
+        # the current policy is to prefetch with best effort and terminate when queuing is over
+        last_host_node, token_ids, host_indices, operation = self.ongoing_prefetch[
+            req_id
+        ]
+        completed_tokens, hash_value = self.cache_controller.terminate_prefetch(
+            operation
+        )
+        logger.debug(f"Prefetch {req_id} completed with {completed_tokens} tokens")
+        min_completed_tokens = torch.tensor(completed_tokens, dtype=torch.int)
+        if torch.distributed.get_world_size(group=self.tp_group) > 1:
+            # synchrnoize TP workers to make the same update to hiradix cache
+            torch.distributed.all_reduce(
+                min_completed_tokens,
+                op=torch.distributed.ReduceOp.MIN,
+                group=self.tp_group,
+            )
+        min_completed_tokens = min_completed_tokens.item()
+        fetched_token_ids = token_ids[:min_completed_tokens]
+        written_indices = host_indices[:min_completed_tokens]
+        matched_length = self._insert_helper_host(
+            last_host_node,
+            fetched_token_ids,
+            written_indices,
+            hash_value[:min_completed_tokens],
+        )
+        self.cache_controller.mem_pool_host.free(host_indices[:matched_length])
+        self.cache_controller.mem_pool_host.free(
+            host_indices[min_completed_tokens:completed_tokens]
+        )
+        last_host_node.release_host()
+        del self.ongoing_prefetch[req_id]
     def match_prefix(self, key: List[int], **kwargs):
         empty_value = torch.empty((0,), dtype=torch.int64, device=self.device)
@@ -348,6 +458,71 @@ class HiRadixCache(RadixCache):
             host_hit_length=host_hit_length,
         )
+    def prefetch_from_storage(
+        self,
+        req_id: str,
+        last_host_node: TreeNode,
+        new_input_tokens: List[int],
+        last_hash: Optional[str] = None,
+    ):
+        if not self.enable_storage or len(new_input_tokens) < self.prefetch_threshold:
+            return
+        last_host_node.protect_host()
+        host_indices = self.cache_controller.mem_pool_host.alloc(len(new_input_tokens))
+        if host_indices is None:
+            self.evict_host(len(new_input_tokens))
+            host_indices = self.cache_controller.mem_pool_host.alloc(
+                len(new_input_tokens)
+            )
+        if host_indices is None:
+            last_host_node.release_host()
+            # no sufficient host memory to prefetch
+            return
+        operation = self.cache_controller.prefetch(
+            req_id, host_indices, new_input_tokens, last_hash
+        )
+        self.ongoing_prefetch[req_id] = (
+            last_host_node,
+            new_input_tokens,
+            host_indices,
+            operation,
+        )
+    def _insert_helper_host(self, node: TreeNode, key: List, host_value, hash_value):
+        node.last_access_time = time.monotonic()
+        if len(key) == 0:
+            return 0
+        child_key = self.get_child_key_fn(key)
+        matched_length = 0
+        while len(key) > 0 and child_key in node.children.keys():
+            node = node.children[child_key]
+            node.last_access_time = time.monotonic()
+            prefix_len = self.key_match_fn(node.key, key)
+            key = key[prefix_len:]
+            host_value = host_value[prefix_len:]
+            hash_value = hash_value[prefix_len:]
+            matched_length += prefix_len
+            if prefix_len < len(node.key):
+                new_node = self._split_node(node.key, node, prefix_len)
+                node = new_node
+            if len(key):
+                child_key = self.get_child_key_fn(key)
+        if len(key):
+            new_node = TreeNode()
+            new_node.parent = node
+            new_node.key = key
+            new_node.value = None
+            new_node.host_value = host_value
+            new_node.hash_value = hash_value
+            node.children[child_key] = new_node
+        return matched_length
     def _match_prefix_helper(self, node: TreeNode, key: List):
         node.last_access_time = time.monotonic()
         child_key = self.get_child_key_fn(key)

sglang/srt/mem_cache/memory_pool.py CHANGED Viewed

@@ -520,8 +520,13 @@ class SWAKVPool(KVCache):
             self.layers_mapping[global_layer_id] = (swa_layer_id, True)
         self.full_to_swa_index_mapping: Optional[torch.Tensor] = None
+        k_size, v_size = self.get_kv_size_bytes()
+        self.mem_usage = (k_size + v_size) / GB
     def get_kv_size_bytes(self):
-        raise NotImplementedError
+        k_size, v_size = self.full_kv_pool.get_kv_size_bytes()
+        k_size_swa, v_size_swa = self.swa_kv_pool.get_kv_size_bytes()
+        return k_size + k_size_swa, v_size + v_size_swa
     def get_contiguous_buf_infos(self):
         full_kv_data_ptrs, full_kv_data_lens, full_kv_item_lens = (
@@ -597,6 +602,16 @@ class SWAKVPool(KVCache):
                 layer_id_override=layer_id_pool,
             )
+    def load_from_host_per_layer(
+        self, host_pool, host_indices, device_indices, layer_id, io_backend
+    ):
+        raise NotImplementedError("HiCache not supported for SWAKVPool.")
+    def backup_to_host_all_layer(
+        self, host_pool, host_indices, device_indices, io_backend
+    ):
+        raise NotImplementedError("HiCache not supported for SWAKVPool.")
 class AscendTokenToKVPool(MHATokenToKVPool):

sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post3__py3-none-any.whl

sglang 0.4.9.post2py3-none-any.whl → 0.4.9.post3py3-none-any.whl