PyPI - sglang - Versions diffs - 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl - Mend

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (256) hide show

sglang/bench_one_batch.py +3 -0
sglang/bench_one_batch_server.py +89 -54
sglang/bench_serving.py +437 -40
sglang/lang/interpreter.py +1 -1
sglang/profiler.py +0 -1
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/longcat_flash.py +104 -0
sglang/srt/configs/model_config.py +37 -7
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/connector/__init__.py +1 -1
sglang/srt/connector/base_connector.py +1 -2
sglang/srt/connector/redis.py +2 -2
sglang/srt/connector/serde/__init__.py +1 -1
sglang/srt/connector/serde/safe_serde.py +4 -3
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +11 -3
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +75 -0
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +15 -12
sglang/srt/disaggregation/decode.py +6 -4
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -420
sglang/srt/disaggregation/mooncake/conn.py +18 -10
sglang/srt/disaggregation/nixl/conn.py +180 -16
sglang/srt/disaggregation/prefill.py +6 -4
sglang/srt/disaggregation/utils.py +5 -50
sglang/srt/distributed/parallel_state.py +94 -58
sglang/srt/entrypoints/engine.py +34 -14
sglang/srt/entrypoints/http_server.py +172 -47
sglang/srt/entrypoints/openai/protocol.py +90 -27
sglang/srt/entrypoints/openai/serving_base.py +6 -2
sglang/srt/entrypoints/openai/serving_chat.py +82 -26
sglang/srt/entrypoints/openai/serving_completions.py +25 -4
sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
sglang/srt/entrypoints/openai/serving_responses.py +7 -4
sglang/srt/eplb/eplb_manager.py +28 -4
sglang/srt/eplb/expert_distribution.py +55 -15
sglang/srt/eplb/expert_location.py +8 -3
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/deepseekv31_detector.py +222 -0
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +2 -0
sglang/srt/function_call/glm4_moe_detector.py +1 -1
sglang/srt/function_call/gpt_oss_detector.py +144 -256
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/hf_transformers_utils.py +28 -7
sglang/srt/layers/activation.py +44 -9
sglang/srt/layers/attention/aiter_backend.py +93 -68
sglang/srt/layers/attention/ascend_backend.py +381 -136
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +241 -7
sglang/srt/layers/attention/flashinfer_backend.py +11 -6
sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
sglang/srt/layers/attention/mamba/mamba.py +64 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +45 -8
sglang/srt/layers/layernorm.py +54 -12
sglang/srt/layers/logits_processor.py +10 -3
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_moe.py +0 -8
sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
sglang/srt/layers/moe/ep_moe/layer.py +111 -56
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +43 -12
sglang/srt/layers/moe/utils.py +6 -5
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
sglang/srt/layers/quantization/fp8.py +78 -48
sglang/srt/layers/quantization/fp8_kernel.py +2 -2
sglang/srt/layers/quantization/fp8_utils.py +45 -31
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +107 -40
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +93 -68
sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/utils.py +13 -0
sglang/srt/layers/quantization/w4afp8.py +60 -42
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +83 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +28 -19
sglang/srt/layers/sampler.py +29 -5
sglang/srt/layers/utils.py +0 -14
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/triton_backend.py +90 -2
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +4 -1
sglang/srt/lora/lora_manager.py +35 -112
sglang/srt/lora/mem_pool.py +24 -10
sglang/srt/lora/utils.py +18 -9
sglang/srt/managers/cache_controller.py +396 -365
sglang/srt/managers/data_parallel_controller.py +30 -15
sglang/srt/managers/detokenizer_manager.py +18 -2
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +190 -11
sglang/srt/managers/mm_utils.py +6 -1
sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
sglang/srt/managers/schedule_batch.py +27 -44
sglang/srt/managers/schedule_policy.py +4 -3
sglang/srt/managers/scheduler.py +148 -122
sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
sglang/srt/managers/tokenizer_manager.py +77 -480
sglang/srt/managers/tp_worker.py +16 -4
sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
sglang/srt/mem_cache/allocator.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +1 -1
sglang/srt/mem_cache/hicache_storage.py +53 -40
sglang/srt/mem_cache/hiradix_cache.py +196 -104
sglang/srt/mem_cache/lora_radix_cache.py +1 -1
sglang/srt/mem_cache/memory_pool.py +395 -53
sglang/srt/mem_cache/memory_pool_host.py +27 -19
sglang/srt/mem_cache/radix_cache.py +6 -6
sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +1 -3
sglang/srt/metrics/collector.py +484 -63
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +48 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +13 -5
sglang/srt/model_executor/forward_batch_info.py +72 -18
sglang/srt/model_executor/model_runner.py +190 -32
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +33 -28
sglang/srt/model_loader/utils.py +12 -0
sglang/srt/model_loader/weight_utils.py +2 -1
sglang/srt/models/deepseek_v2.py +323 -53
sglang/srt/models/gemma3n_mm.py +1 -1
sglang/srt/models/glm4_moe.py +10 -1
sglang/srt/models/glm4v.py +4 -2
sglang/srt/models/gpt_oss.py +7 -19
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +17 -0
sglang/srt/models/longcat_flash.py +1026 -0
sglang/srt/models/longcat_flash_nextn.py +699 -0
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +25 -0
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2.py +33 -3
sglang/srt/models/qwen2_5_vl.py +91 -42
sglang/srt/models/qwen2_moe.py +79 -14
sglang/srt/models/qwen3.py +8 -2
sglang/srt/models/qwen3_moe.py +39 -8
sglang/srt/models/qwen3_next.py +1039 -0
sglang/srt/models/qwen3_next_mtp.py +109 -0
sglang/srt/models/torch_native_llama.py +1 -1
sglang/srt/models/transformers.py +1 -1
sglang/srt/multimodal/processors/base_processor.py +4 -2
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +141 -129
sglang/srt/{conversation.py → parser/conversation.py} +38 -5
sglang/srt/parser/harmony_parser.py +588 -0
sglang/srt/parser/reasoning_parser.py +309 -0
sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
sglang/srt/sampling/sampling_batch_info.py +18 -15
sglang/srt/server_args.py +307 -80
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
sglang/srt/speculative/eagle_worker.py +216 -120
sglang/srt/speculative/spec_info.py +5 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
sglang/srt/utils.py +96 -7
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_trtllm_mla_backend.py +181 -8
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/runners.py +4 -0
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_cutlass_w4a8_moe.py +24 -9
sglang/test/test_disaggregation_utils.py +66 -0
sglang/test/test_utils.py +25 -1
sglang/utils.py +5 -0
sglang/version.py +1 -1
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
sglang/srt/disaggregation/launch_lb.py +0 -131
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
sglang/srt/reasoning_parser.py +0 -553
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -18,53 +18,78 @@ import math
 import threading
 import time
 from queue import Empty, Full, PriorityQueue, Queue
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple
 import torch
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
 if TYPE_CHECKING:
     from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
     from sglang.srt.mem_cache.memory_pool_host import HostKVCache
-from sglang.srt.distributed import get_tensor_model_parallel_rank
-from sglang.srt.mem_cache.memory_pool_host import MLATokenToKVPoolHost
+from sglang.srt.distributed import (
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_world_size,
+)
+from sglang.srt.layers.dp_attention import (
+    get_attention_dp_rank,
+    get_attention_tp_rank,
+    get_attention_tp_size,
+    is_dp_attention_enabled,
+)
+from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
 logger = logging.getLogger(__name__)
+class LayerLoadingEvent:
+    def __init__(self, num_layers: int):
+        self._num_layers = num_layers
+        self.load_events = [torch.cuda.Event() for _ in range(num_layers)]
+        self.start_event = torch.cuda.Event()  # start event on controller stream
+    def complete(self, layer_index: int):
+        assert 0 <= layer_index < self._num_layers
+        self.load_events[layer_index].record()
+    def wait(self, layer_index: int):
+        torch.cuda.current_stream().wait_event(self.load_events[layer_index])
+    @property
+    def finish_event(self):
+        return self.load_events[-1]
 class LayerDoneCounter:
-    def __init__(self, num_layers):
+    def __init__(self, num_layers: int):
         self.num_layers = num_layers
         # extra producer and consumer counters for overlap mode
         self.num_counters = 3
-        self.counters = [num_layers] * self.num_counters
-        self.conditions = [threading.Condition() for _ in range(self.num_counters)]
-        self.producer_index = 0
-        self.consumer_index = 0
-    def next_producer(self):
-        return (self.producer_index + 1) % self.num_counters
+        self.events = [LayerLoadingEvent(num_layers) for _ in range(self.num_counters)]
+        self.producer_index = -1
+        self.consumer_index = -1
     def update_producer(self):
-        self.producer_index = self.next_producer()
+        self.producer_index = (self.producer_index + 1) % self.num_counters
+        assert self.events[
+            self.producer_index
+        ].finish_event.query(), (
+            "Producer finish event should be ready before being reused."
+        )
         return self.producer_index
-    def set_consumer(self, index):
+    def set_consumer(self, index: int):
         self.consumer_index = index
-    def increment(self):
-        with self.conditions[self.producer_index]:
-            self.counters[self.producer_index] += 1
-            self.conditions[self.producer_index].notify_all()
-    def wait_until(self, threshold):
-        with self.conditions[self.consumer_index]:
-            while self.counters[self.consumer_index] <= threshold:
-                self.conditions[self.consumer_index].wait()
+    def wait_until(self, threshold: int):
+        if self.consumer_index < 0:
+            return
+        self.events[self.consumer_index].wait(threshold)
     def reset(self):
-        with self.conditions[self.producer_index]:
-            self.counters[self.producer_index] = 0
+        self.producer_index = -1
+        self.consumer_index = -1
 class CacheOperation:
@@ -88,36 +113,30 @@ class CacheOperation:
         # default priority is the order of creation
         self.priority = priority if priority is not None else self.id
-    def merge(self, other: "CacheOperation") -> None:
-        # multiple operations can be merged into a single operation for batch processing
-        self.host_indices = torch.cat([self.host_indices, other.host_indices])
-        self.device_indices = torch.cat([self.device_indices, other.device_indices])
-        self.priority = min(self.priority, other.priority)
-        self.node_ids.extend(other.node_ids)
-    def split(self, factor) -> List["CacheOperation"]:
-        # split an operation into smaller operations to reduce the size of intermediate buffers
-        if factor <= 1:
-            return [self]
-        chunk_size = math.ceil(len(self.host_indices) / factor)
-        split_ops = []
-        for i in range(0, len(self.host_indices), chunk_size):
-            split_ops.append(
-                CacheOperation(
-                    host_indices=self.host_indices[i : i + chunk_size],
-                    device_indices=self.device_indices[i : i + chunk_size],
-                    node_id=0,
-                )
-            )
-        # Inherit the node_ids on the final chunk
-        if split_ops:
-            split_ops[-1].node_ids = self.node_ids
+    @staticmethod
+    def merge_ops(ops: List[CacheOperation]) -> CacheOperation:
+        assert len(ops) > 0
+        if len(ops) == 1:
+            return ops[0]
+        host_indices = torch.cat([op.host_indices for op in ops])
+        device_indices = torch.cat([op.device_indices for op in ops])
+        node_ids = []
+        priority = min(op.priority for op in ops)
+        for op in ops:
+            node_ids.extend(op.node_ids)
+        merged_op = CacheOperation(host_indices, device_indices, -1, priority)
+        merged_op.node_ids = node_ids
+        return merged_op
+    def __lt__(self, other: CacheOperation):
+        return self.priority < other.priority
-        return split_ops
-    def __lt__(self, other: "CacheOperation"):
-        return self.priority < other.priority
+class HiCacheAck(NamedTuple):
+    start_event: torch.cuda.Event
+    finish_event: torch.cuda.Event
+    node_ids: List[int]
 class TransferBuffer:
@@ -196,26 +215,25 @@ class PrefetchOperation(StorageOperation):
     ):
         self.request_id = request_id
-        self._done_flag = False
         self._lock = threading.Lock()
+        self._terminated_flag = False
         self.start_time = time.monotonic()
         super().__init__(host_indices, token_ids, last_hash)
     def increment(self, num_tokens: int):
         with self._lock:
-            if self._done_flag:
+            if self._terminated_flag:
                 return False
             self.completed_tokens += num_tokens
             return True
-    def mark_done(self):
+    def mark_terminate(self):
         with self._lock:
-            self._done_flag = True
+            self._terminated_flag = True
-    def is_done(self) -> bool:
-        return self._done_flag
+    def is_terminated(self) -> bool:
+        return self._terminated_flag
 class HiCacheController:
@@ -226,11 +244,13 @@ class HiCacheController:
         mem_pool_host: HostKVCache,
         page_size: int,
         tp_group: torch.distributed.ProcessGroup,
-        load_cache_event: threading.Event = None,
+        load_cache_event: threading.Event,
         write_policy: str = "write_through_selective",
         io_backend: str = "",
         storage_backend: Optional[str] = None,
         prefetch_threshold: int = 256,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
     ):
         self.mem_pool_device_allocator = token_to_kv_pool_allocator
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
@@ -238,30 +258,37 @@ class HiCacheController:
         self.write_policy = write_policy
         self.page_size = page_size
         self.io_backend = io_backend
         self.enable_storage = False
-        self.is_mla = isinstance(self.mem_pool_host, MLATokenToKVPoolHost)
-        # todo: move backend initialization to storage backend module
         if storage_backend is not None:
             self.storage_backend_type = storage_backend
-            from sglang.srt.mem_cache.hicache_storage import HiCacheFile, get_hash_str
+            from sglang.srt.mem_cache.hicache_storage import get_hash_str
+            self.get_hash_str = get_hash_str
+            self.storage_config = self._generate_storage_config(
+                model_name, storage_backend_extra_config
+            )
+            # for MLA models, only one rank needs to backup the KV cache
+            self.backup_skip = (
+                self.storage_config.is_mla_model
+                # todo: load balancing
+                and self.storage_config.tp_rank != 0
+            )
             if storage_backend == "file":
-                self.storage_backend = HiCacheFile(is_mla=self.is_mla)
-                self.get_hash_str = get_hash_str
+                from sglang.srt.mem_cache.hicache_storage import HiCacheFile
+                self.storage_backend = HiCacheFile(self.storage_config)
             elif storage_backend == "nixl":
                 from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
                 self.storage_backend = HiCacheNixl()
-                self.get_hash_str = get_hash_str
             elif storage_backend == "mooncake":
                 from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
                     MooncakeStore,
-                    get_hash_str_mooncake,
                 )
-                self.storage_backend = MooncakeStore(is_mla=self.is_mla)
-                self.get_hash_str = get_hash_str_mooncake
+                self.storage_backend = MooncakeStore(self.storage_config)
                 self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
                 assert self.mem_pool_host.layout == "page_first"
             elif storage_backend == "hf3fs":
@@ -279,19 +306,21 @@ class HiCacheController:
                     )
                 dtype = mem_pool_host.dtype
                 self.storage_backend = HiCacheHF3FS.from_env_config(
-                    bytes_per_page, dtype
+                    bytes_per_page, dtype, self.storage_config
                 )
-                self.get_hash_str = get_hash_str
             else:
                 raise NotImplementedError(
                     f"Unsupported storage backend: {storage_backend}"
                 )
             self.enable_storage = True
             # todo: threshold policy for prefetching
             self.prefetch_threshold = max(prefetch_threshold, self.page_size)
             self.prefetch_capacity_limit = int(
                 0.8 * (self.mem_pool_host.size - self.mem_pool_device.size)
             )
+            # granularity of batch storage IO operations, in number of pages
+            self.storage_batch_size = 128
             # tracking the number of tokens locked in prefetching, updated by the main scheduler thread
             self.prefetch_tokens_occupied = 0
@@ -302,15 +331,26 @@ class HiCacheController:
                 self.prefetch_tp_group = torch.distributed.new_group(
                     group_ranks, backend="gloo"
                 )
-                self.prefetch_io_tp_group = torch.distributed.new_group(
-                    group_ranks, backend="gloo"
-                )
-                self.backup_tp_group = torch.distributed.new_group(
-                    group_ranks, backend="gloo"
-                )
-        self.load_cache_event = load_cache_event
-        self.layer_done_counter = LayerDoneCounter(self.mem_pool_device.layer_num)
+            # Select the get and set functions
+            self.page_get_func = self._generic_page_get
+            self.page_set_func = self._generic_page_set
+            self.batch_exists_func = self.storage_backend.batch_exists
+            self.is_3fs_zerocopy = (
+                self.storage_backend_type == "hf3fs"
+                and self.mem_pool_host.layout == "page_first"
+            )
+            if self.storage_backend_type == "mooncake":
+                self.page_get_func = self._mooncake_page_get
+                self.page_set_func = self._mooncake_page_set
+            elif self.is_3fs_zerocopy:
+                self.page_get_func = self._3fs_zero_copy_page_get
+                self.page_set_func = self._3fs_zero_copy_page_set
+                self.batch_exists_func = self._3fs_zero_copy_batch_exists
+        self.device = self.mem_pool_device.device
+        self.layer_num = self.mem_pool_device.layer_num
+        self.layer_done_counter = LayerDoneCounter(self.layer_num)
         self.mem_pool_device.register_layer_transfer_counter(self.layer_done_counter)
         if write_policy not in [
@@ -320,11 +360,11 @@ class HiCacheController:
         ]:
             raise ValueError(f"Invalid write policy: {write_policy}")
-        self.write_queue = PriorityQueue()
-        self.load_queue = PriorityQueue()
-        self.ack_write_queue = Queue()
-        self.ack_load_queue = Queue()
+        # self.write_queue = PriorityQueue[CacheOperation]()
+        self.load_queue: List[CacheOperation] = []
+        self.write_queue: List[CacheOperation] = []
+        self.ack_load_queue: List[HiCacheAck] = []
+        self.ack_write_queue: List[HiCacheAck] = []
         self.stop_event = threading.Event()
         self.write_buffer = TransferBuffer(self.stop_event)
@@ -335,16 +375,6 @@ class HiCacheController:
         self.write_stream = torch.cuda.Stream()
         self.load_stream = torch.cuda.Stream()
-        self.write_thread = threading.Thread(
-            target=self.write_thread_func_direct, daemon=True
-        )
-        self.load_thread = threading.Thread(
-            target=self.load_thread_func_layer_by_layer, daemon=True
-        )
-        self.write_thread.start()
-        self.load_thread.start()
         if self.enable_storage:
             self.prefetch_thread = threading.Thread(
                 target=self.prefetch_thread_func, daemon=True
@@ -357,21 +387,57 @@ class HiCacheController:
             self.prefetch_revoke_queue = Queue()
             self.ack_backup_queue = Queue()
+            self.host_mem_release_queue = Queue()
             self.prefetch_thread.start()
             self.backup_thread.start()
+    def _generate_storage_config(
+        self,
+        model_name: Optional[str] = None,
+        storage_backend_extra_config: Optional[str] = None,
+    ):
+        if is_dp_attention_enabled():
+            self.tp_rank = get_attention_tp_rank()
+            self.tp_size = get_attention_tp_size()
+            self.dp_rank = get_attention_dp_rank()
+        else:
+            self.tp_rank = get_tensor_model_parallel_rank()
+            self.tp_size = get_tensor_model_parallel_world_size()
+            self.dp_rank = 0
+        # Currently, AscendMLAPagedTokenToKVPool is the subclass of MLATokenToKVPool.
+        is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
+        # Parse extra config JSON if provided
+        extra_config = None
+        if storage_backend_extra_config:
+            try:
+                import json
+                extra_config = json.loads(storage_backend_extra_config)
+            except Exception as e:
+                logger.error(f"Invalid backend extra config JSON: {e}")
+        return HiCacheStorageConfig(
+            tp_rank=self.tp_rank,
+            tp_size=self.tp_size,
+            is_mla_model=is_mla_backend,
+            is_page_first_layout=self.mem_pool_host.layout == "page_first",
+            model_name=model_name,
+            extra_config=extra_config,
+        )
     def reset(self):
         self.stop_event.set()
-        self.write_thread.join()
-        self.load_thread.join()
-        self.write_queue.queue.clear()
-        self.load_queue.queue.clear()
+        self.write_queue.clear()
+        self.load_queue.clear()
         self.write_buffer.clear()
         self.load_buffer.clear()
-        self.ack_write_queue.queue.clear()
-        self.ack_load_queue.queue.clear()
+        self.ack_write_queue.clear()
+        self.ack_load_queue.clear()
         if self.enable_storage:
             self.prefetch_thread.join()
             self.backup_thread.join()
@@ -380,15 +446,7 @@ class HiCacheController:
             self.prefetch_revoke_queue.queue.clear()
             self.ack_backup_queue.queue.clear()
-        self.write_thread = threading.Thread(
-            target=self.write_thread_func_direct, daemon=True
-        )
-        self.load_thread = threading.Thread(
-            target=self.load_thread_func_layer_by_layer, daemon=True
-        )
         self.stop_event.clear()
-        self.write_thread.start()
-        self.load_thread.start()
         if self.enable_storage:
             self.prefetch_thread = threading.Thread(
@@ -400,20 +458,11 @@ class HiCacheController:
             self.prefetch_thread.start()
             self.backup_thread.start()
-    @property
-    def backup_skip(self):
-        return (
-            self.is_mla
-            and get_tensor_model_parallel_rank() != 0
-            # todo: only support file and mooncake
-            and self.storage_backend_type in ["file", "mooncake"]
-        )
     def write(
         self,
         device_indices: torch.Tensor,
         priority: Optional[int] = None,
-        node_id: int = 0,
+        node_id: int = -1,
     ) -> Optional[torch.Tensor]:
         """
         Back up KV caches from device memory to host memory.
@@ -422,17 +471,46 @@ class HiCacheController:
         if host_indices is None:
             return None
         self.mem_pool_host.protect_write(host_indices)
-        torch.cuda.current_stream().synchronize()
-        self.write_queue.put(
+        self.write_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
+        self.start_writing()
         return host_indices
+    def start_writing(self) -> None:
+        if len(self.write_queue) == 0:
+            return
+        op = CacheOperation.merge_ops(self.write_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.write_queue.clear()
+        start_event = torch.cuda.Event()
+        finish_event = torch.cuda.Event()
+        start_event.record()
+        with torch.cuda.stream(self.write_stream):
+            start_event.wait(self.write_stream)
+            self.mem_pool_host.backup_from_device_all_layer(
+                self.mem_pool_device, host_indices, device_indices, self.io_backend
+            )
+            self.mem_pool_host.complete_io(op.host_indices)
+            finish_event.record()
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the write stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.write_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.write_stream)
+        self.ack_write_queue.append(HiCacheAck(start_event, finish_event, op.node_ids))
     def load(
         self,
         host_indices: torch.Tensor,
         priority: Optional[int] = None,
-        node_id: int = 0,
+        node_id: int = -1,
     ) -> Optional[torch.Tensor]:
         """
         Load KV caches from host memory to device memory.
@@ -441,17 +519,18 @@ class HiCacheController:
         if device_indices is None:
             return None
         self.mem_pool_host.protect_load(host_indices)
-        # to ensure the device indices are ready before accessed by another CUDA stream
-        torch.cuda.current_stream().synchronize()
-        self.load_queue.put(
+        self.load_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
         return device_indices
-    def move_indices(self, host_indices, device_indices):
+    def move_indices(self, op: CacheOperation):
+        host_indices, device_indices = op.host_indices, op.device_indices
         # move indices to GPU if using kernels, to host if using direct indexing
         if self.io_backend == "kernel":
-            return host_indices.to(self.mem_pool_device.device), device_indices
+            if not host_indices.is_cuda:
+                host_indices = host_indices.to(self.device, non_blocking=True)
+            return host_indices, device_indices
         elif self.io_backend == "direct":
             device_indices = device_indices.cpu()
             host_indices, idx = host_indices.sort()
@@ -459,58 +538,20 @@ class HiCacheController:
         else:
             raise ValueError(f"Unsupported io backend")
-    def write_thread_func_direct(self):
-        """
-        Directly write through KV caches to host memory without buffering.
-        """
-        torch.cuda.set_stream(self.write_stream)
-        while not self.stop_event.is_set():
-            try:
-                operation = self.write_queue.get(block=True, timeout=1)
-                host_indices, device_indices = self.move_indices(
-                    operation.host_indices, operation.device_indices
-                )
-                self.mem_pool_host.backup_from_device_all_layer(
-                    self.mem_pool_device, host_indices, device_indices, self.io_backend
-                )
-                self.write_stream.synchronize()
-                self.mem_pool_host.complete_io(operation.host_indices)
-                for node_id in operation.node_ids:
-                    if node_id != 0:
-                        self.ack_write_queue.put(node_id)
-            except Empty:
-                continue
-            except Exception as e:
-                logger.error(e)
+    def start_loading(self) -> int:
+        if len(self.load_queue) == 0:
+            return -1
-    def load_thread_func_layer_by_layer(self):
-        """
-        Load KV caches from host memory to device memory layer by layer.
-        """
-        torch.cuda.set_stream(self.load_stream)
-        while not self.stop_event.is_set():
-            self.load_cache_event.wait(timeout=1)
-            if not self.load_cache_event.is_set():
-                continue
-            self.load_cache_event.clear()
-            self.layer_done_counter.update_producer()
-            batch_operation = None
-            while self.load_queue.qsize() > 0:
-                op = self.load_queue.get(block=True)
-                if batch_operation is None:
-                    batch_operation = op
-                else:
-                    batch_operation.merge(op)
-            if batch_operation is None:
-                continue
+        producer_id = self.layer_done_counter.update_producer()
+        op = CacheOperation.merge_ops(self.load_queue)
+        host_indices, device_indices = self.move_indices(op)
+        self.load_queue.clear()
+        producer_event = self.layer_done_counter.events[producer_id]
+        producer_event.start_event.record()
-            # start layer-wise KV cache transfer from CPU to GPU
-            self.layer_done_counter.reset()
-            host_indices, device_indices = self.move_indices(
-                batch_operation.host_indices, batch_operation.device_indices
-            )
-            for i in range(self.mem_pool_host.layer_num):
+        with torch.cuda.stream(self.load_stream):
+            producer_event.start_event.wait(self.load_stream)
+            for i in range(self.layer_num):
                 self.mem_pool_host.load_to_device_per_layer(
                     self.mem_pool_device,
                     host_indices,
@@ -518,13 +559,24 @@ class HiCacheController:
                     i,
                     self.io_backend,
                 )
-                self.load_stream.synchronize()
-                self.layer_done_counter.increment()
-            self.mem_pool_host.complete_io(batch_operation.host_indices)
-            for node_id in batch_operation.node_ids:
-                if node_id != 0:
-                    self.ack_load_queue.put(node_id)
+                producer_event.complete(i)
+            self.mem_pool_host.complete_io(op.host_indices)
+            # NOTE: We must save the host indices and device indices here,
+            # this is because we need to guarantee that these tensors are
+            # still alive when the load stream is executing.
+            if host_indices.is_cuda:
+                host_indices.record_stream(self.load_stream)
+            if device_indices.is_cuda:
+                device_indices.record_stream(self.load_stream)
+        self.ack_load_queue.append(
+            HiCacheAck(
+                start_event=producer_event.start_event,
+                finish_event=producer_event.finish_event,
+                node_ids=op.node_ids,
+            )
+        )
+        return producer_id
     def evict_device(
         self, device_indices: torch.Tensor, host_indices: torch.Tensor
@@ -567,63 +619,93 @@ class HiCacheController:
         return operation
     def terminate_prefetch(self, operation):
-        operation.mark_done()
+        operation.mark_terminate()
         return operation.completed_tokens, operation.hash_value
-    def zerocopy_page_transfer(self, operation, batch_size=8):
-        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
-            operation.hash_value, operation.host_indices
+    def append_host_mem_release(self, host_indices: torch.Tensor):
+        chunks = host_indices.split(self.mem_pool_host.page_size)
+        for chunk in chunks:
+            self.host_mem_release_queue.put(chunk)
+    def _3fs_zero_copy_batch_exists(self, batch_hashes):
+        _batch_hashes, _, factor = self.mem_pool_host.get_buffer_with_hash(batch_hashes)
+        hit_page_num = self.storage_backend.batch_exists(_batch_hashes) // factor
+        return hit_page_num
+    def _3fs_zero_copy_page_get(self, operation, hash_values, host_indices):
+        hashes, dsts, factor = self.mem_pool_host.get_buffer_with_hash(
+            hash_values, host_indices
         )
-        for i in range(0, len(hashes), batch_size):
-            page_hashes = hashes[i : i + batch_size]
-            page_dsts = dsts[i : i + batch_size]
-            page_data = self.storage_backend.batch_get(page_hashes, page_dsts)
-            if page_data is None:
-                logger.warning(
-                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
-                )
-                break
-            completed_tokens = operation.completed_tokens
-            if operation.increment(self.page_size * len(page_hashes)):
-                for i in range(len(page_hashes)):
-                    completed_tokens += self.page_size
-            else:
-                break
+        page_data = self.storage_backend.batch_get(hashes, dsts)
+        if page_data:
+            inc = self.page_size * len(hashes) // factor
+            operation.increment(inc)
+        else:
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed to retrieve page {hashes}."
+            )
-    def generic_page_transfer(self, operation, batch_size=8):
-        for i in range(0, len(operation.hash_value), batch_size):
-            page_hashes = operation.hash_value[i : i + batch_size]
-            # todo: zero copy
-            dummy_page_dst = [
-                self.mem_pool_host.get_dummy_flat_data_page()
-                for _ in range(len(page_hashes))
-            ]
-            page_data = self.storage_backend.batch_get(page_hashes, dummy_page_dst)
-            if page_data is None:
+    def _mooncake_page_get(self, operation, hash_values, host_indices):
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
+            self.storage_config.tp_rank,
+        )
+        get_result = self.storage_backend.batch_get(
+            key_strs,
+            target_locations=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        if get_result != len(hash_values):
+            logger.warning(
+                f"Prefetch operation {operation.request_id} failed or partially failed."
+            )
+        if get_result != 0:
+            operation.increment(get_result * self.page_size)
+    def _generic_page_get(self, operation, hash_values, host_indices):
+        dummy_page_dst = [
+            self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
+        ]
+        page_data = self.storage_backend.batch_get(hash_values, dummy_page_dst)
+        if page_data is None:
+            return
+        for i in range(len(hash_values)):
+            if page_data[i] is None:
                 logger.warning(
-                    f"Prefetch operation {operation.request_id} failed to retrieve page {page_hashes}."
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
                 )
                 break
-            completed_tokens = operation.completed_tokens
-            if operation.increment(self.page_size * len(page_hashes)):
-                for i in range(len(page_hashes)):
-                    self.mem_pool_host.set_from_flat_data_page(
-                        operation.host_indices[completed_tokens],
-                        page_data[i],
-                    )
-                    completed_tokens += self.page_size
-            else:
-                break
-    def mooncake_page_transfer(self, operation):
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            operation.hash_value, operation.host_indices
+            # Must set the data before increasing the completed tokens.
+            # Otherwise this page may be read before being set.
+            self.mem_pool_host.set_from_flat_data_page(
+                host_indices[i * self.page_size],
+                page_data[i],
+            )
+            if not operation.increment(self.page_size):
+                break  # Operation terminated by controller
+    def _page_transfer(self, operation):
+        # Transfer batch by batch
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
+            ]
+            prev_completed_tokens = operation.completed_tokens
+            # Get one batch token, and update the completed_tokens if succeed
+            self.page_get_func(operation, batch_hashes, batch_host_indices)
+            # Check termination
+            if (
+                operation.completed_tokens
+                != prev_completed_tokens + len(batch_hashes) * self.page_size
+            ):
+                operation.mark_terminate()
+                break  # Some operations fail or operation terminated by controller
+        # release pre-allocated memory
+        self.append_host_mem_release(
+            operation.host_indices[operation.completed_tokens :]
         )
-        self.storage_backend.batch_get(key_strs, buffer_ptrs, buffer_sizes)
-        operation.increment(len(operation.hash_value) * self.page_size)
-    def is_mooncake_backend(self):
-        return self.storage_backend_type == "mooncake"
     def prefetch_io_aux_func(self):
         """
@@ -632,35 +714,50 @@ class HiCacheController:
         while not self.stop_event.is_set():
             try:
                 operation = self.prefetch_buffer.get(block=True, timeout=1)
-                if self.is_mooncake_backend():
-                    self.mooncake_page_transfer(operation)
-                elif self.storage_backend_type == "hf3fs":
-                    if self.mem_pool_host.layout == "page_first":
-                        self.zerocopy_page_transfer(operation, batch_size=128)
-                    elif self.mem_pool_host.layout == "layer_first":
-                        self.generic_page_transfer(operation, batch_size=128)
-                else:
-                    self.generic_page_transfer(operation)
-                if self.tp_world_size > 1:
-                    # to ensure all TP workers release the host memory at the same time
-                    torch.distributed.barrier(group=self.prefetch_io_tp_group)
+                self._page_transfer(operation)
                 # operation terminated by controller, release pre-allocated memory
-                self.mem_pool_host.free(
+                self.append_host_mem_release(
                     operation.host_indices[operation.completed_tokens :]
                 )
             except Empty:
                 continue
-    def prefetch_rate_limit_check(self) -> bool:
+    def prefetch_rate_limited(self) -> bool:
         """
         Rate limit the prefetching operations to avoid overwhelming the storage backend.
         """
         # cancel prefetch if too much memory is occupied
         if self.prefetch_tokens_occupied >= self.prefetch_capacity_limit:
-            return False
+            return True
         # todo: more sophisticated rate limiting based on storage backend performance
-        return True
+        return False
+    def _storage_hit_query(self, operation) -> tuple[list[str], int]:
+        last_hash = operation.last_hash
+        tokens_to_fetch = operation.token_ids
+        storage_query_count = 0
+        hash_value = []
+        for start in range(
+            0, len(tokens_to_fetch), self.page_size * self.storage_batch_size
+        ):
+            end = min(
+                start + self.page_size * self.storage_batch_size, len(tokens_to_fetch)
+            )
+            batch_tokens = tokens_to_fetch[start:end]
+            batch_hashes = []
+            for i in range(0, len(batch_tokens), self.page_size):
+                last_hash = self.get_hash_str(
+                    batch_tokens[i : i + self.page_size], last_hash
+                )
+                batch_hashes.append(last_hash)
+            hit_page_num = self.batch_exists_func(batch_hashes)
+            hash_value.extend(batch_hashes[:hit_page_num])
+            storage_query_count += hit_page_num * self.page_size
+            if hit_page_num < len(batch_hashes):
+                break
+        return hash_value, storage_query_count
     def prefetch_thread_func(self):
         """
@@ -675,39 +772,7 @@ class HiCacheController:
                 if operation is None:
                     continue
-                storage_hit_count = 0
-                if (
-                    operation.host_indices is not None
-                ) and self.prefetch_rate_limit_check():
-                    last_hash = operation.last_hash
-                    tokens_to_fetch = operation.token_ids
-                    remaining_tokens = len(tokens_to_fetch)
-                    hash_value = []
-                    while remaining_tokens >= self.page_size:
-                        last_hash = self.get_hash_str(
-                            tokens_to_fetch[
-                                storage_hit_count : storage_hit_count + self.page_size
-                            ],
-                            last_hash,
-                        )
-                        # todo, more unified interface
-                        if not self.is_mooncake_backend():
-                            if not self.storage_backend.exists(last_hash):
-                                break
-                        hash_value.append(last_hash)
-                        storage_hit_count += self.page_size
-                        remaining_tokens -= self.page_size
-                    if self.is_mooncake_backend():
-                        # deferring to batch exists for mooncake store
-                        exist_result = self.storage_backend.exists(hash_value)
-                        storage_hit_count = (
-                            sum(1 for v in exist_result.values() if v != 0)
-                            * self.page_size
-                        )
+                hash_value, storage_hit_count = self._storage_hit_query(operation)
                 if self.tp_world_size > 1:
                     storage_hit_count_tensor = torch.tensor(
                         storage_hit_count, dtype=torch.int
@@ -722,8 +787,7 @@ class HiCacheController:
                 if storage_hit_count < self.prefetch_threshold:
                     # not to prefetch if not enough benefits
                     self.prefetch_revoke_queue.put(operation.request_id)
-                    if operation.host_indices is not None:
-                        self.mem_pool_host.free(operation.host_indices)
+                    self.append_host_mem_release(operation.host_indices)
                     logger.debug(
                         f"Revoking prefetch for request {operation.request_id} due to insufficient hits ({storage_hit_count})."
                     )
@@ -732,7 +796,9 @@ class HiCacheController:
                         : (storage_hit_count // self.page_size)
                     ]
                     # free the pre-allocated memory for pages that are not hit
-                    self.mem_pool_host.free(operation.host_indices[storage_hit_count:])
+                    self.append_host_mem_release(
+                        operation.host_indices[storage_hit_count:]
+                    )
                     operation.host_indices = operation.host_indices[:storage_hit_count]
                     logger.debug(
                         f"Prefetching {len(operation.hash_value)} pages for request {operation.request_id}."
@@ -755,59 +821,52 @@ class HiCacheController:
         self.backup_queue.put(operation)
         return operation.id
-    def zerocopy_page_backup(self, operation, batch_size=8):
-        hashes, dsts = self.mem_pool_host.get_buffer_with_hash(
-            operation.hash_value, operation.host_indices
+    # non-zero copy
+    def _generic_page_set(self, hash_values, host_indices) -> bool:
+        data = [
+            self.mem_pool_host.get_flat_data_page(host_indices[i * self.page_size])
+            for i in range(len(hash_values))
+        ]
+        return self.storage_backend.batch_set(hash_values, data)
+    # zero copy
+    def _mooncake_page_set(self, hash_values, host_indices) -> bool:
+        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
+            hash_values,
+            host_indices,
+            self.storage_config.tp_rank,
         )
-        for i in range(0, len(hashes), batch_size):
-            page_hashes = hashes[i : i + batch_size]
-            page_data = dsts[i : i + batch_size]
-            success = self.storage_backend.batch_set(page_hashes, page_data)
-            if not success:
-                logger.warning(f"Failed to write page {page_hashes} to storage.")
-                break
-            operation.completed_tokens += self.page_size * len(page_hashes)
-    def generic_page_backup(self, operation, batch_size=8):
-        for i in range(0, len(operation.hash_value), batch_size):
-            page_hashes = operation.hash_value[i : i + batch_size]
-            page_data = [
-                self.mem_pool_host.get_flat_data_page(
-                    operation.host_indices[j * self.page_size]
-                )
-                for j in range(i, i + len(page_hashes))
+        success = self.storage_backend.batch_set(
+            key_strs,
+            target_locations=buffer_ptrs,
+            target_sizes=buffer_sizes,
+        )
+        return success
+    # zero copy
+    def _3fs_zero_copy_page_set(self, hash_values, host_indices) -> bool:
+        hashes, dsts, _ = self.mem_pool_host.get_buffer_with_hash(
+            hash_values, host_indices
+        )
+        return self.storage_backend.batch_set(hashes, dsts)
+    # Backup batch by batch
+    def _page_backup(self, operation):
+        # Backup batch by batch
+        for i in range(0, len(operation.hash_value), self.storage_batch_size):
+            batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
+            batch_host_indices = operation.host_indices[
+                i * self.page_size : (i + len(batch_hashes)) * self.page_size
             ]
-            success = self.storage_backend.batch_set(page_hashes, page_data)
+            # Set one batch token, and record if success.
+            # todo: allow partial success
+            success = self.page_set_func(batch_hashes, batch_host_indices)
             if not success:
-                logger.warning(f"Failed to write page {page_hashes} to storage.")
-                break
-            operation.completed_tokens += self.page_size * len(page_hashes)
-    def mooncake_page_backup(self, operation):
-        if len(operation.hash_value):
-            exist_hashvalues = self.storage_backend.exists(operation.hash_value)
-            indices = operation.host_indices.tolist()
-            non_exist_keys = []
-            non_exist_indices = []
-            for i in range(len(operation.hash_value)):
-                if not exist_hashvalues[operation.hash_value[i]]:
-                    non_exist_keys.append(operation.hash_value[i])
-                    non_exist_indices.extend(
-                        indices[i * self.page_size : (i + 1) * self.page_size]
-                    )
-            if len(non_exist_keys) > 0:
-                key_strs, buffer_ptrs, buffer_sizes = (
-                    self.mem_pool_host.get_buffer_meta(
-                        non_exist_keys, non_exist_indices
-                    )
-                )
-                # TODO: check the return value of batch set to see how many tokens are set successfully
-                self.storage_backend.batch_set(
-                    key_strs,
-                    target_location=buffer_ptrs,
-                    target_sizes=buffer_sizes,
+                logger.warning(
+                    f"Write page to storage: {len(batch_hashes)} pages failed."
                 )
-        operation.completed_tokens += len(operation.hash_value) * self.page_size
+                break
+            operation.completed_tokens += self.page_size * len(batch_hashes)
     def backup_thread_func(self):
         """
@@ -820,36 +879,8 @@ class HiCacheController:
                     continue
                 if not self.backup_skip:
-                    if self.is_mooncake_backend():
-                        self.mooncake_page_backup(operation)
-                    elif self.storage_backend_type == "hf3fs":
-                        if self.mem_pool_host.layout == "page_first":
-                            self.zerocopy_page_backup(operation, batch_size=128)
-                        elif self.mem_pool_host.layout == "layer_first":
-                            self.generic_page_backup(operation, batch_size=128)
-                    else:
-                        self.generic_page_backup(operation)
-                    min_completed_tokens = operation.completed_tokens
-                else:
-                    min_completed_tokens = len(operation.token_ids)
-                if self.tp_world_size > 1:
-                    completed_tokens_tensor = torch.tensor(
-                        min_completed_tokens, dtype=torch.int
-                    )
-                    torch.distributed.all_reduce(
-                        completed_tokens_tensor,
-                        op=torch.distributed.ReduceOp.MIN,
-                        group=self.backup_tp_group,
-                    )
-                    min_completed_tokens = completed_tokens_tensor.item()
-                self.ack_backup_queue.put(
-                    (
-                        operation.id,
-                        min_completed_tokens,
-                    )
-                )
+                    self._page_backup(operation)
+                self.ack_backup_queue.put(operation)
             except Empty:
                 continue

sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

sglang 0.5.1.post2py3-none-any.whl → 0.5.2py3-none-any.whl