PyPI - sglang - Versions diffs - 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl - Mend

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (482) hide show

sglang/bench_one_batch.py +54 -37
sglang/bench_one_batch_server.py +340 -34
sglang/bench_serving.py +340 -159
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +9 -2
sglang/profiler.py +20 -3
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +309 -0
sglang/srt/configs/load_config.py +33 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +284 -118
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +576 -0
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +6 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +26 -15
sglang/srt/debug_utils/dumper.py +10 -3
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +268 -98
sglang/srt/disaggregation/decode.py +172 -39
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +203 -555
sglang/srt/disaggregation/nixl/conn.py +217 -63
sglang/srt/disaggregation/prefill.py +113 -270
sglang/srt/disaggregation/utils.py +36 -5
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +203 -97
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +85 -65
sglang/srt/entrypoints/grpc_server.py +632 -305
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +169 -17
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +327 -34
sglang/srt/entrypoints/openai/serving_base.py +74 -8
sglang/srt/entrypoints/openai/serving_chat.py +202 -118
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +20 -4
sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
sglang/srt/entrypoints/openai/serving_responses.py +47 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +323 -0
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +3 -4
sglang/srt/eplb/expert_location.py +30 -5
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +21 -16
sglang/srt/function_call/glm4_moe_detector.py +4 -8
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +61 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/utils.py +98 -7
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/grpc_request_manager.py +915 -0
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
sglang/srt/layers/activation.py +11 -7
sglang/srt/layers/attention/aiter_backend.py +17 -18
sglang/srt/layers/attention/ascend_backend.py +125 -10
sglang/srt/layers/attention/attention_registry.py +226 -0
sglang/srt/layers/attention/base_attn_backend.py +32 -4
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +52 -15
sglang/srt/layers/attention/flashinfer_backend.py +357 -212
sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
sglang/srt/layers/attention/flashmla_backend.py +9 -7
sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
sglang/srt/layers/attention/mamba/mamba.py +514 -1
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +23 -0
sglang/srt/layers/attention/nsa_backend.py +1201 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/triton_backend.py +249 -42
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
sglang/srt/layers/attention/utils.py +11 -7
sglang/srt/layers/attention/vision.py +61 -3
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +19 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +28 -1
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +47 -15
sglang/srt/layers/linear.py +30 -5
sglang/srt/layers/logits_processor.py +161 -18
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
sglang/srt/layers/moe/ep_moe/layer.py +243 -448
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
sglang/srt/layers/moe/moe_runner/runner.py +3 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +3 -2
sglang/srt/layers/moe/utils.py +27 -1
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/__init__.py +2 -53
sglang/srt/layers/quantization/awq.py +183 -6
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +20 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +86 -20
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +43 -15
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +141 -81
sglang/srt/layers/quantization/mxfp4.py +17 -34
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +1 -4
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +51 -24
sglang/srt/layers/quantization/w8a8_int8.py +45 -27
sglang/srt/layers/radix_attention.py +59 -9
sglang/srt/layers/rotary_embedding.py +750 -46
sglang/srt/layers/sampler.py +84 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +23 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/base_backend.py +3 -3
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +9 -4
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora.py +7 -5
sglang/srt/lora/lora_manager.py +33 -7
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +41 -17
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
sglang/srt/lora/utils.py +7 -5
sglang/srt/managers/cache_controller.py +83 -152
sglang/srt/managers/data_parallel_controller.py +156 -87
sglang/srt/managers/detokenizer_manager.py +51 -24
sglang/srt/managers/io_struct.py +223 -129
sglang/srt/managers/mm_utils.py +49 -10
sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +130 -0
sglang/srt/managers/schedule_batch.py +340 -529
sglang/srt/managers/schedule_policy.py +158 -18
sglang/srt/managers/scheduler.py +665 -620
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
sglang/srt/managers/tokenizer_manager.py +462 -226
sglang/srt/managers/tp_worker.py +217 -156
sglang/srt/managers/utils.py +79 -47
sglang/srt/mem_cache/allocator.py +21 -22
sglang/srt/mem_cache/allocator_ascend.py +42 -28
sglang/srt/mem_cache/base_prefix_cache.py +3 -3
sglang/srt/mem_cache/chunk_cache.py +20 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +38 -0
sglang/srt/mem_cache/hicache_storage.py +44 -2
sglang/srt/mem_cache/hiradix_cache.py +134 -34
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +602 -208
sglang/srt/mem_cache/memory_pool_host.py +134 -183
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +263 -78
sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +115 -58
sglang/srt/metrics/collector.py +113 -120
sglang/srt/metrics/func_timer.py +3 -8
sglang/srt/metrics/utils.py +8 -1
sglang/srt/model_executor/cpu_graph_runner.py +2 -2
sglang/srt/model_executor/cuda_graph_runner.py +81 -36
sglang/srt/model_executor/forward_batch_info.py +40 -50
sglang/srt/model_executor/model_runner.py +507 -319
sglang/srt/model_executor/npu_graph_runner.py +11 -5
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +438 -37
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +200 -27
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +40 -56
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +25 -4
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +793 -235
sglang/srt/models/dots_ocr.py +171 -0
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +570 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +17 -1
sglang/srt/models/gemma3n_mm.py +2 -3
sglang/srt/models/glm4_moe.py +17 -40
sglang/srt/models/glm4_moe_nextn.py +4 -4
sglang/srt/models/glm4v.py +3 -2
sglang/srt/models/glm4v_moe.py +6 -6
sglang/srt/models/gpt_oss.py +12 -35
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +4 -2
sglang/srt/models/llama.py +6 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +6 -23
sglang/srt/models/longcat_flash_nextn.py +4 -15
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +27 -6
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +0 -7
sglang/srt/models/qwen2_5_vl.py +5 -5
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +70 -4
sglang/srt/models/qwen2_vl.py +6 -3
sglang/srt/models/qwen3.py +18 -3
sglang/srt/models/qwen3_moe.py +50 -38
sglang/srt/models/qwen3_next.py +43 -21
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +791 -0
sglang/srt/models/qwen3_vl_moe.py +343 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +268 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/torch_native_llama.py +9 -2
sglang/srt/models/utils.py +61 -0
sglang/srt/multimodal/processors/base_processor.py +21 -9
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +2 -4
sglang/srt/multimodal/processors/glm4v.py +1 -5
sglang/srt/multimodal/processors/internvl.py +20 -10
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +83 -17
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/jinja_template_utils.py +6 -0
sglang/srt/parser/reasoning_parser.py +0 -1
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +36 -23
sglang/srt/sampling/sampling_params.py +75 -0
sglang/srt/server_args.py +1300 -338
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +161 -0
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
sglang/srt/speculative/eagle_info.py +786 -0
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +113 -1270
sglang/srt/speculative/eagle_worker.py +120 -285
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/ngram_info.py +433 -0
sglang/srt/speculative/ngram_worker.py +246 -0
sglang/srt/speculative/spec_info.py +49 -0
sglang/srt/speculative/spec_utils.py +641 -0
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/tracing/trace.py +32 -6
sglang/srt/two_batch_overlap.py +35 -18
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/{utils.py → utils/common.py} +583 -113
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/profile_merger.py +199 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +1 -1
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/get_logits_ut.py +57 -0
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +120 -11
sglang/test/runners.py +3 -1
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +8 -2
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +3 -4
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +430 -0
sglang/test/test_deterministic_utils.py +73 -0
sglang/test/test_disaggregation_utils.py +93 -1
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +432 -16
sglang/utils.py +10 -1
sglang/version.py +1 -1
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
sglang/srt/entrypoints/grpc_request_manager.py +0 -580
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0

sglang/srt/managers/cache_controller.py CHANGED Viewed

@@ -14,15 +14,17 @@ limitations under the License.
 """
 import logging
-import math
 import threading
 import time
-from queue import Empty, Full, PriorityQueue, Queue
-from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple
+from queue import Empty, Full, Queue
+from typing import TYPE_CHECKING, List, NamedTuple, Optional
 import torch
-from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
 if TYPE_CHECKING:
     from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
@@ -38,7 +40,7 @@ from sglang.srt.layers.dp_attention import (
     get_attention_tp_size,
     is_dp_attention_enabled,
 )
-from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
 logger = logging.getLogger(__name__)
@@ -191,12 +193,14 @@ class StorageOperation:
         token_ids: List[int],
         last_hash: Optional[str] = None,
         hash_value: Optional[List[str]] = None,
+        prefix_keys: Optional[List[str]] = None,
     ):
         self.host_indices = host_indices
         self.token_ids = token_ids
         self.last_hash = last_hash
         self.completed_tokens = 0
         self.hash_value = hash_value if hash_value is not None else []
+        self.prefix_keys = prefix_keys
         self.id = StorageOperation.counter
         StorageOperation.counter += 1
@@ -212,6 +216,7 @@ class PrefetchOperation(StorageOperation):
         host_indices: torch.Tensor,
         token_ids: List[int],
         last_hash: Optional[str] = None,
+        prefix_keys: Optional[List[str]] = None,
     ):
         self.request_id = request_id
@@ -219,7 +224,7 @@ class PrefetchOperation(StorageOperation):
         self._terminated_flag = False
         self.start_time = time.monotonic()
-        super().__init__(host_indices, token_ids, last_hash)
+        super().__init__(host_indices, token_ids, last_hash, prefix_keys=prefix_keys)
     def increment(self, num_tokens: int):
         with self._lock:
@@ -250,7 +255,7 @@ class HiCacheController:
         storage_backend: Optional[str] = None,
         prefetch_threshold: int = 256,
         model_name: Optional[str] = None,
-        storage_backend_extra_config: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
     ):
         self.mem_pool_device_allocator = token_to_kv_pool_allocator
         self.mem_pool_device = token_to_kv_pool_allocator.get_kvcache()
@@ -275,43 +280,17 @@ class HiCacheController:
                 and self.storage_config.tp_rank != 0
             )
-            if storage_backend == "file":
-                from sglang.srt.mem_cache.hicache_storage import HiCacheFile
-                self.storage_backend = HiCacheFile(self.storage_config)
-            elif storage_backend == "nixl":
-                from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
+            # Use storage backend factory for dynamic backend creation
+            from sglang.srt.mem_cache.storage import StorageBackendFactory
-                self.storage_backend = HiCacheNixl()
-            elif storage_backend == "mooncake":
-                from sglang.srt.mem_cache.storage.mooncake_store.mooncake_store import (
-                    MooncakeStore,
-                )
-                self.storage_backend = MooncakeStore(self.storage_config)
-                self.storage_backend.register_buffer(self.mem_pool_host.kv_buffer)
-                assert self.mem_pool_host.layout == "page_first"
-            elif storage_backend == "hf3fs":
-                from sglang.srt.mem_cache.storage.hf3fs.storage_hf3fs import (
-                    HiCacheHF3FS,
+            try:
+                self.storage_backend = StorageBackendFactory.create_backend(
+                    storage_backend, self.storage_config, self.mem_pool_host
                 )
+            except ValueError as e:
+                raise ValueError(f"Failed to create storage backend: {e}") from e
-                if self.mem_pool_host.layout == "page_first":
-                    bytes_per_page = (
-                        mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
-                    )
-                elif self.mem_pool_host.layout == "layer_first":
-                    bytes_per_page = (
-                        mem_pool_host.get_size_per_token() * mem_pool_host.page_size
-                    )
-                dtype = mem_pool_host.dtype
-                self.storage_backend = HiCacheHF3FS.from_env_config(
-                    bytes_per_page, dtype, self.storage_config
-                )
-            else:
-                raise NotImplementedError(
-                    f"Unsupported storage backend: {storage_backend}"
-                )
+            self.storage_backend.register_mem_pool_host(self.mem_pool_host)
             self.enable_storage = True
             # todo: threshold policy for prefetching
@@ -335,18 +314,10 @@ class HiCacheController:
             # Select the get and set functions
             self.page_get_func = self._generic_page_get
             self.page_set_func = self._generic_page_set
-            self.batch_exists_func = self.storage_backend.batch_exists
-            self.is_3fs_zerocopy = (
-                self.storage_backend_type == "hf3fs"
-                and self.mem_pool_host.layout == "page_first"
-            )
-            if self.storage_backend_type == "mooncake":
-                self.page_get_func = self._mooncake_page_get
-                self.page_set_func = self._mooncake_page_set
-            elif self.is_3fs_zerocopy:
-                self.page_get_func = self._3fs_zero_copy_page_get
-                self.page_set_func = self._3fs_zero_copy_page_set
-                self.batch_exists_func = self._3fs_zero_copy_batch_exists
+            if self.storage_backend_type in ["hf3fs", "mooncake", "eic"]:
+                self.page_get_func = self._page_get_zero_copy
+                self.page_set_func = self._page_set_zero_copy
         self.device = self.mem_pool_device.device
         self.layer_num = self.mem_pool_device.layer_num
@@ -395,7 +366,7 @@ class HiCacheController:
     def _generate_storage_config(
         self,
         model_name: Optional[str] = None,
-        storage_backend_extra_config: Optional[str] = None,
+        storage_backend_extra_config: Optional[dict] = None,
     ):
         if is_dp_attention_enabled():
@@ -410,23 +381,13 @@ class HiCacheController:
         # Currently, AscendMLAPagedTokenToKVPool is the subclass of MLATokenToKVPool.
         is_mla_backend = isinstance(self.mem_pool_device, MLATokenToKVPool)
-        # Parse extra config JSON if provided
-        extra_config = None
-        if storage_backend_extra_config:
-            try:
-                import json
-                extra_config = json.loads(storage_backend_extra_config)
-            except Exception as e:
-                logger.error(f"Invalid backend extra config JSON: {e}")
         return HiCacheStorageConfig(
             tp_rank=self.tp_rank,
             tp_size=self.tp_size,
             is_mla_model=is_mla_backend,
             is_page_first_layout=self.mem_pool_host.layout == "page_first",
             model_name=model_name,
-            extra_config=extra_config,
+            extra_config=storage_backend_extra_config,
         )
     def reset(self):
@@ -470,7 +431,6 @@ class HiCacheController:
         host_indices = self.mem_pool_host.alloc(len(device_indices))
         if host_indices is None:
             return None
-        self.mem_pool_host.protect_write(host_indices)
         self.write_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
@@ -494,7 +454,6 @@ class HiCacheController:
             self.mem_pool_host.backup_from_device_all_layer(
                 self.mem_pool_device, host_indices, device_indices, self.io_backend
             )
-            self.mem_pool_host.complete_io(op.host_indices)
             finish_event.record()
             # NOTE: We must save the host indices and device indices here,
             # this is because we need to guarantee that these tensors are
@@ -518,7 +477,6 @@ class HiCacheController:
         device_indices = self.mem_pool_device_allocator.alloc(len(host_indices))
         if device_indices is None:
             return None
-        self.mem_pool_host.protect_load(host_indices)
         self.load_queue.append(
             CacheOperation(host_indices, device_indices, node_id, priority)
         )
@@ -563,7 +521,6 @@ class HiCacheController:
                     self.io_backend,
                 )
                 producer_event.complete(i)
-            self.mem_pool_host.complete_io(op.host_indices)
             # NOTE: We must save the host indices and device indices here,
             # this is because we need to guarantee that these tensors are
             # still alive when the load stream is executing.
@@ -581,29 +538,16 @@ class HiCacheController:
         )
         return producer_id
-    def evict_device(
-        self, device_indices: torch.Tensor, host_indices: torch.Tensor
-    ) -> int:
-        if self.mem_pool_host.is_synced(host_indices):
-            self.mem_pool_device_allocator.free(device_indices)
-            self.mem_pool_host.update_backup(host_indices)
-            return len(device_indices)
-        else:
-            raise ValueError(
-                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
-            )
+    def evict_device(self, device_indices: torch.Tensor) -> int:
+        self.mem_pool_device_allocator.free(device_indices)
+        return len(device_indices)
     def evict_host(self, host_indices: torch.Tensor, backup_only: bool = True) -> int:
         if not backup_only:
             raise ValueError("Other eviction policies are not supported yet.")
-        if self.mem_pool_host.is_backup(host_indices):
-            self.mem_pool_host.free(host_indices)
-            return len(host_indices)
-        else:
-            raise ValueError(
-                f"Inconsistent states: {self.mem_pool_host.get_state(host_indices)}"
-            )
+        self.mem_pool_host.free(host_indices)
+        return len(host_indices)
     def prefetch(
         self,
@@ -611,12 +555,13 @@ class HiCacheController:
         host_indices: torch.Tensor,
         new_input_tokens: List[int],
         last_hash: Optional[str] = None,
+        prefix_keys: Optional[List[str]] = None,
     ) -> PrefetchOperation:
         """
         Prefetch KV caches from storage backend to host memory.
         """
         operation = PrefetchOperation(
-            request_id, host_indices, new_input_tokens, last_hash
+            request_id, host_indices, new_input_tokens, last_hash, prefix_keys
         )
         self.prefetch_queue.put(operation)
         return operation
@@ -626,47 +571,30 @@ class HiCacheController:
         return operation.completed_tokens, operation.hash_value
     def append_host_mem_release(self, host_indices: torch.Tensor):
-        chunks = host_indices.split(self.mem_pool_host.page_size)
-        for chunk in chunks:
-            self.host_mem_release_queue.put(chunk)
-    def _3fs_zero_copy_batch_exists(self, batch_hashes):
-        _batch_hashes, _, factor = self.mem_pool_host.get_buffer_with_hash(batch_hashes)
-        hit_page_num = self.storage_backend.batch_exists(_batch_hashes) // factor
-        return hit_page_num
-    def _3fs_zero_copy_page_get(self, operation, hash_values, host_indices):
-        hashes, dsts, factor = self.mem_pool_host.get_buffer_with_hash(
-            hash_values, host_indices
-        )
-        page_data = self.storage_backend.batch_get(hashes, dsts)
-        if page_data:
-            inc = self.page_size * len(hashes) // factor
-            operation.increment(inc)
-        else:
-            logger.warning(
-                f"Prefetch operation {operation.request_id} failed to retrieve page {hashes}."
-            )
+        if host_indices.numel() == 0:
+            return
+        pages = host_indices.split(self.mem_pool_host.page_size)
+        for page in pages:
+            self.host_mem_release_queue.put(page)
-    def _mooncake_page_get(self, operation, hash_values, host_indices):
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            hash_values,
-            host_indices,
-            self.storage_config.tp_rank,
-        )
-        get_result = self.storage_backend.batch_get(
-            key_strs,
-            target_locations=buffer_ptrs,
-            target_sizes=buffer_sizes,
+    def _page_get_zero_copy(
+        self, operation, hash_values, host_indices, extra_info=None
+    ):
+        results = self.storage_backend.batch_get_v1(
+            hash_values, host_indices, extra_info
         )
-        if get_result != len(hash_values):
-            logger.warning(
-                f"Prefetch operation {operation.request_id} failed or partially failed."
-            )
-        if get_result != 0:
-            operation.increment(get_result * self.page_size)
+        inc = 0
+        for i in range(len(hash_values)):
+            if not results[i]:
+                logger.warning(
+                    f"Prefetch operation {operation.request_id} failed to retrieve page {hash_values[i]}."
+                )
+                break
+            inc += self.page_size
+        operation.increment(inc)
-    def _generic_page_get(self, operation, hash_values, host_indices):
+    # todo: deprecate
+    def _generic_page_get(self, operation, hash_values, host_indices, extra_info=None):
         dummy_page_dst = [
             self.mem_pool_host.get_dummy_flat_data_page() for _ in hash_values
         ]
@@ -690,6 +618,7 @@ class HiCacheController:
     def _page_transfer(self, operation):
         # Transfer batch by batch
+        prefix_keys = operation.prefix_keys
         for i in range(0, len(operation.hash_value), self.storage_batch_size):
             batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
             batch_host_indices = operation.host_indices[
@@ -697,7 +626,8 @@ class HiCacheController:
             ]
             prev_completed_tokens = operation.completed_tokens
             # Get one batch token, and update the completed_tokens if succeed
-            self.page_get_func(operation, batch_hashes, batch_host_indices)
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            self.page_get_func(operation, batch_hashes, batch_host_indices, extra_info)
             # Check termination
             if (
                 operation.completed_tokens
@@ -705,6 +635,10 @@ class HiCacheController:
             ):
                 operation.mark_terminate()
                 break  # Some operations fail or operation terminated by controller
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
         # release pre-allocated memory
         self.append_host_mem_release(
             operation.host_indices[operation.completed_tokens :]
@@ -738,6 +672,7 @@ class HiCacheController:
     def _storage_hit_query(self, operation) -> tuple[list[str], int]:
         last_hash = operation.last_hash
         tokens_to_fetch = operation.token_ids
+        prefix_keys = operation.prefix_keys.copy() if operation.prefix_keys else None
         storage_query_count = 0
         hash_value = []
@@ -755,11 +690,15 @@ class HiCacheController:
                     batch_tokens[i : i + self.page_size], last_hash
                 )
                 batch_hashes.append(last_hash)
-            hit_page_num = self.batch_exists_func(batch_hashes)
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            hit_page_num = self.storage_backend.batch_exists(batch_hashes, extra_info)
             hash_value.extend(batch_hashes[:hit_page_num])
             storage_query_count += hit_page_num * self.page_size
             if hit_page_num < len(batch_hashes):
                 break
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
         return hash_value, storage_query_count
     def prefetch_thread_func(self):
@@ -816,46 +755,34 @@ class HiCacheController:
         host_indices: torch.Tensor,
         token_ids: List[int],
         hash_value: Optional[List[str]] = None,
+        prefix_keys: Optional[List[str]] = None,
     ) -> int:
         """
         Write KV caches from host memory to storage backend.
         """
-        operation = StorageOperation(host_indices, token_ids, hash_value=hash_value)
+        operation = StorageOperation(
+            host_indices, token_ids, hash_value=hash_value, prefix_keys=prefix_keys
+        )
         self.backup_queue.put(operation)
         return operation.id
-    # non-zero copy
-    def _generic_page_set(self, hash_values, host_indices) -> bool:
+    # todo: deprecate
+    def _generic_page_set(self, hash_values, host_indices, extra_info=None) -> bool:
         data = [
-            self.mem_pool_host.get_flat_data_page(host_indices[i * self.page_size])
+            self.mem_pool_host.get_data_page(host_indices[i * self.page_size])
             for i in range(len(hash_values))
         ]
         return self.storage_backend.batch_set(hash_values, data)
-    # zero copy
-    def _mooncake_page_set(self, hash_values, host_indices) -> bool:
-        key_strs, buffer_ptrs, buffer_sizes = self.mem_pool_host.get_buffer_meta(
-            hash_values,
-            host_indices,
-            self.storage_config.tp_rank,
-        )
-        success = self.storage_backend.batch_set(
-            key_strs,
-            target_locations=buffer_ptrs,
-            target_sizes=buffer_sizes,
+    def _page_set_zero_copy(self, hash_values, host_indices, extra_info=None) -> bool:
+        return all(
+            self.storage_backend.batch_set_v1(hash_values, host_indices, extra_info)
         )
-        return success
-    # zero copy
-    def _3fs_zero_copy_page_set(self, hash_values, host_indices) -> bool:
-        hashes, dsts, _ = self.mem_pool_host.get_buffer_with_hash(
-            hash_values, host_indices
-        )
-        return self.storage_backend.batch_set(hashes, dsts)
     # Backup batch by batch
     def _page_backup(self, operation):
         # Backup batch by batch
+        prefix_keys = operation.prefix_keys
         for i in range(0, len(operation.hash_value), self.storage_batch_size):
             batch_hashes = operation.hash_value[i : i + self.storage_batch_size]
             batch_host_indices = operation.host_indices[
@@ -863,12 +790,16 @@ class HiCacheController:
             ]
             # Set one batch token, and record if success.
             # todo: allow partial success
-            success = self.page_set_func(batch_hashes, batch_host_indices)
+            extra_info = HiCacheStorageExtraInfo(prefix_keys=prefix_keys)
+            success = self.page_set_func(batch_hashes, batch_host_indices, extra_info)
             if not success:
                 logger.warning(
                     f"Write page to storage: {len(batch_hashes)} pages failed."
                 )
                 break
+            if prefix_keys and len(prefix_keys) > 0:
+                prefix_keys += batch_hashes
             operation.completed_tokens += self.page_size * len(batch_hashes)
     def backup_thread_func(self):

sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

sglang 0.5.3rc0py3-none-any.whl → 0.5.4py3-none-any.whl