sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +267 -32
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/interpreter.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +8 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/model_config.py +181 -82
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +71 -19
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +326 -53
- sglang/srt/disaggregation/prefill.py +36 -17
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +192 -113
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +855 -0
- sglang/srt/entrypoints/grpc_server.py +810 -0
- sglang/srt/entrypoints/http_server.py +132 -57
- sglang/srt/entrypoints/openai/protocol.py +115 -7
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +207 -58
- sglang/srt/entrypoints/openai/serving_completions.py +17 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +49 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +9 -2
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +24 -1
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +106 -82
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +118 -198
- sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
- sglang/srt/layers/attention/mamba/mamba.py +629 -0
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +53 -7
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +44 -12
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
- sglang/srt/layers/moe/ep_moe/layer.py +256 -63
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +22 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +78 -49
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +225 -57
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -42
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +26 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +78 -49
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +52 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +215 -314
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +240 -138
- sglang/srt/managers/schedule_policy.py +147 -19
- sglang/srt/managers/scheduler.py +501 -304
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
- sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +321 -632
- sglang/srt/managers/tp_worker.py +81 -22
- sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +15 -21
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +58 -34
- sglang/srt/mem_cache/hiradix_cache.py +227 -80
- sglang/srt/mem_cache/memory_pool.py +535 -58
- sglang/srt/mem_cache/memory_pool_host.py +239 -223
- sglang/srt/mem_cache/radix_cache.py +222 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +25 -36
- sglang/srt/metrics/collector.py +519 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +55 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +98 -57
- sglang/srt/model_executor/model_runner.py +433 -158
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +133 -5
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +158 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +833 -152
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +14 -5
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +40 -4
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +124 -14
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +26 -5
- sglang/srt/models/qwen3_moe.py +71 -12
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +10 -3
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/sampling_batch_info.py +38 -17
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1030 -254
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +253 -136
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +445 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +77 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +383 -5
- sglang/utils.py +22 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,21 @@ import logging
|
|
5
5
|
import os
|
6
6
|
import signal
|
7
7
|
import threading
|
8
|
+
import time
|
8
9
|
from abc import ABC, abstractmethod
|
9
10
|
from functools import wraps
|
10
11
|
from typing import Any, List, Optional, Tuple
|
11
12
|
|
12
13
|
import torch
|
13
14
|
|
14
|
-
from sglang.srt.mem_cache.hicache_storage import
|
15
|
-
|
15
|
+
from sglang.srt.mem_cache.hicache_storage import (
|
16
|
+
HiCacheStorage,
|
17
|
+
HiCacheStorageConfig,
|
18
|
+
HiCacheStorageExtraInfo,
|
19
|
+
)
|
20
|
+
from sglang.srt.mem_cache.memory_pool_host import HostKVCache
|
21
|
+
from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
|
22
|
+
from sglang.srt.metrics.collector import StorageMetrics
|
16
23
|
|
17
24
|
logger = logging.getLogger(__name__)
|
18
25
|
|
@@ -112,6 +119,33 @@ def synchronized():
|
|
112
119
|
return _decorator
|
113
120
|
|
114
121
|
|
122
|
+
def create_hf3fs_client(
|
123
|
+
path: str, size: int, bytes_per_page: int, entries: int, use_mock: bool = False
|
124
|
+
) -> Hf3fsClient:
|
125
|
+
"""Factory function to create appropriate HF3FS client.
|
126
|
+
|
127
|
+
Args:
|
128
|
+
path: File path for storage
|
129
|
+
size: Total size of storage file
|
130
|
+
bytes_per_page: Bytes per page
|
131
|
+
entries: Number of entries for batch operations
|
132
|
+
use_mock: Whether to use mock client instead of real usrbio client
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
"""
|
136
|
+
if use_mock:
|
137
|
+
from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsMockClient
|
138
|
+
|
139
|
+
logger.info(f"[Rank Using Hf3fsMockClient for testing")
|
140
|
+
return Hf3fsMockClient(path, size, bytes_per_page, entries)
|
141
|
+
else:
|
142
|
+
from sglang.srt.mem_cache.storage.hf3fs.hf3fs_usrbio_client import (
|
143
|
+
Hf3fsUsrBioClient,
|
144
|
+
)
|
145
|
+
|
146
|
+
return Hf3fsUsrBioClient(path, size, bytes_per_page, entries)
|
147
|
+
|
148
|
+
|
115
149
|
class HiCacheHF3FS(HiCacheStorage):
|
116
150
|
"""HiCache backend that stores KV cache pages in HF3FS files."""
|
117
151
|
|
@@ -128,16 +162,20 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
128
162
|
dtype: torch.dtype,
|
129
163
|
metadata_client: Hf3fsMetadataInterface,
|
130
164
|
is_mla_model: bool = False,
|
165
|
+
is_page_first_layout: bool = False,
|
166
|
+
use_mock_client: bool = False,
|
131
167
|
):
|
132
168
|
self.rank = rank
|
133
169
|
self.file_path = file_path
|
134
170
|
self.file_size = file_size
|
135
171
|
self.numjobs = numjobs
|
136
172
|
self.bytes_per_page = bytes_per_page
|
173
|
+
self.gb_per_page = bytes_per_page / (1 << 30)
|
137
174
|
self.entries = entries
|
138
175
|
self.dtype = dtype
|
139
176
|
self.metadata_client = metadata_client
|
140
177
|
self.is_mla_model = is_mla_model
|
178
|
+
self.is_page_first_layout = is_page_first_layout
|
141
179
|
self.numel = self.bytes_per_page // self.dtype.itemsize
|
142
180
|
self.num_pages = self.file_size // self.bytes_per_page
|
143
181
|
self.skip_backup = False
|
@@ -145,17 +183,24 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
145
183
|
self.skip_backup = True
|
146
184
|
self.rank = 0
|
147
185
|
|
186
|
+
self.is_zero_copy = False
|
187
|
+
|
148
188
|
logger.info(
|
149
189
|
f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
|
150
190
|
f"file_path={self.file_path}, "
|
151
191
|
f"file_size={self.file_size / (2 ** 30):.2f} GB, "
|
152
|
-
f"num_pages={self.num_pages}"
|
192
|
+
f"num_pages={self.num_pages}, "
|
193
|
+
f"is_mla_model={self.is_mla_model}"
|
153
194
|
)
|
154
195
|
|
155
196
|
self.ac = AtomicCounter(self.numjobs)
|
156
197
|
self.clients = [
|
157
|
-
|
158
|
-
self.file_path,
|
198
|
+
create_hf3fs_client(
|
199
|
+
self.file_path,
|
200
|
+
self.file_size,
|
201
|
+
self.bytes_per_page,
|
202
|
+
self.entries,
|
203
|
+
use_mock_client,
|
159
204
|
)
|
160
205
|
for _ in range(numjobs)
|
161
206
|
]
|
@@ -172,6 +217,11 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
172
217
|
signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
|
173
218
|
signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
|
174
219
|
|
220
|
+
self.prefetch_pgs = []
|
221
|
+
self.backup_pgs = []
|
222
|
+
self.prefetch_bandwidth = []
|
223
|
+
self.backup_bandwidth = []
|
224
|
+
|
175
225
|
@staticmethod
|
176
226
|
def from_env_config(
|
177
227
|
bytes_per_page: int,
|
@@ -192,10 +242,24 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
192
242
|
Hf3fsLocalMetadataClient,
|
193
243
|
)
|
194
244
|
|
245
|
+
use_mock_client = False
|
195
246
|
if storage_config is not None:
|
196
|
-
rank, is_mla_model =
|
247
|
+
rank, is_mla_model, is_page_first_layout = (
|
248
|
+
storage_config.tp_rank,
|
249
|
+
storage_config.is_mla_model,
|
250
|
+
storage_config.is_page_first_layout,
|
251
|
+
)
|
252
|
+
|
253
|
+
if storage_config.extra_config is not None:
|
254
|
+
use_mock_client = storage_config.extra_config.get(
|
255
|
+
"use_mock_hf3fs_client", False
|
256
|
+
)
|
197
257
|
else:
|
198
|
-
rank, is_mla_model =
|
258
|
+
rank, is_mla_model, is_page_first_layout = (
|
259
|
+
0,
|
260
|
+
False,
|
261
|
+
False,
|
262
|
+
)
|
199
263
|
|
200
264
|
mla_unsupported_msg = f"MLA model is not supported without global metadata server, please refer to https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md"
|
201
265
|
|
@@ -213,6 +277,8 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
213
277
|
entries=8,
|
214
278
|
dtype=dtype,
|
215
279
|
metadata_client=Hf3fsLocalMetadataClient(),
|
280
|
+
is_page_first_layout=is_page_first_layout,
|
281
|
+
use_mock_client=use_mock_client,
|
216
282
|
)
|
217
283
|
|
218
284
|
try:
|
@@ -261,27 +327,16 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
261
327
|
dtype=dtype,
|
262
328
|
metadata_client=metadata_client,
|
263
329
|
is_mla_model=is_mla_model,
|
330
|
+
is_page_first_layout=is_page_first_layout,
|
331
|
+
use_mock_client=use_mock_client,
|
264
332
|
)
|
265
333
|
|
266
|
-
def get(
|
267
|
-
self,
|
268
|
-
key: str,
|
269
|
-
target_location: Optional[Any] = None,
|
270
|
-
target_sizes: Optional[Any] = None,
|
271
|
-
) -> torch.Tensor | None:
|
272
|
-
return self.batch_get(
|
273
|
-
[key],
|
274
|
-
[target_location] if target_location is not None else None,
|
275
|
-
[target_sizes] if target_sizes is not None else None,
|
276
|
-
)[0]
|
277
|
-
|
278
334
|
@synchronized()
|
279
|
-
def
|
335
|
+
def _batch_get(
|
280
336
|
self,
|
281
337
|
keys: List[str],
|
282
|
-
|
283
|
-
|
284
|
-
) -> List[torch.Tensor | None]:
|
338
|
+
values: List[torch.Tensor],
|
339
|
+
) -> List[bool]:
|
285
340
|
page_indices = self.metadata_client.get_page_indices(self.rank, keys)
|
286
341
|
|
287
342
|
batch_indices, file_offsets = [], []
|
@@ -290,15 +345,11 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
290
345
|
batch_indices.append(i)
|
291
346
|
file_offsets.append(page_index * self.bytes_per_page)
|
292
347
|
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
|
298
|
-
file_results = [
|
299
|
-
torch.empty(self.numel, dtype=self.dtype)
|
300
|
-
for _ in range(len(batch_indices))
|
301
|
-
]
|
348
|
+
for target_location in values:
|
349
|
+
assert target_location.is_contiguous()
|
350
|
+
file_results = values
|
351
|
+
|
352
|
+
start_time = time.perf_counter()
|
302
353
|
|
303
354
|
futures = [
|
304
355
|
self.executor.submit(
|
@@ -310,12 +361,17 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
310
361
|
]
|
311
362
|
read_results = [result for future in futures for result in future.result()]
|
312
363
|
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
364
|
+
end_time = time.perf_counter()
|
365
|
+
ionum = len(batch_indices)
|
366
|
+
self.prefetch_pgs.append(ionum)
|
367
|
+
self.prefetch_bandwidth.append(
|
368
|
+
ionum / (end_time - start_time) * self.gb_per_page
|
369
|
+
)
|
370
|
+
|
371
|
+
results = [False] * len(keys)
|
372
|
+
for batch_index, read_result in zip(batch_indices, read_results):
|
317
373
|
if read_result == self.bytes_per_page:
|
318
|
-
results[batch_index] =
|
374
|
+
results[batch_index] = True
|
319
375
|
else:
|
320
376
|
logger.error(
|
321
377
|
f"[Rank {self.rank}] HiCacheHF3FS get {keys[batch_index]} failed"
|
@@ -323,27 +379,12 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
323
379
|
|
324
380
|
return results
|
325
381
|
|
326
|
-
|
327
|
-
|
328
|
-
key: str,
|
329
|
-
value: Optional[Any] = None,
|
330
|
-
target_location: Optional[Any] = None,
|
331
|
-
target_sizes: Optional[Any] = None,
|
332
|
-
) -> bool:
|
333
|
-
return self.batch_set(
|
334
|
-
[key],
|
335
|
-
[value] if value is not None else None,
|
336
|
-
[target_location] if target_location is not None else None,
|
337
|
-
[target_sizes] if target_sizes is not None else None,
|
338
|
-
)
|
339
|
-
|
340
|
-
def batch_set(
|
382
|
+
@synchronized()
|
383
|
+
def _batch_set(
|
341
384
|
self,
|
342
385
|
keys: List[str],
|
343
386
|
values: Optional[Any] = None,
|
344
|
-
|
345
|
-
target_sizes: Optional[Any] = None,
|
346
|
-
) -> bool:
|
387
|
+
) -> List[bool]:
|
347
388
|
# In MLA backend, only one rank needs to backup the KV cache
|
348
389
|
if self.skip_backup:
|
349
390
|
return True
|
@@ -366,6 +407,8 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
366
407
|
assert value.is_contiguous()
|
367
408
|
file_values.append(value)
|
368
409
|
|
410
|
+
start_time = time.perf_counter()
|
411
|
+
|
369
412
|
futures = [
|
370
413
|
self.executor.submit(
|
371
414
|
self.clients[self.ac.next()].batch_write,
|
@@ -380,6 +423,11 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
380
423
|
for result in future.result()
|
381
424
|
]
|
382
425
|
|
426
|
+
end_time = time.perf_counter()
|
427
|
+
ionum = len(batch_indices)
|
428
|
+
self.backup_pgs.append(ionum)
|
429
|
+
self.backup_bandwidth.append(ionum / (end_time - start_time) * self.gb_per_page)
|
430
|
+
|
383
431
|
written_keys_to_confirm = []
|
384
432
|
results = [index[0] for index in indices]
|
385
433
|
for batch_index, write_result in zip(batch_indices, write_results):
|
@@ -397,7 +445,7 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
397
445
|
self.rank, written_keys_to_confirm, pages_to_release
|
398
446
|
)
|
399
447
|
|
400
|
-
return
|
448
|
+
return results
|
401
449
|
|
402
450
|
def delete(self, key: str) -> None:
|
403
451
|
self.metadata_client.delete_keys(self.rank, [key])
|
@@ -407,21 +455,25 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
407
455
|
return result[0] if result else False
|
408
456
|
|
409
457
|
def batch_exists(self, keys: List[str]) -> int:
|
458
|
+
factor = 1
|
459
|
+
if self.is_zero_copy and not self.is_mla_model:
|
460
|
+
keys = self._get_mha_zero_copy_keys(keys)
|
461
|
+
factor = 2
|
462
|
+
|
410
463
|
results = self.metadata_client.exists(self.rank, keys)
|
411
|
-
for i in range(len(keys)):
|
412
|
-
if not results[i]:
|
413
|
-
return i
|
414
464
|
|
415
|
-
|
465
|
+
i = 0
|
466
|
+
while i < len(keys) and results[i]:
|
467
|
+
i += 1
|
416
468
|
|
417
|
-
|
469
|
+
return i // factor
|
470
|
+
|
471
|
+
def clear(self) -> None:
|
418
472
|
try:
|
419
473
|
self.metadata_client.clear(self.rank)
|
420
474
|
logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
|
421
|
-
return True
|
422
475
|
except Exception as e:
|
423
476
|
logger.error(f"Failed to clear HiCacheHF3FS: {e}")
|
424
|
-
return False
|
425
477
|
|
426
478
|
def close(self) -> None:
|
427
479
|
try:
|
@@ -431,3 +483,156 @@ class HiCacheHF3FS(HiCacheStorage):
|
|
431
483
|
except Exception as e:
|
432
484
|
logger.error(f"close HiCacheHF3FS: {e}")
|
433
485
|
logger.info("close HiCacheHF3FS")
|
486
|
+
|
487
|
+
@synchronized()
|
488
|
+
def get_stats(self):
|
489
|
+
storage_metrics = StorageMetrics()
|
490
|
+
storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
|
491
|
+
storage_metrics.backup_pgs.extend(self.backup_pgs)
|
492
|
+
storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
|
493
|
+
storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
|
494
|
+
self.prefetch_pgs.clear()
|
495
|
+
self.backup_pgs.clear()
|
496
|
+
self.prefetch_bandwidth.clear()
|
497
|
+
self.backup_bandwidth.clear()
|
498
|
+
return storage_metrics
|
499
|
+
|
500
|
+
def register_mem_pool_host(self, mem_pool_host: HostKVCache):
|
501
|
+
super().register_mem_pool_host(mem_pool_host)
|
502
|
+
self.is_zero_copy = self.mem_pool_host.layout == "page_first"
|
503
|
+
logger.info(f"{self.is_zero_copy=}")
|
504
|
+
|
505
|
+
def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
|
506
|
+
_keys = []
|
507
|
+
for k in keys:
|
508
|
+
_keys.append(f"{k}-k")
|
509
|
+
_keys.append(f"{k}-v")
|
510
|
+
return _keys
|
511
|
+
|
512
|
+
def _get_mha_zero_copy_values(
|
513
|
+
self, values: List[torch.Tensor]
|
514
|
+
) -> List[torch.Tensor]:
|
515
|
+
_values = []
|
516
|
+
for value in values:
|
517
|
+
_values.append(value[0])
|
518
|
+
_values.append(value[1])
|
519
|
+
return _values
|
520
|
+
|
521
|
+
def _batch_get_preprocess(self, keys, host_indices):
|
522
|
+
page_num = len(host_indices) // self.mem_pool_host.page_size
|
523
|
+
# host_indices to kv_buffer
|
524
|
+
flat = not self.is_zero_copy
|
525
|
+
values = (
|
526
|
+
[
|
527
|
+
self.mem_pool_host.get_data_page(
|
528
|
+
host_indices[i * self.mem_pool_host.page_size], flat=flat
|
529
|
+
)
|
530
|
+
for i in range(page_num)
|
531
|
+
]
|
532
|
+
if self.is_zero_copy
|
533
|
+
else [
|
534
|
+
self.mem_pool_host.get_dummy_flat_data_page() for _ in range(page_num)
|
535
|
+
]
|
536
|
+
)
|
537
|
+
|
538
|
+
if self.is_zero_copy and not self.is_mla_model:
|
539
|
+
keys = self._get_mha_zero_copy_keys(keys)
|
540
|
+
values = self._get_mha_zero_copy_values(values)
|
541
|
+
|
542
|
+
return keys, values
|
543
|
+
|
544
|
+
def _batch_get_postprocess(self, host_indices, values, results):
|
545
|
+
page_num = len(host_indices) // self.mem_pool_host.page_size
|
546
|
+
|
547
|
+
if self.is_zero_copy:
|
548
|
+
if not self.is_mla_model:
|
549
|
+
results = [
|
550
|
+
(results[2 * i] and results[2 * i + 1]) for i in range(page_num)
|
551
|
+
]
|
552
|
+
results = results[:page_num]
|
553
|
+
return results
|
554
|
+
|
555
|
+
for i in range(page_num):
|
556
|
+
if not results[i]:
|
557
|
+
break
|
558
|
+
self.mem_pool_host.set_from_flat_data_page(
|
559
|
+
host_indices[i * self.mem_pool_host.page_size], values[i]
|
560
|
+
)
|
561
|
+
|
562
|
+
return results
|
563
|
+
|
564
|
+
def batch_get_v1(
|
565
|
+
self,
|
566
|
+
keys: List[str],
|
567
|
+
host_indices: torch.Tensor,
|
568
|
+
extra_info: Optional[HiCacheStorageExtraInfo] = None,
|
569
|
+
) -> List[bool]:
|
570
|
+
keys, values = self._batch_get_preprocess(keys, host_indices)
|
571
|
+
results = self._batch_get(keys, values)
|
572
|
+
return self._batch_get_postprocess(host_indices, values, results)
|
573
|
+
|
574
|
+
def _batch_set_preprocess(self, keys, host_indices):
|
575
|
+
page_num = len(host_indices) // self.mem_pool_host.page_size
|
576
|
+
# host_indices to kv_buffer
|
577
|
+
flat = not self.is_zero_copy
|
578
|
+
values = [
|
579
|
+
self.mem_pool_host.get_data_page(
|
580
|
+
host_indices[i * self.mem_pool_host.page_size], flat=flat
|
581
|
+
)
|
582
|
+
for i in range(page_num)
|
583
|
+
]
|
584
|
+
|
585
|
+
if self.is_zero_copy and not self.is_mla_model:
|
586
|
+
keys = self._get_mha_zero_copy_keys(keys)
|
587
|
+
values = self._get_mha_zero_copy_values(values)
|
588
|
+
|
589
|
+
return keys, values
|
590
|
+
|
591
|
+
def batch_set_v1(
|
592
|
+
self,
|
593
|
+
keys: List[str],
|
594
|
+
host_indices: torch.Tensor,
|
595
|
+
extra_info: Optional[HiCacheStorageExtraInfo] = None,
|
596
|
+
) -> List[bool]:
|
597
|
+
len_keys = len(keys)
|
598
|
+
keys, values = self._batch_set_preprocess(keys, host_indices)
|
599
|
+
results = self._batch_set(keys, values)
|
600
|
+
return results
|
601
|
+
|
602
|
+
# Deprecated
|
603
|
+
def get(
|
604
|
+
self,
|
605
|
+
key: str,
|
606
|
+
target_location: Optional[Any] = None,
|
607
|
+
target_sizes: Optional[Any] = None,
|
608
|
+
) -> torch.Tensor | None:
|
609
|
+
pass
|
610
|
+
|
611
|
+
# Deprecated
|
612
|
+
def batch_get(
|
613
|
+
self,
|
614
|
+
keys: List[str],
|
615
|
+
target_locations: Optional[Any] = None,
|
616
|
+
target_sizes: Optional[Any] = None,
|
617
|
+
) -> List[torch.Tensor | None] | int:
|
618
|
+
pass
|
619
|
+
|
620
|
+
# Deprecated
|
621
|
+
def set(
|
622
|
+
self,
|
623
|
+
key: str,
|
624
|
+
value: Optional[Any] = None,
|
625
|
+
target_location: Optional[Any] = None,
|
626
|
+
target_sizes: Optional[Any] = None,
|
627
|
+
) -> bool:
|
628
|
+
pass
|
629
|
+
|
630
|
+
# Deprecated
|
631
|
+
def batch_set(
|
632
|
+
self,
|
633
|
+
keys: List[str],
|
634
|
+
values: Optional[Any] = None,
|
635
|
+
target_locations: Optional[Any] = None,
|
636
|
+
target_sizes: Optional[Any] = None,
|
637
|
+
) -> bool:
|
638
|
+
pass
|