PyPI - sglang - Versions diffs - 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

sglang 0.5.2rc1py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (395) hide show

sglang/bench_one_batch.py +7 -9
sglang/bench_one_batch_server.py +330 -31
sglang/bench_serving.py +267 -32
sglang/global_config.py +2 -2
sglang/lang/backend/runtime_endpoint.py +1 -1
sglang/lang/interpreter.py +1 -1
sglang/launch_server.py +14 -0
sglang/profiler.py +2 -2
sglang/srt/batch_invariant_ops/__init__.py +27 -0
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
sglang/srt/configs/__init__.py +8 -0
sglang/srt/configs/device_config.py +3 -1
sglang/srt/configs/dots_ocr.py +64 -0
sglang/srt/configs/dots_vlm.py +139 -0
sglang/srt/configs/falcon_h1.py +360 -0
sglang/srt/configs/internvl.py +6 -0
sglang/srt/configs/load_config.py +9 -0
sglang/srt/configs/model_config.py +181 -82
sglang/srt/configs/qwen3_next.py +326 -0
sglang/srt/configs/qwen3_vl.py +586 -0
sglang/srt/connector/__init__.py +8 -1
sglang/srt/connector/remote_instance.py +82 -0
sglang/srt/constrained/base_grammar_backend.py +49 -12
sglang/srt/constrained/llguidance_backend.py +0 -1
sglang/srt/constrained/outlines_backend.py +0 -1
sglang/srt/constrained/outlines_jump_forward.py +1 -1
sglang/srt/constrained/xgrammar_backend.py +30 -9
sglang/srt/custom_op.py +11 -1
sglang/srt/debug_utils/dump_comparator.py +81 -44
sglang/srt/debug_utils/dump_loader.py +97 -0
sglang/srt/debug_utils/dumper.py +21 -6
sglang/srt/debug_utils/text_comparator.py +73 -11
sglang/srt/disaggregation/ascend/conn.py +2 -2
sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
sglang/srt/disaggregation/base/conn.py +1 -1
sglang/srt/disaggregation/common/conn.py +279 -108
sglang/srt/disaggregation/decode.py +71 -19
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
sglang/srt/disaggregation/fake/conn.py +1 -1
sglang/srt/disaggregation/mini_lb.py +6 -445
sglang/srt/disaggregation/mooncake/conn.py +55 -537
sglang/srt/disaggregation/nixl/conn.py +326 -53
sglang/srt/disaggregation/prefill.py +36 -17
sglang/srt/disaggregation/utils.py +40 -54
sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
sglang/srt/distributed/parallel_state.py +192 -113
sglang/srt/entrypoints/engine.py +59 -18
sglang/srt/entrypoints/grpc_request_manager.py +855 -0
sglang/srt/entrypoints/grpc_server.py +810 -0
sglang/srt/entrypoints/http_server.py +132 -57
sglang/srt/entrypoints/openai/protocol.py +115 -7
sglang/srt/entrypoints/openai/serving_base.py +65 -3
sglang/srt/entrypoints/openai/serving_chat.py +207 -58
sglang/srt/entrypoints/openai/serving_completions.py +17 -4
sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
sglang/srt/entrypoints/openai/serving_responses.py +49 -4
sglang/srt/entrypoints/openai/serving_score.py +1 -0
sglang/srt/environ.py +285 -0
sglang/srt/eplb/eplb_manager.py +2 -2
sglang/srt/eplb/expert_distribution.py +26 -13
sglang/srt/eplb/expert_location.py +38 -8
sglang/srt/eplb/expert_location_updater.py +1 -1
sglang/srt/function_call/base_format_detector.py +3 -6
sglang/srt/function_call/ebnf_composer.py +11 -9
sglang/srt/function_call/function_call_parser.py +9 -2
sglang/srt/function_call/glm4_moe_detector.py +4 -4
sglang/srt/function_call/gpt_oss_detector.py +24 -1
sglang/srt/function_call/json_array_parser.py +63 -0
sglang/srt/function_call/kimik2_detector.py +17 -4
sglang/srt/function_call/qwen3_coder_detector.py +1 -1
sglang/srt/function_call/utils.py +96 -5
sglang/srt/grpc/__init__.py +1 -0
sglang/srt/grpc/compile_proto.py +245 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
sglang/srt/layers/activation.py +143 -9
sglang/srt/layers/attention/aiter_backend.py +106 -82
sglang/srt/layers/attention/ascend_backend.py +115 -9
sglang/srt/layers/attention/attention_registry.py +206 -0
sglang/srt/layers/attention/base_attn_backend.py +12 -3
sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
sglang/srt/layers/attention/fla/chunk.py +242 -0
sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
sglang/srt/layers/attention/fla/chunk_o.py +178 -0
sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
sglang/srt/layers/attention/fla/cumsum.py +300 -0
sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
sglang/srt/layers/attention/fla/index.py +37 -0
sglang/srt/layers/attention/fla/l2norm.py +150 -0
sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
sglang/srt/layers/attention/fla/op.py +66 -0
sglang/srt/layers/attention/fla/solve_tril.py +465 -0
sglang/srt/layers/attention/fla/utils.py +331 -0
sglang/srt/layers/attention/fla/wy_fast.py +158 -0
sglang/srt/layers/attention/flashattention_backend.py +41 -8
sglang/srt/layers/attention/flashinfer_backend.py +118 -198
sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
sglang/srt/layers/attention/flashmla_backend.py +7 -5
sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
sglang/srt/layers/attention/intel_amx_backend.py +3 -0
sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
sglang/srt/layers/attention/mamba/mamba.py +629 -0
sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
sglang/srt/layers/attention/nsa/transform_index.py +144 -0
sglang/srt/layers/attention/nsa/utils.py +24 -0
sglang/srt/layers/attention/nsa_backend.py +887 -0
sglang/srt/layers/attention/tbo_backend.py +6 -6
sglang/srt/layers/attention/torch_flex_backend.py +325 -0
sglang/srt/layers/attention/torch_native_backend.py +12 -6
sglang/srt/layers/attention/triton_backend.py +57 -7
sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
sglang/srt/layers/attention/vision.py +58 -0
sglang/srt/layers/attention/wave_backend.py +4 -4
sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
sglang/srt/layers/communicator.py +53 -7
sglang/srt/layers/dp_attention.py +41 -2
sglang/srt/layers/elementwise.py +3 -1
sglang/srt/layers/layernorm.py +34 -15
sglang/srt/layers/linear.py +55 -7
sglang/srt/layers/logits_processor.py +44 -12
sglang/srt/layers/moe/__init__.py +2 -1
sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
sglang/srt/layers/moe/ep_moe/layer.py +256 -63
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
sglang/srt/layers/moe/fused_moe_native.py +5 -3
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
sglang/srt/layers/moe/moe_runner/base.py +274 -1
sglang/srt/layers/moe/moe_runner/runner.py +80 -0
sglang/srt/layers/moe/moe_runner/triton.py +448 -0
sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
sglang/srt/layers/moe/topk.py +30 -9
sglang/srt/layers/moe/utils.py +22 -7
sglang/srt/layers/parameter.py +23 -6
sglang/srt/layers/quantization/awq.py +19 -7
sglang/srt/layers/quantization/base_config.py +11 -6
sglang/srt/layers/quantization/blockwise_int8.py +38 -27
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
sglang/srt/layers/quantization/fp8.py +78 -49
sglang/srt/layers/quantization/fp8_utils.py +51 -32
sglang/srt/layers/quantization/gptq.py +25 -17
sglang/srt/layers/quantization/modelopt_quant.py +225 -57
sglang/srt/layers/quantization/moe_wna16.py +21 -18
sglang/srt/layers/quantization/mxfp4.py +77 -42
sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
sglang/srt/layers/quantization/quark/utils.py +97 -0
sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
sglang/srt/layers/quantization/unquant.py +135 -47
sglang/srt/layers/quantization/w4afp8.py +26 -17
sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
sglang/srt/layers/quantization/w8a8_int8.py +91 -41
sglang/srt/layers/rocm_linear_utils.py +44 -0
sglang/srt/layers/rotary_embedding.py +78 -49
sglang/srt/layers/sampler.py +213 -21
sglang/srt/layers/utils.py +23 -0
sglang/srt/lora/backend/base_backend.py +50 -8
sglang/srt/lora/backend/chunked_backend.py +348 -0
sglang/srt/lora/backend/triton_backend.py +99 -5
sglang/srt/lora/layers.py +32 -0
sglang/srt/lora/lora.py +8 -3
sglang/srt/lora/lora_manager.py +52 -118
sglang/srt/lora/mem_pool.py +25 -11
sglang/srt/lora/triton_ops/__init__.py +4 -0
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
sglang/srt/lora/utils.py +22 -11
sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
sglang/srt/managers/cache_controller.py +215 -314
sglang/srt/managers/data_parallel_controller.py +115 -80
sglang/srt/managers/detokenizer_manager.py +19 -15
sglang/srt/managers/disagg_service.py +46 -0
sglang/srt/managers/io_struct.py +340 -109
sglang/srt/managers/mm_utils.py +44 -6
sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
sglang/srt/managers/multimodal_processor.py +1 -2
sglang/srt/managers/overlap_utils.py +53 -0
sglang/srt/managers/schedule_batch.py +240 -138
sglang/srt/managers/schedule_policy.py +147 -19
sglang/srt/managers/scheduler.py +501 -304
sglang/srt/managers/scheduler_input_blocker.py +1 -1
sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
sglang/srt/managers/template_manager.py +3 -3
sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
sglang/srt/managers/tokenizer_manager.py +321 -632
sglang/srt/managers/tp_worker.py +81 -22
sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
sglang/srt/managers/utils.py +1 -45
sglang/srt/mem_cache/allocator.py +15 -21
sglang/srt/mem_cache/allocator_ascend.py +41 -27
sglang/srt/mem_cache/base_prefix_cache.py +1 -1
sglang/srt/mem_cache/chunk_cache.py +8 -1
sglang/srt/mem_cache/evict_policy.py +23 -0
sglang/srt/mem_cache/hicache_storage.py +58 -34
sglang/srt/mem_cache/hiradix_cache.py +227 -80
sglang/srt/mem_cache/memory_pool.py +535 -58
sglang/srt/mem_cache/memory_pool_host.py +239 -223
sglang/srt/mem_cache/radix_cache.py +222 -73
sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
sglang/srt/mem_cache/storage/__init__.py +10 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
sglang/srt/mem_cache/storage/backend_factory.py +223 -0
sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
sglang/srt/mem_cache/swa_radix_cache.py +25 -36
sglang/srt/metrics/collector.py +519 -132
sglang/srt/metrics/func_timer.py +2 -7
sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
sglang/srt/metrics/utils.py +55 -0
sglang/srt/model_executor/cpu_graph_runner.py +640 -0
sglang/srt/model_executor/cuda_graph_runner.py +52 -37
sglang/srt/model_executor/forward_batch_info.py +98 -57
sglang/srt/model_executor/model_runner.py +433 -158
sglang/srt/model_executor/npu_graph_runner.py +12 -5
sglang/srt/model_loader/__init__.py +9 -3
sglang/srt/model_loader/loader.py +133 -5
sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
sglang/srt/model_loader/weight_utils.py +158 -3
sglang/srt/models/apertus.py +686 -0
sglang/srt/models/bailing_moe.py +820 -217
sglang/srt/models/bailing_moe_nextn.py +168 -0
sglang/srt/models/deepseek_nextn.py +6 -1
sglang/srt/models/deepseek_v2.py +833 -152
sglang/srt/models/dots_ocr.py +173 -0
sglang/srt/models/dots_vlm.py +174 -0
sglang/srt/models/dots_vlm_vit.py +337 -0
sglang/srt/models/ernie4.py +1 -1
sglang/srt/models/falcon_h1.py +576 -0
sglang/srt/models/gemma3_causal.py +0 -2
sglang/srt/models/gemma3_mm.py +1 -1
sglang/srt/models/gemma3n_mm.py +2 -2
sglang/srt/models/glm4_moe.py +14 -5
sglang/srt/models/glm4_moe_nextn.py +2 -2
sglang/srt/models/glm4v.py +5 -3
sglang/srt/models/glm4v_moe.py +4 -1
sglang/srt/models/gpt_oss.py +8 -31
sglang/srt/models/internvl.py +28 -0
sglang/srt/models/kimi_vl_moonvit.py +2 -2
sglang/srt/models/llama.py +4 -0
sglang/srt/models/llama4.py +9 -0
sglang/srt/models/llama_eagle3.py +13 -0
sglang/srt/models/longcat_flash.py +3 -3
sglang/srt/models/longcat_flash_nextn.py +1 -1
sglang/srt/models/minicpmv.py +165 -3
sglang/srt/models/mllama4.py +40 -4
sglang/srt/models/opt.py +637 -0
sglang/srt/models/qwen2_5_vl.py +29 -5
sglang/srt/models/qwen2_audio.py +1 -1
sglang/srt/models/qwen2_moe.py +124 -14
sglang/srt/models/qwen2_vl.py +1 -1
sglang/srt/models/qwen3.py +26 -5
sglang/srt/models/qwen3_moe.py +71 -12
sglang/srt/models/qwen3_next.py +1069 -0
sglang/srt/models/qwen3_next_mtp.py +112 -0
sglang/srt/models/qwen3_vl.py +787 -0
sglang/srt/models/qwen3_vl_moe.py +471 -0
sglang/srt/models/registry.py +15 -3
sglang/srt/models/sarashina2_vision.py +269 -0
sglang/srt/models/solar.py +505 -0
sglang/srt/models/starcoder2.py +357 -0
sglang/srt/models/step3_vl.py +1 -1
sglang/srt/models/torch_native_llama.py +10 -3
sglang/srt/models/utils.py +51 -0
sglang/srt/multimodal/processors/base_processor.py +15 -7
sglang/srt/multimodal/processors/dots_vlm.py +98 -0
sglang/srt/multimodal/processors/glm4v.py +9 -9
sglang/srt/multimodal/processors/internvl.py +153 -129
sglang/srt/multimodal/processors/qwen_vl.py +23 -6
sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
sglang/srt/offloader.py +27 -3
sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
sglang/srt/sampling/sampling_batch_info.py +38 -17
sglang/srt/sampling/sampling_params.py +7 -0
sglang/srt/server_args.py +1030 -254
sglang/srt/server_args_config_parser.py +146 -0
sglang/srt/single_batch_overlap.py +151 -0
sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
sglang/srt/speculative/cpp_ngram/param.h +125 -0
sglang/srt/speculative/cpp_ngram/queue.h +71 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
sglang/srt/speculative/eagle_worker.py +253 -136
sglang/srt/speculative/ngram_utils.py +428 -0
sglang/srt/speculative/ngram_worker.py +245 -0
sglang/srt/speculative/spec_info.py +52 -0
sglang/srt/speculative/spec_utils.py +606 -0
sglang/srt/speculative/standalone_worker.py +109 -0
sglang/srt/torch_memory_saver_adapter.py +5 -7
sglang/srt/tracing/trace.py +578 -0
sglang/srt/two_batch_overlap.py +8 -5
sglang/srt/utils/__init__.py +2 -0
sglang/srt/{utils.py → utils/common.py} +445 -77
sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
sglang/srt/utils/rpd_utils.py +452 -0
sglang/srt/utils/slow_rank_detector.py +71 -0
sglang/srt/warmup.py +8 -4
sglang/srt/weight_sync/utils.py +2 -2
sglang/test/attention/test_trtllm_mla_backend.py +169 -5
sglang/test/few_shot_gsm8k.py +1 -0
sglang/test/get_logits_ut.py +57 -0
sglang/test/run_eval.py +79 -11
sglang/test/runners.py +5 -1
sglang/test/simple_eval_common.py +5 -2
sglang/test/simple_eval_mmmu_vlm.py +441 -0
sglang/test/test_block_fp8.py +2 -2
sglang/test/test_cutlass_moe.py +24 -6
sglang/test/test_deterministic.py +297 -0
sglang/test/test_disaggregation_utils.py +77 -0
sglang/test/test_fp4_moe.py +370 -1
sglang/test/test_programs.py +1 -1
sglang/test/test_utils.py +383 -5
sglang/utils.py +22 -1
sglang/version.py +1 -1
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
sglang/srt/disaggregation/launch_lb.py +0 -118
sglang/srt/mem_cache/lora_radix_cache.py +0 -421
sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
/sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
/sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
/sglang/srt/{conversation.py → parser/conversation.py} +0 -0
/sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
/sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py CHANGED Viewed

@@ -1,4 +1,3 @@
-import hashlib
 import json
 import logging
 import os
@@ -6,15 +5,18 @@ import uuid
 from dataclasses import dataclass
 from typing import Any, List, Optional
-import numpy as np
 import torch
-from sglang.srt.distributed import get_tensor_model_parallel_rank
-from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024  # 4 GiB
 DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024  # 16 MB
+DEFAULT_MOONCAKE_CONFIG_PATH_ENV = "SGLANG_HICACHE_MOONCAKE_CONFIG_PATH"
 logger = logging.getLogger(__name__)
@@ -31,13 +33,13 @@ class MooncakeStoreConfig:
     @staticmethod
     def from_file() -> "MooncakeStoreConfig":
         """Load the config from a JSON file."""
-        file_path = os.getenv("MOONCAKE_CONFIG_PATH")
-        if file_path is None:
-            raise ValueError(
-                "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
-            )
-        with open(file_path) as fin:
-            config = json.load(fin)
+        file_path = os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV)
+        try:
+            with open(file_path) as fin:
+                config = json.load(fin)
+        except Exception as e:
+            raise RuntimeError(f"Failed to load config from {file_path}: {str(e)}")
         return MooncakeStoreConfig(
             local_hostname=config.get("local_hostname"),
             metadata_server=config.get("metadata_server"),
@@ -75,6 +77,26 @@ class MooncakeStoreConfig:
             master_server_address=os.getenv("MOONCAKE_MASTER"),
         )
+    @staticmethod
+    def load_from_extra_config(extra_config: dict) -> "MooncakeStoreConfig":
+        """Load config from extra_config dictionary."""
+        if "master_server_address" not in extra_config:
+            raise ValueError("master_server_address is required in extra_config")
+        return MooncakeStoreConfig(
+            local_hostname=extra_config.get("local_hostname", "localhost"),
+            metadata_server=extra_config.get("metadata_server", "P2PHANDSHAKE"),
+            global_segment_size=extra_config.get(
+                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            ),
+            local_buffer_size=extra_config.get(
+                "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
+            ),
+            protocol=extra_config.get("protocol", "tcp"),
+            device_name=extra_config.get("device_name", "auto"),
+            master_server_address=extra_config["master_server_address"],
+        )
     def __post_init__(self):
         if self.device_name == "auto":
             os.environ["MC_MS_AUTO_DISC"] = "1"
@@ -84,6 +106,7 @@ class MooncakeStoreConfig:
 class MooncakeStore(HiCacheStorage):
     def __init__(self, storage_config: HiCacheStorageConfig = None):
         try:
             from mooncake.store import MooncakeDistributedStore
@@ -96,14 +119,43 @@ class MooncakeStore(HiCacheStorage):
         try:
             self.store = MooncakeDistributedStore()
-            self.config = MooncakeStoreConfig.load_from_env()
-            logger.info("Mooncake Configuration loaded from env successfully.")
+            extra_config = (
+                getattr(storage_config, "extra_config", None)
+                if storage_config
+                else None
+            )
+            # Load configuration with master_server_address prioritized from extra_config if available
+            if (
+                extra_config is not None
+                and extra_config.get("master_server_address") is not None
+            ):
+                # Load from extra_config
+                self.config = MooncakeStoreConfig.load_from_extra_config(extra_config)
+                logger.info(
+                    "Mooncake Configuration loaded from extra_config successfully."
+                )
+            elif os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV):
+                # Load from config file
+                self.config = MooncakeStoreConfig.from_file()
+                logger.info("Mooncake Configuration loaded from file successfully.")
+            else:
+                # Load from environment variables
+                self.config = MooncakeStoreConfig.load_from_env()
+                logger.info("Mooncake Configuration loaded from env successfully.")
+            tp_scale_factor = 1 if storage_config is None else storage_config.tp_size
+            per_tp_global_segment_size = (
+                self.config.global_segment_size // tp_scale_factor
+            )
+            per_tp_local_buffer_size = self.config.local_buffer_size // tp_scale_factor
             ret_code = self.store.setup(
                 self.config.local_hostname,
                 self.config.metadata_server,
-                self.config.global_segment_size,
-                self.config.local_buffer_size,
+                per_tp_global_segment_size,
+                per_tp_local_buffer_size,
                 self.config.protocol,
                 self.config.device_name,
                 self.config.master_server_address,
@@ -136,7 +188,13 @@ class MooncakeStore(HiCacheStorage):
         assert self.store.is_exist(warmup_key) == 1
         assert self.store.get(warmup_key) == warmup_value
-    def register_buffer(self, buffer: torch.Tensor) -> None:
+    def register_mem_pool_host(self, mem_pool_host: HostKVCache):
+        super().register_mem_pool_host(mem_pool_host)
+        assert self.mem_pool_host.layout in [
+            "page_first",
+            "page_first_direct",
+        ], "mooncake store storage backend only support page first or page first direct layout"
+        buffer = self.mem_pool_host.kv_buffer
         try:
             buffer_ptr = buffer.data_ptr()
             buffer_size = buffer.numel() * buffer.element_size()
@@ -147,6 +205,97 @@ class MooncakeStore(HiCacheStorage):
             logger.error("Failed to register buffer to Mooncake Store: %s", err)
             raise TypeError("Mooncake Store Register Buffer Error.") from err
+    def _get_mha_buffer_meta(self, keys, indices):
+        ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
+        key_list = []
+        for key_ in keys:
+            key_list.append(f"{key_}_{self.local_rank}_k")
+            key_list.append(f"{key_}_{self.local_rank}_v")
+        assert len(key_list) == len(ptr_list)
+        return key_list, ptr_list, element_size_list
+    def _get_mla_buffer_meta(self, keys, indices):
+        ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
+        key_list = []
+        for key_ in keys:
+            key_list.append(f"{key_}_k")
+        assert len(key_list) == len(ptr_list)
+        return key_list, ptr_list, element_size_list
+    def _batch_preprocess(self, keys, host_indices):
+        assert len(keys) > 0
+        assert len(keys) == len(host_indices) // self.mem_pool_host.page_size
+        if self.is_mla_backend:
+            return self._get_mla_buffer_meta(keys, host_indices)
+        else:
+            return self._get_mha_buffer_meta(keys, host_indices)
+    def _batch_postprocess(self, results: List[int], is_set_operate=False):
+        """
+        refer to https://github.com/kvcache-ai/Mooncake/blob/main/mooncake-store/include/pybind_client.h
+        for batch_get_into, results is Vector of integers,
+            where each element is the number of bytes read on success, or a negative value on error
+        for batch_put_from, results is Vector of integers,
+            where each element is 0 on success, or a negative value on error
+        """
+        if self.is_mla_backend:
+            return [k_res == 0 if is_set_operate else k_res > 0 for k_res in results]
+        else:
+            kv_pairs = zip(results[::2], results[1::2])
+            return [
+                (
+                    (k_res == 0 and v_res == 0)
+                    if is_set_operate
+                    else (k_res > 0 and v_res > 0)
+                )
+                for k_res, v_res in kv_pairs
+            ]
+    def batch_get_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
+        get_results = self._get_batch_zero_copy_impl(
+            key_strs, buffer_ptrs, buffer_sizes
+        )
+        return self._batch_postprocess(get_results, is_set_operate=False)
+    def batch_set_v1(
+        self,
+        keys: List[str],
+        host_indices: torch.Tensor,
+        extra_info: Optional[HiCacheStorageExtraInfo] = None,
+    ) -> List[bool]:
+        key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
+        exist_result = self._batch_exist(key_strs)
+        set_keys = []
+        set_buffer_ptrs = []
+        set_buffer_sizes = []
+        set_indices = []
+        set_results = [-1] * len(key_strs)
+        for i in range(len(key_strs)):
+            if exist_result[i] != 1:
+                set_keys.append(key_strs[i])
+                set_buffer_ptrs.append(buffer_ptrs[i])
+                set_buffer_sizes.append(buffer_sizes[i])
+                set_indices.append(i)
+            else:
+                set_results[i] = 0
+        # Only set non-existing keys to storage
+        if len(set_keys) > 0:
+            put_results = self._put_batch_zero_copy_impl(
+                set_keys, set_buffer_ptrs, set_buffer_sizes
+            )
+            for i in range(len(set_indices)):
+                set_results[set_indices[i]] = put_results[i]
+        return self._batch_postprocess(set_results, is_set_operate=True)
     def set(
         self,
         key,
@@ -154,21 +303,36 @@ class MooncakeStore(HiCacheStorage):
         target_location: Optional[List[int]] = None,
         target_sizes: Optional[List[int]] = None,
     ) -> bool:
-        return self.batch_set([key], [value], [target_location], [target_sizes])
+        # Only support zero copy set for now
+        assert target_location is not None and target_sizes is not None
+        exist_result = self._batch_exist([key])
+        if exist_result[0] == 1:
+            return True
+        put_result = self._put_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return put_result[0] == 0
     def batch_set(
         self,
         keys: List[str],
         values: Optional[List[torch.Tensor]] = None,
-        target_location: Optional[List[int]] = None,
+        target_locations: Optional[List[int]] = None,
         target_sizes: Optional[List[int]] = None,
     ) -> bool:
-        assert len(keys) == len(target_location) == len(target_sizes)
+        # Only support zero copy set for now
+        assert target_locations is not None and target_sizes is not None
+        assert len(keys) == len(target_locations) == len(target_sizes)
         if len(keys) == 0:
             return False
         for i in range(len(keys)):
-            if keys[i] is None or target_location[i] is None or target_sizes[i] is None:
+            if (
+                keys[i] is None
+                or target_locations[i] is None
+                or target_sizes[i] is None
+            ):
                 return False
         exist_result = self._batch_exist(keys)
@@ -179,7 +343,7 @@ class MooncakeStore(HiCacheStorage):
         for i in range(len(keys)):
             if exist_result[i] != 1:
                 set_keys.append(keys[i])
-                set_target_locations.append(target_location[i])
+                set_target_locations.append(target_locations[i])
                 set_target_sizes.append(target_sizes[i])
                 set_indices.append(i)
         # Only set non-existing keys to storage
@@ -204,18 +368,24 @@ class MooncakeStore(HiCacheStorage):
         target_location: Optional[Any] = None,
         target_sizes: Optional[Any] = None,
     ) -> bool:
-        return self.batch_get([key], [target_location], [target_sizes]) == 1
+        assert target_location is not None and target_sizes is not None
+        get_result = self._get_batch_zero_copy_impl(
+            [key], [target_location], [target_sizes]
+        )
+        return get_result[0] >= 0
     def batch_get(
         self,
         keys: List[str],
-        target_location: Optional[Any] = None,
+        target_locations: Optional[Any] = None,
         target_sizes: Optional[Any] = None,
     ) -> int:
-        assert len(keys) == len(target_location) == len(target_sizes)
+        assert len(keys) == len(target_locations) == len(target_sizes)
         if len(keys) == 0:
             return 0
-        get_result = self._get_batch_zero_copy_impl(keys, target_location, target_sizes)
+        get_result = self._get_batch_zero_copy_impl(
+            keys, target_locations, target_sizes
+        )
         if self.is_mla_backend:
             key_multiplier = 1
         else:
@@ -226,7 +396,8 @@ class MooncakeStore(HiCacheStorage):
         return len(keys) // key_multiplier
     def exists(self, key) -> bool:
-        return self.batch_exists([key]) > 0
+        exist_result = self._batch_exist([key])
+        return exist_result[0] == 1
     def batch_exists(self, keys) -> int:
         if self.is_mla_backend:
@@ -245,9 +416,6 @@ class MooncakeStore(HiCacheStorage):
                 return i // key_multiplier
         return len(query_keys) // key_multiplier
-    def delete(self, key) -> None:
-        raise (NotImplementedError)
     def close(self):
         # MooncakeDistributedStore will automatically call the destructor, so
         # it is unnecessary to close it manually.

sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py ADDED Viewed

@@ -0,0 +1,161 @@
+import logging
+import uuid
+import torch
+from mooncake_store import MooncakeStore
+from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+def generate_batch_query_keys(kv_num: int, config: HiCacheStorageConfig):
+    keys = []
+    for _ in range(kv_num):
+        key = "test_" + str(uuid.uuid4())
+        keys.append(key)
+    set_keys = []
+    for key in keys:
+        if config.is_mla_model:
+            set_keys.append(key + "_k")
+        else:
+            set_keys.append(key + f"_{config.tp_rank}_k")
+            set_keys.append(key + f"_{config.tp_rank}_v")
+    get_keys = set_keys
+    exist_keys = keys
+    return set_keys, get_keys, exist_keys
+def test_single_operation():
+    """Test the set API with a single key-value pair."""
+    print("=" * 100)
+    print("Testing single operation")
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 1024
+    store = MooncakeStore()
+    buffer = torch.randn(buffer_size, dtype=torch.float32)
+    store.register_buffer(buffer)
+    value_size = value_elements * buffer.element_size()
+    key = str(uuid.uuid4())
+    set_slice = buffer[:value_elements]
+    get_slice = buffer[value_elements : 2 * value_elements]
+    set_location = set_slice.data_ptr()
+    get_location = get_slice.data_ptr()
+    # Test set operation
+    result = store.set(key, target_location=set_location, target_sizes=value_size)
+    assert result is True, f"❌set operation failed for key: {key}"
+    # Test exists operation
+    assert store.exists(key), f"❌key {key} should exist after set operation"
+    # Test get operation
+    result = store.get(key, target_location=get_location, target_sizes=value_size)
+    assert result is True, f"❌get operation failed for key: {key}"
+    # Compare the data using proper tensor indices
+    assert torch.allclose(
+        set_slice, get_slice, atol=1e-6
+    ), f"❌get operation failed for key: {key}"
+    logger.info(f"✅ Single operation passed")
+def test_batch_operation(config: HiCacheStorageConfig):
+    """Test the batch set/get APIs with multiple key-value pairs."""
+    print("=" * 100)
+    print(f"Testing batch operation with config: {config}")
+    buffer_size = 1024 * 1024 * 16  # 16MB
+    value_elements = 256
+    kv_num = 13
+    store = MooncakeStore(config)
+    buffer = torch.randn(buffer_size, dtype=torch.float32)
+    store.register_buffer(buffer)
+    value_size = value_elements * buffer.element_size()
+    set_keys, get_keys, exist_keys = generate_batch_query_keys(kv_num, config)
+    set_slices = [
+        buffer[i * value_elements : (i + 1) * value_elements]
+        for i in range(len(set_keys))
+    ]
+    set_locations = [set_slice.data_ptr() for set_slice in set_slices]
+    target_sizes = [value_size for _ in range(len(set_keys))]
+    # Test batch set operation
+    result = store.batch_set(
+        set_keys, target_locations=set_locations, target_sizes=target_sizes
+    )
+    assert result is True, f"❌batch set operation failed"
+    # Test batch exists operation
+    assert store.batch_exists(
+        exist_keys
+    ), f"❌keys should exist after batch set operation"
+    # Test batch get operation
+    get_slices = [
+        buffer[
+            (len(set_keys) + i)
+            * value_elements : (len(set_keys) + i + 1)
+            * value_elements
+        ]
+        for i in range(len(get_keys))
+    ]
+    get_locations = [get_slice.data_ptr() for get_slice in get_slices]
+    result = store.batch_get(
+        get_keys, target_locations=get_locations, target_sizes=target_sizes
+    )
+    assert result == kv_num, f"❌batch get operation failed"
+    for i in range(len(get_keys)):
+        assert torch.allclose(
+            set_slices[i], get_slices[i], atol=1e-6
+        ), f"❌batch get operation failed for key: {get_keys[i]}"
+    logger.info(f"✅ Batch operation passed")
+if __name__ == "__main__":
+    test_single_operation()
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=0,
+            tp_size=1,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=False,
+            tp_rank=1,
+            tp_size=4,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    test_batch_operation(
+        HiCacheStorageConfig(
+            is_mla_model=True,
+            tp_rank=3,
+            tp_size=8,
+            model_name=None,
+            is_page_first_layout=True,
+        )
+    )
+    logger.info(f"✅ All tests passed")

sglang/srt/mem_cache/swa_radix_cache.py CHANGED Viewed

@@ -30,6 +30,12 @@ import torch
 from sglang.srt.mem_cache.allocator import SWATokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
+from sglang.srt.mem_cache.radix_cache import (
+    RadixKey,
+    _key_match_page_size1,
+    _key_match_paged,
+    get_child_key,
+)
 if TYPE_CHECKING:
     from sglang.srt.managers.schedule_batch import Req
@@ -47,7 +53,7 @@ class TreeNode:
     def __init__(self, id: Optional[int] = None):
         self.children = defaultdict(TreeNode)
         self.parent: TreeNode = None
-        self.key: List[int] = None
+        self.key: RadixKey = None
         self.value: Optional[torch.Tensor] = None
         # swa_tombstone is used to indicate the kv indices have been freed for swa layers
         self.swa_tombstone = False
@@ -60,8 +66,6 @@ class TreeNode:
         self.last_access_time = time.monotonic()
         self.hit_count = 0
-        # indicating the node is loading KV cache from host
-        self.loading = False
         # store the host indices of KV cache
         self.host_value = None
@@ -89,27 +93,6 @@ class TreeNode:
         return self.last_access_time < other.last_access_time
-def _key_match_page_size1(key0: List, key1: List):
-    i = 0
-    for k0, k1 in zip(key0, key1):
-        if k0 != k1:
-            break
-        i += 1
-    return i
-def _key_match_paged(key0: List, key1: List, page_size: int):
-    min_len = min(len(key0), len(key1))
-    i = 0
-    while i < min_len:
-        if key0[i : i + page_size] != key1[i : i + page_size]:
-            break
-        i += page_size
-    return i
 def gen_swa_uuid() -> int:
     TreeNode.swa_uuid_counter += 1
     return TreeNode.swa_uuid_counter
@@ -358,10 +341,10 @@ class SWARadixCache(BasePrefixCache):
         if self.page_size == 1:
             self.key_match_fn = _key_match_page_size1
-            self.get_child_key_fn = lambda key: key[0]
+            self.get_child_key_fn = get_child_key
         else:
             self.key_match_fn = partial(_key_match_paged, page_size=page_size)
-            self.get_child_key_fn = lambda key: tuple(key[:page_size])
+            self.get_child_key_fn = partial(get_child_key, page_size=page_size)
         self.sliding_window_size = sliding_window_size
         self.reset()
@@ -382,10 +365,10 @@ class SWARadixCache(BasePrefixCache):
         self.full_lru_list = LRUList(swa=False)
         self.swa_lru_list = LRUList(swa=True)
-    def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
+    def match_prefix(self, key: RadixKey, **kwargs) -> MatchResult:
         """Find the matching prefix from the radix tree.
         Args:
-            key: A list of token IDs to find a matching prefix.
+            key: A RadixKey contains token IDs to find a matching prefix.
         Returns:
             A tuple of a tensor of matching prefix token IDs and
             the last node that contains the prefix values. Note that
@@ -419,12 +402,12 @@ class SWARadixCache(BasePrefixCache):
             last_host_node=last_node,
         )
-    def insert(self, key: List, value=None, prev_prefix_len: int = 0) -> int:
+    def insert(self, key: RadixKey, value=None, prev_prefix_len: int = 0) -> int:
         if self.disable:
             return 0
         if value is None:
-            value = [x for x in key]
+            value = torch.tensor([x for x in key.token_ids], dtype=torch.int64)
         return self._insert_helper(self.root_node, key, value, prev_prefix_len)
     def cache_finished_req(self, req: Req) -> None:
@@ -455,7 +438,7 @@ class SWARadixCache(BasePrefixCache):
         # insert the token_ids and kv_indices into the radix tree
         # Note: the insert function already frees the overlapped kv_indices
         new_prefix_len = self.insert(
-            token_ids[:page_aligned_len],
+            RadixKey(token_ids[:page_aligned_len], req.extra_key),
             page_aligned_kv_indices,
             len(req.prefix_indices),
         )
@@ -491,11 +474,15 @@ class SWARadixCache(BasePrefixCache):
         # Radix Cache takes one ref in memory pool
         # Note: the insert function already frees the overlapped kv_indices
         new_prefix_len = self.insert(
-            page_aligned_token_ids, page_aligned_kv_indices, len(req.prefix_indices)
+            RadixKey(page_aligned_token_ids, req.extra_key),
+            page_aligned_kv_indices,
+            len(req.prefix_indices),
         )
         # The prefix indices could be updated, reuse it
-        new_indices, new_last_node, _, _ = self.match_prefix(page_aligned_token_ids)
+        new_indices, new_last_node, _, _ = self.match_prefix(
+            RadixKey(page_aligned_token_ids, req.extra_key)
+        )
         assert len(req.prefix_indices) <= len(
             new_indices
         ), f"{req.prefix_indices=}, {new_indices=}"
@@ -734,7 +721,9 @@ class SWARadixCache(BasePrefixCache):
     ##### Internal Helper Functions #####
-    def _match_prefix_helper(self, key: List) -> Tuple[List[torch.Tensor], TreeNode]:
+    def _match_prefix_helper(
+        self, key: RadixKey
+    ) -> Tuple[List[torch.Tensor], TreeNode]:
         """
         SWA prefix matching helper. It factors in the sliding window size such that
         the matched node is guaranteed to either 1. connected to root without swa tombstone,
@@ -798,7 +787,7 @@ class SWARadixCache(BasePrefixCache):
         return value[:best_value_len], best_last_node
-    def _split_node(self, key: List[int], child: TreeNode, split_len: int) -> TreeNode:
+    def _split_node(self, key: RadixKey, child: TreeNode, split_len: int) -> TreeNode:
         # new_node -> child
         new_node = TreeNode()
         new_node.children = {self.get_child_key_fn(key[split_len:]): child}
@@ -833,7 +822,7 @@ class SWARadixCache(BasePrefixCache):
         return new_node
     def _insert_helper(
-        self, node: TreeNode, key: List, value, update_kv_after_len: int
+        self, node: TreeNode, key: RadixKey, value, update_kv_after_len: int
     ) -> int:
         # Update the last access time from root to leaf, so that
         # swa will tombstone the node closer to root first

sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

sglang 0.5.2rc1py3-none-any.whl → 0.5.3py3-none-any.whl