PyPI - sglang - Versions diffs - 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl - Mend

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (419) hide show

sglang/bench_one_batch.py +47 -28
sglang/bench_one_batch_server.py +41 -25
sglang/bench_serving.py +378 -160
sglang/check_env.py +1 -1
sglang/compile_deep_gemm.py +6 -2
sglang/global_config.py +1 -25
sglang/lang/api.py +6 -0
sglang/lang/interpreter.py +1 -0
sglang/lang/ir.py +13 -0
sglang/launch_server.py +10 -15
sglang/profiler.py +18 -1
sglang/srt/_custom_ops.py +1 -1
sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
sglang/srt/compilation/backend.py +437 -0
sglang/srt/compilation/compilation_config.py +20 -0
sglang/srt/compilation/compilation_counter.py +47 -0
sglang/srt/compilation/compile.py +210 -0
sglang/srt/compilation/compiler_interface.py +503 -0
sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
sglang/srt/compilation/fix_functionalization.py +134 -0
sglang/srt/compilation/fx_utils.py +83 -0
sglang/srt/compilation/inductor_pass.py +140 -0
sglang/srt/compilation/pass_manager.py +66 -0
sglang/srt/compilation/piecewise_context_manager.py +40 -0
sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
sglang/srt/configs/__init__.py +4 -0
sglang/srt/configs/deepseek_ocr.py +262 -0
sglang/srt/configs/deepseekvl2.py +194 -96
sglang/srt/configs/dots_vlm.py +2 -7
sglang/srt/configs/falcon_h1.py +13 -64
sglang/srt/configs/load_config.py +25 -2
sglang/srt/configs/mamba_utils.py +117 -0
sglang/srt/configs/model_config.py +136 -25
sglang/srt/configs/modelopt_config.py +30 -0
sglang/srt/configs/nemotron_h.py +286 -0
sglang/srt/configs/olmo3.py +105 -0
sglang/srt/configs/points_v15_chat.py +29 -0
sglang/srt/configs/qwen3_next.py +11 -47
sglang/srt/configs/qwen3_omni.py +613 -0
sglang/srt/configs/qwen3_vl.py +0 -10
sglang/srt/connector/remote_instance.py +1 -1
sglang/srt/constrained/base_grammar_backend.py +5 -1
sglang/srt/constrained/llguidance_backend.py +5 -0
sglang/srt/constrained/outlines_backend.py +1 -1
sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
sglang/srt/constrained/utils.py +12 -0
sglang/srt/constrained/xgrammar_backend.py +20 -11
sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
sglang/srt/disaggregation/base/conn.py +17 -4
sglang/srt/disaggregation/common/conn.py +4 -2
sglang/srt/disaggregation/decode.py +123 -31
sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
sglang/srt/disaggregation/fake/conn.py +11 -3
sglang/srt/disaggregation/mooncake/conn.py +157 -19
sglang/srt/disaggregation/nixl/conn.py +69 -24
sglang/srt/disaggregation/prefill.py +96 -270
sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
sglang/srt/distributed/device_communicators/pynccl.py +24 -12
sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
sglang/srt/distributed/naive_distributed.py +5 -4
sglang/srt/distributed/parallel_state.py +63 -19
sglang/srt/elastic_ep/elastic_ep.py +74 -0
sglang/srt/entrypoints/context.py +3 -2
sglang/srt/entrypoints/engine.py +83 -80
sglang/srt/entrypoints/grpc_server.py +430 -234
sglang/srt/entrypoints/harmony_utils.py +2 -2
sglang/srt/entrypoints/http_server.py +195 -102
sglang/srt/entrypoints/http_server_engine.py +1 -7
sglang/srt/entrypoints/openai/protocol.py +225 -37
sglang/srt/entrypoints/openai/serving_base.py +49 -2
sglang/srt/entrypoints/openai/serving_chat.py +29 -74
sglang/srt/entrypoints/openai/serving_classify.py +204 -0
sglang/srt/entrypoints/openai/serving_completions.py +15 -1
sglang/srt/entrypoints/openai/serving_responses.py +5 -2
sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
sglang/srt/environ.py +58 -6
sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
sglang/srt/eplb/expert_distribution.py +33 -4
sglang/srt/eplb/expert_location_dispatch.py +2 -2
sglang/srt/eplb/expert_location_updater.py +2 -2
sglang/srt/function_call/base_format_detector.py +17 -18
sglang/srt/function_call/function_call_parser.py +20 -14
sglang/srt/function_call/glm4_moe_detector.py +1 -5
sglang/srt/function_call/gpt_oss_detector.py +1 -1
sglang/srt/function_call/json_array_parser.py +0 -2
sglang/srt/function_call/minimax_m2.py +367 -0
sglang/srt/function_call/utils.py +2 -2
sglang/srt/grpc/compile_proto.py +3 -3
sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
sglang/srt/grpc/health_servicer.py +189 -0
sglang/srt/grpc/scheduler_launcher.py +181 -0
sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
sglang/srt/layers/activation.py +10 -1
sglang/srt/layers/attention/aiter_backend.py +3 -3
sglang/srt/layers/attention/ascend_backend.py +17 -1
sglang/srt/layers/attention/attention_registry.py +43 -23
sglang/srt/layers/attention/base_attn_backend.py +20 -1
sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
sglang/srt/layers/attention/fla/chunk.py +0 -1
sglang/srt/layers/attention/fla/chunk_o.py +1 -1
sglang/srt/layers/attention/fla/index.py +0 -2
sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
sglang/srt/layers/attention/fla/utils.py +0 -3
sglang/srt/layers/attention/fla/wy_fast.py +0 -2
sglang/srt/layers/attention/flashattention_backend.py +24 -10
sglang/srt/layers/attention/flashinfer_backend.py +258 -22
sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
sglang/srt/layers/attention/flashmla_backend.py +2 -2
sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
sglang/srt/layers/attention/intel_amx_backend.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
sglang/srt/layers/attention/mamba/mamba.py +189 -241
sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
sglang/srt/layers/attention/nsa/utils.py +0 -1
sglang/srt/layers/attention/nsa_backend.py +404 -90
sglang/srt/layers/attention/triton_backend.py +208 -34
sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
sglang/srt/layers/attention/utils.py +89 -7
sglang/srt/layers/attention/vision.py +3 -3
sglang/srt/layers/attention/xpu_backend.py +1028 -0
sglang/srt/layers/communicator.py +12 -7
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
sglang/srt/layers/dp_attention.py +17 -0
sglang/srt/layers/layernorm.py +64 -19
sglang/srt/layers/linear.py +9 -1
sglang/srt/layers/logits_processor.py +152 -17
sglang/srt/layers/modelopt_utils.py +11 -0
sglang/srt/layers/moe/cutlass_moe.py +0 -2
sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
sglang/srt/layers/moe/ep_moe/layer.py +154 -625
sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
sglang/srt/layers/moe/moe_runner/runner.py +6 -0
sglang/srt/layers/moe/moe_runner/triton.py +3 -1
sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
sglang/srt/layers/moe/router.py +51 -15
sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
sglang/srt/layers/moe/topk.py +7 -6
sglang/srt/layers/moe/utils.py +20 -5
sglang/srt/layers/quantization/__init__.py +5 -58
sglang/srt/layers/quantization/awq.py +183 -9
sglang/srt/layers/quantization/awq_triton.py +29 -0
sglang/srt/layers/quantization/base_config.py +27 -1
sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
sglang/srt/layers/quantization/fp8.py +152 -81
sglang/srt/layers/quantization/fp8_kernel.py +55 -10
sglang/srt/layers/quantization/fp8_utils.py +42 -14
sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
sglang/srt/layers/quantization/gguf.py +566 -0
sglang/srt/layers/quantization/gptq.py +0 -1
sglang/srt/layers/quantization/int8_kernel.py +18 -2
sglang/srt/layers/quantization/marlin_utils.py +12 -0
sglang/srt/layers/quantization/modelopt_quant.py +125 -100
sglang/srt/layers/quantization/mxfp4.py +35 -68
sglang/srt/layers/quantization/petit.py +1 -1
sglang/srt/layers/quantization/quark/quark.py +3 -1
sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
sglang/srt/layers/quantization/unquant.py +23 -48
sglang/srt/layers/quantization/utils.py +0 -1
sglang/srt/layers/quantization/w4afp8.py +87 -20
sglang/srt/layers/quantization/w8a8_int8.py +30 -24
sglang/srt/layers/radix_attention.py +62 -9
sglang/srt/layers/rotary_embedding.py +686 -17
sglang/srt/layers/sampler.py +47 -16
sglang/srt/layers/sparse_pooler.py +98 -0
sglang/srt/layers/utils.py +0 -1
sglang/srt/layers/vocab_parallel_embedding.py +4 -1
sglang/srt/lora/backend/triton_backend.py +0 -1
sglang/srt/lora/eviction_policy.py +139 -0
sglang/srt/lora/lora_manager.py +24 -9
sglang/srt/lora/lora_registry.py +1 -1
sglang/srt/lora/mem_pool.py +40 -16
sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
sglang/srt/managers/cache_controller.py +48 -17
sglang/srt/managers/data_parallel_controller.py +146 -42
sglang/srt/managers/detokenizer_manager.py +40 -13
sglang/srt/managers/io_struct.py +69 -16
sglang/srt/managers/mm_utils.py +20 -18
sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
sglang/srt/managers/overlap_utils.py +96 -19
sglang/srt/managers/schedule_batch.py +241 -511
sglang/srt/managers/schedule_policy.py +15 -2
sglang/srt/managers/scheduler.py +420 -514
sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
sglang/srt/managers/scheduler_pp_mixin.py +341 -0
sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
sglang/srt/managers/tokenizer_manager.py +375 -95
sglang/srt/managers/tp_worker.py +212 -161
sglang/srt/managers/utils.py +78 -2
sglang/srt/mem_cache/allocator.py +7 -2
sglang/srt/mem_cache/allocator_ascend.py +2 -2
sglang/srt/mem_cache/base_prefix_cache.py +2 -2
sglang/srt/mem_cache/chunk_cache.py +13 -2
sglang/srt/mem_cache/common.py +480 -0
sglang/srt/mem_cache/evict_policy.py +16 -1
sglang/srt/mem_cache/hicache_storage.py +11 -2
sglang/srt/mem_cache/hiradix_cache.py +16 -3
sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
sglang/srt/mem_cache/memory_pool.py +517 -219
sglang/srt/mem_cache/memory_pool_host.py +0 -1
sglang/srt/mem_cache/multimodal_cache.py +0 -1
sglang/srt/mem_cache/radix_cache.py +53 -19
sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
sglang/srt/mem_cache/storage/backend_factory.py +2 -2
sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
sglang/srt/mem_cache/swa_radix_cache.py +92 -26
sglang/srt/metrics/collector.py +31 -0
sglang/srt/metrics/func_timer.py +1 -1
sglang/srt/model_executor/cuda_graph_runner.py +43 -5
sglang/srt/model_executor/forward_batch_info.py +71 -25
sglang/srt/model_executor/model_runner.py +362 -270
sglang/srt/model_executor/npu_graph_runner.py +2 -3
sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
sglang/srt/model_loader/__init__.py +1 -1
sglang/srt/model_loader/loader.py +424 -27
sglang/srt/model_loader/utils.py +0 -1
sglang/srt/model_loader/weight_utils.py +47 -28
sglang/srt/models/apertus.py +2 -3
sglang/srt/models/arcee.py +2 -2
sglang/srt/models/bailing_moe.py +13 -52
sglang/srt/models/bailing_moe_nextn.py +3 -4
sglang/srt/models/bert.py +1 -1
sglang/srt/models/deepseek_nextn.py +19 -3
sglang/srt/models/deepseek_ocr.py +1516 -0
sglang/srt/models/deepseek_v2.py +418 -140
sglang/srt/models/dots_ocr.py +0 -2
sglang/srt/models/dots_vlm.py +0 -1
sglang/srt/models/dots_vlm_vit.py +1 -1
sglang/srt/models/falcon_h1.py +13 -19
sglang/srt/models/gemma3_mm.py +16 -0
sglang/srt/models/gemma3n_mm.py +1 -2
sglang/srt/models/glm4_moe.py +327 -382
sglang/srt/models/glm4_moe_nextn.py +6 -16
sglang/srt/models/glm4v.py +2 -1
sglang/srt/models/glm4v_moe.py +32 -199
sglang/srt/models/gpt_oss.py +5 -5
sglang/srt/models/grok.py +10 -23
sglang/srt/models/hunyuan.py +2 -7
sglang/srt/models/interns1.py +0 -1
sglang/srt/models/kimi_vl.py +1 -7
sglang/srt/models/kimi_vl_moonvit.py +3 -1
sglang/srt/models/llama.py +2 -2
sglang/srt/models/llama_eagle3.py +1 -1
sglang/srt/models/longcat_flash.py +5 -22
sglang/srt/models/longcat_flash_nextn.py +3 -14
sglang/srt/models/mimo.py +2 -13
sglang/srt/models/mimo_mtp.py +1 -2
sglang/srt/models/minicpmo.py +7 -5
sglang/srt/models/minimax_m2.py +922 -0
sglang/srt/models/mixtral.py +1 -4
sglang/srt/models/mllama.py +1 -1
sglang/srt/models/mllama4.py +13 -3
sglang/srt/models/nemotron_h.py +511 -0
sglang/srt/models/nvila.py +355 -0
sglang/srt/models/nvila_lite.py +184 -0
sglang/srt/models/olmo2.py +31 -4
sglang/srt/models/opt.py +5 -5
sglang/srt/models/phi.py +1 -1
sglang/srt/models/phi4mm.py +1 -1
sglang/srt/models/phimoe.py +0 -1
sglang/srt/models/pixtral.py +0 -3
sglang/srt/models/points_v15_chat.py +186 -0
sglang/srt/models/qwen.py +0 -1
sglang/srt/models/qwen2.py +22 -1
sglang/srt/models/qwen2_5_vl.py +3 -3
sglang/srt/models/qwen2_audio.py +2 -15
sglang/srt/models/qwen2_moe.py +15 -12
sglang/srt/models/qwen2_vl.py +5 -2
sglang/srt/models/qwen3.py +34 -4
sglang/srt/models/qwen3_moe.py +19 -37
sglang/srt/models/qwen3_next.py +7 -12
sglang/srt/models/qwen3_next_mtp.py +3 -4
sglang/srt/models/qwen3_omni_moe.py +661 -0
sglang/srt/models/qwen3_vl.py +37 -33
sglang/srt/models/qwen3_vl_moe.py +57 -185
sglang/srt/models/roberta.py +55 -3
sglang/srt/models/sarashina2_vision.py +0 -1
sglang/srt/models/step3_vl.py +3 -5
sglang/srt/models/utils.py +11 -1
sglang/srt/multimodal/processors/base_processor.py +7 -2
sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
sglang/srt/multimodal/processors/dots_vlm.py +0 -1
sglang/srt/multimodal/processors/glm4v.py +2 -6
sglang/srt/multimodal/processors/internvl.py +0 -2
sglang/srt/multimodal/processors/janus_pro.py +0 -1
sglang/srt/multimodal/processors/mllama4.py +0 -8
sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
sglang/srt/multimodal/processors/phi4mm.py +0 -1
sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
sglang/srt/multimodal/processors/qwen_vl.py +75 -16
sglang/srt/multimodal/processors/step3_vl.py +1 -1
sglang/srt/parser/conversation.py +41 -0
sglang/srt/parser/reasoning_parser.py +28 -2
sglang/srt/sampling/custom_logit_processor.py +77 -2
sglang/srt/sampling/sampling_batch_info.py +17 -22
sglang/srt/sampling/sampling_params.py +70 -2
sglang/srt/server_args.py +846 -163
sglang/srt/server_args_config_parser.py +1 -1
sglang/srt/single_batch_overlap.py +36 -31
sglang/srt/speculative/base_spec_worker.py +34 -0
sglang/srt/speculative/draft_utils.py +226 -0
sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
sglang/srt/speculative/eagle_info.py +57 -18
sglang/srt/speculative/eagle_info_v2.py +458 -0
sglang/srt/speculative/eagle_utils.py +138 -0
sglang/srt/speculative/eagle_worker.py +83 -280
sglang/srt/speculative/eagle_worker_v2.py +702 -0
sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
sglang/srt/speculative/ngram_worker.py +12 -11
sglang/srt/speculative/spec_info.py +2 -0
sglang/srt/speculative/spec_utils.py +38 -3
sglang/srt/speculative/standalone_worker.py +4 -14
sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
sglang/srt/two_batch_overlap.py +28 -14
sglang/srt/utils/__init__.py +1 -1
sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
sglang/srt/utils/common.py +272 -82
sglang/srt/utils/hf_transformers_utils.py +44 -17
sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
sglang/srt/{offloader.py → utils/offloader.py} +4 -4
sglang/srt/utils/profile_merger.py +199 -0
sglang/test/attention/test_flashattn_backend.py +1 -1
sglang/test/attention/test_flashattn_mla_backend.py +0 -1
sglang/test/attention/test_prefix_chunk_info.py +0 -2
sglang/test/attention/test_trtllm_mla_backend.py +221 -53
sglang/test/few_shot_gsm8k_engine.py +2 -4
sglang/test/kit_matched_stop.py +157 -0
sglang/test/longbench_v2/__init__.py +1 -0
sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
sglang/test/run_eval.py +41 -0
sglang/test/runners.py +2 -0
sglang/test/send_one.py +42 -7
sglang/test/simple_eval_common.py +3 -0
sglang/test/simple_eval_gpqa.py +0 -1
sglang/test/simple_eval_humaneval.py +0 -3
sglang/test/simple_eval_longbench_v2.py +344 -0
sglang/test/test_block_fp8.py +1 -2
sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
sglang/test/test_cutlass_moe.py +1 -2
sglang/test/test_cutlass_w4a8_moe.py +10 -20
sglang/test/test_deterministic.py +463 -107
sglang/test/test_deterministic_utils.py +74 -0
sglang/test/test_disaggregation_utils.py +81 -0
sglang/test/test_marlin_moe.py +0 -1
sglang/test/test_utils.py +85 -20
sglang/version.py +1 -1
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
sglang/srt/models/vila.py +0 -306
sglang/srt/speculative/build_eagle_tree.py +0 -427
sglang/test/test_block_fp8_ep.py +0 -358
/sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
/sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
/sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
{sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0

sglang/srt/mem_cache/memory_pool_host.py CHANGED Viewed

@@ -1,7 +1,6 @@
 import abc
 import logging
 import threading
-from enum import IntEnum
 from functools import wraps
 from typing import Optional

sglang/srt/mem_cache/multimodal_cache.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 from collections import OrderedDict
-from typing import Dict
 import torch

sglang/srt/mem_cache/radix_cache.py CHANGED Viewed

@@ -22,8 +22,8 @@ The radix tree data structure for managing the KV cache.
 import heapq
 import time
 from collections import defaultdict
-from functools import partial
-from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple, Union
+from functools import lru_cache, partial
+from typing import TYPE_CHECKING, Iterator, List, Optional, Tuple, Union
 import torch
@@ -34,7 +34,14 @@ from sglang.srt.disaggregation.kv_events import (
 )
 from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
-from sglang.srt.mem_cache.evict_policy import EvictionStrategy, LFUStrategy, LRUStrategy
+from sglang.srt.mem_cache.evict_policy import (
+    EvictionStrategy,
+    FIFOStrategy,
+    FILOStrategy,
+    LFUStrategy,
+    LRUStrategy,
+    MRUStrategy,
+)
 from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
 if TYPE_CHECKING:
@@ -76,6 +83,7 @@ class TreeNode:
         self.value: Optional[torch.Tensor] = None
         self.lock_ref = 0
         self.last_access_time = time.monotonic()
+        self.creation_time = time.monotonic()
         self.hit_count = 0
         # indicating the node is locked to protect from eviction
@@ -114,6 +122,13 @@ class TreeNode:
             return None
         return self.hash_value[-1]
+    @lru_cache(maxsize=1)
+    def get_prefix_hash_values(self, node: TreeNode) -> List[str]:
+        if node is None or node.hash_value is None:
+            return []
+        return node.get_prefix_hash_values(node.parent) + node.hash_value
     def __lt__(self, other: "TreeNode"):
         return self.last_access_time < other.last_access_time
@@ -209,9 +224,15 @@ class RadixCache(BasePrefixCache):
             self.eviction_strategy: EvictionStrategy = LRUStrategy()
         elif eviction_policy.lower() == "lfu":
             self.eviction_strategy: EvictionStrategy = LFUStrategy()
+        elif eviction_policy.lower() == "fifo":
+            self.eviction_strategy: EvictionStrategy = FIFOStrategy()
+        elif eviction_policy.lower() == "mru":
+            self.eviction_strategy: EvictionStrategy = MRUStrategy()
+        elif eviction_policy.lower() == "filo":
+            self.eviction_strategy: EvictionStrategy = FILOStrategy()
         else:
             raise ValueError(
-                f"Unknown eviction policy: {eviction_policy}. Supported policies: 'lru', 'lfu'."
+                f"Unknown eviction policy: {eviction_policy}. Supported policies: 'lru', 'lfu', 'fifo', 'mru', 'filo'."
             )
         self.reset()
@@ -314,18 +335,20 @@ class RadixCache(BasePrefixCache):
         return self._insert_helper(self.root_node, key, value)
-    def cache_finished_req(self, req: Req):
+    def cache_finished_req(self, req: Req, is_insert: bool = True):
         """Cache request when it finishes."""
+        all_token_len = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
         if self.disable:
             kv_indices = self.req_to_token_pool.req_to_token[
-                req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
+                req.req_pool_idx, :all_token_len
             ]
             self.token_to_kv_pool_allocator.free(kv_indices)
             self.req_to_token_pool.free(req.req_pool_idx)
             return
-        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
-        all_token_len = len(token_ids)
+        token_ids = (req.origin_input_ids + req.output_ids)[:all_token_len]
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
         actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, :all_token_len
@@ -336,12 +359,9 @@ class RadixCache(BasePrefixCache):
             page_aligned_kv_indices = kv_indices[:page_aligned_len].to(
                 dtype=torch.int64, copy=True
             )
-            self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
         else:
             page_aligned_len = actual_kv_len
             page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
-            if self.is_eagle:
-                self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
         page_aligned_token_len = (
             page_aligned_len + 1 if self.is_eagle else page_aligned_len
@@ -349,15 +369,27 @@ class RadixCache(BasePrefixCache):
         old_prefix_len = len(req.prefix_indices)
         if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
-            # prefix_indices attached partial part (for page_size > 1) and one unmatched token (for EAGLE)
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
             old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool
-        new_prefix_len = self.insert(
-            RadixKey(token_ids[:page_aligned_token_len], req.extra_key),
-            page_aligned_kv_indices,
-        )
-        self.token_to_kv_pool_allocator.free(kv_indices[old_prefix_len:new_prefix_len])
+        if is_insert:
+            new_prefix_len = self.insert(
+                RadixKey(token_ids[:page_aligned_token_len], req.extra_key),
+                page_aligned_kv_indices,
+            )
+            # Free the duplicates that were already in the tree
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[old_prefix_len:new_prefix_len]
+            )
+        else:
+            self.token_to_kv_pool_allocator.free(
+                kv_indices[old_prefix_len:page_aligned_len]
+            )
+        # free the unaligned tail
+        self.token_to_kv_pool_allocator.free(kv_indices[page_aligned_len:])
         # Remove req slot release the cache lock
         self.req_to_token_pool.free(req.req_pool_idx)
@@ -370,7 +402,8 @@ class RadixCache(BasePrefixCache):
         token_ids = req.fill_ids
         all_token_len = len(token_ids)
-        # The actual kv len for EAGLE is len(token_ids), since EAGLE uses bigram key
+        # For EAGLE radix cache, we will convert the key to bigram key, e.g. [1,2,3,4] -> [(1,2), (2,3), (3,4)], the length will -1. ((len([(1,2), (2,3), (3,4)]) = len([1,2,3,4]) - 1))
+        # So for the corresponding kv length should also -1. Then we get the actual_kv_len, and use it to do later calculation and slicing.
         actual_kv_len = all_token_len - 1 if self.is_eagle else all_token_len
         kv_indices = self.req_to_token_pool.req_to_token[
             req.req_pool_idx, :all_token_len
@@ -393,7 +426,8 @@ class RadixCache(BasePrefixCache):
         old_prefix_len = len(req.prefix_indices)
         if self.is_eagle and old_prefix_len > req.last_matched_prefix_len:
-            # prefix_indices attached partial part (for page_size > 1) and one unmatched token (for EAGLE)
+            # In EAGLE chunked prefill case, the prefix_indices included one unmatched token (kv_indices[actual_kv_len:])
+            # Here we -1 to make sure the kv of the unmatched token can be freed correctly to avoid memory leak
             old_prefix_len -= 1
         # Radix Cache takes one ref in memory pool

sglang/srt/mem_cache/radix_cache_cpp.py CHANGED Viewed

@@ -151,32 +151,37 @@ class RadixCacheCpp(BasePrefixCache):
     def total_size(self):
         return self.tree.total_size()
-    def cache_finished_req(self, req: Req):
+    def cache_finished_req(self, req: Req, is_insert: bool = True):
         """Cache request when it finishes."""
         assert req.req_pool_idx is not None
-        token_ids = (req.origin_input_ids + req.output_ids)[:-1]
+        all_token_len = len(req.origin_input_ids) + max(len(req.output_ids) - 1, 0)
+        token_ids = (req.origin_input_ids + req.output_ids)[:all_token_len]
         overall_len = len(token_ids)  # prefill + decode
         kv_indices = self.req_to_token_pool.req_to_token[req.req_pool_idx, :overall_len]
         # NOTE: our C++ implementation don't need `token_ids` and `kv_indices` to be page-aligned
         # it will automatically align them, but length of them should be equal
         old_prefix_len = len(req.prefix_indices) // self.page_size * self.page_size
-        new_prefix_len = self._insert(RadixKey(token_ids, req.extra_key), kv_indices)
+        page_aligned_overall_len = overall_len // self.page_size * self.page_size
-        # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
-        assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
-        # KVCache between old & new is newly generated, but already exists in the pool
-        # we need to free this newly generated kv indices
-        if old_prefix_len < new_prefix_len:
-            self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+        if is_insert:
+            new_prefix_len = self._insert(
+                RadixKey(token_ids, req.extra_key), kv_indices
+            )
+            # NOTE: kv_indices[:old_prefix_len] == req.prefix_indices
+            assert old_prefix_len <= new_prefix_len, "Wrong prefix indices"
+            # Free duplicates that were already in the pool
+            if old_prefix_len < new_prefix_len:
+                self.token_to_kv_pool.free(kv_indices[old_prefix_len:new_prefix_len])
+        else:
+            self.token_to_kv_pool.free(
+                kv_indices[old_prefix_len:page_aligned_overall_len]
+            )
         # need to free the unaligned part, since it cannot be inserted into the radix tree
-        if self.page_size != 1 and (  # unaligned tail only exists when page_size > 1
-            (unaligned_len := overall_len % self.page_size) > 0
-        ):
+        if page_aligned_overall_len < overall_len:
             # NOTE: sglang PagedAllocator support unaligned free (which will automatically align it)
-            self.token_to_kv_pool.free(kv_indices[overall_len - unaligned_len :])
+            self.token_to_kv_pool.free(kv_indices[page_aligned_overall_len:])
         # Remove req slot release the cache lock
         self.dec_lock_ref(req.last_node)

sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py CHANGED Viewed

@@ -13,7 +13,11 @@ from aibrix_kvcache import (
 )
 from aibrix_kvcache.common.absl_logging import log_every_n_seconds
-from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
+from sglang.srt.mem_cache.hicache_storage import (
+    HiCacheStorage,
+    HiCacheStorageConfig,
+    HiCacheStorageExtraInfo,
+)
 from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)
@@ -140,7 +144,9 @@ class AibrixKVCacheStorage(HiCacheStorage):
     ) -> bool:
         return self.batch_set([key], [value], [target_location], [target_size])
-    def batch_exists(self, keys: List[str]) -> int:
+    def batch_exists(
+        self, keys: List[str], extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
         block_hash = BlockHashes(keys, self.page_size)
         status = self.kv_cache_manager.exists(None, block_hash)
         if status.is_ok():

sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py CHANGED Viewed

@@ -3,20 +3,8 @@ import os
 import torch
 import torch.distributed
-from aibrix_kvcache import (
-    BaseKVCacheManager,
-    GroupAwareKVCacheManager,
-    KVCacheBlockLayout,
-    KVCacheBlockSpec,
-    KVCacheConfig,
-    KVCacheMetrics,
-    KVCacheTensorSpec,
-    ModelSpec,
-    TokenListView,
-)
-from aibrix_kvcache.common.absl_logging import getLogger, log_every_n_seconds, log_if
+from aibrix_kvcache.common.absl_logging import log_every_n_seconds
 from aibrix_kvcache_storage import AibrixKVCacheStorage
-from torch.distributed import Backend, ProcessGroup
 from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
 from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool

sglang/srt/mem_cache/storage/backend_factory.py CHANGED Viewed

@@ -161,7 +161,7 @@ class StorageBackendFactory:
         if backend_name == "file":
             return backend_class(storage_config)
         elif backend_name == "nixl":
-            return backend_class()
+            return backend_class(storage_config)
         elif backend_name == "mooncake":
             backend = backend_class(storage_config)
             return backend
@@ -170,7 +170,7 @@ class StorageBackendFactory:
             return backend
         elif backend_name == "hf3fs":
             # Calculate bytes_per_page based on memory pool layout
-            if mem_pool_host.layout == "page_first":
+            if mem_pool_host.layout in ["page_first", "page_first_direct"]:
                 bytes_per_page = (
                     mem_pool_host.get_ksize_per_token() * mem_pool_host.page_size
                 )

sglang/srt/mem_cache/storage/eic/eic_storage.py CHANGED Viewed

@@ -2,21 +2,18 @@ import json
 import logging
 import os
 import time
-import uuid
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, List, Optional, Tuple
 import eic
 import torch
 import yaml
-from sglang.srt.layers.dp_attention import get_attention_tp_rank, get_attention_tp_size
 from sglang.srt.mem_cache.hicache_storage import (
     HiCacheStorage,
     HiCacheStorageConfig,
     HiCacheStorageExtraInfo,
 )
-from sglang.srt.mem_cache.memory_pool_host import HostKVCache, MLATokenToKVPoolHost
+from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 logger = logging.getLogger(__name__)
@@ -408,7 +405,9 @@ class EICStorage(HiCacheStorage):
         exist_num = self.batch_exists([key])
         return exist_num == 1
-    def batch_exists(self, keys) -> int:
+    def batch_exists(
+        self, keys, extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
         if len(keys) == 0:
             return 0
         if self.use_zero_copy and not self.is_mla_model:

sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import logging
 import os
-import threading
 from abc import ABC, abstractmethod
 from typing import List

sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py CHANGED Viewed

@@ -3,8 +3,9 @@ import atexit
 import json
 import logging
 import threading
+from collections import OrderedDict
 from pathlib import Path
-from typing import Dict, List, Optional, OrderedDict, Tuple
+from typing import Dict, List, Optional, Tuple
 import orjson
 import requests
@@ -136,7 +137,7 @@ class GlobalMetadataState:
                     num_pages = data["num_pages"]
                     rank_meta = RankMetadata(num_pages)
                     rank_meta.free_pages = data["free_pages"]
-                    rank_meta.key_to_index = dict(data["key_to_index"])
+                    rank_meta.key_to_index = OrderedDict(data["key_to_index"])
                     self.ranks[rank_id] = rank_meta
                 logging.info(
                     f"Successfully loaded metadata for {len(self.ranks)} ranks."

sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py CHANGED Viewed

@@ -454,7 +454,9 @@ class HiCacheHF3FS(HiCacheStorage):
         result = self.metadata_client.exists(self.rank, [key])
         return result[0] if result else False
-    def batch_exists(self, keys: List[str]) -> int:
+    def batch_exists(
+        self, keys: List[str], extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
         factor = 1
         if self.is_zero_copy and not self.is_mla_model:
             keys = self._get_mha_zero_copy_keys(keys)
@@ -499,8 +501,12 @@ class HiCacheHF3FS(HiCacheStorage):
     def register_mem_pool_host(self, mem_pool_host: HostKVCache):
         super().register_mem_pool_host(mem_pool_host)
-        self.is_zero_copy = self.mem_pool_host.layout == "page_first"
-        logger.info(f"{self.is_zero_copy=}")
+        self.is_zero_copy = self.mem_pool_host.layout in [
+            "page_first",
+            "page_first_direct",
+        ]
+        logger.info(f"{self.is_zero_copy=}, layout={self.mem_pool_host.layout}")
     def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
         _keys = []

sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 import logging
 import threading
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, Optional
 import torch
@@ -217,10 +217,12 @@ class LMCRadixCache(RadixCache):
         return base_res
-    def cache_finished_req(self, req: "Req") -> None:  # type: ignore[override]
+    def cache_finished_req(self, req: "Req", is_insert: bool = True) -> None:  # type: ignore[override]
         """On request completion, insert device KV into radix and store to LMCache."""
-        super().cache_finished_req(req)
+        super().cache_finished_req(req, is_insert=is_insert)
+        if not is_insert:
+            return
         token_ids = (req.origin_input_ids + req.output_ids)[:-1]
         kv_indices = self.req_to_token_pool.req_to_token[

sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import json
 import logging
 import os
+import time
 import uuid
 from dataclasses import dataclass
 from typing import Any, List, Optional
+import requests
 import torch
 from sglang.srt.mem_cache.hicache_storage import (
@@ -17,9 +19,29 @@ from sglang.srt.mem_cache.memory_pool_host import HostKVCache
 DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024  # 4 GiB
 DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024  # 16 MB
 DEFAULT_MOONCAKE_CONFIG_PATH_ENV = "SGLANG_HICACHE_MOONCAKE_CONFIG_PATH"
+SETUP_TIMEOUT = 600  # 10min
+DEFAULT_MASTER_METRICS_PORT = 9003
+DEFAULT_CHECK_SERVER = False
 logger = logging.getLogger(__name__)
+def _parse_global_segment_size(value) -> int:
+    if isinstance(value, int):
+        return value
+    if isinstance(value, str):
+        s = value.strip().lower()
+        if s.endswith("gb"):
+            num = s[:-2].strip()
+            if not num:
+                raise ValueError(
+                    "Invalid global_segment_size: missing number before 'gb'"
+                )
+            return int(num) * 1024 * 1024 * 1024
+        return int(s)
+    return int(value)
 @dataclass
 class MooncakeStoreConfig:
     local_hostname: str
@@ -29,6 +51,8 @@ class MooncakeStoreConfig:
     protocol: str
     device_name: str
     master_server_address: str
+    master_metrics_port: int
+    check_server: bool
     @staticmethod
     def from_file() -> "MooncakeStoreConfig":
@@ -43,14 +67,18 @@ class MooncakeStoreConfig:
         return MooncakeStoreConfig(
             local_hostname=config.get("local_hostname"),
             metadata_server=config.get("metadata_server"),
-            global_segment_size=config.get(
-                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            global_segment_size=_parse_global_segment_size(
+                config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE)
             ),
             # Zero copy interface does not need local buffer
             local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
             protocol=config.get("protocol", "tcp"),
-            device_name=config.get("device_name", "auto"),
+            device_name=config.get("device_name", ""),
             master_server_address=config.get("master_server_address"),
+            master_metrics_port=config.get(
+                "master_metrics_port", DEFAULT_MASTER_METRICS_PORT
+            ),
+            check_server=config.get("check_server", DEFAULT_CHECK_SERVER),
         )
     @staticmethod
@@ -58,7 +86,7 @@ class MooncakeStoreConfig:
         """Load config from a file specified in the environment variable.
         export MOONCAKE_MASTER=10.13.3.232:50051
         export MOONCAKE_PROTOCOL="rdma"
-        export MOONCAKE_DEVICE="auto"
+        export MOONCAKE_DEVICE=""
         export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE"
         """
         # other required environment variables...
@@ -67,14 +95,18 @@ class MooncakeStoreConfig:
         return MooncakeStoreConfig(
             local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"),
             metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"),
-            global_segment_size=int(
+            global_segment_size=_parse_global_segment_size(
                 os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE)
             ),
             # Zero copy interface does not need local buffer
             local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
             protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"),
-            device_name=os.getenv("MOONCAKE_DEVICE", "auto"),
+            device_name=os.getenv("MOONCAKE_DEVICE", ""),
             master_server_address=os.getenv("MOONCAKE_MASTER"),
+            master_metrics_port=int(
+                os.getenv("MOONCAKE_MASTER_METRICS_PORT", DEFAULT_GLOBAL_SEGMENT_SIZE)
+            ),
+            check_server=bool(os.getenv("MOONCAKE_CHECK_SERVER", DEFAULT_CHECK_SERVER)),
         )
     @staticmethod
@@ -86,24 +118,21 @@ class MooncakeStoreConfig:
         return MooncakeStoreConfig(
             local_hostname=extra_config.get("local_hostname", "localhost"),
             metadata_server=extra_config.get("metadata_server", "P2PHANDSHAKE"),
-            global_segment_size=extra_config.get(
-                "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
+            global_segment_size=_parse_global_segment_size(
+                extra_config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE)
             ),
             local_buffer_size=extra_config.get(
                 "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
             ),
             protocol=extra_config.get("protocol", "tcp"),
-            device_name=extra_config.get("device_name", "auto"),
+            device_name=extra_config.get("device_name", ""),
             master_server_address=extra_config["master_server_address"],
+            master_metrics_port=extra_config.get(
+                "master_metrics_port", DEFAULT_MASTER_METRICS_PORT
+            ),
+            check_server=extra_config.get("check_server", DEFAULT_CHECK_SERVER),
         )
-    def __post_init__(self):
-        if self.device_name == "auto":
-            os.environ["MC_MS_AUTO_DISC"] = "1"
-            os.environ["MC_MS_FILTERS"] = (
-                "mlx5_bond_0, mlx5_bond_1, mlx5_bond_2, mlx5_bond_3"
-            )
 class MooncakeStore(HiCacheStorage):
@@ -151,6 +180,16 @@ class MooncakeStore(HiCacheStorage):
             )
             per_tp_local_buffer_size = self.config.local_buffer_size // tp_scale_factor
+            # Check if extra_backend_tag should be passed to MooncakeDistributedStore
+            self.extra_backend_tag = None
+            if extra_config and "extra_backend_tag" in extra_config:
+                self.extra_backend_tag = extra_config["extra_backend_tag"]
+                logger.info(f"Using extra_backend_tag: {self.extra_backend_tag}")
+            # Check server status
+            if self.config.check_server:
+                self.check_server()
             ret_code = self.store.setup(
                 self.config.local_hostname,
                 self.config.metadata_server,
@@ -181,6 +220,39 @@ class MooncakeStore(HiCacheStorage):
             logger.error("An error occurred while loading the configuration: %s", exc)
             raise
+    def check_server(self):
+        master_server_ip = self.config.master_server_address.split(":")[0]
+        segments_url = f"http://{master_server_ip}:{self.config.master_metrics_port}/get_all_segments"
+        start_time = time.perf_counter()
+        check_result = False
+        while time.perf_counter() - start_time < SETUP_TIMEOUT:
+            try:
+                check_segments_resp = requests.get(segments_url, timeout=3)
+            except Exception:
+                logger.info(
+                    "waiting mooncake store server started, cost_time: %.2f seconds.",
+                    time.perf_counter() - start_time,
+                )
+                time.sleep(3)
+                continue
+            if check_segments_resp.text == "":
+                logger.info(
+                    "waiting mooncake store server started, cost_time: %.2f seconds.",
+                    time.perf_counter() - start_time,
+                )
+                time.sleep(3)
+                continue
+            logger.info("Mooncake store server started successfully.")
+            check_result = True
+            break
+        if not check_result:
+            logger.error("Launch mooncake store server timeout")
+            raise ValueError("Launch mooncake store server timeout")
     def warmup(self):
         warmup_key = "sglang_mooncake_store_warmup_key" + uuid.uuid4().hex
         warmup_value = bytes(4 * 1024)  # 4 KB
@@ -257,6 +329,11 @@ class MooncakeStore(HiCacheStorage):
         host_indices: torch.Tensor,
         extra_info: Optional[HiCacheStorageExtraInfo] = None,
     ) -> List[bool]:
+        # Apply extra_backend_tag prefix if available
+        if self.extra_backend_tag is not None:
+            prefix = self.extra_backend_tag
+            keys = [f"{prefix}_{key}" for key in keys]
         key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
         get_results = self._get_batch_zero_copy_impl(
             key_strs, buffer_ptrs, buffer_sizes
@@ -269,6 +346,11 @@ class MooncakeStore(HiCacheStorage):
         host_indices: torch.Tensor,
         extra_info: Optional[HiCacheStorageExtraInfo] = None,
     ) -> List[bool]:
+        # Apply extra_backend_tag prefix if available
+        if self.extra_backend_tag is not None:
+            prefix = self.extra_backend_tag
+            keys = [f"{prefix}_{key}" for key in keys]
         key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
         exist_result = self._batch_exist(key_strs)
@@ -399,7 +481,9 @@ class MooncakeStore(HiCacheStorage):
         exist_result = self._batch_exist([key])
         return exist_result[0] == 1
-    def batch_exists(self, keys) -> int:
+    def batch_exists(
+        self, keys, extra_info: Optional[HiCacheStorageExtraInfo] = None
+    ) -> int:
         if self.is_mla_backend:
             query_keys = [f"{key}_k" for key in keys]
             key_multiplier = 1

sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

sglang 0.5.3rc2py3-none-any.whl → 0.5.4.post1py3-none-any.whl