sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,20 +1,47 @@
1
1
  import json
2
2
  import logging
3
3
  import os
4
+ import time
4
5
  import uuid
5
6
  from dataclasses import dataclass
6
7
  from typing import Any, List, Optional
7
8
 
9
+ import requests
8
10
  import torch
9
11
 
10
- from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
12
+ from sglang.srt.mem_cache.hicache_storage import (
13
+ HiCacheStorage,
14
+ HiCacheStorageConfig,
15
+ HiCacheStorageExtraInfo,
16
+ )
17
+ from sglang.srt.mem_cache.memory_pool_host import HostKVCache
11
18
 
12
19
  DEFAULT_GLOBAL_SEGMENT_SIZE = 4 * 1024 * 1024 * 1024 # 4 GiB
13
20
  DEFAULT_LOCAL_BUFFER_SIZE = 16 * 1024 * 1024 # 16 MB
21
+ DEFAULT_MOONCAKE_CONFIG_PATH_ENV = "SGLANG_HICACHE_MOONCAKE_CONFIG_PATH"
22
+ SETUP_TIMEOUT = 600 # 10min
23
+ DEFAULT_MASTER_METRICS_PORT = 9003
24
+ DEFAULT_CHECK_SERVER = False
14
25
 
15
26
  logger = logging.getLogger(__name__)
16
27
 
17
28
 
29
+ def _parse_global_segment_size(value) -> int:
30
+ if isinstance(value, int):
31
+ return value
32
+ if isinstance(value, str):
33
+ s = value.strip().lower()
34
+ if s.endswith("gb"):
35
+ num = s[:-2].strip()
36
+ if not num:
37
+ raise ValueError(
38
+ "Invalid global_segment_size: missing number before 'gb'"
39
+ )
40
+ return int(num) * 1024 * 1024 * 1024
41
+ return int(s)
42
+ return int(value)
43
+
44
+
18
45
  @dataclass
19
46
  class MooncakeStoreConfig:
20
47
  local_hostname: str
@@ -24,28 +51,34 @@ class MooncakeStoreConfig:
24
51
  protocol: str
25
52
  device_name: str
26
53
  master_server_address: str
54
+ master_metrics_port: int
55
+ check_server: bool
27
56
 
28
57
  @staticmethod
29
58
  def from_file() -> "MooncakeStoreConfig":
30
59
  """Load the config from a JSON file."""
31
- file_path = os.getenv("MOONCAKE_CONFIG_PATH")
32
- if file_path is None:
33
- raise ValueError(
34
- "The environment variable 'MOONCAKE_CONFIG_PATH' is not set."
35
- )
36
- with open(file_path) as fin:
37
- config = json.load(fin)
60
+ file_path = os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV)
61
+ try:
62
+ with open(file_path) as fin:
63
+ config = json.load(fin)
64
+ except Exception as e:
65
+ raise RuntimeError(f"Failed to load config from {file_path}: {str(e)}")
66
+
38
67
  return MooncakeStoreConfig(
39
68
  local_hostname=config.get("local_hostname"),
40
69
  metadata_server=config.get("metadata_server"),
41
- global_segment_size=config.get(
42
- "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
70
+ global_segment_size=_parse_global_segment_size(
71
+ config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE)
43
72
  ),
44
73
  # Zero copy interface does not need local buffer
45
74
  local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
46
75
  protocol=config.get("protocol", "tcp"),
47
- device_name=config.get("device_name", "auto"),
76
+ device_name=config.get("device_name", ""),
48
77
  master_server_address=config.get("master_server_address"),
78
+ master_metrics_port=config.get(
79
+ "master_metrics_port", DEFAULT_MASTER_METRICS_PORT
80
+ ),
81
+ check_server=config.get("check_server", DEFAULT_CHECK_SERVER),
49
82
  )
50
83
 
51
84
  @staticmethod
@@ -53,7 +86,7 @@ class MooncakeStoreConfig:
53
86
  """Load config from a file specified in the environment variable.
54
87
  export MOONCAKE_MASTER=10.13.3.232:50051
55
88
  export MOONCAKE_PROTOCOL="rdma"
56
- export MOONCAKE_DEVICE="auto"
89
+ export MOONCAKE_DEVICE=""
57
90
  export MOONCAKE_TE_META_DATA_SERVER="P2PHANDSHAKE"
58
91
  """
59
92
  # other required environment variables...
@@ -62,14 +95,18 @@ class MooncakeStoreConfig:
62
95
  return MooncakeStoreConfig(
63
96
  local_hostname=os.getenv("LOCAL_HOSTNAME", "localhost"),
64
97
  metadata_server=os.getenv("MOONCAKE_TE_META_DATA_SERVER", "P2PHANDSHAKE"),
65
- global_segment_size=int(
98
+ global_segment_size=_parse_global_segment_size(
66
99
  os.getenv("MOONCAKE_GLOBAL_SEGMENT_SIZE", DEFAULT_GLOBAL_SEGMENT_SIZE)
67
100
  ),
68
101
  # Zero copy interface does not need local buffer
69
102
  local_buffer_size=DEFAULT_LOCAL_BUFFER_SIZE,
70
103
  protocol=os.getenv("MOONCAKE_PROTOCOL", "tcp"),
71
- device_name=os.getenv("MOONCAKE_DEVICE", "auto"),
104
+ device_name=os.getenv("MOONCAKE_DEVICE", ""),
72
105
  master_server_address=os.getenv("MOONCAKE_MASTER"),
106
+ master_metrics_port=int(
107
+ os.getenv("MOONCAKE_MASTER_METRICS_PORT", DEFAULT_GLOBAL_SEGMENT_SIZE)
108
+ ),
109
+ check_server=bool(os.getenv("MOONCAKE_CHECK_SERVER", DEFAULT_CHECK_SERVER)),
73
110
  )
74
111
 
75
112
  @staticmethod
@@ -81,26 +118,24 @@ class MooncakeStoreConfig:
81
118
  return MooncakeStoreConfig(
82
119
  local_hostname=extra_config.get("local_hostname", "localhost"),
83
120
  metadata_server=extra_config.get("metadata_server", "P2PHANDSHAKE"),
84
- global_segment_size=extra_config.get(
85
- "global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE
121
+ global_segment_size=_parse_global_segment_size(
122
+ extra_config.get("global_segment_size", DEFAULT_GLOBAL_SEGMENT_SIZE)
86
123
  ),
87
124
  local_buffer_size=extra_config.get(
88
125
  "local_buffer_size", DEFAULT_LOCAL_BUFFER_SIZE
89
126
  ),
90
127
  protocol=extra_config.get("protocol", "tcp"),
91
- device_name=extra_config.get("device_name", "auto"),
128
+ device_name=extra_config.get("device_name", ""),
92
129
  master_server_address=extra_config["master_server_address"],
130
+ master_metrics_port=extra_config.get(
131
+ "master_metrics_port", DEFAULT_MASTER_METRICS_PORT
132
+ ),
133
+ check_server=extra_config.get("check_server", DEFAULT_CHECK_SERVER),
93
134
  )
94
135
 
95
- def __post_init__(self):
96
- if self.device_name == "auto":
97
- os.environ["MC_MS_AUTO_DISC"] = "1"
98
- os.environ["MC_MS_FILTERS"] = (
99
- "mlx5_bond_0, mlx5_bond_1, mlx5_bond_2, mlx5_bond_3"
100
- )
101
-
102
136
 
103
137
  class MooncakeStore(HiCacheStorage):
138
+
104
139
  def __init__(self, storage_config: HiCacheStorageConfig = None):
105
140
  try:
106
141
  from mooncake.store import MooncakeDistributedStore
@@ -129,6 +164,10 @@ class MooncakeStore(HiCacheStorage):
129
164
  logger.info(
130
165
  "Mooncake Configuration loaded from extra_config successfully."
131
166
  )
167
+ elif os.getenv(DEFAULT_MOONCAKE_CONFIG_PATH_ENV):
168
+ # Load from config file
169
+ self.config = MooncakeStoreConfig.from_file()
170
+ logger.info("Mooncake Configuration loaded from file successfully.")
132
171
  else:
133
172
  # Load from environment variables
134
173
  self.config = MooncakeStoreConfig.load_from_env()
@@ -141,6 +180,16 @@ class MooncakeStore(HiCacheStorage):
141
180
  )
142
181
  per_tp_local_buffer_size = self.config.local_buffer_size // tp_scale_factor
143
182
 
183
+ # Check if extra_backend_tag should be passed to MooncakeDistributedStore
184
+ self.extra_backend_tag = None
185
+ if extra_config and "extra_backend_tag" in extra_config:
186
+ self.extra_backend_tag = extra_config["extra_backend_tag"]
187
+ logger.info(f"Using extra_backend_tag: {self.extra_backend_tag}")
188
+
189
+ # Check server status
190
+ if self.config.check_server:
191
+ self.check_server()
192
+
144
193
  ret_code = self.store.setup(
145
194
  self.config.local_hostname,
146
195
  self.config.metadata_server,
@@ -171,6 +220,39 @@ class MooncakeStore(HiCacheStorage):
171
220
  logger.error("An error occurred while loading the configuration: %s", exc)
172
221
  raise
173
222
 
223
+ def check_server(self):
224
+ master_server_ip = self.config.master_server_address.split(":")[0]
225
+ segments_url = f"http://{master_server_ip}:{self.config.master_metrics_port}/get_all_segments"
226
+ start_time = time.perf_counter()
227
+
228
+ check_result = False
229
+ while time.perf_counter() - start_time < SETUP_TIMEOUT:
230
+ try:
231
+ check_segments_resp = requests.get(segments_url, timeout=3)
232
+ except Exception:
233
+ logger.info(
234
+ "waiting mooncake store server started, cost_time: %.2f seconds.",
235
+ time.perf_counter() - start_time,
236
+ )
237
+ time.sleep(3)
238
+ continue
239
+
240
+ if check_segments_resp.text == "":
241
+ logger.info(
242
+ "waiting mooncake store server started, cost_time: %.2f seconds.",
243
+ time.perf_counter() - start_time,
244
+ )
245
+ time.sleep(3)
246
+ continue
247
+
248
+ logger.info("Mooncake store server started successfully.")
249
+ check_result = True
250
+ break
251
+
252
+ if not check_result:
253
+ logger.error("Launch mooncake store server timeout")
254
+ raise ValueError("Launch mooncake store server timeout")
255
+
174
256
  def warmup(self):
175
257
  warmup_key = "sglang_mooncake_store_warmup_key" + uuid.uuid4().hex
176
258
  warmup_value = bytes(4 * 1024) # 4 KB
@@ -178,7 +260,13 @@ class MooncakeStore(HiCacheStorage):
178
260
  assert self.store.is_exist(warmup_key) == 1
179
261
  assert self.store.get(warmup_key) == warmup_value
180
262
 
181
- def register_buffer(self, buffer: torch.Tensor) -> None:
263
+ def register_mem_pool_host(self, mem_pool_host: HostKVCache):
264
+ super().register_mem_pool_host(mem_pool_host)
265
+ assert self.mem_pool_host.layout in [
266
+ "page_first",
267
+ "page_first_direct",
268
+ ], "mooncake store storage backend only support page first or page first direct layout"
269
+ buffer = self.mem_pool_host.kv_buffer
182
270
  try:
183
271
  buffer_ptr = buffer.data_ptr()
184
272
  buffer_size = buffer.numel() * buffer.element_size()
@@ -189,6 +277,107 @@ class MooncakeStore(HiCacheStorage):
189
277
  logger.error("Failed to register buffer to Mooncake Store: %s", err)
190
278
  raise TypeError("Mooncake Store Register Buffer Error.") from err
191
279
 
280
+ def _get_mha_buffer_meta(self, keys, indices):
281
+ ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
282
+ key_list = []
283
+ for key_ in keys:
284
+ key_list.append(f"{key_}_{self.local_rank}_k")
285
+ key_list.append(f"{key_}_{self.local_rank}_v")
286
+ assert len(key_list) == len(ptr_list)
287
+ return key_list, ptr_list, element_size_list
288
+
289
+ def _get_mla_buffer_meta(self, keys, indices):
290
+ ptr_list, element_size_list = self.mem_pool_host.get_page_buffer_meta(indices)
291
+ key_list = []
292
+ for key_ in keys:
293
+ key_list.append(f"{key_}_k")
294
+ assert len(key_list) == len(ptr_list)
295
+ return key_list, ptr_list, element_size_list
296
+
297
+ def _batch_preprocess(self, keys, host_indices):
298
+ assert len(keys) > 0
299
+ assert len(keys) == len(host_indices) // self.mem_pool_host.page_size
300
+ if self.is_mla_backend:
301
+ return self._get_mla_buffer_meta(keys, host_indices)
302
+ else:
303
+ return self._get_mha_buffer_meta(keys, host_indices)
304
+
305
+ def _batch_postprocess(self, results: List[int], is_set_operate=False):
306
+ """
307
+ refer to https://github.com/kvcache-ai/Mooncake/blob/main/mooncake-store/include/pybind_client.h
308
+ for batch_get_into, results is Vector of integers,
309
+ where each element is the number of bytes read on success, or a negative value on error
310
+ for batch_put_from, results is Vector of integers,
311
+ where each element is 0 on success, or a negative value on error
312
+ """
313
+ if self.is_mla_backend:
314
+ return [k_res == 0 if is_set_operate else k_res > 0 for k_res in results]
315
+ else:
316
+ kv_pairs = zip(results[::2], results[1::2])
317
+ return [
318
+ (
319
+ (k_res == 0 and v_res == 0)
320
+ if is_set_operate
321
+ else (k_res > 0 and v_res > 0)
322
+ )
323
+ for k_res, v_res in kv_pairs
324
+ ]
325
+
326
+ def batch_get_v1(
327
+ self,
328
+ keys: List[str],
329
+ host_indices: torch.Tensor,
330
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
331
+ ) -> List[bool]:
332
+ # Apply extra_backend_tag prefix if available
333
+ if self.extra_backend_tag is not None:
334
+ prefix = self.extra_backend_tag
335
+ keys = [f"{prefix}_{key}" for key in keys]
336
+
337
+ key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
338
+ get_results = self._get_batch_zero_copy_impl(
339
+ key_strs, buffer_ptrs, buffer_sizes
340
+ )
341
+ return self._batch_postprocess(get_results, is_set_operate=False)
342
+
343
+ def batch_set_v1(
344
+ self,
345
+ keys: List[str],
346
+ host_indices: torch.Tensor,
347
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
348
+ ) -> List[bool]:
349
+ # Apply extra_backend_tag prefix if available
350
+ if self.extra_backend_tag is not None:
351
+ prefix = self.extra_backend_tag
352
+ keys = [f"{prefix}_{key}" for key in keys]
353
+
354
+ key_strs, buffer_ptrs, buffer_sizes = self._batch_preprocess(keys, host_indices)
355
+ exist_result = self._batch_exist(key_strs)
356
+
357
+ set_keys = []
358
+ set_buffer_ptrs = []
359
+ set_buffer_sizes = []
360
+ set_indices = []
361
+ set_results = [-1] * len(key_strs)
362
+ for i in range(len(key_strs)):
363
+ if exist_result[i] != 1:
364
+ set_keys.append(key_strs[i])
365
+ set_buffer_ptrs.append(buffer_ptrs[i])
366
+ set_buffer_sizes.append(buffer_sizes[i])
367
+ set_indices.append(i)
368
+ else:
369
+ set_results[i] = 0
370
+
371
+ # Only set non-existing keys to storage
372
+ if len(set_keys) > 0:
373
+ put_results = self._put_batch_zero_copy_impl(
374
+ set_keys, set_buffer_ptrs, set_buffer_sizes
375
+ )
376
+ for i in range(len(set_indices)):
377
+ set_results[set_indices[i]] = put_results[i]
378
+
379
+ return self._batch_postprocess(set_results, is_set_operate=True)
380
+
192
381
  def set(
193
382
  self,
194
383
  key,
@@ -292,7 +481,9 @@ class MooncakeStore(HiCacheStorage):
292
481
  exist_result = self._batch_exist([key])
293
482
  return exist_result[0] == 1
294
483
 
295
- def batch_exists(self, keys) -> int:
484
+ def batch_exists(
485
+ self, keys, extra_info: Optional[HiCacheStorageExtraInfo] = None
486
+ ) -> int:
296
487
  if self.is_mla_backend:
297
488
  query_keys = [f"{key}_k" for key in keys]
298
489
  key_multiplier = 1
@@ -1,13 +1,12 @@
1
- import hashlib
2
1
  import logging
3
2
  import os
4
3
  import time
5
4
  import uuid
6
- from typing import Any, Dict, List, Optional, Tuple, Union
5
+ from typing import Any, List, Optional, Union
7
6
 
8
7
  import torch
9
8
 
10
- from sglang.srt.mem_cache.hicache_storage import HiCacheStorage
9
+ from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
11
10
 
12
11
  from .nixl_utils import NixlBackendSelection, NixlFileManager, NixlRegistration
13
12
 
@@ -26,7 +25,12 @@ logger = logging.getLogger(__name__)
26
25
  class HiCacheNixl(HiCacheStorage):
27
26
  """HiCacheNixl provides high-performance storage using NIXL plugins."""
28
27
 
29
- def __init__(self, file_path: str = "/tmp/hicache_storage", plugin: str = "auto"):
28
+ def __init__(
29
+ self,
30
+ storage_config: HiCacheStorageConfig,
31
+ file_path: str = "/tmp/hicache_storage",
32
+ plugin: str = "auto",
33
+ ):
30
34
  """Initialize NIXL storage connector."""
31
35
  # Might be better to be unified across HiCache backends and moved to HiCacheController
32
36
  file_path = os.getenv("SGLANG_HICACHE_NIXL_BACKEND_STORAGE_DIR", file_path)
@@ -36,6 +40,19 @@ class HiCacheNixl(HiCacheStorage):
36
40
  else None
37
41
  )
38
42
 
43
+ # Initialize suffix based on storage config
44
+ tp_rank, tp_size, model_name, is_mla_model = (
45
+ storage_config.tp_rank,
46
+ storage_config.tp_size,
47
+ storage_config.model_name,
48
+ storage_config.is_mla_model,
49
+ )
50
+ model_name = "-".join(model_name.split("/")) if model_name else ""
51
+ if is_mla_model:
52
+ self.config_suffix = f"_{model_name}"
53
+ else:
54
+ self.config_suffix = f"_{model_name}_{tp_rank}_{tp_size}"
55
+
39
56
  agent_config = nixl_agent_config(backends=[])
40
57
  self.agent_name = f"hicache_nixl_{str(uuid.uuid4())}"
41
58
  self.agent = nixl_agent(self.agent_name, agent_config)
@@ -46,6 +63,9 @@ class HiCacheNixl(HiCacheStorage):
46
63
 
47
64
  self.registration = NixlRegistration(self.agent)
48
65
 
66
+ def _get_suffixed_key(self, key: str) -> str:
67
+ return key + self.config_suffix
68
+
49
69
  def register_buffers(
50
70
  self, buffers: Union[torch.Tensor, List[torch.Tensor], List[tuple]]
51
71
  ) -> Optional[Any]:
@@ -194,11 +214,14 @@ class HiCacheNixl(HiCacheStorage):
194
214
  else:
195
215
  dest = target_locations
196
216
 
217
+ # Add suffix to keys
218
+ suffixed_keys = [self._get_suffixed_key(key) for key in keys]
219
+
197
220
  if self.backend_selector.mem_type == "FILE":
198
- file_paths = [self.file_manager.get_file_path(key) for key in keys]
221
+ file_paths = [self.file_manager.get_file_path(key) for key in suffixed_keys]
199
222
  success = self._execute_transfer(dest, file_paths, "READ")
200
223
  else:
201
- success = self._execute_transfer(dest, keys, "READ")
224
+ success = self._execute_transfer(dest, suffixed_keys, "READ")
202
225
  return target_locations if success and not target_sizes else [None] * len(keys)
203
226
 
204
227
  def set(
@@ -227,9 +250,12 @@ class HiCacheNixl(HiCacheStorage):
227
250
  if not values:
228
251
  values = list(zip(target_locations, target_sizes))
229
252
 
253
+ # Add suffix to keys
254
+ suffixed_keys = [self._get_suffixed_key(key) for key in keys]
255
+
230
256
  if self.backend_selector.mem_type == "FILE":
231
257
  file_paths = []
232
- for key in keys:
258
+ for key in suffixed_keys:
233
259
  file_path = self.file_manager.get_file_path(key)
234
260
  # New file per set, to be updated when partial writes is added to HiCache
235
261
  if not self.file_manager.create_file(file_path):
@@ -238,11 +264,14 @@ class HiCacheNixl(HiCacheStorage):
238
264
  file_paths.append(file_path)
239
265
  return self._execute_transfer(values, file_paths, "WRITE")
240
266
  else: # mem_type == "OBJ"
241
- return self._execute_transfer(values, keys, "WRITE")
267
+ return self._execute_transfer(values, suffixed_keys, "WRITE")
242
268
 
243
269
  def exists(self, key: str) -> bool:
270
+ # Add suffix to key
271
+ suffixed_key = self._get_suffixed_key(key)
272
+
244
273
  tuples = self.registration.create_query_tuples(
245
- key,
274
+ suffixed_key,
246
275
  self.backend_selector.mem_type,
247
276
  self.file_manager if self.backend_selector.mem_type == "FILE" else None,
248
277
  )
@@ -1,6 +1,6 @@
1
1
  import logging
2
2
  import os
3
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import Any, List, Optional, Tuple, Union
4
4
 
5
5
  import torch
6
6
 
@@ -2,11 +2,12 @@
2
2
 
3
3
  import os
4
4
  import unittest
5
- from typing import List, Optional
5
+ from typing import List
6
6
  from unittest.mock import MagicMock
7
7
 
8
8
  import torch
9
9
 
10
+ from sglang.srt.mem_cache.hicache_storage import HiCacheStorageConfig
10
11
  from sglang.srt.mem_cache.storage.nixl.hicache_nixl import HiCacheNixl
11
12
  from sglang.srt.mem_cache.storage.nixl.nixl_utils import (
12
13
  NixlFileManager,
@@ -31,8 +32,22 @@ class TestNixlUnified(unittest.TestCase):
31
32
  # Create instances
32
33
  self.file_manager = NixlFileManager(self.test_dir)
33
34
  self.registration = NixlRegistration(self.mock_agent)
35
+
36
+ # Create storage config for testing
37
+ self.storage_config = HiCacheStorageConfig(
38
+ tp_rank=0,
39
+ tp_size=2,
40
+ is_mla_model=False,
41
+ is_page_first_layout=False,
42
+ model_name="test_model",
43
+ )
44
+
34
45
  try:
35
- self.hicache = HiCacheNixl(file_path=self.test_dir, plugin="POSIX")
46
+ self.hicache = HiCacheNixl(
47
+ storage_config=self.storage_config,
48
+ file_path=self.test_dir,
49
+ plugin="POSIX",
50
+ )
36
51
  except ImportError:
37
52
  self.skipTest("NIXL not available, skipping NIXL storage tests")
38
53