sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -5,14 +5,21 @@ import logging
5
5
  import os
6
6
  import signal
7
7
  import threading
8
+ import time
8
9
  from abc import ABC, abstractmethod
9
10
  from functools import wraps
10
11
  from typing import Any, List, Optional, Tuple
11
12
 
12
13
  import torch
13
14
 
14
- from sglang.srt.mem_cache.hicache_storage import HiCacheStorage, HiCacheStorageConfig
15
- from sglang.srt.mem_cache.storage.hf3fs.client_hf3fs import Hf3fsClient
15
+ from sglang.srt.mem_cache.hicache_storage import (
16
+ HiCacheStorage,
17
+ HiCacheStorageConfig,
18
+ HiCacheStorageExtraInfo,
19
+ )
20
+ from sglang.srt.mem_cache.memory_pool_host import HostKVCache
21
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsClient
22
+ from sglang.srt.metrics.collector import StorageMetrics
16
23
 
17
24
  logger = logging.getLogger(__name__)
18
25
 
@@ -112,6 +119,33 @@ def synchronized():
112
119
  return _decorator
113
120
 
114
121
 
122
+ def create_hf3fs_client(
123
+ path: str, size: int, bytes_per_page: int, entries: int, use_mock: bool = False
124
+ ) -> Hf3fsClient:
125
+ """Factory function to create appropriate HF3FS client.
126
+
127
+ Args:
128
+ path: File path for storage
129
+ size: Total size of storage file
130
+ bytes_per_page: Bytes per page
131
+ entries: Number of entries for batch operations
132
+ use_mock: Whether to use mock client instead of real usrbio client
133
+
134
+ Returns:
135
+ """
136
+ if use_mock:
137
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_client import Hf3fsMockClient
138
+
139
+ logger.info(f"[Rank Using Hf3fsMockClient for testing")
140
+ return Hf3fsMockClient(path, size, bytes_per_page, entries)
141
+ else:
142
+ from sglang.srt.mem_cache.storage.hf3fs.hf3fs_usrbio_client import (
143
+ Hf3fsUsrBioClient,
144
+ )
145
+
146
+ return Hf3fsUsrBioClient(path, size, bytes_per_page, entries)
147
+
148
+
115
149
  class HiCacheHF3FS(HiCacheStorage):
116
150
  """HiCache backend that stores KV cache pages in HF3FS files."""
117
151
 
@@ -129,12 +163,14 @@ class HiCacheHF3FS(HiCacheStorage):
129
163
  metadata_client: Hf3fsMetadataInterface,
130
164
  is_mla_model: bool = False,
131
165
  is_page_first_layout: bool = False,
166
+ use_mock_client: bool = False,
132
167
  ):
133
168
  self.rank = rank
134
169
  self.file_path = file_path
135
170
  self.file_size = file_size
136
171
  self.numjobs = numjobs
137
172
  self.bytes_per_page = bytes_per_page
173
+ self.gb_per_page = bytes_per_page / (1 << 30)
138
174
  self.entries = entries
139
175
  self.dtype = dtype
140
176
  self.metadata_client = metadata_client
@@ -147,17 +183,24 @@ class HiCacheHF3FS(HiCacheStorage):
147
183
  self.skip_backup = True
148
184
  self.rank = 0
149
185
 
186
+ self.is_zero_copy = False
187
+
150
188
  logger.info(
151
189
  f"[Rank {self.rank}] HiCacheHF3FS Client Initializing: "
152
190
  f"file_path={self.file_path}, "
153
191
  f"file_size={self.file_size / (2 ** 30):.2f} GB, "
154
- f"num_pages={self.num_pages}"
192
+ f"num_pages={self.num_pages}, "
193
+ f"is_mla_model={self.is_mla_model}"
155
194
  )
156
195
 
157
196
  self.ac = AtomicCounter(self.numjobs)
158
197
  self.clients = [
159
- Hf3fsClient(
160
- self.file_path, self.file_size, self.bytes_per_page, self.entries
198
+ create_hf3fs_client(
199
+ self.file_path,
200
+ self.file_size,
201
+ self.bytes_per_page,
202
+ self.entries,
203
+ use_mock_client,
161
204
  )
162
205
  for _ in range(numjobs)
163
206
  ]
@@ -174,6 +217,11 @@ class HiCacheHF3FS(HiCacheStorage):
174
217
  signal.signal(signal.SIGTERM, lambda sig, frame: self.close())
175
218
  signal.signal(signal.SIGQUIT, lambda sig, frame: self.close())
176
219
 
220
+ self.prefetch_pgs = []
221
+ self.backup_pgs = []
222
+ self.prefetch_bandwidth = []
223
+ self.backup_bandwidth = []
224
+
177
225
  @staticmethod
178
226
  def from_env_config(
179
227
  bytes_per_page: int,
@@ -194,14 +242,24 @@ class HiCacheHF3FS(HiCacheStorage):
194
242
  Hf3fsLocalMetadataClient,
195
243
  )
196
244
 
245
+ use_mock_client = False
197
246
  if storage_config is not None:
198
247
  rank, is_mla_model, is_page_first_layout = (
199
248
  storage_config.tp_rank,
200
249
  storage_config.is_mla_model,
201
250
  storage_config.is_page_first_layout,
202
251
  )
252
+
253
+ if storage_config.extra_config is not None:
254
+ use_mock_client = storage_config.extra_config.get(
255
+ "use_mock_hf3fs_client", False
256
+ )
203
257
  else:
204
- rank, is_mla_model, is_page_first_layout = 0, False, False
258
+ rank, is_mla_model, is_page_first_layout = (
259
+ 0,
260
+ False,
261
+ False,
262
+ )
205
263
 
206
264
  mla_unsupported_msg = f"MLA model is not supported without global metadata server, please refer to https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/mem_cache/storage/hf3fs/docs/deploy_sglang_3fs_multinode.md"
207
265
 
@@ -220,6 +278,7 @@ class HiCacheHF3FS(HiCacheStorage):
220
278
  dtype=dtype,
221
279
  metadata_client=Hf3fsLocalMetadataClient(),
222
280
  is_page_first_layout=is_page_first_layout,
281
+ use_mock_client=use_mock_client,
223
282
  )
224
283
 
225
284
  try:
@@ -269,27 +328,15 @@ class HiCacheHF3FS(HiCacheStorage):
269
328
  metadata_client=metadata_client,
270
329
  is_mla_model=is_mla_model,
271
330
  is_page_first_layout=is_page_first_layout,
331
+ use_mock_client=use_mock_client,
272
332
  )
273
333
 
274
- def get(
275
- self,
276
- key: str,
277
- target_location: Optional[Any] = None,
278
- target_sizes: Optional[Any] = None,
279
- ) -> torch.Tensor | None:
280
- return self.batch_get(
281
- [key],
282
- [target_location] if target_location is not None else None,
283
- [target_sizes] if target_sizes is not None else None,
284
- )[0]
285
-
286
334
  @synchronized()
287
- def batch_get(
335
+ def _batch_get(
288
336
  self,
289
337
  keys: List[str],
290
- target_locations: Optional[Any] = None,
291
- target_sizes: Optional[Any] = None,
292
- ) -> List[torch.Tensor | None]:
338
+ values: List[torch.Tensor],
339
+ ) -> List[bool]:
293
340
  page_indices = self.metadata_client.get_page_indices(self.rank, keys)
294
341
 
295
342
  batch_indices, file_offsets = [], []
@@ -298,15 +345,11 @@ class HiCacheHF3FS(HiCacheStorage):
298
345
  batch_indices.append(i)
299
346
  file_offsets.append(page_index * self.bytes_per_page)
300
347
 
301
- if target_locations is not None:
302
- for target_location in target_locations:
303
- assert target_location.is_contiguous()
304
- file_results = target_locations
305
- else:
306
- file_results = [
307
- torch.empty(self.numel, dtype=self.dtype)
308
- for _ in range(len(batch_indices))
309
- ]
348
+ for target_location in values:
349
+ assert target_location.is_contiguous()
350
+ file_results = values
351
+
352
+ start_time = time.perf_counter()
310
353
 
311
354
  futures = [
312
355
  self.executor.submit(
@@ -318,12 +361,17 @@ class HiCacheHF3FS(HiCacheStorage):
318
361
  ]
319
362
  read_results = [result for future in futures for result in future.result()]
320
363
 
321
- results = [None] * len(keys)
322
- for batch_index, file_result, read_result in zip(
323
- batch_indices, file_results, read_results
324
- ):
364
+ end_time = time.perf_counter()
365
+ ionum = len(batch_indices)
366
+ self.prefetch_pgs.append(ionum)
367
+ self.prefetch_bandwidth.append(
368
+ ionum / (end_time - start_time) * self.gb_per_page
369
+ )
370
+
371
+ results = [False] * len(keys)
372
+ for batch_index, read_result in zip(batch_indices, read_results):
325
373
  if read_result == self.bytes_per_page:
326
- results[batch_index] = file_result
374
+ results[batch_index] = True
327
375
  else:
328
376
  logger.error(
329
377
  f"[Rank {self.rank}] HiCacheHF3FS get {keys[batch_index]} failed"
@@ -331,27 +379,12 @@ class HiCacheHF3FS(HiCacheStorage):
331
379
 
332
380
  return results
333
381
 
334
- def set(
335
- self,
336
- key: str,
337
- value: Optional[Any] = None,
338
- target_location: Optional[Any] = None,
339
- target_sizes: Optional[Any] = None,
340
- ) -> bool:
341
- return self.batch_set(
342
- [key],
343
- [value] if value is not None else None,
344
- [target_location] if target_location is not None else None,
345
- [target_sizes] if target_sizes is not None else None,
346
- )
347
-
348
- def batch_set(
382
+ @synchronized()
383
+ def _batch_set(
349
384
  self,
350
385
  keys: List[str],
351
386
  values: Optional[Any] = None,
352
- target_locations: Optional[Any] = None,
353
- target_sizes: Optional[Any] = None,
354
- ) -> bool:
387
+ ) -> List[bool]:
355
388
  # In MLA backend, only one rank needs to backup the KV cache
356
389
  if self.skip_backup:
357
390
  return True
@@ -374,6 +407,8 @@ class HiCacheHF3FS(HiCacheStorage):
374
407
  assert value.is_contiguous()
375
408
  file_values.append(value)
376
409
 
410
+ start_time = time.perf_counter()
411
+
377
412
  futures = [
378
413
  self.executor.submit(
379
414
  self.clients[self.ac.next()].batch_write,
@@ -388,6 +423,11 @@ class HiCacheHF3FS(HiCacheStorage):
388
423
  for result in future.result()
389
424
  ]
390
425
 
426
+ end_time = time.perf_counter()
427
+ ionum = len(batch_indices)
428
+ self.backup_pgs.append(ionum)
429
+ self.backup_bandwidth.append(ionum / (end_time - start_time) * self.gb_per_page)
430
+
391
431
  written_keys_to_confirm = []
392
432
  results = [index[0] for index in indices]
393
433
  for batch_index, write_result in zip(batch_indices, write_results):
@@ -405,7 +445,7 @@ class HiCacheHF3FS(HiCacheStorage):
405
445
  self.rank, written_keys_to_confirm, pages_to_release
406
446
  )
407
447
 
408
- return all(results)
448
+ return results
409
449
 
410
450
  def delete(self, key: str) -> None:
411
451
  self.metadata_client.delete_keys(self.rank, [key])
@@ -415,21 +455,25 @@ class HiCacheHF3FS(HiCacheStorage):
415
455
  return result[0] if result else False
416
456
 
417
457
  def batch_exists(self, keys: List[str]) -> int:
458
+ factor = 1
459
+ if self.is_zero_copy and not self.is_mla_model:
460
+ keys = self._get_mha_zero_copy_keys(keys)
461
+ factor = 2
462
+
418
463
  results = self.metadata_client.exists(self.rank, keys)
419
- for i in range(len(keys)):
420
- if not results[i]:
421
- return i
422
464
 
423
- return len(keys)
465
+ i = 0
466
+ while i < len(keys) and results[i]:
467
+ i += 1
424
468
 
425
- def clear(self) -> bool:
469
+ return i // factor
470
+
471
+ def clear(self) -> None:
426
472
  try:
427
473
  self.metadata_client.clear(self.rank)
428
474
  logger.info(f"Cleared HiCacheHF3FS for rank {self.rank}")
429
- return True
430
475
  except Exception as e:
431
476
  logger.error(f"Failed to clear HiCacheHF3FS: {e}")
432
- return False
433
477
 
434
478
  def close(self) -> None:
435
479
  try:
@@ -439,3 +483,156 @@ class HiCacheHF3FS(HiCacheStorage):
439
483
  except Exception as e:
440
484
  logger.error(f"close HiCacheHF3FS: {e}")
441
485
  logger.info("close HiCacheHF3FS")
486
+
487
+ @synchronized()
488
+ def get_stats(self):
489
+ storage_metrics = StorageMetrics()
490
+ storage_metrics.prefetch_pgs.extend(self.prefetch_pgs)
491
+ storage_metrics.backup_pgs.extend(self.backup_pgs)
492
+ storage_metrics.prefetch_bandwidth.extend(self.prefetch_bandwidth)
493
+ storage_metrics.backup_bandwidth.extend(self.backup_bandwidth)
494
+ self.prefetch_pgs.clear()
495
+ self.backup_pgs.clear()
496
+ self.prefetch_bandwidth.clear()
497
+ self.backup_bandwidth.clear()
498
+ return storage_metrics
499
+
500
+ def register_mem_pool_host(self, mem_pool_host: HostKVCache):
501
+ super().register_mem_pool_host(mem_pool_host)
502
+ self.is_zero_copy = self.mem_pool_host.layout == "page_first"
503
+ logger.info(f"{self.is_zero_copy=}")
504
+
505
+ def _get_mha_zero_copy_keys(self, keys: List[str]) -> List[str]:
506
+ _keys = []
507
+ for k in keys:
508
+ _keys.append(f"{k}-k")
509
+ _keys.append(f"{k}-v")
510
+ return _keys
511
+
512
+ def _get_mha_zero_copy_values(
513
+ self, values: List[torch.Tensor]
514
+ ) -> List[torch.Tensor]:
515
+ _values = []
516
+ for value in values:
517
+ _values.append(value[0])
518
+ _values.append(value[1])
519
+ return _values
520
+
521
+ def _batch_get_preprocess(self, keys, host_indices):
522
+ page_num = len(host_indices) // self.mem_pool_host.page_size
523
+ # host_indices to kv_buffer
524
+ flat = not self.is_zero_copy
525
+ values = (
526
+ [
527
+ self.mem_pool_host.get_data_page(
528
+ host_indices[i * self.mem_pool_host.page_size], flat=flat
529
+ )
530
+ for i in range(page_num)
531
+ ]
532
+ if self.is_zero_copy
533
+ else [
534
+ self.mem_pool_host.get_dummy_flat_data_page() for _ in range(page_num)
535
+ ]
536
+ )
537
+
538
+ if self.is_zero_copy and not self.is_mla_model:
539
+ keys = self._get_mha_zero_copy_keys(keys)
540
+ values = self._get_mha_zero_copy_values(values)
541
+
542
+ return keys, values
543
+
544
+ def _batch_get_postprocess(self, host_indices, values, results):
545
+ page_num = len(host_indices) // self.mem_pool_host.page_size
546
+
547
+ if self.is_zero_copy:
548
+ if not self.is_mla_model:
549
+ results = [
550
+ (results[2 * i] and results[2 * i + 1]) for i in range(page_num)
551
+ ]
552
+ results = results[:page_num]
553
+ return results
554
+
555
+ for i in range(page_num):
556
+ if not results[i]:
557
+ break
558
+ self.mem_pool_host.set_from_flat_data_page(
559
+ host_indices[i * self.mem_pool_host.page_size], values[i]
560
+ )
561
+
562
+ return results
563
+
564
+ def batch_get_v1(
565
+ self,
566
+ keys: List[str],
567
+ host_indices: torch.Tensor,
568
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
569
+ ) -> List[bool]:
570
+ keys, values = self._batch_get_preprocess(keys, host_indices)
571
+ results = self._batch_get(keys, values)
572
+ return self._batch_get_postprocess(host_indices, values, results)
573
+
574
+ def _batch_set_preprocess(self, keys, host_indices):
575
+ page_num = len(host_indices) // self.mem_pool_host.page_size
576
+ # host_indices to kv_buffer
577
+ flat = not self.is_zero_copy
578
+ values = [
579
+ self.mem_pool_host.get_data_page(
580
+ host_indices[i * self.mem_pool_host.page_size], flat=flat
581
+ )
582
+ for i in range(page_num)
583
+ ]
584
+
585
+ if self.is_zero_copy and not self.is_mla_model:
586
+ keys = self._get_mha_zero_copy_keys(keys)
587
+ values = self._get_mha_zero_copy_values(values)
588
+
589
+ return keys, values
590
+
591
+ def batch_set_v1(
592
+ self,
593
+ keys: List[str],
594
+ host_indices: torch.Tensor,
595
+ extra_info: Optional[HiCacheStorageExtraInfo] = None,
596
+ ) -> List[bool]:
597
+ len_keys = len(keys)
598
+ keys, values = self._batch_set_preprocess(keys, host_indices)
599
+ results = self._batch_set(keys, values)
600
+ return results
601
+
602
+ # Deprecated
603
+ def get(
604
+ self,
605
+ key: str,
606
+ target_location: Optional[Any] = None,
607
+ target_sizes: Optional[Any] = None,
608
+ ) -> torch.Tensor | None:
609
+ pass
610
+
611
+ # Deprecated
612
+ def batch_get(
613
+ self,
614
+ keys: List[str],
615
+ target_locations: Optional[Any] = None,
616
+ target_sizes: Optional[Any] = None,
617
+ ) -> List[torch.Tensor | None] | int:
618
+ pass
619
+
620
+ # Deprecated
621
+ def set(
622
+ self,
623
+ key: str,
624
+ value: Optional[Any] = None,
625
+ target_location: Optional[Any] = None,
626
+ target_sizes: Optional[Any] = None,
627
+ ) -> bool:
628
+ pass
629
+
630
+ # Deprecated
631
+ def batch_set(
632
+ self,
633
+ keys: List[str],
634
+ values: Optional[Any] = None,
635
+ target_locations: Optional[Any] = None,
636
+ target_sizes: Optional[Any] = None,
637
+ ) -> bool:
638
+ pass