sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -14,10 +14,10 @@
14
14
  """Utilities for Prometheus Metrics Collection."""
15
15
  import time
16
16
  from dataclasses import dataclass, field
17
- from enum import Enum
18
17
  from typing import Dict, List, Optional, Union
19
18
 
20
- from sglang.srt.metrics.utils import generate_buckets
19
+ from sglang.srt.disaggregation.utils import DisaggregationMode
20
+ from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
21
21
  from sglang.srt.server_args import ServerArgs
22
22
  from sglang.srt.utils import get_bool_env_var
23
23
 
@@ -34,6 +34,7 @@ class TimeStats:
34
34
  Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
35
35
  """
36
36
 
37
+ disagg_mode: DisaggregationMode = DisaggregationMode.NULL
37
38
  lb_entry_time: float = 0.0
38
39
  wait_queue_entry_time: float = 0.0
39
40
  forward_entry_time: float = 0.0
@@ -43,20 +44,11 @@ class TimeStats:
43
44
  decode_prealloc_queue_entry_time: float = 0.0
44
45
  decode_transfer_queue_entry_time: float = 0.0
45
46
 
46
- class RequestType(Enum):
47
- UNIFIED = "unified"
48
- PREFILL = "prefill"
49
- DECODE = "decode"
50
- INVALID = "invalid"
51
-
52
47
  def get_queueing_time(self) -> float:
53
48
  return self.forward_entry_time - self.wait_queue_entry_time
54
49
 
55
- def __str__(self) -> str:
56
- # if unified
57
- _type = self.get_type()
58
-
59
- if _type == self.RequestType.UNIFIED:
50
+ def convert_to_duration(self) -> str:
51
+ if self.disagg_mode == DisaggregationMode.NULL:
60
52
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
61
53
  forward_duration = self.completion_time - self.forward_entry_time
62
54
 
@@ -65,30 +57,28 @@ class TimeStats:
65
57
  queue_duration >= 0 and forward_duration >= 0
66
58
  ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
67
59
 
68
- return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
69
- elif _type == self.RequestType.PREFILL:
60
+ return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
61
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
70
62
  bootstrap_duration = (
71
63
  self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
72
64
  )
73
-
74
65
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
75
-
76
66
  forward_duration = self.completion_time - self.forward_entry_time
77
67
 
78
68
  if SGLANG_TEST_REQUEST_TIME_STATS:
79
- assert (
80
- bootstrap_duration >= 0
81
- and queue_duration >= 0
82
- and forward_duration >= 0
83
- ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
84
- return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
85
- # if decode
86
- elif _type == self.RequestType.DECODE:
69
+ if self.wait_queue_entry_time > 0:
70
+ assert (
71
+ bootstrap_duration >= 0
72
+ and queue_duration >= 0
73
+ and forward_duration >= 0
74
+ ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
75
+
76
+ return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
77
+ elif self.disagg_mode == DisaggregationMode.DECODE:
87
78
  prealloc_duration = (
88
79
  self.decode_transfer_queue_entry_time
89
80
  - self.decode_prealloc_queue_entry_time
90
81
  )
91
-
92
82
  transfer_duration = (
93
83
  self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
94
84
  )
@@ -96,42 +86,30 @@ class TimeStats:
96
86
  forward_duration = self.completion_time - self.forward_entry_time
97
87
 
98
88
  if SGLANG_TEST_REQUEST_TIME_STATS:
99
- assert (
100
- prealloc_duration >= 0
101
- and transfer_duration >= 0
102
- and queue_duration >= 0
103
- and forward_duration >= 0
104
- ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
105
-
106
- return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
89
+ if self.wait_queue_entry_time > 0:
90
+ assert (
91
+ prealloc_duration >= 0
92
+ and transfer_duration >= 0
93
+ and queue_duration >= 0
94
+ and forward_duration >= 0
95
+ ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
96
+
97
+ return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
107
98
  else:
108
- return "Invalid Time Stats"
99
+ return "Unknown Time Stats"
109
100
 
110
101
  def format_duration(self, duration: float) -> str:
111
102
  return f"{duration * 1e3:.2f}ms"
112
103
 
113
- def get_type(self) -> RequestType:
114
- """Determine the type of request based on timestamp values."""
115
- if (
116
- self.prefill_bootstrap_queue_entry_time == 0.0
117
- and self.prefill_transfer_queue_entry_time == 0.0
118
- and self.decode_prealloc_queue_entry_time == 0.0
119
- and self.decode_transfer_queue_entry_time == 0.0
120
- ):
121
- return self.RequestType.UNIFIED
122
- elif (
123
- self.prefill_bootstrap_queue_entry_time > 0.0
124
- and self.prefill_transfer_queue_entry_time > 0.0
125
- ):
126
- return self.RequestType.PREFILL
127
- elif (
128
- self.decode_prealloc_queue_entry_time > 0.0
129
- and self.decode_transfer_queue_entry_time > 0.0
130
- and self.wait_queue_entry_time > 0.0
131
- ):
132
- return self.RequestType.DECODE
104
+ def disagg_mode_str(self) -> str:
105
+ if self.disagg_mode == DisaggregationMode.NULL:
106
+ return "unified"
107
+ elif self.disagg_mode == DisaggregationMode.DECODE:
108
+ return "decode"
109
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
110
+ return "prefill"
133
111
  else:
134
- return self.RequestType.INVALID
112
+ return "unknown"
135
113
 
136
114
 
137
115
  @dataclass
@@ -140,16 +118,21 @@ class SchedulerStats:
140
118
  num_running_reqs: int = 0
141
119
  num_used_tokens: int = 0
142
120
  token_usage: float = 0.0
121
+ pending_prealloc_token_usage: float = 0.0
143
122
  swa_token_usage: float = 0.0
144
123
  gen_throughput: float = 0.0
145
124
  num_queue_reqs: int = 0
146
125
  num_grammar_queue_reqs: int = 0
147
126
  num_running_reqs_offline_batch: int = 0
148
- avg_request_queue_latency: float = 0.0
149
127
  cache_hit_rate: float = 0.0
150
128
 
151
129
  # Speculative decoding
152
130
  spec_accept_length: float = 0.0
131
+ spec_accept_rate: float = 0.0
132
+
133
+ # Retract
134
+ num_retracted_reqs: int = 0
135
+ num_paused_reqs: int = 0
153
136
 
154
137
  # PD disaggregation
155
138
  num_prefill_prealloc_queue_reqs: int = 0
@@ -159,11 +142,6 @@ class SchedulerStats:
159
142
  kv_transfer_speed_gb_s: float = 0.0
160
143
  kv_transfer_latency_ms: float = 0.0
161
144
 
162
- # Retract
163
- total_retracted_reqs: int = 0
164
- num_retracted_reqs: int = 0
165
- num_paused_reqs: int = 0
166
-
167
145
  # Utilization
168
146
  utilization: float = 0.0
169
147
  max_running_requests_under_SLO: Optional[int] = None
@@ -172,6 +150,9 @@ class SchedulerStats:
172
150
  engine_startup_time: float = 0.0
173
151
  engine_load_weights_time: float = 0.0
174
152
 
153
+ # CUDA graph
154
+ is_cuda_graph: float = 0.0
155
+
175
156
 
176
157
  class SchedulerMetricsCollector:
177
158
 
@@ -200,6 +181,12 @@ class SchedulerMetricsCollector:
200
181
  labelnames=labels.keys(),
201
182
  multiprocess_mode="mostrecent",
202
183
  )
184
+ self.pending_prealloc_token_usage = Gauge(
185
+ name="sglang:pending_prealloc_token_usage",
186
+ documentation="The token usage for pending preallocated tokens (not preallocated yet).",
187
+ labelnames=labels.keys(),
188
+ multiprocess_mode="mostrecent",
189
+ )
203
190
  self.swa_token_usage = Gauge(
204
191
  name="sglang:swa_token_usage",
205
192
  documentation="The token usage for SWA layers.",
@@ -230,12 +217,6 @@ class SchedulerMetricsCollector:
230
217
  labelnames=labels.keys(),
231
218
  multiprocess_mode="mostrecent",
232
219
  )
233
- self.avg_request_queue_latency = Gauge(
234
- name="sglang:avg_request_queue_latency",
235
- documentation="The average request queue latency for the last batch of requests in seconds.",
236
- labelnames=labels.keys(),
237
- multiprocess_mode="mostrecent",
238
- )
239
220
  self.cache_hit_rate = Gauge(
240
221
  name="sglang:cache_hit_rate",
241
222
  documentation="The prefix cache hit rate.",
@@ -250,6 +231,24 @@ class SchedulerMetricsCollector:
250
231
  labelnames=labels.keys(),
251
232
  multiprocess_mode="mostrecent",
252
233
  )
234
+ self.spec_accept_rate = Gauge(
235
+ name="sglang:spec_accept_rate",
236
+ documentation="The average acceptance rate of speculative decoding (`accepted tokens / total draft tokens` in batch).",
237
+ labelnames=labels.keys(),
238
+ multiprocess_mode="mostrecent",
239
+ )
240
+
241
+ # Retract
242
+ self.num_retracted_reqs = Gauge(
243
+ name="sglang:num_retracted_reqs",
244
+ documentation="The number of retracted requests.",
245
+ labelnames=labels.keys(),
246
+ )
247
+ self.num_paused_reqs = Gauge(
248
+ name="sglang:num_paused_reqs",
249
+ documentation="The number of paused requests by async weight sync.",
250
+ labelnames=labels.keys(),
251
+ )
253
252
 
254
253
  # PD disaggregation
255
254
  self.num_prefill_prealloc_queue_reqs = Gauge(
@@ -299,24 +298,6 @@ class SchedulerMetricsCollector:
299
298
  multiprocess_mode="mostrecent",
300
299
  )
301
300
 
302
- # Retract
303
- self.total_retracted_reqs = Gauge(
304
- name="sglang:total_retracted_reqs",
305
- documentation="The total number of retracted requests due to kvcache full.",
306
- labelnames=labels.keys(),
307
- multiprocess_mode="mostrecent",
308
- )
309
- self.num_retracted_reqs = Gauge(
310
- name="sglang:num_retracted_reqs",
311
- documentation="The number of retracted requests.",
312
- labelnames=labels.keys(),
313
- )
314
- self.num_paused_reqs = Gauge(
315
- name="sglang:num_paused_reqs",
316
- documentation="The number of paused requests by async weight sync.",
317
- labelnames=labels.keys(),
318
- )
319
-
320
301
  # Utilization
321
302
  self.utilization = Gauge(
322
303
  name="sglang:utilization",
@@ -347,7 +328,7 @@ class SchedulerMetricsCollector:
347
328
 
348
329
  # Additional queueing time histogram
349
330
  self.queue_time = Histogram(
350
- name="sglang:queue_time_s",
331
+ name="sglang:queue_time_seconds",
351
332
  documentation="Histogram of queueing time in seconds.",
352
333
  labelnames=labels.keys(),
353
334
  buckets=[
@@ -513,11 +494,26 @@ class SchedulerMetricsCollector:
513
494
  buckets=tree_traversal_time_buckets,
514
495
  )
515
496
 
497
+ self.per_stage_req_latency_seconds = Histogram(
498
+ name="sglang:per_stage_req_latency_seconds",
499
+ documentation="The latency of each stage of requests.",
500
+ # captures latency in range [1ms - ~1191s]
501
+ buckets=exponential_buckets(start=0.001, width=1.62, length=30),
502
+ labelnames=list(labels.keys()) + ["stage"],
503
+ )
504
+
505
+ self.is_cuda_graph = Gauge(
506
+ name="sglang:is_cuda_graph",
507
+ documentation="Whether the batch is using CUDA graph.",
508
+ labelnames=labels.keys(),
509
+ multiprocess_mode="mostrecent",
510
+ )
511
+
516
512
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
517
513
  # Convenience function for logging to gauge.
518
514
  gauge.labels(**self.labels).set(data)
519
515
 
520
- def log_histogram(self, histogram, data: Union[int, float]) -> None:
516
+ def _log_histogram(self, histogram, data: Union[int, float]) -> None:
521
517
  histogram.labels(**self.labels).observe(data)
522
518
 
523
519
  def increment_bootstrap_failed_reqs(self) -> None:
@@ -526,10 +522,20 @@ class SchedulerMetricsCollector:
526
522
  def increment_transfer_failed_reqs(self) -> None:
527
523
  self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
528
524
 
525
+ def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
526
+ labels_with_stage = {**self.labels, "stage": stage}
527
+ self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
528
+
529
+ def observe_queue_time(self, latency: float) -> None:
530
+ self._log_histogram(self.queue_time, latency)
531
+
529
532
  def log_stats(self, stats: SchedulerStats) -> None:
530
533
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
531
534
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
532
535
  self._log_gauge(self.token_usage, stats.token_usage)
536
+ self._log_gauge(
537
+ self.pending_prealloc_token_usage, stats.pending_prealloc_token_usage
538
+ )
533
539
  self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
534
540
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
535
541
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
@@ -538,10 +544,10 @@ class SchedulerMetricsCollector:
538
544
  self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
539
545
  )
540
546
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
541
- self._log_gauge(self.avg_request_queue_latency, stats.avg_request_queue_latency)
542
547
 
543
548
  # Speculative decoding
544
549
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
550
+ self._log_gauge(self.spec_accept_rate, stats.spec_accept_rate)
545
551
 
546
552
  # PD disaggregation
547
553
  self._log_gauge(
@@ -560,7 +566,6 @@ class SchedulerMetricsCollector:
560
566
  self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
561
567
 
562
568
  # Retract
563
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
564
569
  self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
565
570
  self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
566
571
 
@@ -579,24 +584,27 @@ class SchedulerMetricsCollector:
579
584
  self.engine_load_weights_time, stats.engine_load_weights_time
580
585
  )
581
586
 
587
+ # CUDA graph
588
+ self._log_gauge(self.is_cuda_graph, stats.is_cuda_graph)
589
+
582
590
  self.last_log_time = time.perf_counter()
583
591
 
584
592
  def log_grammar_stats(self, grammar_stats) -> None:
585
593
  # Duck-typed GrammarStats to avoid cross-package dependency
586
594
  if getattr(grammar_stats, "compilation_time", None) is not None:
587
- self.log_histogram(
595
+ self._log_histogram(
588
596
  self.grammar_compilation_time, grammar_stats.compilation_time
589
597
  )
590
598
  if getattr(grammar_stats, "schema_count", None) is not None:
591
- self.log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
599
+ self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
592
600
  if getattr(grammar_stats, "ebnf_size", None) is not None:
593
- self.log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
601
+ self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
594
602
  tree_times = getattr(grammar_stats, "tree_traversal_time", None)
595
603
  if tree_times:
596
604
  max_time = max(tree_times)
597
605
  avg_time = sum(tree_times) / len(tree_times)
598
- self.log_histogram(self.grammar_tree_traversal_time_max, max_time)
599
- self.log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
606
+ self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
607
+ self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
600
608
  if getattr(grammar_stats, "is_cache_hit", False):
601
609
  self.num_grammar_cache_hit.labels(**self.labels).inc(1)
602
610
  if getattr(grammar_stats, "is_grammar_aborted", False):
@@ -702,7 +710,7 @@ class TokenizerMetricsCollector:
702
710
  )
703
711
 
704
712
  self.num_aborted_requests_total = Counter(
705
- name="sglang:num_aborted_requests",
713
+ name="sglang:num_aborted_requests_total",
706
714
  documentation="Number of requests aborted.",
707
715
  labelnames=labels.keys(),
708
716
  )
@@ -789,7 +797,7 @@ class TokenizerMetricsCollector:
789
797
  buckets=bucket_time_to_first_token,
790
798
  )
791
799
 
792
- self.histogram_inter_token_latency_seconds = Histogram(
800
+ self.histogram_inter_token_latency = Histogram(
793
801
  name="sglang:inter_token_latency_seconds",
794
802
  documentation="Histogram of inter-token latency in seconds.",
795
803
  labelnames=labels.keys(),
@@ -803,14 +811,6 @@ class TokenizerMetricsCollector:
803
811
  buckets=bucket_e2e_request_latency,
804
812
  )
805
813
 
806
- # Offline batch specific TTFB histogram
807
- self.histogram_time_to_first_token_offline_batch = Histogram(
808
- name="sglang:time_to_first_token_seconds_offline_batch",
809
- documentation="Histogram of time to first token in seconds for offline batch requests.",
810
- labelnames=labels.keys(),
811
- buckets=bucket_time_to_first_token,
812
- )
813
-
814
814
  def observe_one_finished_request(
815
815
  self,
816
816
  labels: Dict[str, str],
@@ -834,26 +834,19 @@ class TokenizerMetricsCollector:
834
834
  float(generation_tokens)
835
835
  )
836
836
 
837
- def observe_time_to_first_token(
838
- self, labels: Dict[str, str], value: float, type: str = ""
839
- ):
840
- if type == "batch":
841
- self.histogram_time_to_first_token_offline_batch.labels(**labels).observe(
842
- value
843
- )
844
- else:
845
- self.histogram_time_to_first_token.labels(**labels).observe(value)
837
+ def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
838
+ self.histogram_time_to_first_token.labels(**labels).observe(value)
846
839
 
847
840
  def check_time_to_first_token_straggler(self, value: float) -> bool:
848
841
  his = self.histogram_time_to_first_token.labels(**self.labels)
849
842
  total_observations = sum(bucket._value for bucket in his._buckets)
850
- if total_observations < 100:
843
+ if total_observations < 1000:
851
844
  return False
852
- p99_threshold = total_observations * 0.99
845
+ p999_threshold = total_observations * 0.999
853
846
  cumulative_count = 0
854
847
  for i, bucket in enumerate(his._buckets):
855
848
  cumulative_count += bucket._value
856
- if cumulative_count > p99_threshold:
849
+ if cumulative_count > p999_threshold:
857
850
  return value >= his._upper_bounds[i]
858
851
  return False
859
852
 
@@ -864,7 +857,7 @@ class TokenizerMetricsCollector:
864
857
 
865
858
  # A faster version of the Histogram::observe which observes multiple values at the same time.
866
859
  # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
867
- his = self.histogram_inter_token_latency_seconds.labels(**labels)
860
+ his = self.histogram_inter_token_latency.labels(**labels)
868
861
  his._sum.inc(internval)
869
862
 
870
863
  for i, bound in enumerate(his._upper_bounds):
@@ -872,8 +865,8 @@ class TokenizerMetricsCollector:
872
865
  his._buckets[i].inc(num_new_tokens)
873
866
  break
874
867
 
875
- def observe_one_aborted_request(self):
876
- self.num_aborted_requests_total.labels(**self.labels).inc(1)
868
+ def observe_one_aborted_request(self, labels: Dict[str, str]):
869
+ self.num_aborted_requests_total.labels(**labels).inc(1)
877
870
 
878
871
 
879
872
  @dataclass
@@ -18,7 +18,9 @@ Records the latency of some functions
18
18
  import asyncio
19
19
  import time
20
20
  from functools import wraps
21
- from typing import Any, Callable, List, Optional
21
+ from typing import Any, Callable, Optional
22
+
23
+ from sglang.srt.metrics.utils import exponential_buckets
22
24
 
23
25
  enable_metrics = False
24
26
 
@@ -42,13 +44,6 @@ def enable_func_timer():
42
44
  FUNC_LATENCY = None
43
45
 
44
46
 
45
- def exponential_buckets(start: float, width: float, length: int) -> List[float]:
46
- buckets = []
47
- for i in range(length):
48
- buckets.append(start * (width**i))
49
- return buckets
50
-
51
-
52
47
  def time_func_latency(
53
48
  func: Callable = None, name: Optional[str] = None
54
49
  ) -> Callable[..., Any]:
@@ -44,5 +44,12 @@ def generate_buckets(
44
44
  return two_sides_exponential_buckets(float(middle), float(base), int(count))
45
45
  if rule == "default":
46
46
  return sorted(set(default_buckets))
47
- assert rule == "customer"
47
+ assert rule == "custom"
48
48
  return sorted(set([float(x) for x in buckets_rule[1:]]))
49
+
50
+
51
+ def exponential_buckets(start: float, width: float, length: int) -> List[float]:
52
+ buckets = []
53
+ for i in range(length):
54
+ buckets.append(start * (width**i))
55
+ return buckets
@@ -34,7 +34,6 @@ from sglang.srt.model_executor.forward_batch_info import (
34
34
  ForwardMode,
35
35
  PPProxyTensors,
36
36
  )
37
- from sglang.srt.patch_torch import monkey_patch_torch_compile
38
37
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
39
38
  from sglang.srt.utils import (
40
39
  log_info_on_rank0,
@@ -43,6 +42,7 @@ from sglang.srt.utils import (
43
42
  require_mlp_sync,
44
43
  require_mlp_tp_gather,
45
44
  )
45
+ from sglang.srt.utils.patch_torch import monkey_patch_torch_compile
46
46
 
47
47
  logger = logging.getLogger(__name__)
48
48
 
@@ -607,7 +607,7 @@ class CPUGraphRunner:
607
607
  def get_spec_info(self, num_tokens: int):
608
608
  spec_info = None
609
609
  if self.model_runner.spec_algorithm.is_eagle():
610
- from sglang.srt.speculative.eagle_utils import EagleVerifyInput
610
+ from sglang.srt.speculative.eagle_info import EagleVerifyInput
611
611
 
612
612
  if self.model_runner.is_draft_worker:
613
613
  raise RuntimeError("This should not happen.")