sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -12,12 +12,13 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Utilities for Prometheus Metrics Collection."""
15
-
16
15
  import time
17
- from dataclasses import dataclass
18
- from enum import Enum
16
+ from dataclasses import dataclass, field
19
17
  from typing import Dict, List, Optional, Union
20
18
 
19
+ from sglang.srt.disaggregation.utils import DisaggregationMode
20
+ from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
21
+ from sglang.srt.server_args import ServerArgs
21
22
  from sglang.srt.utils import get_bool_env_var
22
23
 
23
24
  SGLANG_TEST_REQUEST_TIME_STATS = get_bool_env_var("SGLANG_TEST_REQUEST_TIME_STATS")
@@ -33,6 +34,7 @@ class TimeStats:
33
34
  Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
34
35
  """
35
36
 
37
+ disagg_mode: DisaggregationMode = DisaggregationMode.NULL
36
38
  lb_entry_time: float = 0.0
37
39
  wait_queue_entry_time: float = 0.0
38
40
  forward_entry_time: float = 0.0
@@ -42,17 +44,11 @@ class TimeStats:
42
44
  decode_prealloc_queue_entry_time: float = 0.0
43
45
  decode_transfer_queue_entry_time: float = 0.0
44
46
 
45
- class RequestType(Enum):
46
- UNIFIED = "unified"
47
- PREFILL = "prefill"
48
- DECODE = "decode"
49
- INVALID = "invalid"
50
-
51
- def __str__(self) -> str:
52
- # if unified
53
- _type = self.get_type()
47
+ def get_queueing_time(self) -> float:
48
+ return self.forward_entry_time - self.wait_queue_entry_time
54
49
 
55
- if _type == self.RequestType.UNIFIED:
50
+ def convert_to_duration(self) -> str:
51
+ if self.disagg_mode == DisaggregationMode.NULL:
56
52
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
57
53
  forward_duration = self.completion_time - self.forward_entry_time
58
54
 
@@ -61,30 +57,28 @@ class TimeStats:
61
57
  queue_duration >= 0 and forward_duration >= 0
62
58
  ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
63
59
 
64
- return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
65
- elif _type == self.RequestType.PREFILL:
60
+ return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
61
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
66
62
  bootstrap_duration = (
67
63
  self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
68
64
  )
69
-
70
65
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
71
-
72
66
  forward_duration = self.completion_time - self.forward_entry_time
73
67
 
74
68
  if SGLANG_TEST_REQUEST_TIME_STATS:
75
- assert (
76
- bootstrap_duration >= 0
77
- and queue_duration >= 0
78
- and forward_duration >= 0
79
- ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
80
- return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
81
- # if decode
82
- elif _type == self.RequestType.DECODE:
69
+ if self.wait_queue_entry_time > 0:
70
+ assert (
71
+ bootstrap_duration >= 0
72
+ and queue_duration >= 0
73
+ and forward_duration >= 0
74
+ ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
75
+
76
+ return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
77
+ elif self.disagg_mode == DisaggregationMode.DECODE:
83
78
  prealloc_duration = (
84
79
  self.decode_transfer_queue_entry_time
85
80
  - self.decode_prealloc_queue_entry_time
86
81
  )
87
-
88
82
  transfer_duration = (
89
83
  self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
90
84
  )
@@ -92,67 +86,74 @@ class TimeStats:
92
86
  forward_duration = self.completion_time - self.forward_entry_time
93
87
 
94
88
  if SGLANG_TEST_REQUEST_TIME_STATS:
95
- assert (
96
- prealloc_duration >= 0
97
- and transfer_duration >= 0
98
- and queue_duration >= 0
99
- and forward_duration >= 0
100
- ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
101
-
102
- return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
89
+ if self.wait_queue_entry_time > 0:
90
+ assert (
91
+ prealloc_duration >= 0
92
+ and transfer_duration >= 0
93
+ and queue_duration >= 0
94
+ and forward_duration >= 0
95
+ ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
96
+
97
+ return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
103
98
  else:
104
- return "Invalid Time Stats"
99
+ return "Unknown Time Stats"
105
100
 
106
101
  def format_duration(self, duration: float) -> str:
107
102
  return f"{duration * 1e3:.2f}ms"
108
103
 
109
- def get_type(self) -> RequestType:
110
- """Determine the type of request based on timestamp values."""
111
- if (
112
- self.prefill_bootstrap_queue_entry_time == 0.0
113
- and self.prefill_transfer_queue_entry_time == 0.0
114
- and self.decode_prealloc_queue_entry_time == 0.0
115
- and self.decode_transfer_queue_entry_time == 0.0
116
- ):
117
- return self.RequestType.UNIFIED
118
- elif (
119
- self.prefill_bootstrap_queue_entry_time > 0.0
120
- and self.prefill_transfer_queue_entry_time > 0.0
121
- ):
122
- return self.RequestType.PREFILL
123
- elif (
124
- self.decode_prealloc_queue_entry_time > 0.0
125
- and self.decode_transfer_queue_entry_time > 0.0
126
- and self.wait_queue_entry_time > 0.0
127
- ):
128
- return self.RequestType.DECODE
104
+ def disagg_mode_str(self) -> str:
105
+ if self.disagg_mode == DisaggregationMode.NULL:
106
+ return "unified"
107
+ elif self.disagg_mode == DisaggregationMode.DECODE:
108
+ return "decode"
109
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
110
+ return "prefill"
129
111
  else:
130
- return self.RequestType.INVALID
112
+ return "unknown"
131
113
 
132
114
 
133
115
  @dataclass
134
116
  class SchedulerStats:
117
+ # Basics
135
118
  num_running_reqs: int = 0
136
119
  num_used_tokens: int = 0
137
120
  token_usage: float = 0.0
121
+ swa_token_usage: float = 0.0
138
122
  gen_throughput: float = 0.0
139
123
  num_queue_reqs: int = 0
140
- cache_hit_rate: float = 0.0
141
124
  num_grammar_queue_reqs: int = 0
125
+ num_running_reqs_offline_batch: int = 0
126
+ cache_hit_rate: float = 0.0
127
+
128
+ # Speculative decoding
142
129
  spec_accept_length: float = 0.0
143
- avg_request_queue_latency: float = 0.0
130
+
131
+ # Retract
132
+ num_retracted_reqs: int = 0
133
+ num_paused_reqs: int = 0
134
+
135
+ # PD disaggregation
144
136
  num_prefill_prealloc_queue_reqs: int = 0
145
137
  num_prefill_inflight_queue_reqs: int = 0
146
138
  num_decode_prealloc_queue_reqs: int = 0
147
139
  num_decode_transfer_queue_reqs: int = 0
148
- total_retracted_reqs: int = 0
140
+ kv_transfer_speed_gb_s: float = 0.0
141
+ kv_transfer_latency_ms: float = 0.0
142
+
143
+ # Utilization
144
+ utilization: float = 0.0
145
+ max_running_requests_under_SLO: Optional[int] = None
146
+
147
+ # Engine startup
148
+ engine_startup_time: float = 0.0
149
+ engine_load_weights_time: float = 0.0
149
150
 
150
151
 
151
152
  class SchedulerMetricsCollector:
152
153
 
153
154
  def __init__(self, labels: Dict[str, str]) -> None:
154
155
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
155
- from prometheus_client import Counter, Gauge
156
+ from prometheus_client import Counter, Gauge, Histogram
156
157
 
157
158
  self.labels = labels
158
159
  self.last_log_time = time.perf_counter()
@@ -163,42 +164,48 @@ class SchedulerMetricsCollector:
163
164
  labelnames=labels.keys(),
164
165
  multiprocess_mode="mostrecent",
165
166
  )
166
-
167
167
  self.num_used_tokens = Gauge(
168
168
  name="sglang:num_used_tokens",
169
169
  documentation="The number of used tokens.",
170
170
  labelnames=labels.keys(),
171
171
  multiprocess_mode="mostrecent",
172
172
  )
173
-
174
173
  self.token_usage = Gauge(
175
174
  name="sglang:token_usage",
176
175
  documentation="The token usage.",
177
176
  labelnames=labels.keys(),
178
177
  multiprocess_mode="mostrecent",
179
178
  )
180
-
179
+ self.swa_token_usage = Gauge(
180
+ name="sglang:swa_token_usage",
181
+ documentation="The token usage for SWA layers.",
182
+ labelnames=labels.keys(),
183
+ multiprocess_mode="mostrecent",
184
+ )
181
185
  self.gen_throughput = Gauge(
182
186
  name="sglang:gen_throughput",
183
187
  documentation="The generation throughput (token/s).",
184
188
  labelnames=labels.keys(),
185
189
  multiprocess_mode="mostrecent",
186
190
  )
187
-
188
191
  self.num_queue_reqs = Gauge(
189
192
  name="sglang:num_queue_reqs",
190
193
  documentation="The number of requests in the waiting queue.",
191
194
  labelnames=labels.keys(),
192
195
  multiprocess_mode="mostrecent",
193
196
  )
194
-
195
197
  self.num_grammar_queue_reqs = Gauge(
196
198
  name="sglang:num_grammar_queue_reqs",
197
199
  documentation="The number of requests in the grammar waiting queue.",
198
200
  labelnames=labels.keys(),
199
201
  multiprocess_mode="mostrecent",
200
202
  )
201
-
203
+ self.num_running_reqs_offline_batch = Gauge(
204
+ name="sglang:num_running_reqs_offline_batch",
205
+ documentation="The number of running low-priority offline batch requests(label is 'batch').",
206
+ labelnames=labels.keys(),
207
+ multiprocess_mode="mostrecent",
208
+ )
202
209
  self.cache_hit_rate = Gauge(
203
210
  name="sglang:cache_hit_rate",
204
211
  documentation="The prefix cache hit rate.",
@@ -206,6 +213,7 @@ class SchedulerMetricsCollector:
206
213
  multiprocess_mode="mostrecent",
207
214
  )
208
215
 
216
+ # Speculative decoding
209
217
  self.spec_accept_length = Gauge(
210
218
  name="sglang:spec_accept_length",
211
219
  documentation="The average acceptance length of speculative decoding.",
@@ -213,83 +221,307 @@ class SchedulerMetricsCollector:
213
221
  multiprocess_mode="mostrecent",
214
222
  )
215
223
 
216
- self.avg_request_queue_latency = Gauge(
217
- name="sglang:avg_request_queue_latency",
218
- documentation="The average request queue latency for the last batch of requests in seconds.",
224
+ # Retract
225
+ self.num_retracted_reqs = Gauge(
226
+ name="sglang:num_retracted_reqs",
227
+ documentation="The number of retracted requests.",
219
228
  labelnames=labels.keys(),
220
- multiprocess_mode="mostrecent",
221
229
  )
222
-
223
- self.total_retracted_reqs = Gauge(
224
- name="sglang:total_retracted_reqs",
225
- documentation="The total number of retracted requests due to kvcache full.",
230
+ self.num_paused_reqs = Gauge(
231
+ name="sglang:num_paused_reqs",
232
+ documentation="The number of paused requests by async weight sync.",
226
233
  labelnames=labels.keys(),
227
- multiprocess_mode="mostrecent",
228
234
  )
229
235
 
230
- # Disaggregation queue metrics
236
+ # PD disaggregation
231
237
  self.num_prefill_prealloc_queue_reqs = Gauge(
232
238
  name="sglang:num_prefill_prealloc_queue_reqs",
233
239
  documentation="The number of requests in the prefill prealloc queue.",
234
240
  labelnames=labels.keys(),
235
241
  multiprocess_mode="mostrecent",
236
242
  )
237
-
238
243
  self.num_prefill_inflight_queue_reqs = Gauge(
239
244
  name="sglang:num_prefill_inflight_queue_reqs",
240
245
  documentation="The number of requests in the prefill inflight queue.",
241
246
  labelnames=labels.keys(),
242
247
  multiprocess_mode="mostrecent",
243
248
  )
244
-
245
249
  self.num_decode_prealloc_queue_reqs = Gauge(
246
250
  name="sglang:num_decode_prealloc_queue_reqs",
247
251
  documentation="The number of requests in the decode prealloc queue.",
248
252
  labelnames=labels.keys(),
249
253
  multiprocess_mode="mostrecent",
250
254
  )
251
-
252
255
  self.num_decode_transfer_queue_reqs = Gauge(
253
256
  name="sglang:num_decode_transfer_queue_reqs",
254
257
  documentation="The number of requests in the decode transfer queue.",
255
258
  labelnames=labels.keys(),
256
259
  multiprocess_mode="mostrecent",
257
260
  )
258
-
259
261
  self.num_bootstrap_failed_reqs = Counter(
260
- name="sglang:num_bootstrap_failed_reqs",
262
+ name="sglang:num_bootstrap_failed_reqs_total",
261
263
  documentation="The number of bootstrap failed requests.",
262
264
  labelnames=labels.keys(),
263
265
  )
264
-
265
266
  self.num_transfer_failed_reqs = Counter(
266
- name="sglang:num_transfer_failed_reqs",
267
+ name="sglang:num_transfer_failed_reqs_total",
267
268
  documentation="The number of transfer failed requests.",
268
269
  labelnames=labels.keys(),
269
270
  )
271
+ self.kv_transfer_speed_gb_s = Gauge(
272
+ name="sglang:kv_transfer_speed_gb_s",
273
+ documentation="The transfer speed of the KV cache in GB/s.",
274
+ labelnames=labels.keys(),
275
+ multiprocess_mode="mostrecent",
276
+ )
277
+ self.kv_transfer_latency_ms = Gauge(
278
+ name="sglang:kv_transfer_latency_ms",
279
+ documentation="The transfer latency of the KV cache in ms.",
280
+ labelnames=labels.keys(),
281
+ multiprocess_mode="mostrecent",
282
+ )
283
+
284
+ # Utilization
285
+ self.utilization = Gauge(
286
+ name="sglang:utilization",
287
+ documentation="The utilization.",
288
+ labelnames=labels.keys(),
289
+ multiprocess_mode="mostrecent",
290
+ )
291
+ self.max_running_requests_under_SLO = Gauge(
292
+ name="sglang:max_running_requests_under_SLO",
293
+ documentation="The maximum number of running requests under SLO.",
294
+ labelnames=labels.keys(),
295
+ multiprocess_mode="mostrecent",
296
+ )
297
+
298
+ # Engine startup
299
+ self.engine_startup_time = Gauge(
300
+ name="sglang:engine_startup_time",
301
+ documentation="The time taken for the engine to start up.",
302
+ labelnames=labels.keys(),
303
+ multiprocess_mode="mostrecent",
304
+ )
305
+ self.engine_load_weights_time = Gauge(
306
+ name="sglang:engine_load_weights_time",
307
+ documentation="The time taken for the engine to load weights.",
308
+ labelnames=labels.keys(),
309
+ multiprocess_mode="mostrecent",
310
+ )
311
+
312
+ # Additional queueing time histogram
313
+ self.queue_time = Histogram(
314
+ name="sglang:queue_time_seconds",
315
+ documentation="Histogram of queueing time in seconds.",
316
+ labelnames=labels.keys(),
317
+ buckets=[
318
+ 0.0,
319
+ 0.1,
320
+ 0.2,
321
+ 0.5,
322
+ 1,
323
+ 2,
324
+ 3,
325
+ 4,
326
+ 5,
327
+ 10,
328
+ 15,
329
+ 20,
330
+ 30,
331
+ 40,
332
+ 50,
333
+ 60,
334
+ 70,
335
+ 80,
336
+ 90,
337
+ 100,
338
+ 200,
339
+ 300,
340
+ 400,
341
+ 500,
342
+ 600,
343
+ 700,
344
+ 800,
345
+ 900,
346
+ 1000,
347
+ 1200,
348
+ 1400,
349
+ 1600,
350
+ 1800,
351
+ 2000,
352
+ 2500,
353
+ 3000,
354
+ ],
355
+ )
356
+
357
+ # Grammar metrics
358
+ self.grammar_compilation_time = Histogram(
359
+ name="sglang:grammar_compilation_time_seconds",
360
+ documentation="Histogram of grammar compilation time in seconds.",
361
+ labelnames=labels.keys(),
362
+ buckets=[
363
+ 0.0,
364
+ 0.01,
365
+ 0.02,
366
+ 0.05,
367
+ 0.1,
368
+ 0.2,
369
+ 0.5,
370
+ 1,
371
+ 2,
372
+ 5,
373
+ 10,
374
+ 20,
375
+ 30,
376
+ 60,
377
+ 90,
378
+ 120,
379
+ 240,
380
+ ],
381
+ )
382
+ self.num_grammar_cache_hit = Counter(
383
+ name="sglang:num_grammar_cache_hit_total",
384
+ documentation="Number of grammar cache hits.",
385
+ labelnames=labels.keys(),
386
+ )
387
+ self.num_grammar_aborted = Counter(
388
+ name="sglang:num_grammar_aborted_total",
389
+ documentation="Number of grammar aborted requests.",
390
+ labelnames=labels.keys(),
391
+ )
392
+ self.num_grammar_total = Counter(
393
+ name="sglang:num_grammar_total",
394
+ documentation="Number of the total grammar requests.",
395
+ labelnames=labels.keys(),
396
+ )
397
+ self.grammar_schema_count = Histogram(
398
+ name="sglang:grammar_schema_count",
399
+ documentation="Histogram of grammar schema count.",
400
+ labelnames=labels.keys(),
401
+ buckets=[
402
+ 0,
403
+ 1,
404
+ 2,
405
+ 5,
406
+ 10,
407
+ 20,
408
+ 30,
409
+ 40,
410
+ 60,
411
+ 80,
412
+ 100,
413
+ 120,
414
+ 140,
415
+ 160,
416
+ 180,
417
+ 200,
418
+ 300,
419
+ 400,
420
+ 500,
421
+ 700,
422
+ 1000,
423
+ ],
424
+ )
425
+ self.grammar_ebnf_size = Histogram(
426
+ name="sglang:grammar_ebnf_size",
427
+ documentation="Histogram of grammar EBNF size.",
428
+ labelnames=labels.keys(),
429
+ buckets=[
430
+ 0,
431
+ 50,
432
+ 100,
433
+ 200,
434
+ 300,
435
+ 500,
436
+ 1000,
437
+ 2000,
438
+ 3000,
439
+ 5000,
440
+ 10000,
441
+ 20000,
442
+ 30000,
443
+ 50000,
444
+ 100000,
445
+ ],
446
+ )
447
+
448
+ tree_traversal_time_buckets = [
449
+ 0.0,
450
+ 0.01,
451
+ 0.02,
452
+ 0.05,
453
+ 0.1,
454
+ 0.2,
455
+ 0.5,
456
+ 1,
457
+ 2,
458
+ 5,
459
+ 10,
460
+ 15,
461
+ 30,
462
+ 60,
463
+ 90,
464
+ 120,
465
+ 240,
466
+ ]
467
+ self.grammar_tree_traversal_time_avg = Histogram(
468
+ name="sglang:grammar_tree_traversal_time_avg",
469
+ documentation="Histogram of average grammar tree traversal time in seconds.",
470
+ labelnames=labels.keys(),
471
+ buckets=tree_traversal_time_buckets,
472
+ )
473
+ self.grammar_tree_traversal_time_max = Histogram(
474
+ name="sglang:grammar_tree_traversal_time_max",
475
+ documentation="Histogram of max grammar tree traversal time in seconds.",
476
+ labelnames=labels.keys(),
477
+ buckets=tree_traversal_time_buckets,
478
+ )
479
+
480
+ self.per_stage_req_latency_seconds = Histogram(
481
+ name="sglang:per_stage_req_latency_seconds",
482
+ documentation="The latency of each stage of requests.",
483
+ # captures latency in range [1ms - ~1191s]
484
+ buckets=exponential_buckets(start=0.001, width=1.62, length=30),
485
+ labelnames=list(labels.keys()) + ["stage"],
486
+ )
270
487
 
271
488
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
272
489
  # Convenience function for logging to gauge.
273
490
  gauge.labels(**self.labels).set(data)
274
491
 
492
+ def _log_histogram(self, histogram, data: Union[int, float]) -> None:
493
+ histogram.labels(**self.labels).observe(data)
494
+
275
495
  def increment_bootstrap_failed_reqs(self) -> None:
276
496
  self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
277
497
 
278
498
  def increment_transfer_failed_reqs(self) -> None:
279
499
  self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
280
500
 
501
+ def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
502
+ labels_with_stage = {**self.labels, "stage": stage}
503
+ self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
504
+
505
+ def observe_queue_time(self, latency: float) -> None:
506
+ self._log_histogram(self.queue_time, latency)
507
+
281
508
  def log_stats(self, stats: SchedulerStats) -> None:
282
509
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
283
510
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
284
511
  self._log_gauge(self.token_usage, stats.token_usage)
512
+ self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
285
513
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
286
514
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
287
515
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
516
+ self._log_gauge(
517
+ self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
518
+ )
288
519
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
520
+
521
+ # Speculative decoding
289
522
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
290
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
291
523
 
292
- # Disaggregation metrics
524
+ # PD disaggregation
293
525
  self._log_gauge(
294
526
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
295
527
  )
@@ -302,14 +534,58 @@ class SchedulerMetricsCollector:
302
534
  self._log_gauge(
303
535
  self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
304
536
  )
537
+ self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
538
+ self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
539
+
540
+ # Retract
541
+ self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
542
+ self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
543
+
544
+ # Utilization
545
+ self._log_gauge(self.utilization, stats.utilization)
546
+ if stats.max_running_requests_under_SLO is not None:
547
+ self._log_gauge(
548
+ self.max_running_requests_under_SLO,
549
+ stats.max_running_requests_under_SLO,
550
+ )
551
+
552
+ # Engine startup time
553
+ self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
554
+ if stats.engine_load_weights_time is not None:
555
+ self._log_gauge(
556
+ self.engine_load_weights_time, stats.engine_load_weights_time
557
+ )
305
558
 
306
559
  self.last_log_time = time.perf_counter()
307
560
 
561
+ def log_grammar_stats(self, grammar_stats) -> None:
562
+ # Duck-typed GrammarStats to avoid cross-package dependency
563
+ if getattr(grammar_stats, "compilation_time", None) is not None:
564
+ self._log_histogram(
565
+ self.grammar_compilation_time, grammar_stats.compilation_time
566
+ )
567
+ if getattr(grammar_stats, "schema_count", None) is not None:
568
+ self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
569
+ if getattr(grammar_stats, "ebnf_size", None) is not None:
570
+ self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
571
+ tree_times = getattr(grammar_stats, "tree_traversal_time", None)
572
+ if tree_times:
573
+ max_time = max(tree_times)
574
+ avg_time = sum(tree_times) / len(tree_times)
575
+ self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
576
+ self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
577
+ if getattr(grammar_stats, "is_cache_hit", False):
578
+ self.num_grammar_cache_hit.labels(**self.labels).inc(1)
579
+ if getattr(grammar_stats, "is_grammar_aborted", False):
580
+ self.num_grammar_aborted.labels(**self.labels).inc(1)
581
+ self.num_grammar_total.labels(**self.labels).inc(1)
582
+
308
583
 
309
584
  class TokenizerMetricsCollector:
310
585
  def __init__(
311
586
  self,
312
- labels: Dict[str, str],
587
+ server_args: Optional[ServerArgs] = None,
588
+ labels: Dict[str, str] = None,
313
589
  bucket_time_to_first_token: Optional[List[float]] = None,
314
590
  bucket_inter_token_latency: Optional[List[float]] = None,
315
591
  bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -318,7 +594,7 @@ class TokenizerMetricsCollector:
318
594
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
319
595
  from prometheus_client import Counter, Histogram
320
596
 
321
- self.labels = labels
597
+ self.labels = labels or {}
322
598
  self.collect_tokens_histogram = collect_tokens_histogram
323
599
 
324
600
  self.prompt_tokens_total = Counter(
@@ -334,7 +610,7 @@ class TokenizerMetricsCollector:
334
610
  )
335
611
 
336
612
  if collect_tokens_histogram:
337
- bucket_prompt_tokens = [
613
+ default_bucket_prompt_tokens = [
338
614
  100,
339
615
  300,
340
616
  500,
@@ -358,39 +634,30 @@ class TokenizerMetricsCollector:
358
634
  30000,
359
635
  35000,
360
636
  40000,
637
+ 66000,
638
+ 99000,
639
+ 132000,
640
+ 300000,
641
+ 600000,
642
+ 900000,
643
+ 1100000,
361
644
  ]
362
645
  self.prompt_tokens_histogram = Histogram(
363
646
  name="sglang:prompt_tokens_histogram",
364
647
  documentation="Histogram of prompt token length.",
365
648
  labelnames=labels.keys(),
366
- buckets=bucket_prompt_tokens,
649
+ buckets=generate_buckets(
650
+ server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
651
+ ),
367
652
  )
368
- bucket_generation_tokens = [
369
- 100,
370
- 300,
371
- 500,
372
- 1000,
373
- 1200,
374
- 1500,
375
- 1700,
376
- 2000,
377
- 2500,
378
- 3000,
379
- 3500,
380
- 4000,
381
- 4500,
382
- 5000,
383
- 6000,
384
- 7000,
385
- 8000,
386
- 9000,
387
- 10000,
388
- ]
389
653
  self.generation_tokens_histogram = Histogram(
390
654
  name="sglang:generation_tokens_histogram",
391
655
  documentation="Histogram of generation token length.",
392
656
  labelnames=labels.keys(),
393
- buckets=bucket_generation_tokens,
657
+ buckets=generate_buckets(
658
+ server_args.generation_tokens_buckets,
659
+ default_bucket_prompt_tokens,
660
+ ),
394
661
  )
395
662
 
396
663
  self.cached_tokens_total = Counter(
@@ -412,7 +679,7 @@ class TokenizerMetricsCollector:
412
679
  )
413
680
 
414
681
  self.num_aborted_requests_total = Counter(
415
- name="sglang:num_aborted_requests",
682
+ name="sglang:num_aborted_requests_total",
416
683
  documentation="Number of requests aborted.",
417
684
  labelnames=labels.keys(),
418
685
  )
@@ -459,7 +726,10 @@ class TokenizerMetricsCollector:
459
726
  100,
460
727
  200,
461
728
  400,
462
- 800,
729
+ 600,
730
+ 1200,
731
+ 1800,
732
+ 2400,
463
733
  ]
464
734
 
465
735
  if bucket_inter_token_latency is None:
@@ -496,7 +766,7 @@ class TokenizerMetricsCollector:
496
766
  buckets=bucket_time_to_first_token,
497
767
  )
498
768
 
499
- self.histogram_inter_token_latency_seconds = Histogram(
769
+ self.histogram_inter_token_latency = Histogram(
500
770
  name="sglang:inter_token_latency_seconds",
501
771
  documentation="Histogram of inter-token latency in seconds.",
502
772
  labelnames=labels.keys(),
@@ -510,38 +780,53 @@ class TokenizerMetricsCollector:
510
780
  buckets=bucket_e2e_request_latency,
511
781
  )
512
782
 
513
- def _log_histogram(self, histogram, data: Union[int, float]) -> None:
514
- histogram.labels(**self.labels).observe(data)
515
-
516
783
  def observe_one_finished_request(
517
784
  self,
785
+ labels: Dict[str, str],
518
786
  prompt_tokens: int,
519
787
  generation_tokens: int,
520
788
  cached_tokens: int,
521
789
  e2e_latency: float,
522
790
  has_grammar: bool,
523
791
  ):
524
- self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
525
- self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
792
+ self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
793
+ self.generation_tokens_total.labels(**labels).inc(generation_tokens)
526
794
  if cached_tokens > 0:
527
- self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
528
- self.num_requests_total.labels(**self.labels).inc(1)
795
+ self.cached_tokens_total.labels(**labels).inc(cached_tokens)
796
+ self.num_requests_total.labels(**labels).inc(1)
529
797
  if has_grammar:
530
- self.num_so_requests_total.labels(**self.labels).inc(1)
531
- self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
798
+ self.num_so_requests_total.labels(**labels).inc(1)
799
+ self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
532
800
  if self.collect_tokens_histogram:
533
- self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
534
- self._log_histogram(self.generation_tokens_histogram, generation_tokens)
535
-
536
- def observe_time_to_first_token(self, value: float):
537
- self.histogram_time_to_first_token.labels(**self.labels).observe(value)
801
+ self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
802
+ self.generation_tokens_histogram.labels(**labels).observe(
803
+ float(generation_tokens)
804
+ )
538
805
 
539
- def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
806
+ def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
807
+ self.histogram_time_to_first_token.labels(**labels).observe(value)
808
+
809
+ def check_time_to_first_token_straggler(self, value: float) -> bool:
810
+ his = self.histogram_time_to_first_token.labels(**self.labels)
811
+ total_observations = sum(bucket._value for bucket in his._buckets)
812
+ if total_observations < 1000:
813
+ return False
814
+ p999_threshold = total_observations * 0.999
815
+ cumulative_count = 0
816
+ for i, bucket in enumerate(his._buckets):
817
+ cumulative_count += bucket._value
818
+ if cumulative_count > p999_threshold:
819
+ return value >= his._upper_bounds[i]
820
+ return False
821
+
822
+ def observe_inter_token_latency(
823
+ self, labels: Dict[str, str], internval: float, num_new_tokens: int
824
+ ):
540
825
  adjusted_interval = internval / num_new_tokens
541
826
 
542
827
  # A faster version of the Histogram::observe which observes multiple values at the same time.
543
828
  # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
544
- his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
829
+ his = self.histogram_inter_token_latency.labels(**labels)
545
830
  his._sum.inc(internval)
546
831
 
547
832
  for i, bound in enumerate(his._upper_bounds):
@@ -549,5 +834,107 @@ class TokenizerMetricsCollector:
549
834
  his._buckets[i].inc(num_new_tokens)
550
835
  break
551
836
 
552
- def observe_one_aborted_request(self):
553
- self.num_aborted_requests_total.labels(**self.labels).inc(1)
837
+ def observe_one_aborted_request(self, labels: Dict[str, str]):
838
+ self.num_aborted_requests_total.labels(**labels).inc(1)
839
+
840
+
841
+ @dataclass
842
+ class StorageMetrics:
843
+ prefetch_pgs: List[int] = field(default_factory=list)
844
+ backup_pgs: List[int] = field(default_factory=list)
845
+ prefetch_bandwidth: List[float] = field(default_factory=list)
846
+ backup_bandwidth: List[float] = field(default_factory=list)
847
+
848
+
849
+ class StorageMetricsCollector:
850
+ def __init__(
851
+ self,
852
+ labels: Dict[str, str],
853
+ ):
854
+ from prometheus_client import Counter, Histogram
855
+
856
+ self.labels = labels
857
+
858
+ self.prefetched_tokens_total = Counter(
859
+ name="sglang:prefetched_tokens_total",
860
+ documentation="Number of prefetched prompt tokens.",
861
+ labelnames=labels.keys(),
862
+ )
863
+
864
+ self.backuped_tokens_total = Counter(
865
+ name="sglang:backuped_tokens_total",
866
+ documentation="Number of backuped tokens.",
867
+ labelnames=labels.keys(),
868
+ )
869
+
870
+ bucket_io = [
871
+ 1,
872
+ 5,
873
+ 10,
874
+ 50,
875
+ 100,
876
+ ]
877
+
878
+ bucket_bandwidth = [
879
+ 0.1,
880
+ 0.5,
881
+ 1,
882
+ 5,
883
+ 10,
884
+ 50,
885
+ 100,
886
+ ]
887
+
888
+ self.histogram_prefetch_pgs = Histogram(
889
+ name="sglang:prefetch_pgs",
890
+ documentation="Histogram of prefetch pages of batches.",
891
+ labelnames=labels.keys(),
892
+ buckets=bucket_io,
893
+ )
894
+
895
+ self.histogram_backup_pgs = Histogram(
896
+ name="sglang:backup_pgs",
897
+ documentation="Histogram of backup pages of batches.",
898
+ labelnames=labels.keys(),
899
+ buckets=bucket_io,
900
+ )
901
+
902
+ self.histogram_prefetch_bandwidth = Histogram(
903
+ name="sglang:prefetch_bandwidth",
904
+ documentation="Histogram of prefetch bandwidth in GB/s.",
905
+ labelnames=labels.keys(),
906
+ buckets=bucket_bandwidth,
907
+ )
908
+
909
+ self.histogram_backup_bandwidth = Histogram(
910
+ name="sglang:backup_bandwidth",
911
+ documentation="Histogram of backup bandwidth in GB/s.",
912
+ labelnames=labels.keys(),
913
+ buckets=bucket_bandwidth,
914
+ )
915
+
916
+ def log_prefetched_tokens(self, prefetched_tokens: int):
917
+ if prefetched_tokens > 0:
918
+ self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
919
+
920
+ def log_backuped_tokens(self, backuped_tokens: int):
921
+ if backuped_tokens > 0:
922
+ self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
923
+
924
+ def _log_histogram(self, histogram, data: Union[int, float]):
925
+ histogram.labels(**self.labels).observe(data)
926
+
927
+ def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
928
+ if storage_metrics is None:
929
+ return
930
+
931
+ assert isinstance(storage_metrics, StorageMetrics)
932
+
933
+ for v in storage_metrics.prefetch_pgs:
934
+ self._log_histogram(self.histogram_prefetch_pgs, v)
935
+ for v in storage_metrics.backup_pgs:
936
+ self._log_histogram(self.histogram_backup_pgs, v)
937
+ for v in storage_metrics.prefetch_bandwidth:
938
+ self._log_histogram(self.histogram_prefetch_bandwidth, v)
939
+ for v in storage_metrics.backup_bandwidth:
940
+ self._log_histogram(self.histogram_backup_bandwidth, v)