sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -12,13 +12,12 @@
12
12
  # limitations under the License.
13
13
  # ==============================================================================
14
14
  """Utilities for Prometheus Metrics Collection."""
15
-
16
15
  import time
17
- from dataclasses import dataclass
18
- from enum import Enum
16
+ from dataclasses import dataclass, field
19
17
  from typing import Dict, List, Optional, Union
20
18
 
21
- from sglang.srt.metrics.utils import generate_buckets
19
+ from sglang.srt.disaggregation.utils import DisaggregationMode
20
+ from sglang.srt.metrics.utils import exponential_buckets, generate_buckets
22
21
  from sglang.srt.server_args import ServerArgs
23
22
  from sglang.srt.utils import get_bool_env_var
24
23
 
@@ -35,6 +34,7 @@ class TimeStats:
35
34
  Decode: prealloc_queue -> transfer_queue -> wait_queue -> forward -> completion
36
35
  """
37
36
 
37
+ disagg_mode: DisaggregationMode = DisaggregationMode.NULL
38
38
  lb_entry_time: float = 0.0
39
39
  wait_queue_entry_time: float = 0.0
40
40
  forward_entry_time: float = 0.0
@@ -44,17 +44,11 @@ class TimeStats:
44
44
  decode_prealloc_queue_entry_time: float = 0.0
45
45
  decode_transfer_queue_entry_time: float = 0.0
46
46
 
47
- class RequestType(Enum):
48
- UNIFIED = "unified"
49
- PREFILL = "prefill"
50
- DECODE = "decode"
51
- INVALID = "invalid"
52
-
53
- def __str__(self) -> str:
54
- # if unified
55
- _type = self.get_type()
47
+ def get_queueing_time(self) -> float:
48
+ return self.forward_entry_time - self.wait_queue_entry_time
56
49
 
57
- if _type == self.RequestType.UNIFIED:
50
+ def convert_to_duration(self) -> str:
51
+ if self.disagg_mode == DisaggregationMode.NULL:
58
52
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
59
53
  forward_duration = self.completion_time - self.forward_entry_time
60
54
 
@@ -63,30 +57,28 @@ class TimeStats:
63
57
  queue_duration >= 0 and forward_duration >= 0
64
58
  ), f"queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
65
59
 
66
- return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time}"
67
- elif _type == self.RequestType.PREFILL:
60
+ return f"queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.wait_queue_entry_time:.3f}"
61
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
68
62
  bootstrap_duration = (
69
63
  self.wait_queue_entry_time - self.prefill_bootstrap_queue_entry_time
70
64
  )
71
-
72
65
  queue_duration = self.forward_entry_time - self.wait_queue_entry_time
73
-
74
66
  forward_duration = self.completion_time - self.forward_entry_time
75
67
 
76
68
  if SGLANG_TEST_REQUEST_TIME_STATS:
77
- assert (
78
- bootstrap_duration >= 0
79
- and queue_duration >= 0
80
- and forward_duration >= 0
81
- ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
82
- return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time}"
83
- # if decode
84
- elif _type == self.RequestType.DECODE:
69
+ if self.wait_queue_entry_time > 0:
70
+ assert (
71
+ bootstrap_duration >= 0
72
+ and queue_duration >= 0
73
+ and forward_duration >= 0
74
+ ), f"bootstrap_duration={bootstrap_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
75
+
76
+ return f"bootstrap_duration={self.format_duration(bootstrap_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.prefill_bootstrap_queue_entry_time:.3f}"
77
+ elif self.disagg_mode == DisaggregationMode.DECODE:
85
78
  prealloc_duration = (
86
79
  self.decode_transfer_queue_entry_time
87
80
  - self.decode_prealloc_queue_entry_time
88
81
  )
89
-
90
82
  transfer_duration = (
91
83
  self.wait_queue_entry_time - self.decode_transfer_queue_entry_time
92
84
  )
@@ -94,67 +86,74 @@ class TimeStats:
94
86
  forward_duration = self.completion_time - self.forward_entry_time
95
87
 
96
88
  if SGLANG_TEST_REQUEST_TIME_STATS:
97
- assert (
98
- prealloc_duration >= 0
99
- and transfer_duration >= 0
100
- and queue_duration >= 0
101
- and forward_duration >= 0
102
- ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0"
103
-
104
- return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time}"
89
+ if self.wait_queue_entry_time > 0:
90
+ assert (
91
+ prealloc_duration >= 0
92
+ and transfer_duration >= 0
93
+ and queue_duration >= 0
94
+ and forward_duration >= 0
95
+ ), f"prealloc_duration={prealloc_duration} < 0 or transfer_duration={transfer_duration} < 0 or queue_duration={queue_duration} < 0 or forward_duration={forward_duration} < 0. {self=}"
96
+
97
+ return f"prealloc_duration={self.format_duration(prealloc_duration)}, transfer_duration={self.format_duration(transfer_duration)}, queue_duration={self.format_duration(queue_duration)}, forward_duration={self.format_duration(forward_duration)}, start_time={self.decode_prealloc_queue_entry_time:.3f}"
105
98
  else:
106
- return "Invalid Time Stats"
99
+ return "Unknown Time Stats"
107
100
 
108
101
  def format_duration(self, duration: float) -> str:
109
102
  return f"{duration * 1e3:.2f}ms"
110
103
 
111
- def get_type(self) -> RequestType:
112
- """Determine the type of request based on timestamp values."""
113
- if (
114
- self.prefill_bootstrap_queue_entry_time == 0.0
115
- and self.prefill_transfer_queue_entry_time == 0.0
116
- and self.decode_prealloc_queue_entry_time == 0.0
117
- and self.decode_transfer_queue_entry_time == 0.0
118
- ):
119
- return self.RequestType.UNIFIED
120
- elif (
121
- self.prefill_bootstrap_queue_entry_time > 0.0
122
- and self.prefill_transfer_queue_entry_time > 0.0
123
- ):
124
- return self.RequestType.PREFILL
125
- elif (
126
- self.decode_prealloc_queue_entry_time > 0.0
127
- and self.decode_transfer_queue_entry_time > 0.0
128
- and self.wait_queue_entry_time > 0.0
129
- ):
130
- return self.RequestType.DECODE
104
+ def disagg_mode_str(self) -> str:
105
+ if self.disagg_mode == DisaggregationMode.NULL:
106
+ return "unified"
107
+ elif self.disagg_mode == DisaggregationMode.DECODE:
108
+ return "decode"
109
+ elif self.disagg_mode == DisaggregationMode.PREFILL:
110
+ return "prefill"
131
111
  else:
132
- return self.RequestType.INVALID
112
+ return "unknown"
133
113
 
134
114
 
135
115
  @dataclass
136
116
  class SchedulerStats:
117
+ # Basics
137
118
  num_running_reqs: int = 0
138
119
  num_used_tokens: int = 0
139
120
  token_usage: float = 0.0
121
+ swa_token_usage: float = 0.0
140
122
  gen_throughput: float = 0.0
141
123
  num_queue_reqs: int = 0
142
- cache_hit_rate: float = 0.0
143
124
  num_grammar_queue_reqs: int = 0
125
+ num_running_reqs_offline_batch: int = 0
126
+ cache_hit_rate: float = 0.0
127
+
128
+ # Speculative decoding
144
129
  spec_accept_length: float = 0.0
145
- avg_request_queue_latency: float = 0.0
130
+
131
+ # Retract
132
+ num_retracted_reqs: int = 0
133
+ num_paused_reqs: int = 0
134
+
135
+ # PD disaggregation
146
136
  num_prefill_prealloc_queue_reqs: int = 0
147
137
  num_prefill_inflight_queue_reqs: int = 0
148
138
  num_decode_prealloc_queue_reqs: int = 0
149
139
  num_decode_transfer_queue_reqs: int = 0
150
- total_retracted_reqs: int = 0
140
+ kv_transfer_speed_gb_s: float = 0.0
141
+ kv_transfer_latency_ms: float = 0.0
142
+
143
+ # Utilization
144
+ utilization: float = 0.0
145
+ max_running_requests_under_SLO: Optional[int] = None
146
+
147
+ # Engine startup
148
+ engine_startup_time: float = 0.0
149
+ engine_load_weights_time: float = 0.0
151
150
 
152
151
 
153
152
  class SchedulerMetricsCollector:
154
153
 
155
154
  def __init__(self, labels: Dict[str, str]) -> None:
156
155
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
157
- from prometheus_client import Counter, Gauge
156
+ from prometheus_client import Counter, Gauge, Histogram
158
157
 
159
158
  self.labels = labels
160
159
  self.last_log_time = time.perf_counter()
@@ -165,42 +164,48 @@ class SchedulerMetricsCollector:
165
164
  labelnames=labels.keys(),
166
165
  multiprocess_mode="mostrecent",
167
166
  )
168
-
169
167
  self.num_used_tokens = Gauge(
170
168
  name="sglang:num_used_tokens",
171
169
  documentation="The number of used tokens.",
172
170
  labelnames=labels.keys(),
173
171
  multiprocess_mode="mostrecent",
174
172
  )
175
-
176
173
  self.token_usage = Gauge(
177
174
  name="sglang:token_usage",
178
175
  documentation="The token usage.",
179
176
  labelnames=labels.keys(),
180
177
  multiprocess_mode="mostrecent",
181
178
  )
182
-
179
+ self.swa_token_usage = Gauge(
180
+ name="sglang:swa_token_usage",
181
+ documentation="The token usage for SWA layers.",
182
+ labelnames=labels.keys(),
183
+ multiprocess_mode="mostrecent",
184
+ )
183
185
  self.gen_throughput = Gauge(
184
186
  name="sglang:gen_throughput",
185
187
  documentation="The generation throughput (token/s).",
186
188
  labelnames=labels.keys(),
187
189
  multiprocess_mode="mostrecent",
188
190
  )
189
-
190
191
  self.num_queue_reqs = Gauge(
191
192
  name="sglang:num_queue_reqs",
192
193
  documentation="The number of requests in the waiting queue.",
193
194
  labelnames=labels.keys(),
194
195
  multiprocess_mode="mostrecent",
195
196
  )
196
-
197
197
  self.num_grammar_queue_reqs = Gauge(
198
198
  name="sglang:num_grammar_queue_reqs",
199
199
  documentation="The number of requests in the grammar waiting queue.",
200
200
  labelnames=labels.keys(),
201
201
  multiprocess_mode="mostrecent",
202
202
  )
203
-
203
+ self.num_running_reqs_offline_batch = Gauge(
204
+ name="sglang:num_running_reqs_offline_batch",
205
+ documentation="The number of running low-priority offline batch requests(label is 'batch').",
206
+ labelnames=labels.keys(),
207
+ multiprocess_mode="mostrecent",
208
+ )
204
209
  self.cache_hit_rate = Gauge(
205
210
  name="sglang:cache_hit_rate",
206
211
  documentation="The prefix cache hit rate.",
@@ -208,6 +213,7 @@ class SchedulerMetricsCollector:
208
213
  multiprocess_mode="mostrecent",
209
214
  )
210
215
 
216
+ # Speculative decoding
211
217
  self.spec_accept_length = Gauge(
212
218
  name="sglang:spec_accept_length",
213
219
  documentation="The average acceptance length of speculative decoding.",
@@ -215,83 +221,307 @@ class SchedulerMetricsCollector:
215
221
  multiprocess_mode="mostrecent",
216
222
  )
217
223
 
218
- self.avg_request_queue_latency = Gauge(
219
- name="sglang:avg_request_queue_latency",
220
- documentation="The average request queue latency for the last batch of requests in seconds.",
224
+ # Retract
225
+ self.num_retracted_reqs = Gauge(
226
+ name="sglang:num_retracted_reqs",
227
+ documentation="The number of retracted requests.",
221
228
  labelnames=labels.keys(),
222
- multiprocess_mode="mostrecent",
223
229
  )
224
-
225
- self.total_retracted_reqs = Gauge(
226
- name="sglang:total_retracted_reqs",
227
- documentation="The total number of retracted requests due to kvcache full.",
230
+ self.num_paused_reqs = Gauge(
231
+ name="sglang:num_paused_reqs",
232
+ documentation="The number of paused requests by async weight sync.",
228
233
  labelnames=labels.keys(),
229
- multiprocess_mode="mostrecent",
230
234
  )
231
235
 
232
- # Disaggregation queue metrics
236
+ # PD disaggregation
233
237
  self.num_prefill_prealloc_queue_reqs = Gauge(
234
238
  name="sglang:num_prefill_prealloc_queue_reqs",
235
239
  documentation="The number of requests in the prefill prealloc queue.",
236
240
  labelnames=labels.keys(),
237
241
  multiprocess_mode="mostrecent",
238
242
  )
239
-
240
243
  self.num_prefill_inflight_queue_reqs = Gauge(
241
244
  name="sglang:num_prefill_inflight_queue_reqs",
242
245
  documentation="The number of requests in the prefill inflight queue.",
243
246
  labelnames=labels.keys(),
244
247
  multiprocess_mode="mostrecent",
245
248
  )
246
-
247
249
  self.num_decode_prealloc_queue_reqs = Gauge(
248
250
  name="sglang:num_decode_prealloc_queue_reqs",
249
251
  documentation="The number of requests in the decode prealloc queue.",
250
252
  labelnames=labels.keys(),
251
253
  multiprocess_mode="mostrecent",
252
254
  )
253
-
254
255
  self.num_decode_transfer_queue_reqs = Gauge(
255
256
  name="sglang:num_decode_transfer_queue_reqs",
256
257
  documentation="The number of requests in the decode transfer queue.",
257
258
  labelnames=labels.keys(),
258
259
  multiprocess_mode="mostrecent",
259
260
  )
260
-
261
261
  self.num_bootstrap_failed_reqs = Counter(
262
- name="sglang:num_bootstrap_failed_reqs",
262
+ name="sglang:num_bootstrap_failed_reqs_total",
263
263
  documentation="The number of bootstrap failed requests.",
264
264
  labelnames=labels.keys(),
265
265
  )
266
-
267
266
  self.num_transfer_failed_reqs = Counter(
268
- name="sglang:num_transfer_failed_reqs",
267
+ name="sglang:num_transfer_failed_reqs_total",
269
268
  documentation="The number of transfer failed requests.",
270
269
  labelnames=labels.keys(),
271
270
  )
271
+ self.kv_transfer_speed_gb_s = Gauge(
272
+ name="sglang:kv_transfer_speed_gb_s",
273
+ documentation="The transfer speed of the KV cache in GB/s.",
274
+ labelnames=labels.keys(),
275
+ multiprocess_mode="mostrecent",
276
+ )
277
+ self.kv_transfer_latency_ms = Gauge(
278
+ name="sglang:kv_transfer_latency_ms",
279
+ documentation="The transfer latency of the KV cache in ms.",
280
+ labelnames=labels.keys(),
281
+ multiprocess_mode="mostrecent",
282
+ )
283
+
284
+ # Utilization
285
+ self.utilization = Gauge(
286
+ name="sglang:utilization",
287
+ documentation="The utilization.",
288
+ labelnames=labels.keys(),
289
+ multiprocess_mode="mostrecent",
290
+ )
291
+ self.max_running_requests_under_SLO = Gauge(
292
+ name="sglang:max_running_requests_under_SLO",
293
+ documentation="The maximum number of running requests under SLO.",
294
+ labelnames=labels.keys(),
295
+ multiprocess_mode="mostrecent",
296
+ )
297
+
298
+ # Engine startup
299
+ self.engine_startup_time = Gauge(
300
+ name="sglang:engine_startup_time",
301
+ documentation="The time taken for the engine to start up.",
302
+ labelnames=labels.keys(),
303
+ multiprocess_mode="mostrecent",
304
+ )
305
+ self.engine_load_weights_time = Gauge(
306
+ name="sglang:engine_load_weights_time",
307
+ documentation="The time taken for the engine to load weights.",
308
+ labelnames=labels.keys(),
309
+ multiprocess_mode="mostrecent",
310
+ )
311
+
312
+ # Additional queueing time histogram
313
+ self.queue_time = Histogram(
314
+ name="sglang:queue_time_seconds",
315
+ documentation="Histogram of queueing time in seconds.",
316
+ labelnames=labels.keys(),
317
+ buckets=[
318
+ 0.0,
319
+ 0.1,
320
+ 0.2,
321
+ 0.5,
322
+ 1,
323
+ 2,
324
+ 3,
325
+ 4,
326
+ 5,
327
+ 10,
328
+ 15,
329
+ 20,
330
+ 30,
331
+ 40,
332
+ 50,
333
+ 60,
334
+ 70,
335
+ 80,
336
+ 90,
337
+ 100,
338
+ 200,
339
+ 300,
340
+ 400,
341
+ 500,
342
+ 600,
343
+ 700,
344
+ 800,
345
+ 900,
346
+ 1000,
347
+ 1200,
348
+ 1400,
349
+ 1600,
350
+ 1800,
351
+ 2000,
352
+ 2500,
353
+ 3000,
354
+ ],
355
+ )
356
+
357
+ # Grammar metrics
358
+ self.grammar_compilation_time = Histogram(
359
+ name="sglang:grammar_compilation_time_seconds",
360
+ documentation="Histogram of grammar compilation time in seconds.",
361
+ labelnames=labels.keys(),
362
+ buckets=[
363
+ 0.0,
364
+ 0.01,
365
+ 0.02,
366
+ 0.05,
367
+ 0.1,
368
+ 0.2,
369
+ 0.5,
370
+ 1,
371
+ 2,
372
+ 5,
373
+ 10,
374
+ 20,
375
+ 30,
376
+ 60,
377
+ 90,
378
+ 120,
379
+ 240,
380
+ ],
381
+ )
382
+ self.num_grammar_cache_hit = Counter(
383
+ name="sglang:num_grammar_cache_hit_total",
384
+ documentation="Number of grammar cache hits.",
385
+ labelnames=labels.keys(),
386
+ )
387
+ self.num_grammar_aborted = Counter(
388
+ name="sglang:num_grammar_aborted_total",
389
+ documentation="Number of grammar aborted requests.",
390
+ labelnames=labels.keys(),
391
+ )
392
+ self.num_grammar_total = Counter(
393
+ name="sglang:num_grammar_total",
394
+ documentation="Number of the total grammar requests.",
395
+ labelnames=labels.keys(),
396
+ )
397
+ self.grammar_schema_count = Histogram(
398
+ name="sglang:grammar_schema_count",
399
+ documentation="Histogram of grammar schema count.",
400
+ labelnames=labels.keys(),
401
+ buckets=[
402
+ 0,
403
+ 1,
404
+ 2,
405
+ 5,
406
+ 10,
407
+ 20,
408
+ 30,
409
+ 40,
410
+ 60,
411
+ 80,
412
+ 100,
413
+ 120,
414
+ 140,
415
+ 160,
416
+ 180,
417
+ 200,
418
+ 300,
419
+ 400,
420
+ 500,
421
+ 700,
422
+ 1000,
423
+ ],
424
+ )
425
+ self.grammar_ebnf_size = Histogram(
426
+ name="sglang:grammar_ebnf_size",
427
+ documentation="Histogram of grammar EBNF size.",
428
+ labelnames=labels.keys(),
429
+ buckets=[
430
+ 0,
431
+ 50,
432
+ 100,
433
+ 200,
434
+ 300,
435
+ 500,
436
+ 1000,
437
+ 2000,
438
+ 3000,
439
+ 5000,
440
+ 10000,
441
+ 20000,
442
+ 30000,
443
+ 50000,
444
+ 100000,
445
+ ],
446
+ )
447
+
448
+ tree_traversal_time_buckets = [
449
+ 0.0,
450
+ 0.01,
451
+ 0.02,
452
+ 0.05,
453
+ 0.1,
454
+ 0.2,
455
+ 0.5,
456
+ 1,
457
+ 2,
458
+ 5,
459
+ 10,
460
+ 15,
461
+ 30,
462
+ 60,
463
+ 90,
464
+ 120,
465
+ 240,
466
+ ]
467
+ self.grammar_tree_traversal_time_avg = Histogram(
468
+ name="sglang:grammar_tree_traversal_time_avg",
469
+ documentation="Histogram of average grammar tree traversal time in seconds.",
470
+ labelnames=labels.keys(),
471
+ buckets=tree_traversal_time_buckets,
472
+ )
473
+ self.grammar_tree_traversal_time_max = Histogram(
474
+ name="sglang:grammar_tree_traversal_time_max",
475
+ documentation="Histogram of max grammar tree traversal time in seconds.",
476
+ labelnames=labels.keys(),
477
+ buckets=tree_traversal_time_buckets,
478
+ )
479
+
480
+ self.per_stage_req_latency_seconds = Histogram(
481
+ name="sglang:per_stage_req_latency_seconds",
482
+ documentation="The latency of each stage of requests.",
483
+ # captures latency in range [1ms - ~1191s]
484
+ buckets=exponential_buckets(start=0.001, width=1.62, length=30),
485
+ labelnames=list(labels.keys()) + ["stage"],
486
+ )
272
487
 
273
488
  def _log_gauge(self, gauge, data: Union[int, float]) -> None:
274
489
  # Convenience function for logging to gauge.
275
490
  gauge.labels(**self.labels).set(data)
276
491
 
492
+ def _log_histogram(self, histogram, data: Union[int, float]) -> None:
493
+ histogram.labels(**self.labels).observe(data)
494
+
277
495
  def increment_bootstrap_failed_reqs(self) -> None:
278
496
  self.num_bootstrap_failed_reqs.labels(**self.labels).inc(1)
279
497
 
280
498
  def increment_transfer_failed_reqs(self) -> None:
281
499
  self.num_transfer_failed_reqs.labels(**self.labels).inc(1)
282
500
 
501
+ def observe_per_stage_req_latency(self, stage: str, latency: float) -> None:
502
+ labels_with_stage = {**self.labels, "stage": stage}
503
+ self.per_stage_req_latency_seconds.labels(**labels_with_stage).observe(latency)
504
+
505
+ def observe_queue_time(self, latency: float) -> None:
506
+ self._log_histogram(self.queue_time, latency)
507
+
283
508
  def log_stats(self, stats: SchedulerStats) -> None:
284
509
  self._log_gauge(self.num_running_reqs, stats.num_running_reqs)
285
510
  self._log_gauge(self.num_used_tokens, stats.num_used_tokens)
286
511
  self._log_gauge(self.token_usage, stats.token_usage)
512
+ self._log_gauge(self.swa_token_usage, stats.swa_token_usage)
287
513
  self._log_gauge(self.gen_throughput, stats.gen_throughput)
288
514
  self._log_gauge(self.num_queue_reqs, stats.num_queue_reqs)
289
515
  self._log_gauge(self.num_grammar_queue_reqs, stats.num_grammar_queue_reqs)
516
+ self._log_gauge(
517
+ self.num_running_reqs_offline_batch, stats.num_running_reqs_offline_batch
518
+ )
290
519
  self._log_gauge(self.cache_hit_rate, stats.cache_hit_rate)
520
+
521
+ # Speculative decoding
291
522
  self._log_gauge(self.spec_accept_length, stats.spec_accept_length)
292
- self._log_gauge(self.total_retracted_reqs, stats.total_retracted_reqs)
293
523
 
294
- # Disaggregation metrics
524
+ # PD disaggregation
295
525
  self._log_gauge(
296
526
  self.num_prefill_prealloc_queue_reqs, stats.num_prefill_prealloc_queue_reqs
297
527
  )
@@ -304,15 +534,58 @@ class SchedulerMetricsCollector:
304
534
  self._log_gauge(
305
535
  self.num_decode_transfer_queue_reqs, stats.num_decode_transfer_queue_reqs
306
536
  )
537
+ self._log_gauge(self.kv_transfer_speed_gb_s, stats.kv_transfer_speed_gb_s)
538
+ self._log_gauge(self.kv_transfer_latency_ms, stats.kv_transfer_latency_ms)
539
+
540
+ # Retract
541
+ self._log_gauge(self.num_retracted_reqs, stats.num_retracted_reqs)
542
+ self._log_gauge(self.num_paused_reqs, stats.num_paused_reqs)
543
+
544
+ # Utilization
545
+ self._log_gauge(self.utilization, stats.utilization)
546
+ if stats.max_running_requests_under_SLO is not None:
547
+ self._log_gauge(
548
+ self.max_running_requests_under_SLO,
549
+ stats.max_running_requests_under_SLO,
550
+ )
551
+
552
+ # Engine startup time
553
+ self._log_gauge(self.engine_startup_time, stats.engine_startup_time)
554
+ if stats.engine_load_weights_time is not None:
555
+ self._log_gauge(
556
+ self.engine_load_weights_time, stats.engine_load_weights_time
557
+ )
307
558
 
308
559
  self.last_log_time = time.perf_counter()
309
560
 
561
+ def log_grammar_stats(self, grammar_stats) -> None:
562
+ # Duck-typed GrammarStats to avoid cross-package dependency
563
+ if getattr(grammar_stats, "compilation_time", None) is not None:
564
+ self._log_histogram(
565
+ self.grammar_compilation_time, grammar_stats.compilation_time
566
+ )
567
+ if getattr(grammar_stats, "schema_count", None) is not None:
568
+ self._log_histogram(self.grammar_schema_count, grammar_stats.schema_count)
569
+ if getattr(grammar_stats, "ebnf_size", None) is not None:
570
+ self._log_histogram(self.grammar_ebnf_size, grammar_stats.ebnf_size)
571
+ tree_times = getattr(grammar_stats, "tree_traversal_time", None)
572
+ if tree_times:
573
+ max_time = max(tree_times)
574
+ avg_time = sum(tree_times) / len(tree_times)
575
+ self._log_histogram(self.grammar_tree_traversal_time_max, max_time)
576
+ self._log_histogram(self.grammar_tree_traversal_time_avg, avg_time)
577
+ if getattr(grammar_stats, "is_cache_hit", False):
578
+ self.num_grammar_cache_hit.labels(**self.labels).inc(1)
579
+ if getattr(grammar_stats, "is_grammar_aborted", False):
580
+ self.num_grammar_aborted.labels(**self.labels).inc(1)
581
+ self.num_grammar_total.labels(**self.labels).inc(1)
582
+
310
583
 
311
584
  class TokenizerMetricsCollector:
312
585
  def __init__(
313
586
  self,
314
- server_args: ServerArgs,
315
- labels: Dict[str, str],
587
+ server_args: Optional[ServerArgs] = None,
588
+ labels: Dict[str, str] = None,
316
589
  bucket_time_to_first_token: Optional[List[float]] = None,
317
590
  bucket_inter_token_latency: Optional[List[float]] = None,
318
591
  bucket_e2e_request_latency: Optional[List[float]] = None,
@@ -321,7 +594,7 @@ class TokenizerMetricsCollector:
321
594
  # We need to import prometheus_client after setting the env variable `PROMETHEUS_MULTIPROC_DIR`
322
595
  from prometheus_client import Counter, Histogram
323
596
 
324
- self.labels = labels
597
+ self.labels = labels or {}
325
598
  self.collect_tokens_histogram = collect_tokens_histogram
326
599
 
327
600
  self.prompt_tokens_total = Counter(
@@ -361,6 +634,13 @@ class TokenizerMetricsCollector:
361
634
  30000,
362
635
  35000,
363
636
  40000,
637
+ 66000,
638
+ 99000,
639
+ 132000,
640
+ 300000,
641
+ 600000,
642
+ 900000,
643
+ 1100000,
364
644
  ]
365
645
  self.prompt_tokens_histogram = Histogram(
366
646
  name="sglang:prompt_tokens_histogram",
@@ -370,34 +650,13 @@ class TokenizerMetricsCollector:
370
650
  server_args.prompt_tokens_buckets, default_bucket_prompt_tokens
371
651
  ),
372
652
  )
373
- default_bucket_generation_tokens = [
374
- 100,
375
- 300,
376
- 500,
377
- 1000,
378
- 1200,
379
- 1500,
380
- 1700,
381
- 2000,
382
- 2500,
383
- 3000,
384
- 3500,
385
- 4000,
386
- 4500,
387
- 5000,
388
- 6000,
389
- 7000,
390
- 8000,
391
- 9000,
392
- 10000,
393
- ]
394
653
  self.generation_tokens_histogram = Histogram(
395
654
  name="sglang:generation_tokens_histogram",
396
655
  documentation="Histogram of generation token length.",
397
656
  labelnames=labels.keys(),
398
657
  buckets=generate_buckets(
399
658
  server_args.generation_tokens_buckets,
400
- default_bucket_generation_tokens,
659
+ default_bucket_prompt_tokens,
401
660
  ),
402
661
  )
403
662
 
@@ -420,7 +679,7 @@ class TokenizerMetricsCollector:
420
679
  )
421
680
 
422
681
  self.num_aborted_requests_total = Counter(
423
- name="sglang:num_aborted_requests",
682
+ name="sglang:num_aborted_requests_total",
424
683
  documentation="Number of requests aborted.",
425
684
  labelnames=labels.keys(),
426
685
  )
@@ -467,7 +726,10 @@ class TokenizerMetricsCollector:
467
726
  100,
468
727
  200,
469
728
  400,
470
- 800,
729
+ 600,
730
+ 1200,
731
+ 1800,
732
+ 2400,
471
733
  ]
472
734
 
473
735
  if bucket_inter_token_latency is None:
@@ -504,7 +766,7 @@ class TokenizerMetricsCollector:
504
766
  buckets=bucket_time_to_first_token,
505
767
  )
506
768
 
507
- self.histogram_inter_token_latency_seconds = Histogram(
769
+ self.histogram_inter_token_latency = Histogram(
508
770
  name="sglang:inter_token_latency_seconds",
509
771
  documentation="Histogram of inter-token latency in seconds.",
510
772
  labelnames=labels.keys(),
@@ -518,38 +780,53 @@ class TokenizerMetricsCollector:
518
780
  buckets=bucket_e2e_request_latency,
519
781
  )
520
782
 
521
- def _log_histogram(self, histogram, data: Union[int, float]) -> None:
522
- histogram.labels(**self.labels).observe(data)
523
-
524
783
  def observe_one_finished_request(
525
784
  self,
785
+ labels: Dict[str, str],
526
786
  prompt_tokens: int,
527
787
  generation_tokens: int,
528
788
  cached_tokens: int,
529
789
  e2e_latency: float,
530
790
  has_grammar: bool,
531
791
  ):
532
- self.prompt_tokens_total.labels(**self.labels).inc(prompt_tokens)
533
- self.generation_tokens_total.labels(**self.labels).inc(generation_tokens)
792
+ self.prompt_tokens_total.labels(**labels).inc(prompt_tokens)
793
+ self.generation_tokens_total.labels(**labels).inc(generation_tokens)
534
794
  if cached_tokens > 0:
535
- self.cached_tokens_total.labels(**self.labels).inc(cached_tokens)
536
- self.num_requests_total.labels(**self.labels).inc(1)
795
+ self.cached_tokens_total.labels(**labels).inc(cached_tokens)
796
+ self.num_requests_total.labels(**labels).inc(1)
537
797
  if has_grammar:
538
- self.num_so_requests_total.labels(**self.labels).inc(1)
539
- self._log_histogram(self.histogram_e2e_request_latency, e2e_latency)
798
+ self.num_so_requests_total.labels(**labels).inc(1)
799
+ self.histogram_e2e_request_latency.labels(**labels).observe(float(e2e_latency))
540
800
  if self.collect_tokens_histogram:
541
- self._log_histogram(self.prompt_tokens_histogram, prompt_tokens)
542
- self._log_histogram(self.generation_tokens_histogram, generation_tokens)
543
-
544
- def observe_time_to_first_token(self, value: float):
545
- self.histogram_time_to_first_token.labels(**self.labels).observe(value)
801
+ self.prompt_tokens_histogram.labels(**labels).observe(float(prompt_tokens))
802
+ self.generation_tokens_histogram.labels(**labels).observe(
803
+ float(generation_tokens)
804
+ )
546
805
 
547
- def observe_inter_token_latency(self, internval: float, num_new_tokens: int):
806
+ def observe_time_to_first_token(self, labels: Dict[str, str], value: float):
807
+ self.histogram_time_to_first_token.labels(**labels).observe(value)
808
+
809
+ def check_time_to_first_token_straggler(self, value: float) -> bool:
810
+ his = self.histogram_time_to_first_token.labels(**self.labels)
811
+ total_observations = sum(bucket._value for bucket in his._buckets)
812
+ if total_observations < 1000:
813
+ return False
814
+ p999_threshold = total_observations * 0.999
815
+ cumulative_count = 0
816
+ for i, bucket in enumerate(his._buckets):
817
+ cumulative_count += bucket._value
818
+ if cumulative_count > p999_threshold:
819
+ return value >= his._upper_bounds[i]
820
+ return False
821
+
822
+ def observe_inter_token_latency(
823
+ self, labels: Dict[str, str], internval: float, num_new_tokens: int
824
+ ):
548
825
  adjusted_interval = internval / num_new_tokens
549
826
 
550
827
  # A faster version of the Histogram::observe which observes multiple values at the same time.
551
828
  # reference: https://github.com/prometheus/client_python/blob/v0.21.1/prometheus_client/metrics.py#L639
552
- his = self.histogram_inter_token_latency_seconds.labels(**self.labels)
829
+ his = self.histogram_inter_token_latency.labels(**labels)
553
830
  his._sum.inc(internval)
554
831
 
555
832
  for i, bound in enumerate(his._upper_bounds):
@@ -557,5 +834,107 @@ class TokenizerMetricsCollector:
557
834
  his._buckets[i].inc(num_new_tokens)
558
835
  break
559
836
 
560
- def observe_one_aborted_request(self):
561
- self.num_aborted_requests_total.labels(**self.labels).inc(1)
837
+ def observe_one_aborted_request(self, labels: Dict[str, str]):
838
+ self.num_aborted_requests_total.labels(**labels).inc(1)
839
+
840
+
841
+ @dataclass
842
+ class StorageMetrics:
843
+ prefetch_pgs: List[int] = field(default_factory=list)
844
+ backup_pgs: List[int] = field(default_factory=list)
845
+ prefetch_bandwidth: List[float] = field(default_factory=list)
846
+ backup_bandwidth: List[float] = field(default_factory=list)
847
+
848
+
849
+ class StorageMetricsCollector:
850
+ def __init__(
851
+ self,
852
+ labels: Dict[str, str],
853
+ ):
854
+ from prometheus_client import Counter, Histogram
855
+
856
+ self.labels = labels
857
+
858
+ self.prefetched_tokens_total = Counter(
859
+ name="sglang:prefetched_tokens_total",
860
+ documentation="Number of prefetched prompt tokens.",
861
+ labelnames=labels.keys(),
862
+ )
863
+
864
+ self.backuped_tokens_total = Counter(
865
+ name="sglang:backuped_tokens_total",
866
+ documentation="Number of backuped tokens.",
867
+ labelnames=labels.keys(),
868
+ )
869
+
870
+ bucket_io = [
871
+ 1,
872
+ 5,
873
+ 10,
874
+ 50,
875
+ 100,
876
+ ]
877
+
878
+ bucket_bandwidth = [
879
+ 0.1,
880
+ 0.5,
881
+ 1,
882
+ 5,
883
+ 10,
884
+ 50,
885
+ 100,
886
+ ]
887
+
888
+ self.histogram_prefetch_pgs = Histogram(
889
+ name="sglang:prefetch_pgs",
890
+ documentation="Histogram of prefetch pages of batches.",
891
+ labelnames=labels.keys(),
892
+ buckets=bucket_io,
893
+ )
894
+
895
+ self.histogram_backup_pgs = Histogram(
896
+ name="sglang:backup_pgs",
897
+ documentation="Histogram of backup pages of batches.",
898
+ labelnames=labels.keys(),
899
+ buckets=bucket_io,
900
+ )
901
+
902
+ self.histogram_prefetch_bandwidth = Histogram(
903
+ name="sglang:prefetch_bandwidth",
904
+ documentation="Histogram of prefetch bandwidth in GB/s.",
905
+ labelnames=labels.keys(),
906
+ buckets=bucket_bandwidth,
907
+ )
908
+
909
+ self.histogram_backup_bandwidth = Histogram(
910
+ name="sglang:backup_bandwidth",
911
+ documentation="Histogram of backup bandwidth in GB/s.",
912
+ labelnames=labels.keys(),
913
+ buckets=bucket_bandwidth,
914
+ )
915
+
916
+ def log_prefetched_tokens(self, prefetched_tokens: int):
917
+ if prefetched_tokens > 0:
918
+ self.prefetched_tokens_total.labels(**self.labels).inc(prefetched_tokens)
919
+
920
+ def log_backuped_tokens(self, backuped_tokens: int):
921
+ if backuped_tokens > 0:
922
+ self.backuped_tokens_total.labels(**self.labels).inc(backuped_tokens)
923
+
924
+ def _log_histogram(self, histogram, data: Union[int, float]):
925
+ histogram.labels(**self.labels).observe(data)
926
+
927
+ def log_storage_metrics(self, storage_metrics: Optional[StorageMetrics] = None):
928
+ if storage_metrics is None:
929
+ return
930
+
931
+ assert isinstance(storage_metrics, StorageMetrics)
932
+
933
+ for v in storage_metrics.prefetch_pgs:
934
+ self._log_histogram(self.histogram_prefetch_pgs, v)
935
+ for v in storage_metrics.backup_pgs:
936
+ self._log_histogram(self.histogram_backup_pgs, v)
937
+ for v in storage_metrics.prefetch_bandwidth:
938
+ self._log_histogram(self.histogram_prefetch_bandwidth, v)
939
+ for v in storage_metrics.backup_bandwidth:
940
+ self._log_histogram(self.histogram_backup_bandwidth, v)