sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -17,7 +17,7 @@ from enum import Enum, auto
17
17
  from typing import Any, List, Optional
18
18
 
19
19
  from sglang.srt.managers.io_struct import BlockReqInput, BlockReqType
20
- from sglang.srt.poll_based_barrier import PollBasedBarrier
20
+ from sglang.srt.utils.poll_based_barrier import PollBasedBarrier
21
21
 
22
22
  logger = logging.getLogger(__name__)
23
23
 
@@ -12,7 +12,6 @@ from sglang.srt.disaggregation.utils import DisaggregationMode
12
12
  from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
13
13
  from sglang.srt.managers.schedule_policy import PrefillAdder
14
14
  from sglang.srt.managers.scheduler import Req, ScheduleBatch
15
- from sglang.srt.managers.utils import DPBalanceMeta
16
15
  from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats
17
16
  from sglang.srt.utils import get_bool_env_var
18
17
 
@@ -47,8 +46,11 @@ class SchedulerMetricsMixin:
47
46
  self.spec_num_total_forward_ct = 0
48
47
  self.cum_spec_accept_length = 0
49
48
  self.cum_spec_accept_count = 0
50
- self.total_retracted_reqs = 0
49
+ self.kv_transfer_speed_gb_s: float = 0.0
50
+ self.kv_transfer_latency_ms: float = 0.0
51
+
51
52
  self.stats = SchedulerStats()
53
+
52
54
  if self.enable_metrics:
53
55
  engine_type = "unified"
54
56
  labels = {
@@ -61,33 +63,30 @@ class SchedulerMetricsMixin:
61
63
  labels["dp_rank"] = dp_rank
62
64
  self.metrics_collector = SchedulerMetricsCollector(labels=labels)
63
65
 
64
- def init_dp_balance(self: Scheduler, dp_balance_meta: Optional[DPBalanceMeta]):
65
- self.balance_meta = dp_balance_meta
66
- if (
67
- self.server_args.enable_dp_attention
68
- and self.server_args.load_balance_method == "minimum_tokens"
69
- ):
70
- assert dp_balance_meta is not None
71
-
72
- self.recv_dp_balance_id_this_term = []
73
-
74
66
  def init_kv_events(self: Scheduler, kv_events_config: Optional[str]):
75
67
  if self.enable_kv_cache_events:
76
68
  self.kv_event_publisher = EventPublisherFactory.create(
77
69
  kv_events_config, self.attn_dp_rank
78
70
  )
79
71
 
72
+ def update_spec_metrics(self, bs: int, num_accepted_tokens: int):
73
+ self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
74
+ self.spec_num_total_forward_ct += bs
75
+ self.num_generated_tokens += num_accepted_tokens
76
+
80
77
  def log_prefill_stats(
81
78
  self: Scheduler,
82
79
  adder: PrefillAdder,
83
80
  can_run_list: List[Req],
84
81
  running_bs: int,
82
+ running_bs_offline_batch: int,
85
83
  ):
86
84
  gap_latency = time.perf_counter() - self.last_prefill_stats_tic
87
85
  self.last_prefill_stats_tic = time.perf_counter()
88
86
  self.last_input_throughput = self.last_prefill_tokens / gap_latency
89
87
  self.last_prefill_tokens = adder.log_input_tokens
90
88
 
89
+ # TODO: generalize this for various memory pools
91
90
  if self.is_hybrid:
92
91
  (
93
92
  full_num_used,
@@ -101,51 +100,53 @@ class SchedulerMetricsMixin:
101
100
  ) = self._get_swa_token_info()
102
101
  num_used = max(full_num_used, swa_num_used)
103
102
  token_usage = max(full_token_usage, swa_token_usage)
104
- token_msg = (
103
+ token_usage_msg = (
105
104
  f"full token usage: {full_token_usage:.2f}, "
106
105
  f"swa token usage: {swa_token_usage:.2f}, "
107
106
  )
108
107
  else:
109
108
  num_used, token_usage, _, _ = self._get_token_info()
110
- token_msg = f"token usage: {token_usage:.2f}, "
109
+ token_usage_msg = f"token usage: {token_usage:.2f}, "
111
110
 
112
- num_new_seq = len(can_run_list)
113
111
  f = (
114
112
  f"Prefill batch. "
115
- f"#new-seq: {num_new_seq}, "
113
+ f"#new-seq: {len(can_run_list)}, "
116
114
  f"#new-token: {adder.log_input_tokens}, "
117
115
  f"#cached-token: {adder.log_hit_tokens}, "
118
- f"{token_msg}"
116
+ f"{token_usage_msg}"
117
+ f"#running-req: {running_bs}, "
118
+ f"#queue-req: {len(self.waiting_queue)}, "
119
119
  )
120
120
 
121
121
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
122
- f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
123
- f += f"#queue-req: {len(self.waiting_queue)}, "
124
- f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
125
- f += f"input throughput (token/s): {self.last_input_throughput:.2f}, "
126
- else:
127
- f += f"#running-req: {running_bs}, "
128
- f += f"#queue-req: {len(self.waiting_queue)}, "
122
+ f += f"#prealloc-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
123
+ f += f"#inflight-req: {len(self.disagg_prefill_inflight_queue)}, "
129
124
 
130
125
  logger.info(f)
131
126
 
132
127
  if self.enable_metrics:
128
+ # Basics
133
129
  total_tokens = adder.log_input_tokens + adder.log_hit_tokens
134
-
135
130
  cache_hit_rate = (
136
131
  adder.log_hit_tokens / total_tokens if total_tokens > 0 else 0.0
137
132
  )
133
+
138
134
  self.stats.num_running_reqs = running_bs
135
+ self.stats.num_running_reqs_offline_batch = running_bs_offline_batch
139
136
  self.stats.num_used_tokens = num_used
140
- self.stats.token_usage = round(token_usage, 2)
137
+ self.stats.token_usage = token_usage
138
+ if self.is_hybrid:
139
+ self.stats.swa_token_usage = swa_token_usage
141
140
  self.stats.num_queue_reqs = len(self.waiting_queue)
141
+ self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
142
142
  self.stats.cache_hit_rate = cache_hit_rate
143
143
 
144
- total_queue_latency = 0
145
- for req in can_run_list:
146
- total_queue_latency += req.queue_time_end - req.queue_time_start
147
- self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
144
+ # Retract
145
+ self.stats.num_retracted_reqs = self.num_retracted_reqs
146
+ self.stats.num_paused_reqs = self.num_paused_reqs
147
+ self.num_retracted_reqs = self.num_paused_reqs = 0
148
148
 
149
+ # PD disaggregation
149
150
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
150
151
  self.stats.num_prefill_prealloc_queue_reqs = len(
151
152
  self.disagg_prefill_bootstrap_queue.queue
@@ -153,7 +154,18 @@ class SchedulerMetricsMixin:
153
154
  self.stats.num_prefill_inflight_queue_reqs = len(
154
155
  self.disagg_prefill_inflight_queue
155
156
  )
157
+ self.stats.kv_transfer_speed_gb_s = self.kv_transfer_speed_gb_s
158
+ self.stats.kv_transfer_latency_ms = self.kv_transfer_latency_ms
159
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
160
+ self.stats.num_decode_prealloc_queue_reqs = len(
161
+ self.disagg_decode_prealloc_queue.queue
162
+ )
163
+ self.stats.num_decode_transfer_queue_reqs = len(
164
+ self.disagg_decode_transfer_queue.queue
165
+ )
156
166
 
167
+ # Others
168
+ self.calculate_utilization()
157
169
  self.metrics_collector.log_stats(self.stats)
158
170
  self._emit_kv_metrics()
159
171
  self._publish_kv_events()
@@ -166,8 +178,12 @@ class SchedulerMetricsMixin:
166
178
  gap_latency = time.perf_counter() - self.last_decode_stats_tic
167
179
  self.last_decode_stats_tic = time.perf_counter()
168
180
  self.last_gen_throughput = self.num_generated_tokens / gap_latency
181
+
169
182
  self.num_generated_tokens = 0
170
183
  num_running_reqs = len(batch.reqs)
184
+ num_running_reqs_offline_batch = 0
185
+
186
+ # TODO: generalize this for various memory pools
171
187
  if self.is_hybrid:
172
188
  (
173
189
  full_num_used,
@@ -181,7 +197,7 @@ class SchedulerMetricsMixin:
181
197
  ) = self._get_swa_token_info()
182
198
  num_used = max(full_num_used, swa_num_used)
183
199
  token_usage = max(full_token_usage, swa_token_usage)
184
- token_msg = (
200
+ token_usage_msg = (
185
201
  f"#full token: {full_num_used}, "
186
202
  f"full token usage: {full_token_usage:.2f}, "
187
203
  f"#swa token: {swa_num_used}, "
@@ -189,14 +205,14 @@ class SchedulerMetricsMixin:
189
205
  )
190
206
  else:
191
207
  num_used, token_usage, _, _ = self._get_token_info()
192
- token_msg = f"#token: {num_used}, " f"token usage: {token_usage:.2f}, "
208
+ token_usage_msg = f"#token: {num_used}, token usage: {token_usage:.2f}, "
193
209
 
194
210
  if RECORD_STEP_TIME:
195
211
  self.step_time_dict[num_running_reqs].append(
196
212
  gap_latency / self.server_args.decode_log_interval
197
213
  )
198
214
 
199
- msg = f"Decode batch. #running-req: {num_running_reqs}, {token_msg}"
215
+ msg = f"Decode batch. #running-req: {num_running_reqs}, {token_usage_msg}"
200
216
 
201
217
  if self.spec_algorithm.is_none():
202
218
  spec_accept_length = 0
@@ -208,40 +224,66 @@ class SchedulerMetricsMixin:
208
224
  self.cum_spec_accept_count += self.spec_num_total_forward_ct
209
225
  self.spec_num_total_accepted_tokens = self.spec_num_total_forward_ct = 0
210
226
  msg += f"accept len: {spec_accept_length:.2f}, "
227
+ cache_hit_rate = 0.0
211
228
 
212
229
  if self.disaggregation_mode == DisaggregationMode.DECODE:
213
230
  msg += f"pre-allocated usage: {self.disagg_decode_prealloc_queue.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
231
+ msg += f"#prealloc-req: {len(self.disagg_decode_prealloc_queue.queue)}, "
232
+ msg += f"#transfer-req: {len(self.disagg_decode_transfer_queue.queue)}, "
214
233
  msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
215
234
 
216
235
  msg += (
217
- f"cuda graph: {can_run_cuda_graph}, "
236
+ f"{'cuda graph' if self.device == 'cuda' else 'cpu graph'}: {can_run_cuda_graph}, "
218
237
  f"gen throughput (token/s): {self.last_gen_throughput:.2f}, "
219
238
  f"#queue-req: {len(self.waiting_queue)}, "
220
239
  )
221
240
 
222
241
  logger.info(msg)
223
242
  if self.enable_metrics:
243
+ # Basics
224
244
  self.stats.num_running_reqs = num_running_reqs
245
+ self.stats.num_running_reqs_offline_batch = num_running_reqs_offline_batch
225
246
  self.stats.num_used_tokens = num_used
226
- self.stats.token_usage = round(token_usage, 2)
227
- self.stats.cache_hit_rate = 0.0
247
+ self.stats.token_usage = token_usage
248
+ if self.is_hybrid:
249
+ self.stats.swa_token_usage = swa_token_usage
228
250
  self.stats.gen_throughput = self.last_gen_throughput
229
251
  self.stats.num_queue_reqs = len(self.waiting_queue)
230
252
  self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
253
+ self.stats.cache_hit_rate = cache_hit_rate
231
254
  self.stats.spec_accept_length = spec_accept_length
232
- self.stats.total_retracted_reqs = self.total_retracted_reqs
233
- self.metrics_collector.log_stats(self.stats)
234
- if self.disaggregation_mode == DisaggregationMode.DECODE:
255
+
256
+ # Retract
257
+ self.stats.num_retracted_reqs = self.num_retracted_reqs
258
+ self.stats.num_paused_reqs = self.num_paused_reqs
259
+ self.num_retracted_reqs = self.num_paused_reqs = 0
260
+
261
+ # PD disaggregation
262
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
263
+ self.stats.num_prefill_prealloc_queue_reqs = len(
264
+ self.disagg_prefill_bootstrap_queue.queue
265
+ )
266
+ self.stats.num_prefill_inflight_queue_reqs = len(
267
+ self.disagg_prefill_inflight_queue
268
+ )
269
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
235
270
  self.stats.num_decode_prealloc_queue_reqs = len(
236
271
  self.disagg_decode_prealloc_queue.queue
237
272
  )
238
273
  self.stats.num_decode_transfer_queue_reqs = len(
239
274
  self.disagg_decode_transfer_queue.queue
240
275
  )
276
+
277
+ # Others
278
+ self.calculate_utilization()
279
+ self.metrics_collector.log_stats(self.stats)
241
280
  self._emit_kv_metrics()
242
281
  self._publish_kv_events()
243
282
 
244
283
  def _emit_kv_metrics(self: Scheduler):
284
+ if not self.enable_kv_cache_events:
285
+ return
286
+
245
287
  kv_metrics = KvMetrics()
246
288
  kv_metrics.request_active_slots = self.stats.num_running_reqs
247
289
  kv_metrics.request_total_slots = self.max_running_requests
@@ -258,93 +300,24 @@ class SchedulerMetricsMixin:
258
300
  self.send_metrics_from_scheduler.send_pyobj(kv_metrics)
259
301
 
260
302
  def _publish_kv_events(self: Scheduler):
261
- if self.enable_kv_cache_events:
262
- events = self.tree_cache.take_events()
263
- if events:
264
- batch = KVEventBatch(ts=time.time(), events=events)
265
- self.kv_event_publisher.publish(batch)
266
-
267
- def maybe_update_dp_balance_data(
268
- self: Scheduler, recv_req: TokenizedGenerateReqInput
269
- ):
270
- if (
271
- self.server_args.enable_dp_attention
272
- and self.server_args.load_balance_method == "minimum_tokens"
273
- ):
274
- self.recv_dp_balance_id_this_term.append(recv_req.dp_balance_id)
275
-
276
- def maybe_handle_dp_balance_data(self: Scheduler):
277
- if (
278
- self.server_args.load_balance_method == "minimum_tokens"
279
- and self.forward_ct % 40 == 0
280
- ):
281
- holding_tokens = self.get_load()
282
-
283
- new_recv_dp_balance_id_list, holding_token_list = (
284
- self.gather_dp_balance_info(holding_tokens)
285
- )
303
+ if not self.enable_kv_cache_events:
304
+ return
286
305
 
287
- self.recv_dp_balance_id_this_term.clear()
288
- if self.tp_rank == 0: # only first worker write info
289
- self.write_shared_dp_balance_info(
290
- new_recv_dp_balance_id_list, holding_token_list
291
- )
306
+ events = self.tree_cache.take_events()
307
+ if events:
308
+ batch = KVEventBatch(ts=time.time(), events=events)
309
+ self.kv_event_publisher.publish(batch)
292
310
 
293
- def gather_dp_balance_info(
294
- self: Scheduler, holding_tokens_list
295
- ) -> Union[None, List[List[int]]]:
296
- """gather recv_dp_balance_id_this_term and holding tokens per worker for dp balance"""
297
- recv_list = self.recv_dp_balance_id_this_term
298
- assert len(recv_list) <= 511, (
299
- "The number of requests received this round is too large. "
300
- "Please increase gather_tensor_size and onfly_info_size."
301
- )
302
- # The maximum size of the tensor used for gathering data from all workers.
303
- gather_tensor_size = 512
304
-
305
- # recv_tensor: | holding_tokens | len(recv_dp_balance_id) | recv_dp_balance_ids
306
- recv_tensor = torch.zeros(gather_tensor_size, dtype=torch.int32)
307
- recv_tensor[0] = holding_tokens_list
308
- recv_tensor[1] = len(recv_list) # The first element is the length of the list.
309
- recv_tensor[2 : len(recv_list) + 2] = torch.tensor(recv_list, dtype=torch.int32)
310
-
311
- if self.tp_rank == 0:
312
- gathered_list = [
313
- torch.zeros(gather_tensor_size, dtype=torch.int32)
314
- for _ in range(self.balance_meta.num_workers)
315
- ]
311
+ def calculate_utilization(self):
312
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
313
+ self.stats.utilization = -1
316
314
  else:
317
- gathered_list = None
318
-
319
- torch.distributed.gather(recv_tensor, gathered_list, group=self.tp_cpu_group)
320
-
321
- gathered_id_list_per_worker = None
322
- if self.tp_rank == 0:
323
- gathered_id_list_per_worker = []
324
- holding_tokens_list = []
325
- for tensor in gathered_list:
326
- holding_tokens_list.append(tensor[0].item())
327
- list_length = tensor[1].item()
328
- gathered_id_list_per_worker.append(tensor[2 : list_length + 2].tolist())
329
-
330
- return gathered_id_list_per_worker, holding_tokens_list
331
-
332
- def write_shared_dp_balance_info(self: Scheduler, new_recv_rid_lists, local_tokens):
333
- meta = self.balance_meta
334
-
335
- with meta.mutex:
336
- onfly_list: List[Dict[int, int]] = meta.get_shared_onfly()
337
- assert len(new_recv_rid_lists) == len(onfly_list), "num_worker not equal"
338
- # 1.Check if the rid received by each worker this round is present in onfly.
339
- # If it is, remove the corresponding onfly item.
340
- worker_id = 0
341
- for new_recv_rids, on_fly_reqs in zip(new_recv_rid_lists, onfly_list):
342
- for new_recv_rid in new_recv_rids:
343
- assert (
344
- new_recv_rid in on_fly_reqs
345
- ), f"{new_recv_rid=} not in {worker_id=} {on_fly_reqs=}, data consistency is wrong"
346
- del on_fly_reqs[new_recv_rid]
347
- worker_id += 1
348
- # 2. Atomically write local_tokens and onfly into shm under the mutex
349
- meta.set_shared_onfly_info(onfly_list)
350
- meta.set_shared_local_tokens(local_tokens)
315
+ if (
316
+ self.stats.max_running_requests_under_SLO is not None
317
+ and self.stats.max_running_requests_under_SLO > 0
318
+ ):
319
+ self.stats.utilization = max(
320
+ self.stats.num_running_reqs
321
+ / self.stats.max_running_requests_under_SLO,
322
+ self.stats.token_usage / 0.9,
323
+ )