sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -60,7 +60,6 @@ import torch.distributed as dist
60
60
  from sglang.srt.configs.model_config import ModelConfig
61
61
  from sglang.srt.distributed.parallel_state import destroy_distributed_environment
62
62
  from sglang.srt.entrypoints.engine import _set_envs_and_config
63
- from sglang.srt.hf_transformers_utils import get_tokenizer
64
63
  from sglang.srt.layers.moe import initialize_moe_config
65
64
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
65
  from sglang.srt.managers.scheduler import Scheduler
@@ -78,6 +77,7 @@ from sglang.srt.utils import (
78
77
  set_gpu_proc_affinity,
79
78
  suppress_other_loggers,
80
79
  )
80
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
81
81
 
82
82
 
83
83
  @dataclasses.dataclass
@@ -204,7 +204,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
204
204
  origin_input_ids=tmp_input_ids,
205
205
  sampling_params=sampling_params,
206
206
  )
207
- req.prefix_indices = []
208
207
  req.fill_ids = req.origin_input_ids
209
208
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
210
209
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -248,7 +247,6 @@ def prepare_synthetic_inputs_for_latency_test(
248
247
  origin_input_ids=list(input_ids[i]),
249
248
  sampling_params=sampling_params,
250
249
  )
251
- req.prefix_indices = []
252
250
  req.fill_ids = req.origin_input_ids
253
251
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
254
252
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -443,11 +441,9 @@ def latency_test_run_once(
443
441
 
444
442
  if profile:
445
443
  profiler.stop()
446
- profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
447
- _save_profile_trace_results(profiler, profile_filename)
448
- rank_print(
449
- f"torch profiler chrome trace for prefill saved to {profile_filename}"
450
- )
444
+ trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
445
+ _save_profile_trace_results(profiler, trace_filename)
446
+ rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
451
447
 
452
448
  # Decode
453
449
  decode_latencies = []
@@ -479,10 +475,10 @@ def latency_test_run_once(
479
475
 
480
476
  if profile and i == output_len / 2:
481
477
  profiler.stop()
482
- profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
483
- _save_profile_trace_results(profiler, profile_filename)
478
+ trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
479
+ _save_profile_trace_results(profiler, trace_filename)
484
480
  rank_print(
485
- f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}"
481
+ f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
486
482
  )
487
483
 
488
484
  # Record decode timing from 2nd output
@@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --
9
9
 
10
10
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
11
11
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
12
+ python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
12
13
  """
13
14
 
14
15
  import argparse
@@ -17,12 +18,19 @@ import itertools
17
18
  import json
18
19
  import multiprocessing
19
20
  import os
21
+ import random
20
22
  import time
21
- from typing import List, Tuple
23
+ from typing import List, Optional, Tuple
22
24
 
25
+ import numpy as np
23
26
  import requests
27
+ from pydantic import BaseModel
24
28
 
25
- from sglang.bench_serving import get_tokenizer, sample_random_requests
29
+ from sglang.bench_serving import (
30
+ get_tokenizer,
31
+ sample_mmmu_requests,
32
+ sample_random_requests,
33
+ )
26
34
  from sglang.profiler import run_profile
27
35
  from sglang.srt.entrypoints.http_server import launch_server
28
36
  from sglang.srt.server_args import ServerArgs
@@ -30,9 +38,112 @@ from sglang.srt.utils import is_blackwell, kill_process_tree
30
38
  from sglang.test.test_utils import is_in_ci, write_github_step_summary
31
39
 
32
40
 
41
+ class ProfileLinks(BaseModel):
42
+ """Pydantic model for profile trace links."""
43
+
44
+ extend: Optional[str] = None
45
+ decode: Optional[str] = None
46
+
47
+
48
+ class BenchmarkResult(BaseModel):
49
+ """Pydantic model for benchmark results table data, for a single isl and osl"""
50
+
51
+ model_path: str
52
+ run_name: str
53
+ batch_size: int
54
+ input_len: int
55
+ output_len: int
56
+ latency: float
57
+ ttft: float
58
+ input_throughput: float
59
+ output_throughput: float
60
+ overall_throughput: float
61
+ last_gen_throughput: float
62
+ acc_length: Optional[float] = None
63
+ profile_links: Optional[ProfileLinks] = None
64
+
65
+ @staticmethod
66
+ def help_str() -> str:
67
+ return f"""
68
+ Note: To view the traces through perfetto-ui, please:
69
+ 1. open with Google Chrome
70
+ 2. allow popup
71
+ """
72
+
73
+ def to_markdown_row(
74
+ self, trace_dir, base_url: str = "", relay_base: str = ""
75
+ ) -> str:
76
+ """Convert this benchmark result to a markdown table row."""
77
+ # Calculate costs (assuming H100 pricing for now)
78
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
79
+ hourly_cost = hourly_cost_per_gpu * 1 # Assuming tp_size = 1 for simplicity
80
+ input_util = 0.7
81
+ accept_length = (
82
+ round(self.acc_length, 2) if self.acc_length is not None else "n/a"
83
+ )
84
+ itl = 1 / (self.output_throughput / self.batch_size) * 1000
85
+ input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
86
+ output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
87
+
88
+ def get_perfetto_relay_link_from_trace_file(trace_file: str):
89
+ import os
90
+ from urllib.parse import quote
91
+
92
+ rel_path = os.path.relpath(trace_file, trace_dir)
93
+ raw_file_link = f"{base_url}/{rel_path}"
94
+ relay_link = (
95
+ f"{relay_base}?src={quote(raw_file_link, safe='')}"
96
+ if relay_base and quote
97
+ else raw_file_link
98
+ )
99
+ return relay_link
100
+
101
+ # Handle profile links
102
+ profile_link = "NA | NA"
103
+ if self.profile_links:
104
+ if self.profile_links.extend or self.profile_links.decode:
105
+ # Create a combined link or use the first available one
106
+ trace_files = [self.profile_links.extend, self.profile_links.decode]
107
+ trace_files_relay_links = [
108
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
109
+ for trace_file in trace_files
110
+ ]
111
+
112
+ profile_link = " | ".join(trace_files_relay_links)
113
+
114
+ # Build the row
115
+ return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
116
+
117
+ @classmethod
118
+ def generate_markdown_report(
119
+ cls, trace_dir, results: List["BenchmarkResult"]
120
+ ) -> str:
121
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
122
+ import os
123
+
124
+ summary = f"### {results[0].model_path}\n"
125
+
126
+ # summary += (
127
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
128
+ # )
129
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
130
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
131
+
132
+ # all results should share the same isl & osl
133
+ for result in results:
134
+ base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
135
+ relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
136
+ relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
137
+ # base_url = "https://github.com/sgl-project/ci-data/traces"
138
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
139
+
140
+ return summary
141
+
142
+
33
143
  @dataclasses.dataclass
34
144
  class BenchArgs:
35
145
  run_name: str = "default"
146
+ seed: int = 42
36
147
  batch_size: Tuple[int] = (1,)
37
148
  input_len: Tuple[int] = (1024,)
38
149
  output_len: Tuple[int] = (16,)
@@ -47,10 +158,17 @@ class BenchArgs:
47
158
  profile: bool = False
48
159
  profile_steps: int = 3
49
160
  profile_by_stage: bool = False
161
+ profile_filename_prefix: str = None
162
+ append_to_github_summary: bool = True
163
+ dataset_path: str = ""
164
+ parallel_batch: bool = False
165
+ dataset_name: str = "random"
166
+ output_path: Optional[str] = None
50
167
 
51
168
  @staticmethod
52
169
  def add_cli_args(parser: argparse.ArgumentParser):
53
170
  parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
171
+ parser.add_argument("--seed", type=int, default=BenchArgs.seed)
54
172
  parser.add_argument(
55
173
  "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
56
174
  )
@@ -61,6 +179,13 @@ class BenchArgs:
61
179
  "--output-len", type=int, nargs="+", default=BenchArgs.output_len
62
180
  )
63
181
  parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
182
+ parser.add_argument(
183
+ "--dataset-name",
184
+ type=str,
185
+ default=BenchArgs.dataset_name,
186
+ choices=["mmmu", "random"],
187
+ help="Name of the dataset to benchmark on.",
188
+ )
64
189
  parser.add_argument("--return-logprob", action="store_true")
65
190
  parser.add_argument(
66
191
  "--client-stream-interval",
@@ -83,14 +208,43 @@ class BenchArgs:
83
208
  "--profile-steps", type=int, default=BenchArgs.profile_steps
84
209
  )
85
210
  parser.add_argument("--profile-by-stage", action="store_true")
211
+ parser.add_argument(
212
+ "--dataset-path",
213
+ type=str,
214
+ default=BenchArgs.dataset_path,
215
+ help="Path to the dataset.",
216
+ )
217
+ parser.add_argument("--parallel-batch", action="store_true")
218
+ parser.add_argument(
219
+ "--profile-filename-prefix",
220
+ type=str,
221
+ default=BenchArgs.profile_filename_prefix,
222
+ )
223
+ parser.add_argument(
224
+ "--no-append-to-github-summary",
225
+ action="store_false",
226
+ dest="append_to_github_summary",
227
+ help="Disable appending the output of this run to github ci summary",
228
+ )
229
+ parser.add_argument(
230
+ "--output-path",
231
+ type=str,
232
+ default=BenchArgs.output_path,
233
+ help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
234
+ )
86
235
 
87
236
  @classmethod
88
237
  def from_cli_args(cls, args: argparse.Namespace):
89
238
  # use the default value's type to cast the args into correct types.
90
239
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
91
- return cls(
92
- **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
93
- )
240
+ kwargs = {}
241
+ for attr, attr_type in attrs:
242
+ val = getattr(args, attr)
243
+ if attr_type is type(None):
244
+ kwargs[attr] = val
245
+ else:
246
+ kwargs[attr] = attr_type(val)
247
+ return cls(**kwargs)
94
248
 
95
249
 
96
250
  def launch_server_internal(server_args):
@@ -135,21 +289,35 @@ def run_one_case(
135
289
  run_name: str,
136
290
  result_filename: str,
137
291
  tokenizer,
292
+ dataset_name="",
138
293
  profile: bool = False,
139
294
  profile_steps: int = 3,
140
295
  profile_by_stage: bool = False,
296
+ profile_filename_prefix: str = None,
297
+ dataset_path: str = "",
298
+ parallel_batch: bool = False,
141
299
  ):
142
300
  requests.post(url + "/flush_cache")
143
- input_requests = sample_random_requests(
144
- input_len=input_len,
145
- output_len=output_len,
146
- num_prompts=batch_size,
147
- range_ratio=1.0,
148
- tokenizer=tokenizer,
149
- dataset_path="",
150
- random_sample=True,
151
- return_text=False,
152
- )
301
+ # TODO: reuse bench_serving.get_dataset ?
302
+ if dataset_name == "mmmu":
303
+ input_requests = sample_mmmu_requests(
304
+ num_requests=batch_size,
305
+ tokenizer=tokenizer,
306
+ fixed_output_len=output_len,
307
+ apply_chat_template=True,
308
+ random_sample=False,
309
+ )
310
+ elif dataset_name == "random":
311
+ input_requests = sample_random_requests(
312
+ input_len=input_len,
313
+ output_len=output_len,
314
+ num_prompts=batch_size,
315
+ range_ratio=1.0,
316
+ tokenizer=tokenizer,
317
+ dataset_path=dataset_path,
318
+ random_sample=True,
319
+ return_text=False,
320
+ )
153
321
 
154
322
  use_structured_outputs = False
155
323
  if use_structured_outputs:
@@ -166,25 +334,48 @@ def run_one_case(
166
334
 
167
335
  profile_link = None
168
336
  if profile:
337
+ output_dir, profile_name = None, None
338
+ if profile_filename_prefix:
339
+ output_dir = os.path.dirname(profile_filename_prefix)
340
+ profile_name = os.path.basename(profile_filename_prefix)
169
341
  profile_link: str = run_profile(
170
- url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
342
+ url,
343
+ profile_steps,
344
+ ["CPU", "GPU"],
345
+ output_dir,
346
+ profile_name,
347
+ profile_by_stage,
171
348
  )
172
349
 
173
350
  tic = time.perf_counter()
351
+
352
+ payload = {
353
+ "sampling_params": {
354
+ "temperature": temperature,
355
+ "max_new_tokens": output_len,
356
+ "ignore_eos": True,
357
+ "json_schema": json_schema,
358
+ "stream_interval": stream_interval,
359
+ },
360
+ "return_logprob": return_logprob,
361
+ "stream": True,
362
+ **({"parallel_batch": parallel_batch} if parallel_batch else {}),
363
+ }
364
+ if dataset_name == "mmmu":
365
+ # vlm
366
+ input_ids = []
367
+ for input_req in input_requests:
368
+ input_ids += [tokenizer.encode(input_req.prompt)]
369
+ payload["image_data"] = [req.image_data for req in input_requests]
370
+
371
+ else:
372
+ input_ids = [req.prompt for req in input_requests]
373
+
374
+ payload["input_ids"] = input_ids
375
+
174
376
  response = requests.post(
175
377
  url + "/generate",
176
- json={
177
- "input_ids": [req.prompt for req in input_requests],
178
- "sampling_params": {
179
- "temperature": temperature,
180
- "max_new_tokens": output_len,
181
- "ignore_eos": True,
182
- "json_schema": json_schema,
183
- "stream_interval": stream_interval,
184
- },
185
- "return_logprob": return_logprob,
186
- "stream": True,
187
- },
378
+ json=payload,
188
379
  stream=True,
189
380
  )
190
381
 
@@ -248,10 +439,100 @@ def run_one_case(
248
439
  overall_throughput,
249
440
  last_gen_throughput,
250
441
  acc_length,
251
- profile_link if profile else None,
442
+ profile_link,
252
443
  )
253
444
 
254
445
 
446
+ def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
447
+ """Save benchmark results as JSON using Pydantic models."""
448
+ json_results = []
449
+
450
+ # Generate all parameter combinations to match with results
451
+ param_combinations = list(
452
+ itertools.product(
453
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
454
+ )
455
+ )
456
+
457
+ for i, (
458
+ batch_size,
459
+ latency,
460
+ ttft,
461
+ input_throughput,
462
+ output_throughput,
463
+ overall_throughput,
464
+ last_gen_throughput,
465
+ acc_length,
466
+ profile_link,
467
+ ) in enumerate(result):
468
+ # Get the corresponding parameters for this result
469
+ bs, input_len, output_len = param_combinations[i]
470
+
471
+ # Parse profile links if available
472
+ profile_links = None
473
+ if profile_link:
474
+ profile_links = parse_profile_links(
475
+ profile_link, batch_size, input_len, output_len
476
+ )
477
+
478
+ benchmark_result = BenchmarkResult(
479
+ model_path=model,
480
+ run_name=bench_args.run_name,
481
+ batch_size=batch_size,
482
+ input_len=input_len,
483
+ output_len=output_len,
484
+ latency=latency,
485
+ ttft=ttft,
486
+ input_throughput=input_throughput,
487
+ output_throughput=output_throughput,
488
+ overall_throughput=overall_throughput,
489
+ last_gen_throughput=last_gen_throughput,
490
+ acc_length=acc_length,
491
+ profile_links=profile_links,
492
+ )
493
+ json_results.append(benchmark_result.model_dump())
494
+
495
+ # Save to JSON file
496
+ with open(bench_args.output_path, "w", encoding="utf-8") as f:
497
+ json.dump(json_results, f, indent=2, ensure_ascii=False)
498
+
499
+ print(f"Results saved as JSON to {bench_args.output_path}")
500
+
501
+
502
+ def parse_profile_links(
503
+ profile_dir: str, batch_size: int, input_len: int, output_len: int
504
+ ) -> Optional[ProfileLinks]:
505
+ """Parse profile directory to extract extend and decode trace file links."""
506
+ if not profile_dir or not os.path.exists(profile_dir):
507
+ return None
508
+
509
+ extend_link = None
510
+ decode_link = None
511
+
512
+ # Look for extend/prefill trace files
513
+ for file in os.listdir(profile_dir):
514
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
515
+ if "extend" in file.lower() or "prefill" in file.lower():
516
+ extend_link = os.path.join(profile_dir, file)
517
+ elif "decode" in file.lower():
518
+ decode_link = os.path.join(profile_dir, file)
519
+
520
+ # If no specific extend/decode files found, try to find files with batch/input/output info
521
+ if not extend_link or not decode_link:
522
+ for file in os.listdir(profile_dir):
523
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
524
+ if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
525
+ if "prefill" in file.lower() or "extend" in file.lower():
526
+ extend_link = os.path.join(profile_dir, file)
527
+ elif "decode" in file.lower():
528
+ decode_link = os.path.join(profile_dir, file)
529
+
530
+ if extend_link or decode_link:
531
+ return ProfileLinks(extend=extend_link, decode=decode_link)
532
+
533
+ return None
534
+
535
+
255
536
  def get_report_summary(
256
537
  result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
257
538
  ):
@@ -342,9 +623,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
342
623
  return_logprob=bench_args.return_logprob,
343
624
  stream_interval=bench_args.client_stream_interval,
344
625
  input_len_step_percentage=bench_args.input_len_step_percentage,
626
+ dataset_name=bench_args.dataset_name,
345
627
  run_name="",
346
628
  result_filename="",
347
629
  tokenizer=tokenizer,
630
+ dataset_path=bench_args.dataset_path,
631
+ parallel_batch=bench_args.parallel_batch,
348
632
  )
349
633
  print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
350
634
 
@@ -366,8 +650,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
366
650
  stream_interval=bench_args.client_stream_interval,
367
651
  input_len_step_percentage=bench_args.input_len_step_percentage,
368
652
  run_name=bench_args.run_name,
653
+ dataset_name=bench_args.dataset_name,
369
654
  result_filename=bench_args.result_filename,
370
655
  tokenizer=tokenizer,
656
+ dataset_path=bench_args.dataset_path,
657
+ parallel_batch=bench_args.parallel_batch,
658
+ profile_filename_prefix=bench_args.profile_filename_prefix,
371
659
  )
372
660
  )
373
661
 
@@ -390,9 +678,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
390
678
  run_name=bench_args.run_name,
391
679
  result_filename=bench_args.result_filename,
392
680
  tokenizer=tokenizer,
681
+ dataset_name=bench_args.dataset_name,
393
682
  profile=bench_args.profile,
394
683
  profile_steps=bench_args.profile_steps,
395
684
  profile_by_stage=bench_args.profile_by_stage,
685
+ dataset_path=bench_args.dataset_path,
686
+ parallel_batch=bench_args.parallel_batch,
687
+ profile_filename_prefix=bench_args.profile_filename_prefix,
396
688
  )[-1],
397
689
  )
398
690
  )
@@ -405,13 +697,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
405
697
 
406
698
  print(f"\nResults are saved to {bench_args.result_filename}")
407
699
 
700
+ # Save results as JSON if output_path is specified
701
+ if bench_args.output_path:
702
+ save_results_as_json(result, bench_args, model=server_args.model_path)
703
+
408
704
  if not bench_args.show_report:
409
705
  return
410
706
 
411
707
  summary = get_report_summary(result, server_args, bench_args)
412
- print(summary)
413
708
 
414
- if is_in_ci():
709
+ if is_in_ci() and bench_args.append_to_github_summary:
415
710
  write_github_step_summary(summary)
416
711
 
417
712
 
@@ -420,6 +715,10 @@ def main():
420
715
  ServerArgs.add_cli_args(parser)
421
716
  BenchArgs.add_cli_args(parser)
422
717
  args = parser.parse_args()
718
+
719
+ random.seed(args.seed)
720
+ np.random.seed(args.seed)
721
+
423
722
  server_args = ServerArgs.from_cli_args(args)
424
723
  bench_args = BenchArgs.from_cli_args(args)
425
724