sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -9,30 +9,151 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --
9
9
 
10
10
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
11
11
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
12
+ python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
12
13
  """
13
14
 
14
15
  import argparse
15
16
  import dataclasses
16
17
  import itertools
17
18
  import json
19
+ import logging
18
20
  import multiprocessing
19
21
  import os
22
+ import random
20
23
  import time
21
- from typing import List, Tuple
24
+ from typing import List, Optional, Tuple
22
25
 
26
+ import numpy as np
23
27
  import requests
24
-
25
- from sglang.bench_serving import get_tokenizer, sample_random_requests
28
+ from pydantic import BaseModel
29
+ from transformers import AutoProcessor, PreTrainedTokenizer
30
+
31
+ from sglang.bench_serving import (
32
+ get_processor,
33
+ get_tokenizer,
34
+ sample_mmmu_requests,
35
+ sample_random_requests,
36
+ )
26
37
  from sglang.profiler import run_profile
27
38
  from sglang.srt.entrypoints.http_server import launch_server
28
39
  from sglang.srt.server_args import ServerArgs
29
40
  from sglang.srt.utils import is_blackwell, kill_process_tree
30
41
  from sglang.test.test_utils import is_in_ci, write_github_step_summary
31
42
 
43
+ logger = logging.getLogger(__name__)
44
+
45
+
46
+ class ProfileLinks(BaseModel):
47
+ """Pydantic model for profile trace links."""
48
+
49
+ extend: Optional[str] = None
50
+ decode: Optional[str] = None
51
+
52
+
53
+ class BenchmarkResult(BaseModel):
54
+ """Pydantic model for benchmark results table data, for a single isl and osl"""
55
+
56
+ model_path: str
57
+ run_name: str
58
+ batch_size: int
59
+ input_len: int
60
+ output_len: int
61
+ latency: float
62
+ ttft: float
63
+ input_throughput: float
64
+ output_throughput: float
65
+ overall_throughput: float
66
+ last_gen_throughput: float
67
+ acc_length: Optional[float] = None
68
+ profile_links: Optional[ProfileLinks] = None
69
+
70
+ @staticmethod
71
+ def help_str() -> str:
72
+ return f"""
73
+ Note: To view the traces through perfetto-ui, please:
74
+ 1. open with Google Chrome
75
+ 2. allow popup
76
+ """
77
+
78
+ def to_markdown_row(
79
+ self, trace_dir, base_url: str = "", relay_base: str = ""
80
+ ) -> str:
81
+ """Convert this benchmark result to a markdown table row."""
82
+ # Calculate costs (assuming H100 pricing for now)
83
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
84
+ hourly_cost = hourly_cost_per_gpu * 1 # Assuming tp_size = 1 for simplicity
85
+ input_util = 0.7
86
+ accept_length = (
87
+ round(self.acc_length, 2) if self.acc_length is not None else "n/a"
88
+ )
89
+ itl = 1 / (self.output_throughput / self.batch_size) * 1000
90
+ input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
91
+ output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
92
+
93
+ def get_perfetto_relay_link_from_trace_file(trace_file: str):
94
+ import os
95
+ from urllib.parse import quote
96
+
97
+ rel_path = os.path.relpath(trace_file, trace_dir)
98
+ raw_file_link = f"{base_url}/{rel_path}"
99
+ relay_link = (
100
+ f"{relay_base}?src={quote(raw_file_link, safe='')}"
101
+ if relay_base and quote
102
+ else raw_file_link
103
+ )
104
+ return relay_link
105
+
106
+ # Handle profile links
107
+ profile_link = "NA | NA"
108
+ if self.profile_links:
109
+ if self.profile_links.extend or self.profile_links.decode:
110
+ # Create a combined link or use the first available one
111
+ trace_files = [self.profile_links.extend, self.profile_links.decode]
112
+ if any(trace_file is None for trace_file in trace_files):
113
+ logger.error("Some trace files are None", f"{trace_files=}")
114
+ trace_files_relay_links = [
115
+ (
116
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
117
+ if trace_file
118
+ else "N/A"
119
+ )
120
+ for trace_file in trace_files
121
+ ]
122
+
123
+ profile_link = " | ".join(trace_files_relay_links)
124
+
125
+ # Build the row
126
+ return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
127
+
128
+
129
+ def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
130
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
131
+ import os
132
+
133
+ summary = f"### {results[0].model_path}\n"
134
+
135
+ # summary += (
136
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
137
+ # )
138
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
139
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
140
+
141
+ # all results should share the same isl & osl
142
+ for result in results:
143
+ base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
144
+ relay_base = os.getenv(
145
+ "PERFETTO_RELAY_URL",
146
+ "",
147
+ ).rstrip("/")
148
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
149
+
150
+ return summary
151
+
32
152
 
33
153
  @dataclasses.dataclass
34
154
  class BenchArgs:
35
155
  run_name: str = "default"
156
+ seed: int = 42
36
157
  batch_size: Tuple[int] = (1,)
37
158
  input_len: Tuple[int] = (1024,)
38
159
  output_len: Tuple[int] = (16,)
@@ -47,11 +168,17 @@ class BenchArgs:
47
168
  profile: bool = False
48
169
  profile_steps: int = 3
49
170
  profile_by_stage: bool = False
171
+ profile_filename_prefix: str = None
172
+ append_to_github_summary: bool = True
50
173
  dataset_path: str = ""
174
+ parallel_batch: bool = False
175
+ dataset_name: str = "random"
176
+ output_path: Optional[str] = None
51
177
 
52
178
  @staticmethod
53
179
  def add_cli_args(parser: argparse.ArgumentParser):
54
180
  parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
181
+ parser.add_argument("--seed", type=int, default=BenchArgs.seed)
55
182
  parser.add_argument(
56
183
  "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
57
184
  )
@@ -62,6 +189,13 @@ class BenchArgs:
62
189
  "--output-len", type=int, nargs="+", default=BenchArgs.output_len
63
190
  )
64
191
  parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
192
+ parser.add_argument(
193
+ "--dataset-name",
194
+ type=str,
195
+ default=BenchArgs.dataset_name,
196
+ choices=["mmmu", "random"],
197
+ help="Name of the dataset to benchmark on.",
198
+ )
65
199
  parser.add_argument("--return-logprob", action="store_true")
66
200
  parser.add_argument(
67
201
  "--client-stream-interval",
@@ -90,14 +224,37 @@ class BenchArgs:
90
224
  default=BenchArgs.dataset_path,
91
225
  help="Path to the dataset.",
92
226
  )
227
+ parser.add_argument("--parallel-batch", action="store_true")
228
+ parser.add_argument(
229
+ "--profile-filename-prefix",
230
+ type=str,
231
+ default=BenchArgs.profile_filename_prefix,
232
+ )
233
+ parser.add_argument(
234
+ "--no-append-to-github-summary",
235
+ action="store_false",
236
+ dest="append_to_github_summary",
237
+ help="Disable appending the output of this run to github ci summary",
238
+ )
239
+ parser.add_argument(
240
+ "--output-path",
241
+ type=str,
242
+ default=BenchArgs.output_path,
243
+ help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
244
+ )
93
245
 
94
246
  @classmethod
95
247
  def from_cli_args(cls, args: argparse.Namespace):
96
248
  # use the default value's type to cast the args into correct types.
97
249
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
98
- return cls(
99
- **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
100
- )
250
+ kwargs = {}
251
+ for attr, attr_type in attrs:
252
+ val = getattr(args, attr)
253
+ if attr_type is type(None):
254
+ kwargs[attr] = val
255
+ else:
256
+ kwargs[attr] = attr_type(val)
257
+ return cls(**kwargs)
101
258
 
102
259
 
103
260
  def launch_server_internal(server_args):
@@ -141,23 +298,35 @@ def run_one_case(
141
298
  input_len_step_percentage: float,
142
299
  run_name: str,
143
300
  result_filename: str,
144
- tokenizer,
301
+ tokenizer: PreTrainedTokenizer | AutoProcessor,
302
+ dataset_name="",
145
303
  profile: bool = False,
146
304
  profile_steps: int = 3,
147
305
  profile_by_stage: bool = False,
306
+ profile_filename_prefix: str = None,
148
307
  dataset_path: str = "",
308
+ parallel_batch: bool = False,
149
309
  ):
150
310
  requests.post(url + "/flush_cache")
151
- input_requests = sample_random_requests(
152
- input_len=input_len,
153
- output_len=output_len,
154
- num_prompts=batch_size,
155
- range_ratio=1.0,
156
- tokenizer=tokenizer,
157
- dataset_path=dataset_path,
158
- random_sample=True,
159
- return_text=False,
160
- )
311
+ # TODO: reuse bench_serving.get_dataset ?
312
+ if dataset_name == "mmmu":
313
+ input_requests = sample_mmmu_requests(
314
+ num_requests=batch_size,
315
+ processor=tokenizer,
316
+ fixed_output_len=output_len,
317
+ random_sample=False,
318
+ )
319
+ elif dataset_name == "random":
320
+ input_requests = sample_random_requests(
321
+ input_len=input_len,
322
+ output_len=output_len,
323
+ num_prompts=batch_size,
324
+ range_ratio=1.0,
325
+ tokenizer=tokenizer,
326
+ dataset_path=dataset_path,
327
+ random_sample=True,
328
+ return_text=False,
329
+ )
161
330
 
162
331
  use_structured_outputs = False
163
332
  if use_structured_outputs:
@@ -174,25 +343,50 @@ def run_one_case(
174
343
 
175
344
  profile_link = None
176
345
  if profile:
346
+ output_dir, profile_name = None, None
347
+ if profile_filename_prefix:
348
+ output_dir = os.path.dirname(profile_filename_prefix)
349
+ profile_name = os.path.basename(profile_filename_prefix)
177
350
  profile_link: str = run_profile(
178
- url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
351
+ url,
352
+ profile_steps,
353
+ ["CPU", "GPU"],
354
+ output_dir,
355
+ profile_name,
356
+ profile_by_stage,
179
357
  )
180
358
 
181
359
  tic = time.perf_counter()
360
+
361
+ payload = {
362
+ "sampling_params": {
363
+ "temperature": temperature,
364
+ "max_new_tokens": output_len,
365
+ "ignore_eos": True,
366
+ "json_schema": json_schema,
367
+ "stream_interval": stream_interval,
368
+ },
369
+ "return_logprob": return_logprob,
370
+ "stream": True,
371
+ **({"parallel_batch": parallel_batch} if parallel_batch else {}),
372
+ }
373
+ if dataset_name == "mmmu":
374
+ # vlm
375
+ input_ids = []
376
+ # for vlms, tokenizer is an instance of AutoProcessor
377
+ tokenizer = tokenizer.tokenizer
378
+ for input_req in input_requests:
379
+ input_ids += [tokenizer.encode(input_req.prompt)]
380
+ payload["image_data"] = [req.image_data for req in input_requests]
381
+
382
+ else:
383
+ input_ids = [req.prompt for req in input_requests]
384
+
385
+ payload["input_ids"] = input_ids
386
+
182
387
  response = requests.post(
183
388
  url + "/generate",
184
- json={
185
- "input_ids": [req.prompt for req in input_requests],
186
- "sampling_params": {
187
- "temperature": temperature,
188
- "max_new_tokens": output_len,
189
- "ignore_eos": True,
190
- "json_schema": json_schema,
191
- "stream_interval": stream_interval,
192
- },
193
- "return_logprob": return_logprob,
194
- "stream": True,
195
- },
389
+ json=payload,
196
390
  stream=True,
197
391
  )
198
392
 
@@ -256,9 +450,99 @@ def run_one_case(
256
450
  overall_throughput,
257
451
  last_gen_throughput,
258
452
  acc_length,
259
- profile_link if profile else None,
453
+ profile_link,
454
+ )
455
+
456
+
457
+ def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
458
+ """Save benchmark results as JSON using Pydantic models."""
459
+ json_results = []
460
+
461
+ # Generate all parameter combinations to match with results
462
+ param_combinations = list(
463
+ itertools.product(
464
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
465
+ )
260
466
  )
261
467
 
468
+ for i, (
469
+ batch_size,
470
+ latency,
471
+ ttft,
472
+ input_throughput,
473
+ output_throughput,
474
+ overall_throughput,
475
+ last_gen_throughput,
476
+ acc_length,
477
+ profile_link,
478
+ ) in enumerate(result):
479
+ # Get the corresponding parameters for this result
480
+ bs, input_len, output_len = param_combinations[i]
481
+
482
+ # Parse profile links if available
483
+ profile_links = None
484
+ if profile_link:
485
+ profile_links = parse_profile_links(
486
+ profile_link, batch_size, input_len, output_len
487
+ )
488
+
489
+ benchmark_result = BenchmarkResult(
490
+ model_path=model,
491
+ run_name=bench_args.run_name,
492
+ batch_size=batch_size,
493
+ input_len=input_len,
494
+ output_len=output_len,
495
+ latency=latency,
496
+ ttft=ttft,
497
+ input_throughput=input_throughput,
498
+ output_throughput=output_throughput,
499
+ overall_throughput=overall_throughput,
500
+ last_gen_throughput=last_gen_throughput,
501
+ acc_length=acc_length,
502
+ profile_links=profile_links,
503
+ )
504
+ json_results.append(benchmark_result.model_dump())
505
+
506
+ # Save to JSON file
507
+ with open(bench_args.output_path, "w", encoding="utf-8") as f:
508
+ json.dump(json_results, f, indent=2, ensure_ascii=False)
509
+
510
+ print(f"Results saved as JSON to {bench_args.output_path}")
511
+
512
+
513
+ def parse_profile_links(
514
+ profile_dir: str, batch_size: int, input_len: int, output_len: int
515
+ ) -> Optional[ProfileLinks]:
516
+ """Parse profile directory to extract extend and decode trace file links."""
517
+ if not profile_dir or not os.path.exists(profile_dir):
518
+ return None
519
+
520
+ extend_link = None
521
+ decode_link = None
522
+
523
+ # Look for extend/prefill trace files
524
+ for file in os.listdir(profile_dir):
525
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
526
+ if "extend" in file.lower() or "prefill" in file.lower():
527
+ extend_link = os.path.join(profile_dir, file)
528
+ elif "decode" in file.lower():
529
+ decode_link = os.path.join(profile_dir, file)
530
+
531
+ # If no specific extend/decode files found, try to find files with batch/input/output info
532
+ if not extend_link or not decode_link:
533
+ for file in os.listdir(profile_dir):
534
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
535
+ if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
536
+ if "prefill" in file.lower() or "extend" in file.lower():
537
+ extend_link = os.path.join(profile_dir, file)
538
+ elif "decode" in file.lower():
539
+ decode_link = os.path.join(profile_dir, file)
540
+
541
+ if extend_link or decode_link:
542
+ return ProfileLinks(extend=extend_link, decode=decode_link)
543
+
544
+ return None
545
+
262
546
 
263
547
  def get_report_summary(
264
548
  result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
@@ -336,7 +620,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
336
620
  tokenizer_path = server_info["tokenizer_path"]
337
621
  elif "prefill" in server_info:
338
622
  tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
339
- tokenizer = get_tokenizer(tokenizer_path)
623
+
624
+ if bench_args.dataset_name == "mmmu":
625
+ # mmmu implies this is a MLLM
626
+ tokenizer = get_processor(tokenizer_path)
627
+ else:
628
+ tokenizer = get_tokenizer(tokenizer_path)
340
629
 
341
630
  # warmup
342
631
  if not bench_args.skip_warmup:
@@ -350,10 +639,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
350
639
  return_logprob=bench_args.return_logprob,
351
640
  stream_interval=bench_args.client_stream_interval,
352
641
  input_len_step_percentage=bench_args.input_len_step_percentage,
642
+ dataset_name=bench_args.dataset_name,
353
643
  run_name="",
354
644
  result_filename="",
355
645
  tokenizer=tokenizer,
356
646
  dataset_path=bench_args.dataset_path,
647
+ parallel_batch=bench_args.parallel_batch,
357
648
  )
358
649
  print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
359
650
 
@@ -375,8 +666,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
375
666
  stream_interval=bench_args.client_stream_interval,
376
667
  input_len_step_percentage=bench_args.input_len_step_percentage,
377
668
  run_name=bench_args.run_name,
669
+ dataset_name=bench_args.dataset_name,
378
670
  result_filename=bench_args.result_filename,
379
671
  tokenizer=tokenizer,
672
+ dataset_path=bench_args.dataset_path,
673
+ parallel_batch=bench_args.parallel_batch,
674
+ profile_filename_prefix=bench_args.profile_filename_prefix,
380
675
  )
381
676
  )
382
677
 
@@ -399,9 +694,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
399
694
  run_name=bench_args.run_name,
400
695
  result_filename=bench_args.result_filename,
401
696
  tokenizer=tokenizer,
697
+ dataset_name=bench_args.dataset_name,
402
698
  profile=bench_args.profile,
403
699
  profile_steps=bench_args.profile_steps,
404
700
  profile_by_stage=bench_args.profile_by_stage,
701
+ dataset_path=bench_args.dataset_path,
702
+ parallel_batch=bench_args.parallel_batch,
703
+ profile_filename_prefix=bench_args.profile_filename_prefix,
405
704
  )[-1],
406
705
  )
407
706
  )
@@ -414,13 +713,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
414
713
 
415
714
  print(f"\nResults are saved to {bench_args.result_filename}")
416
715
 
716
+ # Save results as JSON if output_path is specified
717
+ if bench_args.output_path:
718
+ save_results_as_json(result, bench_args, model=server_args.model_path)
719
+
417
720
  if not bench_args.show_report:
418
721
  return
419
722
 
420
723
  summary = get_report_summary(result, server_args, bench_args)
421
- print(summary)
422
724
 
423
- if is_in_ci():
725
+ if is_in_ci() and bench_args.append_to_github_summary:
424
726
  write_github_step_summary(summary)
425
727
 
426
728
 
@@ -429,6 +731,10 @@ def main():
429
731
  ServerArgs.add_cli_args(parser)
430
732
  BenchArgs.add_cli_args(parser)
431
733
  args = parser.parse_args()
734
+
735
+ random.seed(args.seed)
736
+ np.random.seed(args.seed)
737
+
432
738
  server_args = ServerArgs.from_cli_args(args)
433
739
  bench_args = BenchArgs.from_cli_args(args)
434
740