sglang 0.5.2rc1__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (395) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/lang/interpreter.py +1 -1
  7. sglang/launch_server.py +14 -0
  8. sglang/profiler.py +2 -2
  9. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  10. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  11. sglang/srt/configs/__init__.py +8 -0
  12. sglang/srt/configs/device_config.py +3 -1
  13. sglang/srt/configs/dots_ocr.py +64 -0
  14. sglang/srt/configs/dots_vlm.py +139 -0
  15. sglang/srt/configs/falcon_h1.py +360 -0
  16. sglang/srt/configs/internvl.py +6 -0
  17. sglang/srt/configs/load_config.py +9 -0
  18. sglang/srt/configs/model_config.py +181 -82
  19. sglang/srt/configs/qwen3_next.py +326 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +71 -19
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +326 -53
  44. sglang/srt/disaggregation/prefill.py +36 -17
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +192 -113
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  52. sglang/srt/entrypoints/grpc_server.py +810 -0
  53. sglang/srt/entrypoints/http_server.py +132 -57
  54. sglang/srt/entrypoints/openai/protocol.py +115 -7
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +207 -58
  57. sglang/srt/entrypoints/openai/serving_completions.py +17 -4
  58. sglang/srt/entrypoints/openai/serving_embedding.py +10 -4
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +49 -4
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/environ.py +285 -0
  63. sglang/srt/eplb/eplb_manager.py +2 -2
  64. sglang/srt/eplb/expert_distribution.py +26 -13
  65. sglang/srt/eplb/expert_location.py +38 -8
  66. sglang/srt/eplb/expert_location_updater.py +1 -1
  67. sglang/srt/function_call/base_format_detector.py +3 -6
  68. sglang/srt/function_call/ebnf_composer.py +11 -9
  69. sglang/srt/function_call/function_call_parser.py +9 -2
  70. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  71. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  72. sglang/srt/function_call/json_array_parser.py +63 -0
  73. sglang/srt/function_call/kimik2_detector.py +17 -4
  74. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  75. sglang/srt/function_call/utils.py +96 -5
  76. sglang/srt/grpc/__init__.py +1 -0
  77. sglang/srt/grpc/compile_proto.py +245 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  81. sglang/srt/layers/activation.py +143 -9
  82. sglang/srt/layers/attention/aiter_backend.py +106 -82
  83. sglang/srt/layers/attention/ascend_backend.py +115 -9
  84. sglang/srt/layers/attention/attention_registry.py +206 -0
  85. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  86. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  87. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  88. sglang/srt/layers/attention/fla/chunk.py +242 -0
  89. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  90. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  91. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  92. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  93. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  94. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  95. sglang/srt/layers/attention/fla/index.py +37 -0
  96. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  97. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  98. sglang/srt/layers/attention/fla/op.py +66 -0
  99. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  100. sglang/srt/layers/attention/fla/utils.py +331 -0
  101. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  102. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  103. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  104. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  105. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  106. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  107. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  108. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  109. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  111. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  112. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  113. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  114. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  115. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  121. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  122. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  123. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  124. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  125. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  126. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  127. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  128. sglang/srt/layers/attention/nsa/utils.py +24 -0
  129. sglang/srt/layers/attention/nsa_backend.py +887 -0
  130. sglang/srt/layers/attention/tbo_backend.py +6 -6
  131. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  132. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  133. sglang/srt/layers/attention/triton_backend.py +57 -7
  134. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  135. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  136. sglang/srt/layers/attention/vision.py +58 -0
  137. sglang/srt/layers/attention/wave_backend.py +4 -4
  138. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  139. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  140. sglang/srt/layers/communicator.py +53 -7
  141. sglang/srt/layers/dp_attention.py +41 -2
  142. sglang/srt/layers/elementwise.py +3 -1
  143. sglang/srt/layers/layernorm.py +34 -15
  144. sglang/srt/layers/linear.py +55 -7
  145. sglang/srt/layers/logits_processor.py +44 -12
  146. sglang/srt/layers/moe/__init__.py +2 -1
  147. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  148. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  149. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  150. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  151. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  167. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  169. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  170. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  171. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  172. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  173. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  174. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  175. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  176. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  177. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  178. sglang/srt/layers/moe/topk.py +30 -9
  179. sglang/srt/layers/moe/utils.py +22 -7
  180. sglang/srt/layers/parameter.py +23 -6
  181. sglang/srt/layers/quantization/awq.py +19 -7
  182. sglang/srt/layers/quantization/base_config.py +11 -6
  183. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  184. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  185. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  186. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  187. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  188. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  189. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  190. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  191. sglang/srt/layers/quantization/fp8.py +78 -49
  192. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  193. sglang/srt/layers/quantization/gptq.py +25 -17
  194. sglang/srt/layers/quantization/modelopt_quant.py +225 -57
  195. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  196. sglang/srt/layers/quantization/mxfp4.py +77 -42
  197. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  199. sglang/srt/layers/quantization/quark/utils.py +97 -0
  200. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  201. sglang/srt/layers/quantization/unquant.py +135 -47
  202. sglang/srt/layers/quantization/w4afp8.py +26 -17
  203. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  204. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  205. sglang/srt/layers/rocm_linear_utils.py +44 -0
  206. sglang/srt/layers/rotary_embedding.py +78 -49
  207. sglang/srt/layers/sampler.py +213 -21
  208. sglang/srt/layers/utils.py +23 -0
  209. sglang/srt/lora/backend/base_backend.py +50 -8
  210. sglang/srt/lora/backend/chunked_backend.py +348 -0
  211. sglang/srt/lora/backend/triton_backend.py +99 -5
  212. sglang/srt/lora/layers.py +32 -0
  213. sglang/srt/lora/lora.py +8 -3
  214. sglang/srt/lora/lora_manager.py +52 -118
  215. sglang/srt/lora/mem_pool.py +25 -11
  216. sglang/srt/lora/triton_ops/__init__.py +4 -0
  217. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  219. sglang/srt/lora/utils.py +22 -11
  220. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  221. sglang/srt/managers/cache_controller.py +215 -314
  222. sglang/srt/managers/data_parallel_controller.py +115 -80
  223. sglang/srt/managers/detokenizer_manager.py +19 -15
  224. sglang/srt/managers/disagg_service.py +46 -0
  225. sglang/srt/managers/io_struct.py +340 -109
  226. sglang/srt/managers/mm_utils.py +44 -6
  227. sglang/srt/managers/multi_tokenizer_mixin.py +358 -404
  228. sglang/srt/managers/multimodal_processor.py +1 -2
  229. sglang/srt/managers/overlap_utils.py +53 -0
  230. sglang/srt/managers/schedule_batch.py +240 -138
  231. sglang/srt/managers/schedule_policy.py +147 -19
  232. sglang/srt/managers/scheduler.py +501 -304
  233. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  234. sglang/srt/managers/scheduler_metrics_mixin.py +119 -40
  235. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  236. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  237. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  238. sglang/srt/managers/template_manager.py +3 -3
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +321 -632
  241. sglang/srt/managers/tp_worker.py +81 -22
  242. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  243. sglang/srt/managers/utils.py +1 -45
  244. sglang/srt/mem_cache/allocator.py +15 -21
  245. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  246. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  247. sglang/srt/mem_cache/chunk_cache.py +8 -1
  248. sglang/srt/mem_cache/evict_policy.py +23 -0
  249. sglang/srt/mem_cache/hicache_storage.py +58 -34
  250. sglang/srt/mem_cache/hiradix_cache.py +227 -80
  251. sglang/srt/mem_cache/memory_pool.py +535 -58
  252. sglang/srt/mem_cache/memory_pool_host.py +239 -223
  253. sglang/srt/mem_cache/radix_cache.py +222 -73
  254. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  255. sglang/srt/mem_cache/storage/__init__.py +10 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  257. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  258. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  259. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  260. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  261. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  262. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  263. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +268 -63
  264. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  265. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  266. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +198 -30
  267. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  268. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  269. sglang/srt/metrics/collector.py +519 -132
  270. sglang/srt/metrics/func_timer.py +2 -7
  271. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  272. sglang/srt/metrics/utils.py +55 -0
  273. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  274. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  275. sglang/srt/model_executor/forward_batch_info.py +98 -57
  276. sglang/srt/model_executor/model_runner.py +433 -158
  277. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  278. sglang/srt/model_loader/__init__.py +9 -3
  279. sglang/srt/model_loader/loader.py +133 -5
  280. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  281. sglang/srt/model_loader/weight_utils.py +158 -3
  282. sglang/srt/models/apertus.py +686 -0
  283. sglang/srt/models/bailing_moe.py +820 -217
  284. sglang/srt/models/bailing_moe_nextn.py +168 -0
  285. sglang/srt/models/deepseek_nextn.py +6 -1
  286. sglang/srt/models/deepseek_v2.py +833 -152
  287. sglang/srt/models/dots_ocr.py +173 -0
  288. sglang/srt/models/dots_vlm.py +174 -0
  289. sglang/srt/models/dots_vlm_vit.py +337 -0
  290. sglang/srt/models/ernie4.py +1 -1
  291. sglang/srt/models/falcon_h1.py +576 -0
  292. sglang/srt/models/gemma3_causal.py +0 -2
  293. sglang/srt/models/gemma3_mm.py +1 -1
  294. sglang/srt/models/gemma3n_mm.py +2 -2
  295. sglang/srt/models/glm4_moe.py +14 -5
  296. sglang/srt/models/glm4_moe_nextn.py +2 -2
  297. sglang/srt/models/glm4v.py +5 -3
  298. sglang/srt/models/glm4v_moe.py +4 -1
  299. sglang/srt/models/gpt_oss.py +8 -31
  300. sglang/srt/models/internvl.py +28 -0
  301. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  302. sglang/srt/models/llama.py +4 -0
  303. sglang/srt/models/llama4.py +9 -0
  304. sglang/srt/models/llama_eagle3.py +13 -0
  305. sglang/srt/models/longcat_flash.py +3 -3
  306. sglang/srt/models/longcat_flash_nextn.py +1 -1
  307. sglang/srt/models/minicpmv.py +165 -3
  308. sglang/srt/models/mllama4.py +40 -4
  309. sglang/srt/models/opt.py +637 -0
  310. sglang/srt/models/qwen2_5_vl.py +29 -5
  311. sglang/srt/models/qwen2_audio.py +1 -1
  312. sglang/srt/models/qwen2_moe.py +124 -14
  313. sglang/srt/models/qwen2_vl.py +1 -1
  314. sglang/srt/models/qwen3.py +26 -5
  315. sglang/srt/models/qwen3_moe.py +71 -12
  316. sglang/srt/models/qwen3_next.py +1069 -0
  317. sglang/srt/models/qwen3_next_mtp.py +112 -0
  318. sglang/srt/models/qwen3_vl.py +787 -0
  319. sglang/srt/models/qwen3_vl_moe.py +471 -0
  320. sglang/srt/models/registry.py +15 -3
  321. sglang/srt/models/sarashina2_vision.py +269 -0
  322. sglang/srt/models/solar.py +505 -0
  323. sglang/srt/models/starcoder2.py +357 -0
  324. sglang/srt/models/step3_vl.py +1 -1
  325. sglang/srt/models/torch_native_llama.py +10 -3
  326. sglang/srt/models/utils.py +51 -0
  327. sglang/srt/multimodal/processors/base_processor.py +15 -7
  328. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  329. sglang/srt/multimodal/processors/glm4v.py +9 -9
  330. sglang/srt/multimodal/processors/internvl.py +153 -129
  331. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  332. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  333. sglang/srt/offloader.py +27 -3
  334. sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +6 -0
  335. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  336. sglang/srt/sampling/sampling_batch_info.py +38 -17
  337. sglang/srt/sampling/sampling_params.py +7 -0
  338. sglang/srt/server_args.py +1030 -254
  339. sglang/srt/server_args_config_parser.py +146 -0
  340. sglang/srt/single_batch_overlap.py +151 -0
  341. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  342. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  343. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  344. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  345. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  346. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  347. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  348. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  349. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  350. sglang/srt/speculative/eagle_worker.py +253 -136
  351. sglang/srt/speculative/ngram_utils.py +428 -0
  352. sglang/srt/speculative/ngram_worker.py +245 -0
  353. sglang/srt/speculative/spec_info.py +52 -0
  354. sglang/srt/speculative/spec_utils.py +606 -0
  355. sglang/srt/speculative/standalone_worker.py +109 -0
  356. sglang/srt/torch_memory_saver_adapter.py +5 -7
  357. sglang/srt/tracing/trace.py +578 -0
  358. sglang/srt/two_batch_overlap.py +8 -5
  359. sglang/srt/utils/__init__.py +2 -0
  360. sglang/srt/{utils.py → utils/common.py} +445 -77
  361. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  362. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  363. sglang/srt/utils/rpd_utils.py +452 -0
  364. sglang/srt/utils/slow_rank_detector.py +71 -0
  365. sglang/srt/warmup.py +8 -4
  366. sglang/srt/weight_sync/utils.py +2 -2
  367. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  368. sglang/test/few_shot_gsm8k.py +1 -0
  369. sglang/test/get_logits_ut.py +57 -0
  370. sglang/test/run_eval.py +79 -11
  371. sglang/test/runners.py +5 -1
  372. sglang/test/simple_eval_common.py +5 -2
  373. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  374. sglang/test/test_block_fp8.py +2 -2
  375. sglang/test/test_cutlass_moe.py +24 -6
  376. sglang/test/test_deterministic.py +297 -0
  377. sglang/test/test_disaggregation_utils.py +77 -0
  378. sglang/test/test_fp4_moe.py +370 -1
  379. sglang/test/test_programs.py +1 -1
  380. sglang/test/test_utils.py +383 -5
  381. sglang/utils.py +22 -1
  382. sglang/version.py +1 -1
  383. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  384. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/RECORD +392 -258
  385. sglang/srt/disaggregation/launch_lb.py +0 -118
  386. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  387. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  388. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  389. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  390. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  391. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  392. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  393. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  394. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  395. {sglang-0.5.2rc1.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -60,7 +60,6 @@ import torch.distributed as dist
60
60
  from sglang.srt.configs.model_config import ModelConfig
61
61
  from sglang.srt.distributed.parallel_state import destroy_distributed_environment
62
62
  from sglang.srt.entrypoints.engine import _set_envs_and_config
63
- from sglang.srt.hf_transformers_utils import get_tokenizer
64
63
  from sglang.srt.layers.moe import initialize_moe_config
65
64
  from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
66
65
  from sglang.srt.managers.scheduler import Scheduler
@@ -78,6 +77,7 @@ from sglang.srt.utils import (
78
77
  set_gpu_proc_affinity,
79
78
  suppress_other_loggers,
80
79
  )
80
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
81
81
 
82
82
 
83
83
  @dataclasses.dataclass
@@ -443,11 +443,9 @@ def latency_test_run_once(
443
443
 
444
444
  if profile:
445
445
  profiler.stop()
446
- profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
447
- _save_profile_trace_results(profiler, profile_filename)
448
- rank_print(
449
- f"torch profiler chrome trace for prefill saved to {profile_filename}"
450
- )
446
+ trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_prefill.trace.json.gz"
447
+ _save_profile_trace_results(profiler, trace_filename)
448
+ rank_print(f"torch profiler chrome trace for prefill saved to {trace_filename}")
451
449
 
452
450
  # Decode
453
451
  decode_latencies = []
@@ -479,10 +477,10 @@ def latency_test_run_once(
479
477
 
480
478
  if profile and i == output_len / 2:
481
479
  profiler.stop()
482
- profile_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
483
- _save_profile_trace_results(profiler, profile_filename)
480
+ trace_filename = f"{profile_filename_prefix}_batch{batch_size}_input{input_len}_output{output_len}_decode.trace.json.gz"
481
+ _save_profile_trace_results(profiler, trace_filename)
484
482
  rank_print(
485
- f"torch profiler chrome trace for decoding 1 token saved to {profile_filename}"
483
+ f"torch profiler chrome trace for decoding 1 token saved to {trace_filename}"
486
484
  )
487
485
 
488
486
  # Record decode timing from 2nd output
@@ -9,6 +9,7 @@ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --
9
9
 
10
10
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
11
11
  python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --show-report --profile --profile-by-stage
12
+ python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8 --output-path results.json --profile
12
13
  """
13
14
 
14
15
  import argparse
@@ -17,12 +18,19 @@ import itertools
17
18
  import json
18
19
  import multiprocessing
19
20
  import os
21
+ import random
20
22
  import time
21
- from typing import List, Tuple
23
+ from typing import List, Optional, Tuple
22
24
 
25
+ import numpy as np
23
26
  import requests
27
+ from pydantic import BaseModel
24
28
 
25
- from sglang.bench_serving import get_tokenizer, sample_random_requests
29
+ from sglang.bench_serving import (
30
+ get_tokenizer,
31
+ sample_mmmu_requests,
32
+ sample_random_requests,
33
+ )
26
34
  from sglang.profiler import run_profile
27
35
  from sglang.srt.entrypoints.http_server import launch_server
28
36
  from sglang.srt.server_args import ServerArgs
@@ -30,9 +38,112 @@ from sglang.srt.utils import is_blackwell, kill_process_tree
30
38
  from sglang.test.test_utils import is_in_ci, write_github_step_summary
31
39
 
32
40
 
41
+ class ProfileLinks(BaseModel):
42
+ """Pydantic model for profile trace links."""
43
+
44
+ extend: Optional[str] = None
45
+ decode: Optional[str] = None
46
+
47
+
48
+ class BenchmarkResult(BaseModel):
49
+ """Pydantic model for benchmark results table data, for a single isl and osl"""
50
+
51
+ model_path: str
52
+ run_name: str
53
+ batch_size: int
54
+ input_len: int
55
+ output_len: int
56
+ latency: float
57
+ ttft: float
58
+ input_throughput: float
59
+ output_throughput: float
60
+ overall_throughput: float
61
+ last_gen_throughput: float
62
+ acc_length: Optional[float] = None
63
+ profile_links: Optional[ProfileLinks] = None
64
+
65
+ @staticmethod
66
+ def help_str() -> str:
67
+ return f"""
68
+ Note: To view the traces through perfetto-ui, please:
69
+ 1. open with Google Chrome
70
+ 2. allow popup
71
+ """
72
+
73
+ def to_markdown_row(
74
+ self, trace_dir, base_url: str = "", relay_base: str = ""
75
+ ) -> str:
76
+ """Convert this benchmark result to a markdown table row."""
77
+ # Calculate costs (assuming H100 pricing for now)
78
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
79
+ hourly_cost = hourly_cost_per_gpu * 1 # Assuming tp_size = 1 for simplicity
80
+ input_util = 0.7
81
+ accept_length = (
82
+ round(self.acc_length, 2) if self.acc_length is not None else "n/a"
83
+ )
84
+ itl = 1 / (self.output_throughput / self.batch_size) * 1000
85
+ input_cost = 1e6 / (self.input_throughput * input_util) / 3600 * hourly_cost
86
+ output_cost = 1e6 / self.output_throughput / 3600 * hourly_cost
87
+
88
+ def get_perfetto_relay_link_from_trace_file(trace_file: str):
89
+ import os
90
+ from urllib.parse import quote
91
+
92
+ rel_path = os.path.relpath(trace_file, trace_dir)
93
+ raw_file_link = f"{base_url}/{rel_path}"
94
+ relay_link = (
95
+ f"{relay_base}?src={quote(raw_file_link, safe='')}"
96
+ if relay_base and quote
97
+ else raw_file_link
98
+ )
99
+ return relay_link
100
+
101
+ # Handle profile links
102
+ profile_link = "NA | NA"
103
+ if self.profile_links:
104
+ if self.profile_links.extend or self.profile_links.decode:
105
+ # Create a combined link or use the first available one
106
+ trace_files = [self.profile_links.extend, self.profile_links.decode]
107
+ trace_files_relay_links = [
108
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
109
+ for trace_file in trace_files
110
+ ]
111
+
112
+ profile_link = " | ".join(trace_files_relay_links)
113
+
114
+ # Build the row
115
+ return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
116
+
117
+ @classmethod
118
+ def generate_markdown_report(
119
+ cls, trace_dir, results: List["BenchmarkResult"]
120
+ ) -> str:
121
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
122
+ import os
123
+
124
+ summary = f"### {results[0].model_path}\n"
125
+
126
+ # summary += (
127
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
128
+ # )
129
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
130
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
131
+
132
+ # all results should share the same isl & osl
133
+ for result in results:
134
+ base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
135
+ relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
136
+ relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
137
+ # base_url = "https://github.com/sgl-project/ci-data/traces"
138
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
139
+
140
+ return summary
141
+
142
+
33
143
  @dataclasses.dataclass
34
144
  class BenchArgs:
35
145
  run_name: str = "default"
146
+ seed: int = 42
36
147
  batch_size: Tuple[int] = (1,)
37
148
  input_len: Tuple[int] = (1024,)
38
149
  output_len: Tuple[int] = (16,)
@@ -47,10 +158,17 @@ class BenchArgs:
47
158
  profile: bool = False
48
159
  profile_steps: int = 3
49
160
  profile_by_stage: bool = False
161
+ profile_filename_prefix: str = None
162
+ append_to_github_summary: bool = True
163
+ dataset_path: str = ""
164
+ parallel_batch: bool = False
165
+ dataset_name: str = "random"
166
+ output_path: Optional[str] = None
50
167
 
51
168
  @staticmethod
52
169
  def add_cli_args(parser: argparse.ArgumentParser):
53
170
  parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
171
+ parser.add_argument("--seed", type=int, default=BenchArgs.seed)
54
172
  parser.add_argument(
55
173
  "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
56
174
  )
@@ -61,6 +179,13 @@ class BenchArgs:
61
179
  "--output-len", type=int, nargs="+", default=BenchArgs.output_len
62
180
  )
63
181
  parser.add_argument("--temperature", type=float, default=BenchArgs.temperature)
182
+ parser.add_argument(
183
+ "--dataset-name",
184
+ type=str,
185
+ default=BenchArgs.dataset_name,
186
+ choices=["mmmu", "random"],
187
+ help="Name of the dataset to benchmark on.",
188
+ )
64
189
  parser.add_argument("--return-logprob", action="store_true")
65
190
  parser.add_argument(
66
191
  "--client-stream-interval",
@@ -83,14 +208,43 @@ class BenchArgs:
83
208
  "--profile-steps", type=int, default=BenchArgs.profile_steps
84
209
  )
85
210
  parser.add_argument("--profile-by-stage", action="store_true")
211
+ parser.add_argument(
212
+ "--dataset-path",
213
+ type=str,
214
+ default=BenchArgs.dataset_path,
215
+ help="Path to the dataset.",
216
+ )
217
+ parser.add_argument("--parallel-batch", action="store_true")
218
+ parser.add_argument(
219
+ "--profile-filename-prefix",
220
+ type=str,
221
+ default=BenchArgs.profile_filename_prefix,
222
+ )
223
+ parser.add_argument(
224
+ "--no-append-to-github-summary",
225
+ action="store_false",
226
+ dest="append_to_github_summary",
227
+ help="Disable appending the output of this run to github ci summary",
228
+ )
229
+ parser.add_argument(
230
+ "--output-path",
231
+ type=str,
232
+ default=BenchArgs.output_path,
233
+ help="Path to save benchmark results as JSON format. If not specified, results will only be saved to result-filename.",
234
+ )
86
235
 
87
236
  @classmethod
88
237
  def from_cli_args(cls, args: argparse.Namespace):
89
238
  # use the default value's type to cast the args into correct types.
90
239
  attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
91
- return cls(
92
- **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
93
- )
240
+ kwargs = {}
241
+ for attr, attr_type in attrs:
242
+ val = getattr(args, attr)
243
+ if attr_type is type(None):
244
+ kwargs[attr] = val
245
+ else:
246
+ kwargs[attr] = attr_type(val)
247
+ return cls(**kwargs)
94
248
 
95
249
 
96
250
  def launch_server_internal(server_args):
@@ -135,21 +289,35 @@ def run_one_case(
135
289
  run_name: str,
136
290
  result_filename: str,
137
291
  tokenizer,
292
+ dataset_name="",
138
293
  profile: bool = False,
139
294
  profile_steps: int = 3,
140
295
  profile_by_stage: bool = False,
296
+ profile_filename_prefix: str = None,
297
+ dataset_path: str = "",
298
+ parallel_batch: bool = False,
141
299
  ):
142
300
  requests.post(url + "/flush_cache")
143
- input_requests = sample_random_requests(
144
- input_len=input_len,
145
- output_len=output_len,
146
- num_prompts=batch_size,
147
- range_ratio=1.0,
148
- tokenizer=tokenizer,
149
- dataset_path="",
150
- random_sample=True,
151
- return_text=False,
152
- )
301
+ # TODO: reuse bench_serving.get_dataset ?
302
+ if dataset_name == "mmmu":
303
+ input_requests = sample_mmmu_requests(
304
+ num_requests=batch_size,
305
+ tokenizer=tokenizer,
306
+ fixed_output_len=output_len,
307
+ apply_chat_template=True,
308
+ random_sample=False,
309
+ )
310
+ elif dataset_name == "random":
311
+ input_requests = sample_random_requests(
312
+ input_len=input_len,
313
+ output_len=output_len,
314
+ num_prompts=batch_size,
315
+ range_ratio=1.0,
316
+ tokenizer=tokenizer,
317
+ dataset_path=dataset_path,
318
+ random_sample=True,
319
+ return_text=False,
320
+ )
153
321
 
154
322
  use_structured_outputs = False
155
323
  if use_structured_outputs:
@@ -166,25 +334,48 @@ def run_one_case(
166
334
 
167
335
  profile_link = None
168
336
  if profile:
337
+ output_dir, profile_name = None, None
338
+ if profile_filename_prefix:
339
+ output_dir = os.path.dirname(profile_filename_prefix)
340
+ profile_name = os.path.basename(profile_filename_prefix)
169
341
  profile_link: str = run_profile(
170
- url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
342
+ url,
343
+ profile_steps,
344
+ ["CPU", "GPU"],
345
+ output_dir,
346
+ profile_name,
347
+ profile_by_stage,
171
348
  )
172
349
 
173
350
  tic = time.perf_counter()
351
+
352
+ payload = {
353
+ "sampling_params": {
354
+ "temperature": temperature,
355
+ "max_new_tokens": output_len,
356
+ "ignore_eos": True,
357
+ "json_schema": json_schema,
358
+ "stream_interval": stream_interval,
359
+ },
360
+ "return_logprob": return_logprob,
361
+ "stream": True,
362
+ **({"parallel_batch": parallel_batch} if parallel_batch else {}),
363
+ }
364
+ if dataset_name == "mmmu":
365
+ # vlm
366
+ input_ids = []
367
+ for input_req in input_requests:
368
+ input_ids += [tokenizer.encode(input_req.prompt)]
369
+ payload["image_data"] = [req.image_data for req in input_requests]
370
+
371
+ else:
372
+ input_ids = [req.prompt for req in input_requests]
373
+
374
+ payload["input_ids"] = input_ids
375
+
174
376
  response = requests.post(
175
377
  url + "/generate",
176
- json={
177
- "input_ids": [req.prompt for req in input_requests],
178
- "sampling_params": {
179
- "temperature": temperature,
180
- "max_new_tokens": output_len,
181
- "ignore_eos": True,
182
- "json_schema": json_schema,
183
- "stream_interval": stream_interval,
184
- },
185
- "return_logprob": return_logprob,
186
- "stream": True,
187
- },
378
+ json=payload,
188
379
  stream=True,
189
380
  )
190
381
 
@@ -248,10 +439,100 @@ def run_one_case(
248
439
  overall_throughput,
249
440
  last_gen_throughput,
250
441
  acc_length,
251
- profile_link if profile else None,
442
+ profile_link,
252
443
  )
253
444
 
254
445
 
446
+ def save_results_as_json(result: List[Tuple], bench_args: BenchArgs, model: str):
447
+ """Save benchmark results as JSON using Pydantic models."""
448
+ json_results = []
449
+
450
+ # Generate all parameter combinations to match with results
451
+ param_combinations = list(
452
+ itertools.product(
453
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
454
+ )
455
+ )
456
+
457
+ for i, (
458
+ batch_size,
459
+ latency,
460
+ ttft,
461
+ input_throughput,
462
+ output_throughput,
463
+ overall_throughput,
464
+ last_gen_throughput,
465
+ acc_length,
466
+ profile_link,
467
+ ) in enumerate(result):
468
+ # Get the corresponding parameters for this result
469
+ bs, input_len, output_len = param_combinations[i]
470
+
471
+ # Parse profile links if available
472
+ profile_links = None
473
+ if profile_link:
474
+ profile_links = parse_profile_links(
475
+ profile_link, batch_size, input_len, output_len
476
+ )
477
+
478
+ benchmark_result = BenchmarkResult(
479
+ model_path=model,
480
+ run_name=bench_args.run_name,
481
+ batch_size=batch_size,
482
+ input_len=input_len,
483
+ output_len=output_len,
484
+ latency=latency,
485
+ ttft=ttft,
486
+ input_throughput=input_throughput,
487
+ output_throughput=output_throughput,
488
+ overall_throughput=overall_throughput,
489
+ last_gen_throughput=last_gen_throughput,
490
+ acc_length=acc_length,
491
+ profile_links=profile_links,
492
+ )
493
+ json_results.append(benchmark_result.model_dump())
494
+
495
+ # Save to JSON file
496
+ with open(bench_args.output_path, "w", encoding="utf-8") as f:
497
+ json.dump(json_results, f, indent=2, ensure_ascii=False)
498
+
499
+ print(f"Results saved as JSON to {bench_args.output_path}")
500
+
501
+
502
+ def parse_profile_links(
503
+ profile_dir: str, batch_size: int, input_len: int, output_len: int
504
+ ) -> Optional[ProfileLinks]:
505
+ """Parse profile directory to extract extend and decode trace file links."""
506
+ if not profile_dir or not os.path.exists(profile_dir):
507
+ return None
508
+
509
+ extend_link = None
510
+ decode_link = None
511
+
512
+ # Look for extend/prefill trace files
513
+ for file in os.listdir(profile_dir):
514
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
515
+ if "extend" in file.lower() or "prefill" in file.lower():
516
+ extend_link = os.path.join(profile_dir, file)
517
+ elif "decode" in file.lower():
518
+ decode_link = os.path.join(profile_dir, file)
519
+
520
+ # If no specific extend/decode files found, try to find files with batch/input/output info
521
+ if not extend_link or not decode_link:
522
+ for file in os.listdir(profile_dir):
523
+ if file.endswith(".trace.json.gz") or file.endswith(".trace.json"):
524
+ if f"_batch{batch_size}_input{input_len}_output{output_len}_" in file:
525
+ if "prefill" in file.lower() or "extend" in file.lower():
526
+ extend_link = os.path.join(profile_dir, file)
527
+ elif "decode" in file.lower():
528
+ decode_link = os.path.join(profile_dir, file)
529
+
530
+ if extend_link or decode_link:
531
+ return ProfileLinks(extend=extend_link, decode=decode_link)
532
+
533
+ return None
534
+
535
+
255
536
  def get_report_summary(
256
537
  result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
257
538
  ):
@@ -342,9 +623,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
342
623
  return_logprob=bench_args.return_logprob,
343
624
  stream_interval=bench_args.client_stream_interval,
344
625
  input_len_step_percentage=bench_args.input_len_step_percentage,
626
+ dataset_name=bench_args.dataset_name,
345
627
  run_name="",
346
628
  result_filename="",
347
629
  tokenizer=tokenizer,
630
+ dataset_path=bench_args.dataset_path,
631
+ parallel_batch=bench_args.parallel_batch,
348
632
  )
349
633
  print("=" * 8 + " Warmup End " + "=" * 8 + "\n")
350
634
 
@@ -366,8 +650,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
366
650
  stream_interval=bench_args.client_stream_interval,
367
651
  input_len_step_percentage=bench_args.input_len_step_percentage,
368
652
  run_name=bench_args.run_name,
653
+ dataset_name=bench_args.dataset_name,
369
654
  result_filename=bench_args.result_filename,
370
655
  tokenizer=tokenizer,
656
+ dataset_path=bench_args.dataset_path,
657
+ parallel_batch=bench_args.parallel_batch,
658
+ profile_filename_prefix=bench_args.profile_filename_prefix,
371
659
  )
372
660
  )
373
661
 
@@ -390,9 +678,13 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
390
678
  run_name=bench_args.run_name,
391
679
  result_filename=bench_args.result_filename,
392
680
  tokenizer=tokenizer,
681
+ dataset_name=bench_args.dataset_name,
393
682
  profile=bench_args.profile,
394
683
  profile_steps=bench_args.profile_steps,
395
684
  profile_by_stage=bench_args.profile_by_stage,
685
+ dataset_path=bench_args.dataset_path,
686
+ parallel_batch=bench_args.parallel_batch,
687
+ profile_filename_prefix=bench_args.profile_filename_prefix,
396
688
  )[-1],
397
689
  )
398
690
  )
@@ -405,13 +697,16 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
405
697
 
406
698
  print(f"\nResults are saved to {bench_args.result_filename}")
407
699
 
700
+ # Save results as JSON if output_path is specified
701
+ if bench_args.output_path:
702
+ save_results_as_json(result, bench_args, model=server_args.model_path)
703
+
408
704
  if not bench_args.show_report:
409
705
  return
410
706
 
411
707
  summary = get_report_summary(result, server_args, bench_args)
412
- print(summary)
413
708
 
414
- if is_in_ci():
709
+ if is_in_ci() and bench_args.append_to_github_summary:
415
710
  write_github_step_summary(summary)
416
711
 
417
712
 
@@ -420,6 +715,10 @@ def main():
420
715
  ServerArgs.add_cli_args(parser)
421
716
  BenchArgs.add_cli_args(parser)
422
717
  args = parser.parse_args()
718
+
719
+ random.seed(args.seed)
720
+ np.random.seed(args.seed)
721
+
423
722
  server_args = ServerArgs.from_cli_args(args)
424
723
  bench_args = BenchArgs.from_cli_args(args)
425
724