sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -35,6 +35,7 @@ import numpy as np
35
35
  import requests
36
36
  from tqdm.asyncio import tqdm
37
37
  from transformers import (
38
+ AutoProcessor,
38
39
  AutoTokenizer,
39
40
  PreTrainedTokenizer,
40
41
  PreTrainedTokenizerBase,
@@ -75,6 +76,7 @@ class RequestFuncInput:
75
76
  lora_name: str
76
77
  image_data: Optional[List[str]]
77
78
  extra_request_body: Dict[str, Any]
79
+ timestamp: Optional[float] = None
78
80
 
79
81
 
80
82
  @dataclass
@@ -104,10 +106,13 @@ def remove_suffix(text: str, suffix: str) -> str:
104
106
 
105
107
 
106
108
  def get_auth_headers() -> Dict[str, str]:
107
- api_key = os.environ.get("OPENAI_API_KEY")
108
- if api_key:
109
- return {"Authorization": f"Bearer {api_key}"}
109
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
110
+ if openai_api_key:
111
+ return {"Authorization": f"Bearer {openai_api_key}"}
110
112
  else:
113
+ api_key = os.environ.get("API_KEY")
114
+ if api_key:
115
+ return {"Authorization": f"{api_key}"}
111
116
  return {}
112
117
 
113
118
 
@@ -204,6 +209,15 @@ async def async_request_openai_completions(
204
209
  "ignore_eos": not args.disable_ignore_eos,
205
210
  **request_func_input.extra_request_body,
206
211
  }
212
+
213
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
214
+ if request_func_input.lora_name:
215
+ payload["model"] = request_func_input.lora_name
216
+ payload["lora_path"] = request_func_input.lora_name
217
+
218
+ if request_func_input.image_data:
219
+ payload.update({"image_data": request_func_input.image_data})
220
+
207
221
  headers = get_auth_headers()
208
222
 
209
223
  output = RequestFuncOutput.init_new(request_func_input)
@@ -314,10 +328,17 @@ async def async_request_openai_chat_completions(
314
328
  "model": request_func_input.model,
315
329
  "messages": messages,
316
330
  "temperature": 0.0,
317
- "max_tokens": request_func_input.output_len,
331
+ "max_completion_tokens": request_func_input.output_len,
318
332
  "stream": not args.disable_stream,
333
+ "ignore_eos": not args.disable_ignore_eos,
319
334
  **request_func_input.extra_request_body,
320
335
  }
336
+
337
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
338
+ if request_func_input.lora_name:
339
+ payload["model"] = request_func_input.lora_name
340
+ payload["lora_path"] = request_func_input.lora_name
341
+
321
342
  headers = get_auth_headers()
322
343
 
323
344
  output = RequestFuncOutput.init_new(request_func_input)
@@ -627,7 +648,7 @@ def get_tokenizer(
627
648
  if pretrained_model_name_or_path.endswith(
628
649
  ".json"
629
650
  ) or pretrained_model_name_or_path.endswith(".model"):
630
- from sglang.srt.hf_transformers_utils import get_tokenizer
651
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
631
652
 
632
653
  return get_tokenizer(pretrained_model_name_or_path)
633
654
 
@@ -640,7 +661,30 @@ def get_tokenizer(
640
661
  )
641
662
 
642
663
 
643
- def get_dataset(args, tokenizer):
664
+ def get_processor(
665
+ pretrained_model_name_or_path: str,
666
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
667
+ assert (
668
+ pretrained_model_name_or_path is not None
669
+ and pretrained_model_name_or_path != ""
670
+ )
671
+ if pretrained_model_name_or_path.endswith(
672
+ ".json"
673
+ ) or pretrained_model_name_or_path.endswith(".model"):
674
+ from sglang.srt.hf_transformers_utils import get_processor
675
+
676
+ return get_processor(pretrained_model_name_or_path)
677
+
678
+ if pretrained_model_name_or_path is not None and not os.path.exists(
679
+ pretrained_model_name_or_path
680
+ ):
681
+ pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
682
+ return AutoProcessor.from_pretrained(
683
+ pretrained_model_name_or_path, trust_remote_code=True
684
+ )
685
+
686
+
687
+ def get_dataset(args, tokenizer, model_id=None):
644
688
  tokenize_prompt = getattr(args, "tokenize_prompt", False)
645
689
  if args.dataset_name == "sharegpt":
646
690
  assert not tokenize_prompt
@@ -653,7 +697,7 @@ def get_dataset(args, tokenizer):
653
697
  prompt_suffix=args.prompt_suffix,
654
698
  apply_chat_template=args.apply_chat_template,
655
699
  )
656
- elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
700
+ elif args.dataset_name.startswith("random"):
657
701
  input_requests = sample_random_requests(
658
702
  input_len=args.random_input_len,
659
703
  output_len=args.random_output_len,
@@ -664,17 +708,18 @@ def get_dataset(args, tokenizer):
664
708
  random_sample=args.dataset_name == "random",
665
709
  return_text=not tokenize_prompt,
666
710
  )
667
- elif args.dataset_name == "random-image":
668
- assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
669
- input_requests = sample_random_image_requests(
711
+ elif args.dataset_name == "image":
712
+ processor = get_processor(model_id)
713
+ input_requests = sample_image_requests(
670
714
  num_requests=args.num_prompts,
671
- num_images=args.random_image_num_images,
715
+ image_count=args.image_count,
672
716
  input_len=args.random_input_len,
673
717
  output_len=args.random_output_len,
674
718
  range_ratio=args.random_range_ratio,
675
- tokenizer=tokenizer,
676
- apply_chat_template=args.apply_chat_template,
677
- image_resolution=args.random_image_resolution,
719
+ processor=processor,
720
+ image_content=args.image_content,
721
+ image_format=args.image_format,
722
+ image_resolution=args.image_resolution,
678
723
  )
679
724
  elif args.dataset_name == "generated-shared-prefix":
680
725
  assert not tokenize_prompt
@@ -688,14 +733,31 @@ def get_dataset(args, tokenizer):
688
733
  args=args,
689
734
  )
690
735
  elif args.dataset_name == "mmmu":
691
- assert not tokenize_prompt
736
+ processor = get_processor(model_id)
692
737
  input_requests = sample_mmmu_requests(
693
738
  num_requests=args.num_prompts,
694
- tokenizer=tokenizer,
739
+ processor=processor,
695
740
  fixed_output_len=args.random_output_len,
696
- apply_chat_template=args.apply_chat_template,
697
741
  random_sample=True,
698
742
  )
743
+ elif args.dataset_name == "mooncake":
744
+ # For mooncake, we don't generate the prompts here.
745
+ # We just load the raw trace data. The async generator will handle the rest.
746
+ if not args.dataset_path:
747
+ local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
748
+ else:
749
+ local_path = args.dataset_path
750
+
751
+ if not os.path.exists(local_path):
752
+ download_and_cache_file(
753
+ MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
754
+ )
755
+
756
+ with open(local_path, "r") as f:
757
+ all_requests_data = [json.loads(line) for line in f if line.strip()]
758
+
759
+ # Limit the number of requests based on --num-prompts
760
+ input_requests = all_requests_data[: args.num_prompts]
699
761
  else:
700
762
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
701
763
  return input_requests
@@ -720,6 +782,8 @@ ASYNC_REQUEST_FUNCS = {
720
782
  class BenchmarkMetrics:
721
783
  completed: int
722
784
  total_input: int
785
+ total_input_text: int
786
+ total_input_vision: int
723
787
  total_output: int
724
788
  total_output_retokenized: int
725
789
  request_throughput: float
@@ -750,6 +814,12 @@ class BenchmarkMetrics:
750
814
 
751
815
 
752
816
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
817
+ MOONCAKE_DATASET_URL = {
818
+ "mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
819
+ "conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
820
+ "synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
821
+ "toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
822
+ }
753
823
 
754
824
 
755
825
  def download_and_cache_file(url: str, filename: Optional[str] = None):
@@ -807,14 +877,95 @@ class DatasetRow:
807
877
  prompt: str
808
878
  prompt_len: int
809
879
  output_len: int
880
+ text_prompt_len: Optional[int] = None
881
+ vision_prompt_len: Optional[int] = None
810
882
  image_data: Optional[List[str]] = None
883
+ timestamp: Optional[float] = None
884
+
885
+ def __post_init__(self):
886
+ if self.text_prompt_len is None:
887
+ self.text_prompt_len = self.prompt_len
888
+ if self.vision_prompt_len is None:
889
+ self.vision_prompt_len = 0
890
+
891
+
892
+ async def get_mooncake_request_over_time(
893
+ input_requests: List[Dict],
894
+ tokenizer: PreTrainedTokenizerBase,
895
+ slowdown_factor: float,
896
+ num_rounds: int,
897
+ ) -> AsyncGenerator[DatasetRow, None]:
898
+ """
899
+ An async generator that yields requests based on the timestamps in the Mooncake trace file,
900
+ with support for multi-round sessions.
901
+ """
902
+ if not input_requests:
903
+ return
904
+
905
+ input_requests.sort(key=lambda r: r["timestamp"])
906
+
907
+ start_time = time.perf_counter()
908
+ trace_start_time_ms = input_requests[0]["timestamp"]
909
+
910
+ for record in input_requests:
911
+ # Calculate when this entire session should start
912
+ relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
913
+ target_arrival_time_s = relative_arrival_time_s * slowdown_factor
914
+
915
+ current_elapsed_time_s = time.perf_counter() - start_time
916
+ sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
917
+ if sleep_duration_s > 0:
918
+ await asyncio.sleep(sleep_duration_s)
919
+
920
+ # Once the session starts, generate all rounds for it as a burst
921
+ # This simulates a user engaging in a multi-turn conversation
922
+
923
+ # Base user query constructed from hash_ids
924
+ user_query_base = ""
925
+ hash_ids = record.get("hash_ids", [])
926
+ for hash_id in hash_ids:
927
+ user_query_base += f"{hash_id}" + " ".join(
928
+ ["hi"] * 128
929
+ ) # Shorter for multi-round
930
+ user_query_base += "Tell me a story based on this context."
931
+
932
+ output_len_per_round = record.get("output_length", 256)
933
+ chat_history = []
934
+
935
+ for i in range(num_rounds):
936
+ # Add user query for the current round
937
+ chat_history.append(
938
+ {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
939
+ )
940
+
941
+ # Form the full prompt from history
942
+ try:
943
+ full_prompt_text = tokenizer.apply_chat_template(
944
+ chat_history, tokenize=False, add_generation_prompt=True
945
+ )
946
+ except Exception:
947
+ full_prompt_text = "\n".join(
948
+ [f"{msg['role']}: {msg['content']}" for msg in chat_history]
949
+ )
950
+
951
+ prompt_len = len(tokenizer.encode(full_prompt_text))
952
+
953
+ yield DatasetRow(
954
+ prompt=full_prompt_text,
955
+ prompt_len=prompt_len,
956
+ output_len=output_len_per_round,
957
+ )
958
+
959
+ # Add a placeholder assistant response for the next round's context
960
+ # We use a placeholder because we don't know the real response
961
+ placeholder_response = " ".join(["story"] * output_len_per_round)
962
+ chat_history.append({"role": "assistant", "content": placeholder_response})
811
963
 
812
964
 
813
965
  def sample_mmmu_requests(
814
966
  num_requests: int,
815
- tokenizer: PreTrainedTokenizerBase,
967
+ processor: AutoProcessor,
816
968
  fixed_output_len: Optional[int] = None,
817
- apply_chat_template: bool = True,
818
969
  random_sample: bool = True,
819
970
  ) -> List[DatasetRow]:
820
971
  """
@@ -893,46 +1044,12 @@ def sample_mmmu_requests(
893
1044
  question = example.get("question")
894
1045
 
895
1046
  # Construct the prompt
896
- prompt = f"Question: {question}\n\nAnswer: "
897
- if apply_chat_template:
898
- try:
899
- prompt = tokenizer.apply_chat_template(
900
- [
901
- {
902
- "role": "user",
903
- "content": [
904
- {
905
- "type": "image_url",
906
- "image_url": {"url": image_data},
907
- },
908
- {"type": "text", "text": prompt},
909
- ],
910
- }
911
- ],
912
- add_generation_prompt=True,
913
- tokenize=False,
914
- )
915
- except Exception as e:
916
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
917
- print(
918
- f"Error applying chat template: {e}, fallback to <image> tag"
919
- )
920
- prompt = f"<image>{prompt}"
921
-
922
- # Calculate token lengths for text only (without image data)
923
- prompt_token_ids = tokenizer.encode(prompt)
924
- prompt_len = len(prompt_token_ids)
925
-
1047
+ text_prompt = f"Question: {question}\n\nAnswer: "
926
1048
  output_len = fixed_output_len if fixed_output_len is not None else 256
927
-
928
- filtered_dataset.append(
929
- DatasetRow(
930
- prompt=prompt,
931
- prompt_len=prompt_len,
932
- output_len=output_len,
933
- image_data=[image_data],
934
- )
1049
+ data_row = create_mm_data_row(
1050
+ text_prompt, [image], [image_data], output_len, processor
935
1051
  )
1052
+ filtered_dataset.append(data_row)
936
1053
 
937
1054
  except Exception as e:
938
1055
  print(f"Error processing example {i}: {e}")
@@ -1000,7 +1117,8 @@ def sample_sharegpt_requests(
1000
1117
  add_generation_prompt=True,
1001
1118
  tokenize=False,
1002
1119
  )
1003
- prompt = prompt.replace(tokenizer.bos_token, "")
1120
+ if tokenizer.bos_token:
1121
+ prompt = prompt.replace(tokenizer.bos_token, "")
1004
1122
 
1005
1123
  prompt_token_ids = tokenizer.encode(prompt)
1006
1124
  completion = dataset[i][1]
@@ -1019,7 +1137,11 @@ def sample_sharegpt_requests(
1019
1137
  continue
1020
1138
 
1021
1139
  filtered_dataset.append(
1022
- DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
1140
+ DatasetRow(
1141
+ prompt=prompt,
1142
+ prompt_len=prompt_len,
1143
+ output_len=output_len,
1144
+ )
1023
1145
  )
1024
1146
 
1025
1147
  print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
@@ -1130,7 +1252,7 @@ def sample_random_requests(
1130
1252
  return input_requests
1131
1253
 
1132
1254
 
1133
- def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1255
+ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1134
1256
  """Parse image resolution into (width, height).
1135
1257
 
1136
1258
  Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
@@ -1155,24 +1277,79 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1155
1277
  return (width, height)
1156
1278
 
1157
1279
  raise ValueError(
1158
- f"Unsupported random-image resolution: {image_resolution}. "
1280
+ f"Unsupported image resolution: {image_resolution}. "
1159
1281
  "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1160
1282
  )
1161
1283
 
1162
1284
 
1163
- def sample_random_image_requests(
1285
+ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
1286
+ try:
1287
+ content_items = [
1288
+ {"type": "image_url", "image_url": {"url": img_url}}
1289
+ for img_url in images_base64
1290
+ ]
1291
+ content_items.append({"type": "text", "text": text_prompt})
1292
+ prompt_str = processor.apply_chat_template(
1293
+ [{"role": "user", "content": content_items}],
1294
+ add_generation_prompt=True,
1295
+ tokenize=False,
1296
+ )
1297
+ except Exception:
1298
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1299
+ prompt_str = f"<image>{text_prompt}"
1300
+
1301
+ # Calculate total tokens (text + vision)
1302
+ prompt_len = processor(
1303
+ text=[prompt_str],
1304
+ images=images,
1305
+ padding=False,
1306
+ return_tensors="pt",
1307
+ )["input_ids"].numel()
1308
+
1309
+ # Calculate text-only tokens
1310
+ try:
1311
+ # Create text-only version of the prompt
1312
+ text_only_prompt = processor.apply_chat_template(
1313
+ [{"role": "user", "content": text_prompt}],
1314
+ add_generation_prompt=True,
1315
+ tokenize=False,
1316
+ )
1317
+ text_prompt_len = processor(
1318
+ text=[text_only_prompt],
1319
+ padding=False,
1320
+ return_tensors="pt",
1321
+ )["input_ids"].numel()
1322
+ except Exception:
1323
+ # Fallback: just tokenize the text prompt directly
1324
+ text_prompt_len = len(processor.tokenizer.encode(text_prompt))
1325
+
1326
+ # Vision tokens = total tokens - text tokens
1327
+ vision_prompt_len = prompt_len - text_prompt_len
1328
+
1329
+ return DatasetRow(
1330
+ prompt=text_prompt,
1331
+ prompt_len=prompt_len,
1332
+ output_len=output_len,
1333
+ text_prompt_len=text_prompt_len,
1334
+ vision_prompt_len=vision_prompt_len,
1335
+ image_data=images_base64,
1336
+ )
1337
+
1338
+
1339
+ def sample_image_requests(
1164
1340
  num_requests: int,
1165
- num_images: int,
1341
+ image_count: int,
1166
1342
  input_len: int,
1167
1343
  output_len: int,
1168
1344
  range_ratio: float,
1169
- tokenizer: PreTrainedTokenizerBase,
1170
- apply_chat_template: bool = True,
1171
- image_resolution: str = "1080p",
1345
+ processor: AutoProcessor,
1346
+ image_content: str,
1347
+ image_format: str,
1348
+ image_resolution: str,
1172
1349
  ) -> List[DatasetRow]:
1173
- """Generate requests with random images.
1350
+ """Generate requests with images.
1174
1351
 
1175
- - Each request includes ``num_images`` random images.
1352
+ - Each request includes ``image_count`` images.
1176
1353
  - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1177
1354
  or custom 'heightxwidth' (e.g., 1080x1920).
1178
1355
  - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
@@ -1187,12 +1364,12 @@ def sample_random_image_requests(
1187
1364
  ) from e
1188
1365
 
1189
1366
  # Parse resolution (supports presets and 'heightxwidth')
1190
- width, height = parse_random_image_resolution(image_resolution)
1367
+ width, height = parse_image_resolution(image_resolution)
1191
1368
 
1192
1369
  # Check for potentially problematic combinations and warn user
1193
- if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1370
+ if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
1194
1371
  warnings.warn(
1195
- f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1372
+ f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
1196
1373
  f"may take a long time. Consider reducing resolution or image count.",
1197
1374
  UserWarning,
1198
1375
  stacklevel=2,
@@ -1206,53 +1383,50 @@ def sample_random_image_requests(
1206
1383
  int(output_len * range_ratio), output_len + 1, size=num_requests
1207
1384
  )
1208
1385
 
1209
- def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1210
- arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1211
- img = Image.fromarray(arr, mode="RGB")
1386
+ def _gen_random_image_data_uri(
1387
+ width: int = width, height: int = height
1388
+ ) -> (Image, str, int):
1389
+ if image_content == "blank":
1390
+ # Generate blank white image
1391
+ arr = np.full((height, width, 3), 255, dtype=np.uint8)
1392
+ else:
1393
+ # Generate random colored image
1394
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1395
+ img = Image.fromarray(arr)
1212
1396
  buf = io.BytesIO()
1213
- img.save(buf, format="JPEG", quality=85)
1397
+ img.save(buf, format=image_format, quality=85)
1214
1398
  encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1215
- return f"data:image/jpeg;base64,{encoded}"
1399
+ image_data = f"data:image/{image_format};base64,{encoded}"
1400
+ image_bytes = len(image_data.encode("utf-8"))
1401
+ return img, image_data, image_bytes
1216
1402
 
1217
1403
  dataset: List[DatasetRow] = []
1404
+ total_image_bytes = 0
1218
1405
  for i in range(num_requests):
1219
1406
  # Generate text prompt
1220
- text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1407
+ text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
1221
1408
 
1222
1409
  # Generate image list
1223
- images = [_gen_random_image_data_uri() for _ in range(num_images)]
1224
-
1225
- prompt_str = text_prompt
1226
- if apply_chat_template:
1227
- try:
1228
- content_items = [
1229
- {"type": "image_url", "image_url": {"url": img_url}}
1230
- for img_url in images
1231
- ]
1232
- content_items.append({"type": "text", "text": text_prompt})
1233
- prompt_str = tokenizer.apply_chat_template(
1234
- [{"role": "user", "content": content_items}],
1235
- add_generation_prompt=True,
1236
- tokenize=False,
1237
- )
1238
- except Exception:
1239
- # Some tokenizers do not support list content; fall back to a placeholder in the text
1240
- prompt_str = f"<image>{text_prompt}"
1241
-
1242
- prompt_token_ids = tokenizer.encode(prompt_str)
1243
- prompt_token_len = len(prompt_token_ids)
1244
-
1245
- dataset.append(
1246
- DatasetRow(
1247
- prompt=prompt_str,
1248
- prompt_len=prompt_token_len,
1249
- output_len=int(output_lens[i]),
1250
- image_data=images,
1251
- )
1410
+ images, images_base64, images_bytes = zip(
1411
+ *[_gen_random_image_data_uri() for _ in range(image_count)]
1412
+ )
1413
+ total_image_bytes += sum(list(images_bytes))
1414
+
1415
+ data_row = create_mm_data_row(
1416
+ text_prompt,
1417
+ list(images),
1418
+ list(images_base64),
1419
+ int(output_lens[i]),
1420
+ processor,
1252
1421
  )
1253
1422
 
1423
+ dataset.append(data_row)
1424
+
1254
1425
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1255
1426
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1427
+ print(
1428
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
1429
+ )
1256
1430
  return dataset
1257
1431
 
1258
1432
 
@@ -1324,7 +1498,9 @@ def sample_generated_shared_prefix_requests(
1324
1498
 
1325
1499
  input_requests.append(
1326
1500
  DatasetRow(
1327
- prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1501
+ prompt=full_prompt,
1502
+ prompt_len=prompt_len,
1503
+ output_len=output_len,
1328
1504
  )
1329
1505
  )
1330
1506
  total_input_tokens += prompt_len
@@ -1359,19 +1535,41 @@ def sample_generated_shared_prefix_requests(
1359
1535
  async def get_request(
1360
1536
  input_requests: List[DatasetRow],
1361
1537
  request_rate: float,
1538
+ use_trace_timestamps: bool = False,
1539
+ slowdown_factor: float = 1.0,
1362
1540
  ) -> AsyncGenerator[DatasetRow, None]:
1363
- input_requests = iter(input_requests)
1364
- for request in input_requests:
1365
- yield request
1541
+ if use_trace_timestamps:
1542
+ print(
1543
+ f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
1544
+ )
1545
+ # Sort requests by timestamp for correct replay
1546
+ input_requests.sort(key=lambda r: r.timestamp)
1366
1547
 
1367
- if request_rate == float("inf"):
1368
- # If the request rate is infinity, then we don't need to wait.
1369
- continue
1548
+ start_time = time.perf_counter()
1549
+ trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
1370
1550
 
1371
- # Sample the request interval from the exponential distribution.
1372
- interval = np.random.exponential(1.0 / request_rate)
1373
- # The next request will be sent after the interval.
1374
- await asyncio.sleep(interval)
1551
+ for request in input_requests:
1552
+ trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
1553
+ target_arrival_time = start_time + (trace_time_s * slowdown_factor)
1554
+
1555
+ sleep_duration = target_arrival_time - time.perf_counter()
1556
+ if sleep_duration > 0:
1557
+ await asyncio.sleep(sleep_duration)
1558
+
1559
+ yield request
1560
+ else:
1561
+ input_requests_iter = iter(input_requests)
1562
+ for request in input_requests_iter:
1563
+ yield request
1564
+
1565
+ if request_rate == float("inf"):
1566
+ # If the request rate is infinity, then we don't need to wait.
1567
+ continue
1568
+
1569
+ # Sample the request interval from the exponential distribution.
1570
+ interval = np.random.exponential(1.0 / request_rate)
1571
+ # The next request will be sent after the interval.
1572
+ await asyncio.sleep(interval)
1375
1573
 
1376
1574
 
1377
1575
  def calculate_metrics(
@@ -1384,6 +1582,8 @@ def calculate_metrics(
1384
1582
  output_lens: List[int] = []
1385
1583
  retokenized_output_lens: List[int] = []
1386
1584
  total_input = 0
1585
+ total_input_text = 0
1586
+ total_input_vision = 0
1387
1587
  completed = 0
1388
1588
  itls: List[float] = []
1389
1589
  tpots: List[float] = []
@@ -1398,6 +1598,8 @@ def calculate_metrics(
1398
1598
  )
1399
1599
  retokenized_output_lens.append(retokenized_output_len)
1400
1600
  total_input += input_requests[i].prompt_len
1601
+ total_input_text += input_requests[i].text_prompt_len
1602
+ total_input_vision += input_requests[i].vision_prompt_len
1401
1603
  if output_len > 1:
1402
1604
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1403
1605
  itls += outputs[i].itl
@@ -1419,6 +1621,8 @@ def calculate_metrics(
1419
1621
  metrics = BenchmarkMetrics(
1420
1622
  completed=completed,
1421
1623
  total_input=total_input,
1624
+ total_input_text=total_input_text,
1625
+ total_input_vision=total_input_vision,
1422
1626
  total_output=sum(output_lens),
1423
1627
  total_output_retokenized=sum(retokenized_output_lens),
1424
1628
  request_throughput=completed / dur_s,
@@ -1469,6 +1673,9 @@ async def benchmark(
1469
1673
  pd_separated: bool = False,
1470
1674
  flush_cache: bool = False,
1471
1675
  warmup_requests: int = 1,
1676
+ use_trace_timestamps: bool = False,
1677
+ mooncake_slowdown_factor=1.0,
1678
+ mooncake_num_rounds=1,
1472
1679
  ):
1473
1680
  if backend in ASYNC_REQUEST_FUNCS:
1474
1681
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1488,8 +1695,32 @@ async def benchmark(
1488
1695
  # Warmup
1489
1696
  print(f"Starting warmup with {warmup_requests} sequences...")
1490
1697
 
1491
- # Use the first request for all warmup iterations
1492
- test_request = input_requests[0]
1698
+ # Handle the data structure difference for the warmup request
1699
+ if args.dataset_name == "mooncake":
1700
+ # For mooncake, input_requests is a list of dicts.
1701
+ # We need to build a temporary DatasetRow for the warmup phase.
1702
+ warmup_record = input_requests[0]
1703
+
1704
+ # Build prompt from hash_ids, just like in the async generator
1705
+ hash_ids = warmup_record.get("hash_ids", [])
1706
+ prompt_text = ""
1707
+ for hash_id in hash_ids:
1708
+ prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
1709
+ prompt_text += "Can you tell me a detailed story in 1000 words?"
1710
+
1711
+ output_len = warmup_record.get("output_length", 32)
1712
+ prompt_len = len(tokenizer.encode(prompt_text))
1713
+
1714
+ # Create a temporary DatasetRow object for warmup
1715
+ test_request = DatasetRow(
1716
+ prompt=prompt_text,
1717
+ prompt_len=prompt_len,
1718
+ output_len=output_len,
1719
+ image_data=None, # Mooncake doesn't have image data
1720
+ )
1721
+ else:
1722
+ # For all other datasets, input_requests is a list of DatasetRow objects
1723
+ test_request = input_requests[0]
1493
1724
 
1494
1725
  if lora_names is not None and len(lora_names) != 0:
1495
1726
  lora_name = lora_names[0]
@@ -1543,12 +1774,26 @@ async def benchmark(
1543
1774
  if profile_output.success:
1544
1775
  print("Profiler started")
1545
1776
 
1546
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
1547
-
1548
1777
  # Run all requests
1549
1778
  benchmark_start_time = time.perf_counter()
1550
1779
  tasks: List[asyncio.Task] = []
1551
- async for request in get_request(input_requests, request_rate):
1780
+ pbar_total = len(input_requests)
1781
+ if (
1782
+ backend == "sglang" and args.dataset_name == "mooncake"
1783
+ ): # Assuming mooncake is mainly for sglang or similar backends
1784
+ print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
1785
+ request_generator = get_mooncake_request_over_time(
1786
+ input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
1787
+ )
1788
+ print(
1789
+ f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
1790
+ )
1791
+ pbar_total *= args.mooncake_num_rounds
1792
+ else:
1793
+ request_generator = get_request(input_requests, request_rate)
1794
+
1795
+ pbar = None if disable_tqdm else tqdm(total=pbar_total)
1796
+ async for request in request_generator:
1552
1797
  if lora_names is not None and len(lora_names) != 0:
1553
1798
  idx = random.randint(0, len(lora_names) - 1)
1554
1799
  lora_name = lora_names[idx]
@@ -1564,6 +1809,7 @@ async def benchmark(
1564
1809
  lora_name=lora_name,
1565
1810
  image_data=request.image_data,
1566
1811
  extra_request_body=extra_request_body,
1812
+ timestamp=request.timestamp,
1567
1813
  )
1568
1814
 
1569
1815
  tasks.append(
@@ -1584,14 +1830,22 @@ async def benchmark(
1584
1830
  pbar.close()
1585
1831
 
1586
1832
  if "sglang" in backend:
1587
- server_info = requests.get(base_url + "/get_server_info")
1833
+ server_info = requests.get(
1834
+ base_url + "/get_server_info", headers=get_auth_headers()
1835
+ )
1588
1836
  if server_info.status_code == 200:
1589
1837
  server_info_json = server_info.json()
1590
1838
  if "decode" in server_info_json:
1591
1839
  server_info_json = server_info_json["decode"][0]
1592
- accept_length = server_info_json["internal_states"][0].get(
1593
- "avg_spec_accept_length", None
1594
- )
1840
+ if (
1841
+ "internal_states" in server_info_json
1842
+ and server_info_json["internal_states"]
1843
+ ):
1844
+ accept_length = server_info_json["internal_states"][0].get(
1845
+ "avg_spec_accept_length", None
1846
+ )
1847
+ else:
1848
+ accept_length = None
1595
1849
  else:
1596
1850
  accept_length = None
1597
1851
  else:
@@ -1609,7 +1863,11 @@ async def benchmark(
1609
1863
 
1610
1864
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
1611
1865
  print("{:<40} {:<10}".format("Backend:", backend))
1612
- print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
1866
+ print(
1867
+ "{:<40} {:<10}".format(
1868
+ "Traffic request rate:", "trace" if use_trace_timestamps else request_rate
1869
+ )
1870
+ )
1613
1871
  print(
1614
1872
  "{:<40} {:<10}".format(
1615
1873
  "Max request concurrency:",
@@ -1619,6 +1877,10 @@ async def benchmark(
1619
1877
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
1620
1878
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
1621
1879
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
1880
+ print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
1881
+ print(
1882
+ "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
1883
+ )
1622
1884
  print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
1623
1885
  print(
1624
1886
  "{:<40} {:<10}".format(
@@ -1678,7 +1940,7 @@ async def benchmark(
1678
1940
  # Arguments
1679
1941
  "backend": args.backend,
1680
1942
  "dataset_name": args.dataset_name,
1681
- "request_rate": request_rate,
1943
+ "request_rate": "trace" if use_trace_timestamps else request_rate,
1682
1944
  "max_concurrency": max_concurrency,
1683
1945
  "sharegpt_output_len": args.sharegpt_output_len,
1684
1946
  "random_input_len": args.random_input_len,
@@ -1688,6 +1950,8 @@ async def benchmark(
1688
1950
  "duration": benchmark_duration,
1689
1951
  "completed": metrics.completed,
1690
1952
  "total_input_tokens": metrics.total_input,
1953
+ "total_input_text_tokens": metrics.total_input_text,
1954
+ "total_input_vision_tokens": metrics.total_input_vision,
1691
1955
  "total_output_tokens": metrics.total_output,
1692
1956
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1693
1957
  "request_throughput": metrics.request_throughput,
@@ -1722,16 +1986,18 @@ async def benchmark(
1722
1986
  output_file_name = args.output_file
1723
1987
  else:
1724
1988
  now = datetime.now().strftime("%m%d")
1725
- if args.dataset_name == "random-image":
1989
+ if args.dataset_name == "image":
1726
1990
  output_file_name = (
1727
1991
  f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1728
- f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1729
- f"{args.random_image_resolution}.jsonl"
1992
+ f"{args.random_output_len}_{args.image_count}imgs_"
1993
+ f"{args.image_resolution}.jsonl"
1730
1994
  )
1731
1995
  elif args.dataset_name.startswith("random"):
1732
1996
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1733
1997
  else:
1734
- output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
1998
+ output_file_name = (
1999
+ f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
2000
+ )
1735
2001
 
1736
2002
  result_details = {
1737
2003
  "input_lens": [output.prompt_len for output in outputs],
@@ -1786,6 +2052,17 @@ def run_benchmark(args_: argparse.Namespace):
1786
2052
  if not hasattr(args, "tokenize_prompt"):
1787
2053
  args.tokenize_prompt = False
1788
2054
 
2055
+ if not hasattr(args, "use_trace_timestamps"):
2056
+ args.use_trace_timestamps = False
2057
+ if not hasattr(args, "mooncake_slowdown_factor"):
2058
+ args.mooncake_slowdown_factor = 1.0
2059
+
2060
+ if not hasattr(args, "mooncake_slowdown_factor"):
2061
+ args.mooncake_slowdown_factor = 1.0
2062
+
2063
+ if not hasattr(args, "mooncake_num_rounds"):
2064
+ args.mooncake_num_rounds = 1
2065
+
1789
2066
  print(f"benchmark_args={args}")
1790
2067
 
1791
2068
  # Set global environments
@@ -1889,6 +2166,12 @@ def run_benchmark(args_: argparse.Namespace):
1889
2166
  "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
1890
2167
  )
1891
2168
 
2169
+ if args.dataset_name in ["image", "mmmu"]:
2170
+ args.apply_chat_template = True
2171
+ assert (
2172
+ not args.tokenize_prompt
2173
+ ), "`--tokenize-prompt` not compatible with image dataset"
2174
+
1892
2175
  print(f"{args}\n")
1893
2176
 
1894
2177
  # Read dataset
@@ -1896,7 +2179,7 @@ def run_benchmark(args_: argparse.Namespace):
1896
2179
  model_id = args.model
1897
2180
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
1898
2181
  tokenizer = get_tokenizer(tokenizer_id)
1899
- input_requests = get_dataset(args, tokenizer)
2182
+ input_requests = get_dataset(args, tokenizer, model_id)
1900
2183
 
1901
2184
  # compatible with SimpleNamespace
1902
2185
  if not hasattr(args, "flush_cache"):
@@ -1919,6 +2202,9 @@ def run_benchmark(args_: argparse.Namespace):
1919
2202
  pd_separated=args.pd_separated,
1920
2203
  flush_cache=args.flush_cache,
1921
2204
  warmup_requests=args.warmup_requests,
2205
+ use_trace_timestamps=args.use_trace_timestamps,
2206
+ mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2207
+ mooncake_num_rounds=args.mooncake_num_rounds,
1922
2208
  )
1923
2209
  )
1924
2210
 
@@ -1974,7 +2260,8 @@ if __name__ == "__main__":
1974
2260
  "random-ids",
1975
2261
  "generated-shared-prefix",
1976
2262
  "mmmu",
1977
- "random-image",
2263
+ "image",
2264
+ "mooncake",
1978
2265
  ],
1979
2266
  help="Name of the dataset to benchmark on.",
1980
2267
  )
@@ -2013,37 +2300,49 @@ if __name__ == "__main__":
2013
2300
  "--random-input-len",
2014
2301
  type=int,
2015
2302
  default=1024,
2016
- help="Number of input tokens per request, used only for random dataset.",
2303
+ help="Number of input tokens per request, used only for random and image dataset.",
2017
2304
  )
2018
2305
  parser.add_argument(
2019
2306
  "--random-output-len",
2020
2307
  default=1024,
2021
2308
  type=int,
2022
- help="Number of output tokens per request, used only for random dataset.",
2309
+ help="Number of output tokens per request, used only for random and image dataset.",
2023
2310
  )
2024
2311
  parser.add_argument(
2025
2312
  "--random-range-ratio",
2026
2313
  type=float,
2027
2314
  default=0.0,
2028
2315
  help="Range of sampled ratio of input/output length, "
2029
- "used only for random dataset.",
2316
+ "used only for random and image dataset.",
2030
2317
  )
2031
- # random-image dataset args
2318
+ # image dataset args
2032
2319
  parser.add_argument(
2033
- "--random-image-num-images",
2320
+ "--image-count",
2034
2321
  type=int,
2035
2322
  default=1,
2036
- help="Number of images per request (only available with the random-image dataset)",
2323
+ help="Number of images per request (only available with the image dataset)",
2037
2324
  )
2038
2325
  parser.add_argument(
2039
- "--random-image-resolution",
2326
+ "--image-resolution",
2040
2327
  type=str,
2041
2328
  default="1080p",
2042
2329
  help=(
2043
- "Resolution of random images for random-image dataset. "
2330
+ "Resolution of images for image dataset. "
2044
2331
  "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2045
2332
  ),
2046
2333
  )
2334
+ parser.add_argument(
2335
+ "--image-format",
2336
+ type=str,
2337
+ default="jpeg",
2338
+ help=("Format of images for image dataset. " "Supports jpeg and png."),
2339
+ )
2340
+ parser.add_argument(
2341
+ "--image-content",
2342
+ type=str,
2343
+ default="random",
2344
+ help=("Content for images for image dataset. " "Supports random and blank."),
2345
+ )
2047
2346
  parser.add_argument(
2048
2347
  "--request-rate",
2049
2348
  type=float,
@@ -2051,6 +2350,11 @@ if __name__ == "__main__":
2051
2350
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
2052
2351
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
2053
2352
  )
2353
+ parser.add_argument(
2354
+ "--use-trace-timestamps",
2355
+ action="store_true",
2356
+ help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
2357
+ )
2054
2358
  parser.add_argument(
2055
2359
  "--max-concurrency",
2056
2360
  type=int,
@@ -2174,5 +2478,33 @@ if __name__ == "__main__":
2174
2478
  default=256,
2175
2479
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
2176
2480
  )
2481
+ mooncake_group = parser.add_argument_group("mooncake dataset arguments")
2482
+ mooncake_group.add_argument(
2483
+ "--mooncake-slowdown-factor",
2484
+ type=float,
2485
+ default=1.0,
2486
+ help="Slowdown factor for replaying the mooncake trace. "
2487
+ "A value of 2.0 means the replay is twice as slow. "
2488
+ "NOTE: --request-rate is IGNORED in mooncake mode.",
2489
+ )
2490
+ mooncake_group.add_argument(
2491
+ "--mooncake-num-rounds",
2492
+ type=int,
2493
+ default=1,
2494
+ help="Number of conversation rounds for each session in the mooncake dataset. "
2495
+ "A value > 1 will enable true multi-turn session benchmarking.",
2496
+ )
2497
+ mooncake_group.add_argument(
2498
+ "--mooncake-workload",
2499
+ type=str,
2500
+ default="conversation",
2501
+ choices=[
2502
+ "mooncake",
2503
+ "conversation",
2504
+ "synthetic",
2505
+ "toolagent",
2506
+ ],
2507
+ help="Underlying workload for the mooncake dataset.",
2508
+ )
2177
2509
  args = parser.parse_args()
2178
2510
  run_benchmark(args)