sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
- import base64
16
15
  import io
17
16
  import json
18
17
  import os
@@ -32,9 +31,13 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
32
31
 
33
32
  import aiohttp
34
33
  import numpy as np
34
+ import pybase64
35
35
  import requests
36
+ from datasets import load_dataset
37
+ from PIL import Image
36
38
  from tqdm.asyncio import tqdm
37
39
  from transformers import (
40
+ AutoProcessor,
38
41
  AutoTokenizer,
39
42
  PreTrainedTokenizer,
40
43
  PreTrainedTokenizerBase,
@@ -85,6 +88,7 @@ class RequestFuncOutput:
85
88
  latency: float = 0.0
86
89
  ttft: float = 0.0 # Time to first token
87
90
  itl: List[float] = field(default_factory=list) # List of inter-token latencies
91
+ text_chunks: List[str] = field(default_factory=list)
88
92
  prompt_len: int = 0
89
93
  error: str = ""
90
94
  output_len: int = 0
@@ -209,6 +213,11 @@ async def async_request_openai_completions(
209
213
  **request_func_input.extra_request_body,
210
214
  }
211
215
 
216
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
217
+ if request_func_input.lora_name:
218
+ payload["model"] = request_func_input.lora_name
219
+ payload["lora_path"] = request_func_input.lora_name
220
+
212
221
  if request_func_input.image_data:
213
222
  payload.update({"image_data": request_func_input.image_data})
214
223
 
@@ -250,6 +259,9 @@ async def async_request_openai_completions(
250
259
 
251
260
  # Decoding phase
252
261
  else:
262
+ output.text_chunks.append(
263
+ data["choices"][0]["text"]
264
+ )
253
265
  output.itl.append(timestamp - most_recent_timestamp)
254
266
 
255
267
  most_recent_timestamp = timestamp
@@ -322,10 +334,17 @@ async def async_request_openai_chat_completions(
322
334
  "model": request_func_input.model,
323
335
  "messages": messages,
324
336
  "temperature": 0.0,
325
- "max_tokens": request_func_input.output_len,
337
+ "max_completion_tokens": request_func_input.output_len,
326
338
  "stream": not args.disable_stream,
339
+ "ignore_eos": not args.disable_ignore_eos,
327
340
  **request_func_input.extra_request_body,
328
341
  }
342
+
343
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
344
+ if request_func_input.lora_name:
345
+ payload["model"] = request_func_input.lora_name
346
+ payload["lora_path"] = request_func_input.lora_name
347
+
329
348
  headers = get_auth_headers()
330
349
 
331
350
  output = RequestFuncOutput.init_new(request_func_input)
@@ -559,9 +578,8 @@ async def async_request_sglang_generate(
559
578
  num_new_tokens = output_len - last_output_len
560
579
  if num_new_tokens == 0:
561
580
  continue
562
- adjust_itl = (
563
- timestamp - most_recent_timestamp
564
- ) / num_new_tokens
581
+ chunk_gap = timestamp - most_recent_timestamp
582
+ adjust_itl = chunk_gap / num_new_tokens
565
583
  output.itl.extend([adjust_itl] * num_new_tokens)
566
584
 
567
585
  most_recent_timestamp = timestamp
@@ -610,6 +628,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
610
628
  return output
611
629
 
612
630
 
631
+ def _build_profile_urls(
632
+ profile_prefill_url: Optional[List[str]],
633
+ profile_decode_url: Optional[List[str]],
634
+ ) -> List[Tuple[str, str]]:
635
+ """Build profile URLs list from prefill/decode URL arguments.
636
+
637
+ Returns:
638
+ List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
639
+ """
640
+ profile_urls = []
641
+ if profile_prefill_url:
642
+ for idx, url in enumerate(profile_prefill_url):
643
+ profile_urls.append((f"Prefill-{idx}", url))
644
+ if profile_decode_url:
645
+ for idx, url in enumerate(profile_decode_url):
646
+ profile_urls.append((f"Decode-{idx}", url))
647
+ return profile_urls
648
+
649
+
650
+ async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
651
+ """Call profile endpoint (start/stop) on PD separated workers.
652
+
653
+ Args:
654
+ profile_urls: List of (worker_type, url) tuples
655
+ mode: "start" or "stop"
656
+ """
657
+ endpoint = "/start_profile" if mode == "start" else "/stop_profile"
658
+ action = "Starting" if mode == "start" else "Stopping"
659
+ action_past = "started" if mode == "start" else "stopped"
660
+
661
+ print(f"{action} profiler...")
662
+
663
+ for worker_type, url in profile_urls:
664
+ profile_output = await async_request_profile(api_url=url + endpoint)
665
+ if profile_output.success:
666
+ print(f"Profiler {action_past} for {worker_type} worker at {url}")
667
+ else:
668
+ print(
669
+ f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
670
+ )
671
+
672
+
613
673
  def get_model(pretrained_model_name_or_path: str) -> str:
614
674
  if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
615
675
  import huggingface_hub.constants
@@ -648,7 +708,30 @@ def get_tokenizer(
648
708
  )
649
709
 
650
710
 
651
- def get_dataset(args, tokenizer):
711
+ def get_processor(
712
+ pretrained_model_name_or_path: str,
713
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
714
+ assert (
715
+ pretrained_model_name_or_path is not None
716
+ and pretrained_model_name_or_path != ""
717
+ )
718
+ if pretrained_model_name_or_path.endswith(
719
+ ".json"
720
+ ) or pretrained_model_name_or_path.endswith(".model"):
721
+ from sglang.srt.utils.hf_transformers_utils import get_processor
722
+
723
+ return get_processor(pretrained_model_name_or_path)
724
+
725
+ if pretrained_model_name_or_path is not None and not os.path.exists(
726
+ pretrained_model_name_or_path
727
+ ):
728
+ pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
729
+ return AutoProcessor.from_pretrained(
730
+ pretrained_model_name_or_path, trust_remote_code=True
731
+ )
732
+
733
+
734
+ def get_dataset(args, tokenizer, model_id=None):
652
735
  tokenize_prompt = getattr(args, "tokenize_prompt", False)
653
736
  if args.dataset_name == "sharegpt":
654
737
  assert not tokenize_prompt
@@ -661,7 +744,7 @@ def get_dataset(args, tokenizer):
661
744
  prompt_suffix=args.prompt_suffix,
662
745
  apply_chat_template=args.apply_chat_template,
663
746
  )
664
- elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
747
+ elif args.dataset_name.startswith("random"):
665
748
  input_requests = sample_random_requests(
666
749
  input_len=args.random_input_len,
667
750
  output_len=args.random_output_len,
@@ -672,17 +755,19 @@ def get_dataset(args, tokenizer):
672
755
  random_sample=args.dataset_name == "random",
673
756
  return_text=not tokenize_prompt,
674
757
  )
675
- elif args.dataset_name == "random-image":
676
- assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
677
- input_requests = sample_random_image_requests(
758
+ elif args.dataset_name == "image":
759
+ processor = get_processor(model_id)
760
+ input_requests = sample_image_requests(
678
761
  num_requests=args.num_prompts,
679
- num_images=args.random_image_num_images,
762
+ image_count=args.image_count,
680
763
  input_len=args.random_input_len,
681
764
  output_len=args.random_output_len,
682
765
  range_ratio=args.random_range_ratio,
683
- tokenizer=tokenizer,
684
- apply_chat_template=args.apply_chat_template,
685
- image_resolution=args.random_image_resolution,
766
+ processor=processor,
767
+ image_content=args.image_content,
768
+ image_format=args.image_format,
769
+ image_resolution=args.image_resolution,
770
+ backend=args.backend,
686
771
  )
687
772
  elif args.dataset_name == "generated-shared-prefix":
688
773
  assert not tokenize_prompt
@@ -696,12 +781,12 @@ def get_dataset(args, tokenizer):
696
781
  args=args,
697
782
  )
698
783
  elif args.dataset_name == "mmmu":
699
- assert not tokenize_prompt
784
+ processor = get_processor(model_id)
700
785
  input_requests = sample_mmmu_requests(
701
786
  num_requests=args.num_prompts,
702
- tokenizer=tokenizer,
787
+ processor=processor,
788
+ backend=args.backend,
703
789
  fixed_output_len=args.random_output_len,
704
- apply_chat_template=args.apply_chat_template,
705
790
  random_sample=True,
706
791
  )
707
792
  elif args.dataset_name == "mooncake":
@@ -746,6 +831,8 @@ ASYNC_REQUEST_FUNCS = {
746
831
  class BenchmarkMetrics:
747
832
  completed: int
748
833
  total_input: int
834
+ total_input_text: int
835
+ total_input_vision: int
749
836
  total_output: int
750
837
  total_output_retokenized: int
751
838
  request_throughput: float
@@ -839,9 +926,17 @@ class DatasetRow:
839
926
  prompt: str
840
927
  prompt_len: int
841
928
  output_len: int
929
+ text_prompt_len: Optional[int] = None
930
+ vision_prompt_len: Optional[int] = None
842
931
  image_data: Optional[List[str]] = None
843
932
  timestamp: Optional[float] = None
844
933
 
934
+ def __post_init__(self):
935
+ if self.text_prompt_len is None:
936
+ self.text_prompt_len = self.prompt_len
937
+ if self.vision_prompt_len is None:
938
+ self.vision_prompt_len = 0
939
+
845
940
 
846
941
  async def get_mooncake_request_over_time(
847
942
  input_requests: List[Dict],
@@ -889,7 +984,7 @@ async def get_mooncake_request_over_time(
889
984
  for i in range(num_rounds):
890
985
  # Add user query for the current round
891
986
  chat_history.append(
892
- {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
987
+ {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
893
988
  )
894
989
 
895
990
  # Form the full prompt from history
@@ -918,9 +1013,9 @@ async def get_mooncake_request_over_time(
918
1013
 
919
1014
  def sample_mmmu_requests(
920
1015
  num_requests: int,
921
- tokenizer: PreTrainedTokenizerBase,
1016
+ processor: AutoProcessor | AutoTokenizer,
1017
+ backend: str,
922
1018
  fixed_output_len: Optional[int] = None,
923
- apply_chat_template: bool = True,
924
1019
  random_sample: bool = True,
925
1020
  ) -> List[DatasetRow]:
926
1021
  """
@@ -928,22 +1023,12 @@ def sample_mmmu_requests(
928
1023
 
929
1024
  Args:
930
1025
  num_requests: Number of requests to sample.
931
- tokenizer: Tokenizer to use for token counting.
932
1026
  fixed_output_len: If provided, use this fixed output length for all requests.
933
- apply_chat_template: Whether to apply the chat template to the prompt.
934
1027
  random_sample: Whether to randomly sample or take the first N.
935
1028
 
936
1029
  Returns:
937
1030
  List of tuples (prompt, prompt_token_len, output_token_len).
938
1031
  """
939
- try:
940
- import io
941
-
942
- import pybase64
943
- from datasets import load_dataset
944
- except ImportError:
945
- raise ImportError("Please install datasets: pip install datasets")
946
-
947
1032
  print("Loading MMMU dataset from HuggingFace...")
948
1033
 
949
1034
  try:
@@ -999,54 +1084,12 @@ def sample_mmmu_requests(
999
1084
  question = example.get("question")
1000
1085
 
1001
1086
  # Construct the prompt
1002
- prompt = f"Question: {question}\n\nAnswer: "
1003
- if apply_chat_template:
1004
- try:
1005
- is_phi4_multimodal = (
1006
- "phi-4-multimodal" in tokenizer.name_or_path.lower()
1007
- )
1008
- if is_phi4_multimodal:
1009
- # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1010
- content = prompt.replace("image 1", "<|endoftext10|>")
1011
- else:
1012
- content = [
1013
- {
1014
- "type": "image_url",
1015
- "image_url": {"url": image_data},
1016
- },
1017
- {"type": "text", "text": prompt},
1018
- ]
1019
- prompt = tokenizer.apply_chat_template(
1020
- [
1021
- {
1022
- "role": "user",
1023
- "content": content,
1024
- }
1025
- ],
1026
- add_generation_prompt=True,
1027
- tokenize=False,
1028
- )
1029
- except Exception as e:
1030
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1031
- print(
1032
- f"Error applying chat template: {e}, fallback to <image> tag"
1033
- )
1034
- prompt = f"<image>{prompt}"
1035
-
1036
- # Calculate token lengths for text only (without image data)
1037
- prompt_token_ids = tokenizer.encode(prompt)
1038
- prompt_len = len(prompt_token_ids)
1039
-
1087
+ text_prompt = f"Question: {question}\n\nAnswer: "
1040
1088
  output_len = fixed_output_len if fixed_output_len is not None else 256
1041
-
1042
- filtered_dataset.append(
1043
- DatasetRow(
1044
- prompt=prompt,
1045
- prompt_len=prompt_len,
1046
- output_len=output_len,
1047
- image_data=[image_data],
1048
- )
1089
+ data_row = create_mm_data_row(
1090
+ text_prompt, [image], [image_data], output_len, processor, backend
1049
1091
  )
1092
+ filtered_dataset.append(data_row)
1050
1093
 
1051
1094
  except Exception as e:
1052
1095
  print(f"Error processing example {i}: {e}")
@@ -1134,7 +1177,11 @@ def sample_sharegpt_requests(
1134
1177
  continue
1135
1178
 
1136
1179
  filtered_dataset.append(
1137
- DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
1180
+ DatasetRow(
1181
+ prompt=prompt,
1182
+ prompt_len=prompt_len,
1183
+ output_len=output_len,
1184
+ )
1138
1185
  )
1139
1186
 
1140
1187
  print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
@@ -1245,7 +1292,7 @@ def sample_random_requests(
1245
1292
  return input_requests
1246
1293
 
1247
1294
 
1248
- def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1295
+ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1249
1296
  """Parse image resolution into (width, height).
1250
1297
 
1251
1298
  Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
@@ -1270,44 +1317,109 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1270
1317
  return (width, height)
1271
1318
 
1272
1319
  raise ValueError(
1273
- f"Unsupported random-image resolution: {image_resolution}. "
1320
+ f"Unsupported image resolution: {image_resolution}. "
1274
1321
  "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1275
1322
  )
1276
1323
 
1277
1324
 
1278
- def sample_random_image_requests(
1325
+ def create_mm_data_row(
1326
+ text_prompt, images: list, images_base64, output_len, processor, backend
1327
+ ):
1328
+ try:
1329
+ if type(processor).__name__ == "Phi4MMProcessor":
1330
+ # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1331
+ content_items = text_prompt.replace("image 1", "|endoftext10|")
1332
+ else:
1333
+ content_items = [
1334
+ {"type": "image", "image": {"url": image_base64}}
1335
+ for image_base64 in images_base64
1336
+ ]
1337
+ content_items.append({"type": "text", "text": text_prompt})
1338
+ prompt_str = processor.apply_chat_template(
1339
+ [{"role": "user", "content": content_items}],
1340
+ add_generation_prompt=True,
1341
+ tokenize=False,
1342
+ )
1343
+ except Exception as e:
1344
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1345
+ print(f"Error applying chat template: {e}, fallback to <image> tag")
1346
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1347
+ prompt_str = f"<image>{text_prompt}"
1348
+
1349
+ # Calculate total tokens (text + vision)
1350
+ prompt_len = processor(
1351
+ text=[prompt_str],
1352
+ images=images,
1353
+ padding=False,
1354
+ return_tensors="pt",
1355
+ )["input_ids"].numel()
1356
+
1357
+ # Calculate text-only tokens
1358
+ try:
1359
+ # Create text-only version of the prompt
1360
+ text_only_prompt = processor.apply_chat_template(
1361
+ [{"role": "user", "content": text_prompt}],
1362
+ add_generation_prompt=True,
1363
+ tokenize=False,
1364
+ )
1365
+ text_prompt_len = processor(
1366
+ text=[text_only_prompt],
1367
+ padding=False,
1368
+ return_tensors="pt",
1369
+ )["input_ids"].numel()
1370
+ except Exception:
1371
+ # Fallback: just tokenize the text prompt directly
1372
+ text_prompt_len = len(processor.tokenizer.encode(text_prompt))
1373
+
1374
+ # Vision tokens = total tokens - text tokens
1375
+ vision_prompt_len = prompt_len - text_prompt_len
1376
+
1377
+ use_raw_prompt = backend in [
1378
+ "sglang-oai",
1379
+ "sglang-oai-chat",
1380
+ "vllm",
1381
+ "vllm-chat",
1382
+ "lmdeploy",
1383
+ "lmdeploy-chat",
1384
+ ]
1385
+ return DatasetRow(
1386
+ prompt=text_prompt if use_raw_prompt else prompt_str,
1387
+ prompt_len=prompt_len,
1388
+ output_len=output_len,
1389
+ text_prompt_len=text_prompt_len,
1390
+ vision_prompt_len=vision_prompt_len,
1391
+ image_data=images_base64,
1392
+ )
1393
+
1394
+
1395
+ def sample_image_requests(
1279
1396
  num_requests: int,
1280
- num_images: int,
1397
+ image_count: int,
1281
1398
  input_len: int,
1282
1399
  output_len: int,
1283
1400
  range_ratio: float,
1284
- tokenizer: PreTrainedTokenizerBase,
1285
- apply_chat_template: bool = True,
1286
- image_resolution: str = "1080p",
1401
+ processor: AutoProcessor,
1402
+ image_content: str,
1403
+ image_format: str,
1404
+ image_resolution: str,
1405
+ backend: str,
1287
1406
  ) -> List[DatasetRow]:
1288
- """Generate requests with random images.
1407
+ """Generate requests with images.
1289
1408
 
1290
- - Each request includes ``num_images`` random images.
1409
+ - Each request includes ``image_count`` images.
1291
1410
  - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1292
1411
  or custom 'heightxwidth' (e.g., 1080x1920).
1293
1412
  - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1294
1413
  only counts text tokens and excludes image data.
1295
1414
  """
1296
- try:
1297
- import pybase64
1298
- from PIL import Image
1299
- except ImportError as e:
1300
- raise ImportError(
1301
- "Please install Pillow to generate random images: pip install pillow"
1302
- ) from e
1303
1415
 
1304
1416
  # Parse resolution (supports presets and 'heightxwidth')
1305
- width, height = parse_random_image_resolution(image_resolution)
1417
+ width, height = parse_image_resolution(image_resolution)
1306
1418
 
1307
1419
  # Check for potentially problematic combinations and warn user
1308
- if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1420
+ if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
1309
1421
  warnings.warn(
1310
- f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1422
+ f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
1311
1423
  f"may take a long time. Consider reducing resolution or image count.",
1312
1424
  UserWarning,
1313
1425
  stacklevel=2,
@@ -1321,53 +1433,51 @@ def sample_random_image_requests(
1321
1433
  int(output_len * range_ratio), output_len + 1, size=num_requests
1322
1434
  )
1323
1435
 
1324
- def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1325
- arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1326
- img = Image.fromarray(arr, mode="RGB")
1436
+ def _gen_random_image_data_uri(
1437
+ width: int = width, height: int = height
1438
+ ) -> (Image, str, int):
1439
+ if image_content == "blank":
1440
+ # Generate blank white image
1441
+ arr = np.full((height, width, 3), 255, dtype=np.uint8)
1442
+ else:
1443
+ # Generate random colored image
1444
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1445
+ img = Image.fromarray(arr)
1327
1446
  buf = io.BytesIO()
1328
- img.save(buf, format="JPEG", quality=85)
1447
+ img.save(buf, format=image_format, quality=85)
1329
1448
  encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1330
- return f"data:image/jpeg;base64,{encoded}"
1449
+ image_data = f"data:image/{image_format};base64,{encoded}"
1450
+ image_bytes = len(image_data.encode("utf-8"))
1451
+ return img, image_data, image_bytes
1331
1452
 
1332
1453
  dataset: List[DatasetRow] = []
1454
+ total_image_bytes = 0
1333
1455
  for i in range(num_requests):
1334
1456
  # Generate text prompt
1335
- text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1457
+ text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
1336
1458
 
1337
1459
  # Generate image list
1338
- images = [_gen_random_image_data_uri() for _ in range(num_images)]
1339
-
1340
- prompt_str = text_prompt
1341
- if apply_chat_template:
1342
- try:
1343
- content_items = [
1344
- {"type": "image_url", "image_url": {"url": img_url}}
1345
- for img_url in images
1346
- ]
1347
- content_items.append({"type": "text", "text": text_prompt})
1348
- prompt_str = tokenizer.apply_chat_template(
1349
- [{"role": "user", "content": content_items}],
1350
- add_generation_prompt=True,
1351
- tokenize=False,
1352
- )
1353
- except Exception:
1354
- # Some tokenizers do not support list content; fall back to a placeholder in the text
1355
- prompt_str = f"<image>{text_prompt}"
1356
-
1357
- prompt_token_ids = tokenizer.encode(prompt_str)
1358
- prompt_token_len = len(prompt_token_ids)
1359
-
1360
- dataset.append(
1361
- DatasetRow(
1362
- prompt=prompt_str,
1363
- prompt_len=prompt_token_len,
1364
- output_len=int(output_lens[i]),
1365
- image_data=images,
1366
- )
1460
+ images, images_base64, images_bytes = zip(
1461
+ *[_gen_random_image_data_uri() for _ in range(image_count)]
1462
+ )
1463
+ total_image_bytes += sum(list(images_bytes))
1464
+
1465
+ data_row = create_mm_data_row(
1466
+ text_prompt,
1467
+ list(images),
1468
+ list(images_base64),
1469
+ int(output_lens[i]),
1470
+ processor,
1471
+ backend,
1367
1472
  )
1368
1473
 
1474
+ dataset.append(data_row)
1475
+
1369
1476
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1370
1477
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1478
+ print(
1479
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
1480
+ )
1371
1481
  return dataset
1372
1482
 
1373
1483
 
@@ -1439,7 +1549,9 @@ def sample_generated_shared_prefix_requests(
1439
1549
 
1440
1550
  input_requests.append(
1441
1551
  DatasetRow(
1442
- prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1552
+ prompt=full_prompt,
1553
+ prompt_len=prompt_len,
1554
+ output_len=output_len,
1443
1555
  )
1444
1556
  )
1445
1557
  total_input_tokens += prompt_len
@@ -1517,15 +1629,26 @@ def calculate_metrics(
1517
1629
  dur_s: float,
1518
1630
  tokenizer: PreTrainedTokenizerBase,
1519
1631
  backend: str,
1632
+ accept_length: Optional[float] = None,
1520
1633
  ) -> Tuple[BenchmarkMetrics, List[int]]:
1521
1634
  output_lens: List[int] = []
1522
1635
  retokenized_output_lens: List[int] = []
1523
1636
  total_input = 0
1637
+ total_input_text = 0
1638
+ total_input_vision = 0
1524
1639
  completed = 0
1525
1640
  itls: List[float] = []
1526
1641
  tpots: List[float] = []
1527
1642
  ttfts: List[float] = []
1528
1643
  e2e_latencies: List[float] = []
1644
+ retokenized_itls: List[float] = []
1645
+
1646
+ use_retokenized_itl = (
1647
+ accept_length is not None
1648
+ and accept_length > 0
1649
+ and backend in ("sglang-oai", "sglang-oai-chat")
1650
+ )
1651
+
1529
1652
  for i in range(len(outputs)):
1530
1653
  if outputs[i].success:
1531
1654
  output_len = outputs[i].output_len
@@ -1534,10 +1657,22 @@ def calculate_metrics(
1534
1657
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1535
1658
  )
1536
1659
  retokenized_output_lens.append(retokenized_output_len)
1537
- total_input += outputs[i].prompt_len
1660
+ total_input += input_requests[i].prompt_len
1661
+ total_input_text += input_requests[i].text_prompt_len
1662
+ total_input_vision += input_requests[i].vision_prompt_len
1538
1663
  if output_len > 1:
1539
1664
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1540
- itls += outputs[i].itl
1665
+ if use_retokenized_itl:
1666
+ for k, itl in enumerate(outputs[i].itl):
1667
+ num_tokens = len(
1668
+ tokenizer.encode(
1669
+ outputs[i].text_chunks[k], add_special_tokens=False
1670
+ )
1671
+ )
1672
+ adjusted_itl = itl / num_tokens
1673
+ retokenized_itls.extend([adjusted_itl] * num_tokens)
1674
+ else:
1675
+ itls += outputs[i].itl
1541
1676
  ttfts.append(outputs[i].ttft)
1542
1677
 
1543
1678
  e2e_latencies.append(outputs[i].latency)
@@ -1553,9 +1688,13 @@ def calculate_metrics(
1553
1688
  "on the benchmark arguments.",
1554
1689
  stacklevel=2,
1555
1690
  )
1691
+
1692
+ itls = retokenized_itls if use_retokenized_itl else itls
1556
1693
  metrics = BenchmarkMetrics(
1557
1694
  completed=completed,
1558
1695
  total_input=total_input,
1696
+ total_input_text=total_input_text,
1697
+ total_input_vision=total_input_vision,
1559
1698
  total_output=sum(output_lens),
1560
1699
  total_output_retokenized=sum(retokenized_output_lens),
1561
1700
  request_throughput=completed / dur_s,
@@ -1609,6 +1748,8 @@ async def benchmark(
1609
1748
  use_trace_timestamps: bool = False,
1610
1749
  mooncake_slowdown_factor=1.0,
1611
1750
  mooncake_num_rounds=1,
1751
+ profile_prefill_url: Optional[List[str]] = None,
1752
+ profile_decode_url: Optional[List[str]] = None,
1612
1753
  ):
1613
1754
  if backend in ASYNC_REQUEST_FUNCS:
1614
1755
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1698,14 +1839,28 @@ async def benchmark(
1698
1839
 
1699
1840
  time.sleep(1.0)
1700
1841
 
1842
+ # Build profile URLs for PD separated mode (do this once at the beginning)
1843
+ pd_profile_urls = []
1844
+ if profile and pd_separated:
1845
+ pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
1846
+ if not pd_profile_urls:
1847
+ print(
1848
+ "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
1849
+ )
1850
+ print("Skipping profiler start. Please specify worker URLs for profiling.")
1851
+
1701
1852
  # Start profiler
1702
1853
  if profile:
1703
- print("Starting profiler...")
1704
- profile_output = await async_request_profile(
1705
- api_url=base_url + "/start_profile"
1706
- )
1707
- if profile_output.success:
1708
- print("Profiler started")
1854
+ if pd_separated:
1855
+ if pd_profile_urls:
1856
+ await _call_profile_pd(pd_profile_urls, "start")
1857
+ else:
1858
+ print("Starting profiler...")
1859
+ profile_output = await async_request_profile(
1860
+ api_url=base_url + "/start_profile"
1861
+ )
1862
+ if profile_output.success:
1863
+ print("Profiler started")
1709
1864
 
1710
1865
  # Run all requests
1711
1866
  benchmark_start_time = time.perf_counter()
@@ -1754,10 +1909,16 @@ async def benchmark(
1754
1909
 
1755
1910
  # Stop profiler
1756
1911
  if profile:
1757
- print("Stopping profiler...")
1758
- profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
1759
- if profile_output.success:
1760
- print("Profiler stopped")
1912
+ if pd_separated:
1913
+ if pd_profile_urls:
1914
+ await _call_profile_pd(pd_profile_urls, "stop")
1915
+ else:
1916
+ print("Stopping profiler...")
1917
+ profile_output = await async_request_profile(
1918
+ api_url=base_url + "/stop_profile"
1919
+ )
1920
+ if profile_output.success:
1921
+ print("Profiler stopped")
1761
1922
 
1762
1923
  if pbar is not None:
1763
1924
  pbar.close()
@@ -1770,9 +1931,15 @@ async def benchmark(
1770
1931
  server_info_json = server_info.json()
1771
1932
  if "decode" in server_info_json:
1772
1933
  server_info_json = server_info_json["decode"][0]
1773
- accept_length = server_info_json["internal_states"][0].get(
1774
- "avg_spec_accept_length", None
1775
- )
1934
+ if (
1935
+ "internal_states" in server_info_json
1936
+ and server_info_json["internal_states"]
1937
+ ):
1938
+ accept_length = server_info_json["internal_states"][0].get(
1939
+ "avg_spec_accept_length", None
1940
+ )
1941
+ else:
1942
+ accept_length = None
1776
1943
  else:
1777
1944
  accept_length = None
1778
1945
  else:
@@ -1786,6 +1953,7 @@ async def benchmark(
1786
1953
  dur_s=benchmark_duration,
1787
1954
  tokenizer=tokenizer,
1788
1955
  backend=backend,
1956
+ accept_length=accept_length,
1789
1957
  )
1790
1958
 
1791
1959
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
@@ -1804,6 +1972,10 @@ async def benchmark(
1804
1972
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
1805
1973
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
1806
1974
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
1975
+ print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
1976
+ print(
1977
+ "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
1978
+ )
1807
1979
  print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
1808
1980
  print(
1809
1981
  "{:<40} {:<10}".format(
@@ -1873,6 +2045,8 @@ async def benchmark(
1873
2045
  "duration": benchmark_duration,
1874
2046
  "completed": metrics.completed,
1875
2047
  "total_input_tokens": metrics.total_input,
2048
+ "total_input_text_tokens": metrics.total_input_text,
2049
+ "total_input_vision_tokens": metrics.total_input_vision,
1876
2050
  "total_output_tokens": metrics.total_output,
1877
2051
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1878
2052
  "request_throughput": metrics.request_throughput,
@@ -1907,11 +2081,11 @@ async def benchmark(
1907
2081
  output_file_name = args.output_file
1908
2082
  else:
1909
2083
  now = datetime.now().strftime("%m%d")
1910
- if args.dataset_name == "random-image":
2084
+ if args.dataset_name == "image":
1911
2085
  output_file_name = (
1912
2086
  f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1913
- f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1914
- f"{args.random_image_resolution}.jsonl"
2087
+ f"{args.random_output_len}_{args.image_count}imgs_"
2088
+ f"{args.image_resolution}.jsonl"
1915
2089
  )
1916
2090
  elif args.dataset_name.startswith("random"):
1917
2091
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
@@ -2087,6 +2261,12 @@ def run_benchmark(args_: argparse.Namespace):
2087
2261
  "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
2088
2262
  )
2089
2263
 
2264
+ if args.dataset_name in ["image", "mmmu"]:
2265
+ args.apply_chat_template = True
2266
+ assert (
2267
+ not args.tokenize_prompt
2268
+ ), "`--tokenize-prompt` not compatible with image dataset"
2269
+
2090
2270
  print(f"{args}\n")
2091
2271
 
2092
2272
  # Read dataset
@@ -2094,7 +2274,7 @@ def run_benchmark(args_: argparse.Namespace):
2094
2274
  model_id = args.model
2095
2275
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
2096
2276
  tokenizer = get_tokenizer(tokenizer_id)
2097
- input_requests = get_dataset(args, tokenizer)
2277
+ input_requests = get_dataset(args, tokenizer, model_id)
2098
2278
 
2099
2279
  # compatible with SimpleNamespace
2100
2280
  if not hasattr(args, "flush_cache"):
@@ -2120,6 +2300,8 @@ def run_benchmark(args_: argparse.Namespace):
2120
2300
  use_trace_timestamps=args.use_trace_timestamps,
2121
2301
  mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2122
2302
  mooncake_num_rounds=args.mooncake_num_rounds,
2303
+ profile_prefill_url=getattr(args, "profile_prefill_url", None),
2304
+ profile_decode_url=getattr(args, "profile_decode_url", None),
2123
2305
  )
2124
2306
  )
2125
2307
 
@@ -2175,7 +2357,7 @@ if __name__ == "__main__":
2175
2357
  "random-ids",
2176
2358
  "generated-shared-prefix",
2177
2359
  "mmmu",
2178
- "random-image",
2360
+ "image",
2179
2361
  "mooncake",
2180
2362
  ],
2181
2363
  help="Name of the dataset to benchmark on.",
@@ -2215,37 +2397,49 @@ if __name__ == "__main__":
2215
2397
  "--random-input-len",
2216
2398
  type=int,
2217
2399
  default=1024,
2218
- help="Number of input tokens per request, used only for random dataset.",
2400
+ help="Number of input tokens per request, used only for random and image dataset.",
2219
2401
  )
2220
2402
  parser.add_argument(
2221
2403
  "--random-output-len",
2222
2404
  default=1024,
2223
2405
  type=int,
2224
- help="Number of output tokens per request, used only for random dataset.",
2406
+ help="Number of output tokens per request, used only for random and image dataset.",
2225
2407
  )
2226
2408
  parser.add_argument(
2227
2409
  "--random-range-ratio",
2228
2410
  type=float,
2229
2411
  default=0.0,
2230
2412
  help="Range of sampled ratio of input/output length, "
2231
- "used only for random dataset.",
2413
+ "used only for random and image dataset.",
2232
2414
  )
2233
- # random-image dataset args
2415
+ # image dataset args
2234
2416
  parser.add_argument(
2235
- "--random-image-num-images",
2417
+ "--image-count",
2236
2418
  type=int,
2237
2419
  default=1,
2238
- help="Number of images per request (only available with the random-image dataset)",
2420
+ help="Number of images per request (only available with the image dataset)",
2239
2421
  )
2240
2422
  parser.add_argument(
2241
- "--random-image-resolution",
2423
+ "--image-resolution",
2242
2424
  type=str,
2243
2425
  default="1080p",
2244
2426
  help=(
2245
- "Resolution of random images for random-image dataset. "
2427
+ "Resolution of images for image dataset. "
2246
2428
  "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2247
2429
  ),
2248
2430
  )
2431
+ parser.add_argument(
2432
+ "--image-format",
2433
+ type=str,
2434
+ default="jpeg",
2435
+ help=("Format of images for image dataset. " "Supports jpeg and png."),
2436
+ )
2437
+ parser.add_argument(
2438
+ "--image-content",
2439
+ type=str,
2440
+ default="random",
2441
+ help=("Content for images for image dataset. " "Supports random and blank."),
2442
+ )
2249
2443
  parser.add_argument(
2250
2444
  "--request-rate",
2251
2445
  type=float,
@@ -2333,6 +2527,30 @@ if __name__ == "__main__":
2333
2527
  action="store_true",
2334
2528
  help="Benchmark PD disaggregation server",
2335
2529
  )
2530
+
2531
+ # Create a mutually exclusive group for profiling URLs
2532
+ # In PD separated mode, prefill and decode workers must be profiled separately
2533
+ profile_url_group = parser.add_mutually_exclusive_group()
2534
+ profile_url_group.add_argument(
2535
+ "--profile-prefill-url",
2536
+ type=str,
2537
+ nargs="*",
2538
+ default=None,
2539
+ help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
2540
+ "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
2541
+ "NOTE: Cannot be used together with --profile-decode-url. "
2542
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2543
+ )
2544
+ profile_url_group.add_argument(
2545
+ "--profile-decode-url",
2546
+ type=str,
2547
+ nargs="*",
2548
+ default=None,
2549
+ help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
2550
+ "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
2551
+ "NOTE: Cannot be used together with --profile-prefill-url. "
2552
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2553
+ )
2336
2554
  parser.add_argument(
2337
2555
  "--flush-cache",
2338
2556
  action="store_true",