sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
- import base64
16
15
  import io
17
16
  import json
18
17
  import os
@@ -32,9 +31,13 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
32
31
 
33
32
  import aiohttp
34
33
  import numpy as np
34
+ import pybase64
35
35
  import requests
36
+ from datasets import load_dataset
37
+ from PIL import Image
36
38
  from tqdm.asyncio import tqdm
37
39
  from transformers import (
40
+ AutoProcessor,
38
41
  AutoTokenizer,
39
42
  PreTrainedTokenizer,
40
43
  PreTrainedTokenizerBase,
@@ -209,6 +212,11 @@ async def async_request_openai_completions(
209
212
  **request_func_input.extra_request_body,
210
213
  }
211
214
 
215
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
216
+ if request_func_input.lora_name:
217
+ payload["model"] = request_func_input.lora_name
218
+ payload["lora_path"] = request_func_input.lora_name
219
+
212
220
  if request_func_input.image_data:
213
221
  payload.update({"image_data": request_func_input.image_data})
214
222
 
@@ -322,10 +330,17 @@ async def async_request_openai_chat_completions(
322
330
  "model": request_func_input.model,
323
331
  "messages": messages,
324
332
  "temperature": 0.0,
325
- "max_tokens": request_func_input.output_len,
333
+ "max_completion_tokens": request_func_input.output_len,
326
334
  "stream": not args.disable_stream,
335
+ "ignore_eos": not args.disable_ignore_eos,
327
336
  **request_func_input.extra_request_body,
328
337
  }
338
+
339
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
340
+ if request_func_input.lora_name:
341
+ payload["model"] = request_func_input.lora_name
342
+ payload["lora_path"] = request_func_input.lora_name
343
+
329
344
  headers = get_auth_headers()
330
345
 
331
346
  output = RequestFuncOutput.init_new(request_func_input)
@@ -610,6 +625,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
610
625
  return output
611
626
 
612
627
 
628
+ def _build_profile_urls(
629
+ profile_prefill_url: Optional[List[str]],
630
+ profile_decode_url: Optional[List[str]],
631
+ ) -> List[Tuple[str, str]]:
632
+ """Build profile URLs list from prefill/decode URL arguments.
633
+
634
+ Returns:
635
+ List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
636
+ """
637
+ profile_urls = []
638
+ if profile_prefill_url:
639
+ for idx, url in enumerate(profile_prefill_url):
640
+ profile_urls.append((f"Prefill-{idx}", url))
641
+ if profile_decode_url:
642
+ for idx, url in enumerate(profile_decode_url):
643
+ profile_urls.append((f"Decode-{idx}", url))
644
+ return profile_urls
645
+
646
+
647
+ async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
648
+ """Call profile endpoint (start/stop) on PD separated workers.
649
+
650
+ Args:
651
+ profile_urls: List of (worker_type, url) tuples
652
+ mode: "start" or "stop"
653
+ """
654
+ endpoint = "/start_profile" if mode == "start" else "/stop_profile"
655
+ action = "Starting" if mode == "start" else "Stopping"
656
+ action_past = "started" if mode == "start" else "stopped"
657
+
658
+ print(f"{action} profiler...")
659
+
660
+ for worker_type, url in profile_urls:
661
+ profile_output = await async_request_profile(api_url=url + endpoint)
662
+ if profile_output.success:
663
+ print(f"Profiler {action_past} for {worker_type} worker at {url}")
664
+ else:
665
+ print(
666
+ f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
667
+ )
668
+
669
+
613
670
  def get_model(pretrained_model_name_or_path: str) -> str:
614
671
  if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
615
672
  import huggingface_hub.constants
@@ -648,7 +705,30 @@ def get_tokenizer(
648
705
  )
649
706
 
650
707
 
651
- def get_dataset(args, tokenizer):
708
+ def get_processor(
709
+ pretrained_model_name_or_path: str,
710
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
711
+ assert (
712
+ pretrained_model_name_or_path is not None
713
+ and pretrained_model_name_or_path != ""
714
+ )
715
+ if pretrained_model_name_or_path.endswith(
716
+ ".json"
717
+ ) or pretrained_model_name_or_path.endswith(".model"):
718
+ from sglang.srt.utils.hf_transformers_utils import get_processor
719
+
720
+ return get_processor(pretrained_model_name_or_path)
721
+
722
+ if pretrained_model_name_or_path is not None and not os.path.exists(
723
+ pretrained_model_name_or_path
724
+ ):
725
+ pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
726
+ return AutoProcessor.from_pretrained(
727
+ pretrained_model_name_or_path, trust_remote_code=True
728
+ )
729
+
730
+
731
+ def get_dataset(args, tokenizer, model_id=None):
652
732
  tokenize_prompt = getattr(args, "tokenize_prompt", False)
653
733
  if args.dataset_name == "sharegpt":
654
734
  assert not tokenize_prompt
@@ -661,7 +741,7 @@ def get_dataset(args, tokenizer):
661
741
  prompt_suffix=args.prompt_suffix,
662
742
  apply_chat_template=args.apply_chat_template,
663
743
  )
664
- elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
744
+ elif args.dataset_name.startswith("random"):
665
745
  input_requests = sample_random_requests(
666
746
  input_len=args.random_input_len,
667
747
  output_len=args.random_output_len,
@@ -672,17 +752,18 @@ def get_dataset(args, tokenizer):
672
752
  random_sample=args.dataset_name == "random",
673
753
  return_text=not tokenize_prompt,
674
754
  )
675
- elif args.dataset_name == "random-image":
676
- assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
677
- input_requests = sample_random_image_requests(
755
+ elif args.dataset_name == "image":
756
+ processor = get_processor(model_id)
757
+ input_requests = sample_image_requests(
678
758
  num_requests=args.num_prompts,
679
- num_images=args.random_image_num_images,
759
+ image_count=args.image_count,
680
760
  input_len=args.random_input_len,
681
761
  output_len=args.random_output_len,
682
762
  range_ratio=args.random_range_ratio,
683
- tokenizer=tokenizer,
684
- apply_chat_template=args.apply_chat_template,
685
- image_resolution=args.random_image_resolution,
763
+ processor=processor,
764
+ image_content=args.image_content,
765
+ image_format=args.image_format,
766
+ image_resolution=args.image_resolution,
686
767
  )
687
768
  elif args.dataset_name == "generated-shared-prefix":
688
769
  assert not tokenize_prompt
@@ -696,12 +777,11 @@ def get_dataset(args, tokenizer):
696
777
  args=args,
697
778
  )
698
779
  elif args.dataset_name == "mmmu":
699
- assert not tokenize_prompt
780
+ processor = get_processor(model_id)
700
781
  input_requests = sample_mmmu_requests(
701
782
  num_requests=args.num_prompts,
702
- tokenizer=tokenizer,
783
+ processor=processor,
703
784
  fixed_output_len=args.random_output_len,
704
- apply_chat_template=args.apply_chat_template,
705
785
  random_sample=True,
706
786
  )
707
787
  elif args.dataset_name == "mooncake":
@@ -746,6 +826,8 @@ ASYNC_REQUEST_FUNCS = {
746
826
  class BenchmarkMetrics:
747
827
  completed: int
748
828
  total_input: int
829
+ total_input_text: int
830
+ total_input_vision: int
749
831
  total_output: int
750
832
  total_output_retokenized: int
751
833
  request_throughput: float
@@ -839,9 +921,17 @@ class DatasetRow:
839
921
  prompt: str
840
922
  prompt_len: int
841
923
  output_len: int
924
+ text_prompt_len: Optional[int] = None
925
+ vision_prompt_len: Optional[int] = None
842
926
  image_data: Optional[List[str]] = None
843
927
  timestamp: Optional[float] = None
844
928
 
929
+ def __post_init__(self):
930
+ if self.text_prompt_len is None:
931
+ self.text_prompt_len = self.prompt_len
932
+ if self.vision_prompt_len is None:
933
+ self.vision_prompt_len = 0
934
+
845
935
 
846
936
  async def get_mooncake_request_over_time(
847
937
  input_requests: List[Dict],
@@ -889,7 +979,7 @@ async def get_mooncake_request_over_time(
889
979
  for i in range(num_rounds):
890
980
  # Add user query for the current round
891
981
  chat_history.append(
892
- {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
982
+ {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
893
983
  )
894
984
 
895
985
  # Form the full prompt from history
@@ -918,9 +1008,8 @@ async def get_mooncake_request_over_time(
918
1008
 
919
1009
  def sample_mmmu_requests(
920
1010
  num_requests: int,
921
- tokenizer: PreTrainedTokenizerBase,
1011
+ processor: AutoProcessor | AutoTokenizer,
922
1012
  fixed_output_len: Optional[int] = None,
923
- apply_chat_template: bool = True,
924
1013
  random_sample: bool = True,
925
1014
  ) -> List[DatasetRow]:
926
1015
  """
@@ -928,22 +1017,12 @@ def sample_mmmu_requests(
928
1017
 
929
1018
  Args:
930
1019
  num_requests: Number of requests to sample.
931
- tokenizer: Tokenizer to use for token counting.
932
1020
  fixed_output_len: If provided, use this fixed output length for all requests.
933
- apply_chat_template: Whether to apply the chat template to the prompt.
934
1021
  random_sample: Whether to randomly sample or take the first N.
935
1022
 
936
1023
  Returns:
937
1024
  List of tuples (prompt, prompt_token_len, output_token_len).
938
1025
  """
939
- try:
940
- import io
941
-
942
- import pybase64
943
- from datasets import load_dataset
944
- except ImportError:
945
- raise ImportError("Please install datasets: pip install datasets")
946
-
947
1026
  print("Loading MMMU dataset from HuggingFace...")
948
1027
 
949
1028
  try:
@@ -999,54 +1078,12 @@ def sample_mmmu_requests(
999
1078
  question = example.get("question")
1000
1079
 
1001
1080
  # Construct the prompt
1002
- prompt = f"Question: {question}\n\nAnswer: "
1003
- if apply_chat_template:
1004
- try:
1005
- is_phi4_multimodal = (
1006
- "phi-4-multimodal" in tokenizer.name_or_path.lower()
1007
- )
1008
- if is_phi4_multimodal:
1009
- # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1010
- content = prompt.replace("image 1", "<|endoftext10|>")
1011
- else:
1012
- content = [
1013
- {
1014
- "type": "image_url",
1015
- "image_url": {"url": image_data},
1016
- },
1017
- {"type": "text", "text": prompt},
1018
- ]
1019
- prompt = tokenizer.apply_chat_template(
1020
- [
1021
- {
1022
- "role": "user",
1023
- "content": content,
1024
- }
1025
- ],
1026
- add_generation_prompt=True,
1027
- tokenize=False,
1028
- )
1029
- except Exception as e:
1030
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1031
- print(
1032
- f"Error applying chat template: {e}, fallback to <image> tag"
1033
- )
1034
- prompt = f"<image>{prompt}"
1035
-
1036
- # Calculate token lengths for text only (without image data)
1037
- prompt_token_ids = tokenizer.encode(prompt)
1038
- prompt_len = len(prompt_token_ids)
1039
-
1081
+ text_prompt = f"Question: {question}\n\nAnswer: "
1040
1082
  output_len = fixed_output_len if fixed_output_len is not None else 256
1041
-
1042
- filtered_dataset.append(
1043
- DatasetRow(
1044
- prompt=prompt,
1045
- prompt_len=prompt_len,
1046
- output_len=output_len,
1047
- image_data=[image_data],
1048
- )
1083
+ data_row = create_mm_data_row(
1084
+ text_prompt, [image], [image_data], output_len, processor
1049
1085
  )
1086
+ filtered_dataset.append(data_row)
1050
1087
 
1051
1088
  except Exception as e:
1052
1089
  print(f"Error processing example {i}: {e}")
@@ -1134,7 +1171,11 @@ def sample_sharegpt_requests(
1134
1171
  continue
1135
1172
 
1136
1173
  filtered_dataset.append(
1137
- DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
1174
+ DatasetRow(
1175
+ prompt=prompt,
1176
+ prompt_len=prompt_len,
1177
+ output_len=output_len,
1178
+ )
1138
1179
  )
1139
1180
 
1140
1181
  print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
@@ -1245,7 +1286,7 @@ def sample_random_requests(
1245
1286
  return input_requests
1246
1287
 
1247
1288
 
1248
- def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1289
+ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1249
1290
  """Parse image resolution into (width, height).
1250
1291
 
1251
1292
  Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
@@ -1270,44 +1311,94 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1270
1311
  return (width, height)
1271
1312
 
1272
1313
  raise ValueError(
1273
- f"Unsupported random-image resolution: {image_resolution}. "
1314
+ f"Unsupported image resolution: {image_resolution}. "
1274
1315
  "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1275
1316
  )
1276
1317
 
1277
1318
 
1278
- def sample_random_image_requests(
1319
+ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
1320
+ try:
1321
+ content_items = [
1322
+ {"type": "image", "image": {"url": image_base64}}
1323
+ for image_base64 in images_base64
1324
+ ]
1325
+ content_items.append({"type": "text", "text": text_prompt})
1326
+ prompt_str = processor.apply_chat_template(
1327
+ [{"role": "user", "content": content_items}],
1328
+ add_generation_prompt=True,
1329
+ tokenize=False,
1330
+ )
1331
+ except Exception as e:
1332
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1333
+ print(f"Error applying chat template: {e}, fallback to <image> tag")
1334
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1335
+ prompt_str = f"<image>{text_prompt}"
1336
+
1337
+ # Calculate total tokens (text + vision)
1338
+ prompt_len = processor(
1339
+ text=[prompt_str],
1340
+ images=images,
1341
+ padding=False,
1342
+ return_tensors="pt",
1343
+ )["input_ids"].numel()
1344
+
1345
+ # Calculate text-only tokens
1346
+ try:
1347
+ # Create text-only version of the prompt
1348
+ text_only_prompt = processor.apply_chat_template(
1349
+ [{"role": "user", "content": text_prompt}],
1350
+ add_generation_prompt=True,
1351
+ tokenize=False,
1352
+ )
1353
+ text_prompt_len = processor(
1354
+ text=[text_only_prompt],
1355
+ padding=False,
1356
+ return_tensors="pt",
1357
+ )["input_ids"].numel()
1358
+ except Exception:
1359
+ # Fallback: just tokenize the text prompt directly
1360
+ text_prompt_len = len(processor.tokenizer.encode(text_prompt))
1361
+
1362
+ # Vision tokens = total tokens - text tokens
1363
+ vision_prompt_len = prompt_len - text_prompt_len
1364
+
1365
+ return DatasetRow(
1366
+ prompt=text_prompt,
1367
+ prompt_len=prompt_len,
1368
+ output_len=output_len,
1369
+ text_prompt_len=text_prompt_len,
1370
+ vision_prompt_len=vision_prompt_len,
1371
+ image_data=images_base64,
1372
+ )
1373
+
1374
+
1375
+ def sample_image_requests(
1279
1376
  num_requests: int,
1280
- num_images: int,
1377
+ image_count: int,
1281
1378
  input_len: int,
1282
1379
  output_len: int,
1283
1380
  range_ratio: float,
1284
- tokenizer: PreTrainedTokenizerBase,
1285
- apply_chat_template: bool = True,
1286
- image_resolution: str = "1080p",
1381
+ processor: AutoProcessor,
1382
+ image_content: str,
1383
+ image_format: str,
1384
+ image_resolution: str,
1287
1385
  ) -> List[DatasetRow]:
1288
- """Generate requests with random images.
1386
+ """Generate requests with images.
1289
1387
 
1290
- - Each request includes ``num_images`` random images.
1388
+ - Each request includes ``image_count`` images.
1291
1389
  - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1292
1390
  or custom 'heightxwidth' (e.g., 1080x1920).
1293
1391
  - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1294
1392
  only counts text tokens and excludes image data.
1295
1393
  """
1296
- try:
1297
- import pybase64
1298
- from PIL import Image
1299
- except ImportError as e:
1300
- raise ImportError(
1301
- "Please install Pillow to generate random images: pip install pillow"
1302
- ) from e
1303
1394
 
1304
1395
  # Parse resolution (supports presets and 'heightxwidth')
1305
- width, height = parse_random_image_resolution(image_resolution)
1396
+ width, height = parse_image_resolution(image_resolution)
1306
1397
 
1307
1398
  # Check for potentially problematic combinations and warn user
1308
- if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1399
+ if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
1309
1400
  warnings.warn(
1310
- f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1401
+ f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
1311
1402
  f"may take a long time. Consider reducing resolution or image count.",
1312
1403
  UserWarning,
1313
1404
  stacklevel=2,
@@ -1321,53 +1412,50 @@ def sample_random_image_requests(
1321
1412
  int(output_len * range_ratio), output_len + 1, size=num_requests
1322
1413
  )
1323
1414
 
1324
- def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1325
- arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1326
- img = Image.fromarray(arr, mode="RGB")
1415
+ def _gen_random_image_data_uri(
1416
+ width: int = width, height: int = height
1417
+ ) -> (Image, str, int):
1418
+ if image_content == "blank":
1419
+ # Generate blank white image
1420
+ arr = np.full((height, width, 3), 255, dtype=np.uint8)
1421
+ else:
1422
+ # Generate random colored image
1423
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1424
+ img = Image.fromarray(arr)
1327
1425
  buf = io.BytesIO()
1328
- img.save(buf, format="JPEG", quality=85)
1426
+ img.save(buf, format=image_format, quality=85)
1329
1427
  encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1330
- return f"data:image/jpeg;base64,{encoded}"
1428
+ image_data = f"data:image/{image_format};base64,{encoded}"
1429
+ image_bytes = len(image_data.encode("utf-8"))
1430
+ return img, image_data, image_bytes
1331
1431
 
1332
1432
  dataset: List[DatasetRow] = []
1433
+ total_image_bytes = 0
1333
1434
  for i in range(num_requests):
1334
1435
  # Generate text prompt
1335
- text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1436
+ text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
1336
1437
 
1337
1438
  # Generate image list
1338
- images = [_gen_random_image_data_uri() for _ in range(num_images)]
1339
-
1340
- prompt_str = text_prompt
1341
- if apply_chat_template:
1342
- try:
1343
- content_items = [
1344
- {"type": "image_url", "image_url": {"url": img_url}}
1345
- for img_url in images
1346
- ]
1347
- content_items.append({"type": "text", "text": text_prompt})
1348
- prompt_str = tokenizer.apply_chat_template(
1349
- [{"role": "user", "content": content_items}],
1350
- add_generation_prompt=True,
1351
- tokenize=False,
1352
- )
1353
- except Exception:
1354
- # Some tokenizers do not support list content; fall back to a placeholder in the text
1355
- prompt_str = f"<image>{text_prompt}"
1356
-
1357
- prompt_token_ids = tokenizer.encode(prompt_str)
1358
- prompt_token_len = len(prompt_token_ids)
1359
-
1360
- dataset.append(
1361
- DatasetRow(
1362
- prompt=prompt_str,
1363
- prompt_len=prompt_token_len,
1364
- output_len=int(output_lens[i]),
1365
- image_data=images,
1366
- )
1439
+ images, images_base64, images_bytes = zip(
1440
+ *[_gen_random_image_data_uri() for _ in range(image_count)]
1441
+ )
1442
+ total_image_bytes += sum(list(images_bytes))
1443
+
1444
+ data_row = create_mm_data_row(
1445
+ text_prompt,
1446
+ list(images),
1447
+ list(images_base64),
1448
+ int(output_lens[i]),
1449
+ processor,
1367
1450
  )
1368
1451
 
1452
+ dataset.append(data_row)
1453
+
1369
1454
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1370
1455
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1456
+ print(
1457
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
1458
+ )
1371
1459
  return dataset
1372
1460
 
1373
1461
 
@@ -1439,7 +1527,9 @@ def sample_generated_shared_prefix_requests(
1439
1527
 
1440
1528
  input_requests.append(
1441
1529
  DatasetRow(
1442
- prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1530
+ prompt=full_prompt,
1531
+ prompt_len=prompt_len,
1532
+ output_len=output_len,
1443
1533
  )
1444
1534
  )
1445
1535
  total_input_tokens += prompt_len
@@ -1521,6 +1611,8 @@ def calculate_metrics(
1521
1611
  output_lens: List[int] = []
1522
1612
  retokenized_output_lens: List[int] = []
1523
1613
  total_input = 0
1614
+ total_input_text = 0
1615
+ total_input_vision = 0
1524
1616
  completed = 0
1525
1617
  itls: List[float] = []
1526
1618
  tpots: List[float] = []
@@ -1534,7 +1626,9 @@ def calculate_metrics(
1534
1626
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1535
1627
  )
1536
1628
  retokenized_output_lens.append(retokenized_output_len)
1537
- total_input += outputs[i].prompt_len
1629
+ total_input += input_requests[i].prompt_len
1630
+ total_input_text += input_requests[i].text_prompt_len
1631
+ total_input_vision += input_requests[i].vision_prompt_len
1538
1632
  if output_len > 1:
1539
1633
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1540
1634
  itls += outputs[i].itl
@@ -1556,6 +1650,8 @@ def calculate_metrics(
1556
1650
  metrics = BenchmarkMetrics(
1557
1651
  completed=completed,
1558
1652
  total_input=total_input,
1653
+ total_input_text=total_input_text,
1654
+ total_input_vision=total_input_vision,
1559
1655
  total_output=sum(output_lens),
1560
1656
  total_output_retokenized=sum(retokenized_output_lens),
1561
1657
  request_throughput=completed / dur_s,
@@ -1609,6 +1705,8 @@ async def benchmark(
1609
1705
  use_trace_timestamps: bool = False,
1610
1706
  mooncake_slowdown_factor=1.0,
1611
1707
  mooncake_num_rounds=1,
1708
+ profile_prefill_url: Optional[List[str]] = None,
1709
+ profile_decode_url: Optional[List[str]] = None,
1612
1710
  ):
1613
1711
  if backend in ASYNC_REQUEST_FUNCS:
1614
1712
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1698,14 +1796,28 @@ async def benchmark(
1698
1796
 
1699
1797
  time.sleep(1.0)
1700
1798
 
1799
+ # Build profile URLs for PD separated mode (do this once at the beginning)
1800
+ pd_profile_urls = []
1801
+ if profile and pd_separated:
1802
+ pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
1803
+ if not pd_profile_urls:
1804
+ print(
1805
+ "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
1806
+ )
1807
+ print("Skipping profiler start. Please specify worker URLs for profiling.")
1808
+
1701
1809
  # Start profiler
1702
1810
  if profile:
1703
- print("Starting profiler...")
1704
- profile_output = await async_request_profile(
1705
- api_url=base_url + "/start_profile"
1706
- )
1707
- if profile_output.success:
1708
- print("Profiler started")
1811
+ if pd_separated:
1812
+ if pd_profile_urls:
1813
+ await _call_profile_pd(pd_profile_urls, "start")
1814
+ else:
1815
+ print("Starting profiler...")
1816
+ profile_output = await async_request_profile(
1817
+ api_url=base_url + "/start_profile"
1818
+ )
1819
+ if profile_output.success:
1820
+ print("Profiler started")
1709
1821
 
1710
1822
  # Run all requests
1711
1823
  benchmark_start_time = time.perf_counter()
@@ -1754,10 +1866,16 @@ async def benchmark(
1754
1866
 
1755
1867
  # Stop profiler
1756
1868
  if profile:
1757
- print("Stopping profiler...")
1758
- profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
1759
- if profile_output.success:
1760
- print("Profiler stopped")
1869
+ if pd_separated:
1870
+ if pd_profile_urls:
1871
+ await _call_profile_pd(pd_profile_urls, "stop")
1872
+ else:
1873
+ print("Stopping profiler...")
1874
+ profile_output = await async_request_profile(
1875
+ api_url=base_url + "/stop_profile"
1876
+ )
1877
+ if profile_output.success:
1878
+ print("Profiler stopped")
1761
1879
 
1762
1880
  if pbar is not None:
1763
1881
  pbar.close()
@@ -1770,9 +1888,15 @@ async def benchmark(
1770
1888
  server_info_json = server_info.json()
1771
1889
  if "decode" in server_info_json:
1772
1890
  server_info_json = server_info_json["decode"][0]
1773
- accept_length = server_info_json["internal_states"][0].get(
1774
- "avg_spec_accept_length", None
1775
- )
1891
+ if (
1892
+ "internal_states" in server_info_json
1893
+ and server_info_json["internal_states"]
1894
+ ):
1895
+ accept_length = server_info_json["internal_states"][0].get(
1896
+ "avg_spec_accept_length", None
1897
+ )
1898
+ else:
1899
+ accept_length = None
1776
1900
  else:
1777
1901
  accept_length = None
1778
1902
  else:
@@ -1804,6 +1928,10 @@ async def benchmark(
1804
1928
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
1805
1929
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
1806
1930
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
1931
+ print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
1932
+ print(
1933
+ "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
1934
+ )
1807
1935
  print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
1808
1936
  print(
1809
1937
  "{:<40} {:<10}".format(
@@ -1873,6 +2001,8 @@ async def benchmark(
1873
2001
  "duration": benchmark_duration,
1874
2002
  "completed": metrics.completed,
1875
2003
  "total_input_tokens": metrics.total_input,
2004
+ "total_input_text_tokens": metrics.total_input_text,
2005
+ "total_input_vision_tokens": metrics.total_input_vision,
1876
2006
  "total_output_tokens": metrics.total_output,
1877
2007
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1878
2008
  "request_throughput": metrics.request_throughput,
@@ -1907,11 +2037,11 @@ async def benchmark(
1907
2037
  output_file_name = args.output_file
1908
2038
  else:
1909
2039
  now = datetime.now().strftime("%m%d")
1910
- if args.dataset_name == "random-image":
2040
+ if args.dataset_name == "image":
1911
2041
  output_file_name = (
1912
2042
  f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1913
- f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1914
- f"{args.random_image_resolution}.jsonl"
2043
+ f"{args.random_output_len}_{args.image_count}imgs_"
2044
+ f"{args.image_resolution}.jsonl"
1915
2045
  )
1916
2046
  elif args.dataset_name.startswith("random"):
1917
2047
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
@@ -2087,6 +2217,12 @@ def run_benchmark(args_: argparse.Namespace):
2087
2217
  "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
2088
2218
  )
2089
2219
 
2220
+ if args.dataset_name in ["image", "mmmu"]:
2221
+ args.apply_chat_template = True
2222
+ assert (
2223
+ not args.tokenize_prompt
2224
+ ), "`--tokenize-prompt` not compatible with image dataset"
2225
+
2090
2226
  print(f"{args}\n")
2091
2227
 
2092
2228
  # Read dataset
@@ -2094,7 +2230,7 @@ def run_benchmark(args_: argparse.Namespace):
2094
2230
  model_id = args.model
2095
2231
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
2096
2232
  tokenizer = get_tokenizer(tokenizer_id)
2097
- input_requests = get_dataset(args, tokenizer)
2233
+ input_requests = get_dataset(args, tokenizer, model_id)
2098
2234
 
2099
2235
  # compatible with SimpleNamespace
2100
2236
  if not hasattr(args, "flush_cache"):
@@ -2120,6 +2256,8 @@ def run_benchmark(args_: argparse.Namespace):
2120
2256
  use_trace_timestamps=args.use_trace_timestamps,
2121
2257
  mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2122
2258
  mooncake_num_rounds=args.mooncake_num_rounds,
2259
+ profile_prefill_url=getattr(args, "profile_prefill_url", None),
2260
+ profile_decode_url=getattr(args, "profile_decode_url", None),
2123
2261
  )
2124
2262
  )
2125
2263
 
@@ -2175,7 +2313,7 @@ if __name__ == "__main__":
2175
2313
  "random-ids",
2176
2314
  "generated-shared-prefix",
2177
2315
  "mmmu",
2178
- "random-image",
2316
+ "image",
2179
2317
  "mooncake",
2180
2318
  ],
2181
2319
  help="Name of the dataset to benchmark on.",
@@ -2215,37 +2353,49 @@ if __name__ == "__main__":
2215
2353
  "--random-input-len",
2216
2354
  type=int,
2217
2355
  default=1024,
2218
- help="Number of input tokens per request, used only for random dataset.",
2356
+ help="Number of input tokens per request, used only for random and image dataset.",
2219
2357
  )
2220
2358
  parser.add_argument(
2221
2359
  "--random-output-len",
2222
2360
  default=1024,
2223
2361
  type=int,
2224
- help="Number of output tokens per request, used only for random dataset.",
2362
+ help="Number of output tokens per request, used only for random and image dataset.",
2225
2363
  )
2226
2364
  parser.add_argument(
2227
2365
  "--random-range-ratio",
2228
2366
  type=float,
2229
2367
  default=0.0,
2230
2368
  help="Range of sampled ratio of input/output length, "
2231
- "used only for random dataset.",
2369
+ "used only for random and image dataset.",
2232
2370
  )
2233
- # random-image dataset args
2371
+ # image dataset args
2234
2372
  parser.add_argument(
2235
- "--random-image-num-images",
2373
+ "--image-count",
2236
2374
  type=int,
2237
2375
  default=1,
2238
- help="Number of images per request (only available with the random-image dataset)",
2376
+ help="Number of images per request (only available with the image dataset)",
2239
2377
  )
2240
2378
  parser.add_argument(
2241
- "--random-image-resolution",
2379
+ "--image-resolution",
2242
2380
  type=str,
2243
2381
  default="1080p",
2244
2382
  help=(
2245
- "Resolution of random images for random-image dataset. "
2383
+ "Resolution of images for image dataset. "
2246
2384
  "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2247
2385
  ),
2248
2386
  )
2387
+ parser.add_argument(
2388
+ "--image-format",
2389
+ type=str,
2390
+ default="jpeg",
2391
+ help=("Format of images for image dataset. " "Supports jpeg and png."),
2392
+ )
2393
+ parser.add_argument(
2394
+ "--image-content",
2395
+ type=str,
2396
+ default="random",
2397
+ help=("Content for images for image dataset. " "Supports random and blank."),
2398
+ )
2249
2399
  parser.add_argument(
2250
2400
  "--request-rate",
2251
2401
  type=float,
@@ -2333,6 +2483,30 @@ if __name__ == "__main__":
2333
2483
  action="store_true",
2334
2484
  help="Benchmark PD disaggregation server",
2335
2485
  )
2486
+
2487
+ # Create a mutually exclusive group for profiling URLs
2488
+ # In PD separated mode, prefill and decode workers must be profiled separately
2489
+ profile_url_group = parser.add_mutually_exclusive_group()
2490
+ profile_url_group.add_argument(
2491
+ "--profile-prefill-url",
2492
+ type=str,
2493
+ nargs="*",
2494
+ default=None,
2495
+ help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
2496
+ "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
2497
+ "NOTE: Cannot be used together with --profile-decode-url. "
2498
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2499
+ )
2500
+ profile_url_group.add_argument(
2501
+ "--profile-decode-url",
2502
+ type=str,
2503
+ nargs="*",
2504
+ default=None,
2505
+ help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
2506
+ "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
2507
+ "NOTE: Cannot be used together with --profile-prefill-url. "
2508
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2509
+ )
2336
2510
  parser.add_argument(
2337
2511
  "--flush-cache",
2338
2512
  action="store_true",