sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
- import base64
16
15
  import io
17
16
  import json
18
17
  import os
@@ -32,9 +31,13 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
32
31
 
33
32
  import aiohttp
34
33
  import numpy as np
34
+ import pybase64
35
35
  import requests
36
+ from datasets import load_dataset
37
+ from PIL import Image
36
38
  from tqdm.asyncio import tqdm
37
39
  from transformers import (
40
+ AutoProcessor,
38
41
  AutoTokenizer,
39
42
  PreTrainedTokenizer,
40
43
  PreTrainedTokenizerBase,
@@ -208,6 +211,15 @@ async def async_request_openai_completions(
208
211
  "ignore_eos": not args.disable_ignore_eos,
209
212
  **request_func_input.extra_request_body,
210
213
  }
214
+
215
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
216
+ if request_func_input.lora_name:
217
+ payload["model"] = request_func_input.lora_name
218
+ payload["lora_path"] = request_func_input.lora_name
219
+
220
+ if request_func_input.image_data:
221
+ payload.update({"image_data": request_func_input.image_data})
222
+
211
223
  headers = get_auth_headers()
212
224
 
213
225
  output = RequestFuncOutput.init_new(request_func_input)
@@ -318,10 +330,17 @@ async def async_request_openai_chat_completions(
318
330
  "model": request_func_input.model,
319
331
  "messages": messages,
320
332
  "temperature": 0.0,
321
- "max_tokens": request_func_input.output_len,
333
+ "max_completion_tokens": request_func_input.output_len,
322
334
  "stream": not args.disable_stream,
335
+ "ignore_eos": not args.disable_ignore_eos,
323
336
  **request_func_input.extra_request_body,
324
337
  }
338
+
339
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
340
+ if request_func_input.lora_name:
341
+ payload["model"] = request_func_input.lora_name
342
+ payload["lora_path"] = request_func_input.lora_name
343
+
325
344
  headers = get_auth_headers()
326
345
 
327
346
  output = RequestFuncOutput.init_new(request_func_input)
@@ -606,6 +625,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
606
625
  return output
607
626
 
608
627
 
628
+ def _build_profile_urls(
629
+ profile_prefill_url: Optional[List[str]],
630
+ profile_decode_url: Optional[List[str]],
631
+ ) -> List[Tuple[str, str]]:
632
+ """Build profile URLs list from prefill/decode URL arguments.
633
+
634
+ Returns:
635
+ List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
636
+ """
637
+ profile_urls = []
638
+ if profile_prefill_url:
639
+ for idx, url in enumerate(profile_prefill_url):
640
+ profile_urls.append((f"Prefill-{idx}", url))
641
+ if profile_decode_url:
642
+ for idx, url in enumerate(profile_decode_url):
643
+ profile_urls.append((f"Decode-{idx}", url))
644
+ return profile_urls
645
+
646
+
647
+ async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
648
+ """Call profile endpoint (start/stop) on PD separated workers.
649
+
650
+ Args:
651
+ profile_urls: List of (worker_type, url) tuples
652
+ mode: "start" or "stop"
653
+ """
654
+ endpoint = "/start_profile" if mode == "start" else "/stop_profile"
655
+ action = "Starting" if mode == "start" else "Stopping"
656
+ action_past = "started" if mode == "start" else "stopped"
657
+
658
+ print(f"{action} profiler...")
659
+
660
+ for worker_type, url in profile_urls:
661
+ profile_output = await async_request_profile(api_url=url + endpoint)
662
+ if profile_output.success:
663
+ print(f"Profiler {action_past} for {worker_type} worker at {url}")
664
+ else:
665
+ print(
666
+ f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
667
+ )
668
+
669
+
609
670
  def get_model(pretrained_model_name_or_path: str) -> str:
610
671
  if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
611
672
  import huggingface_hub.constants
@@ -631,7 +692,7 @@ def get_tokenizer(
631
692
  if pretrained_model_name_or_path.endswith(
632
693
  ".json"
633
694
  ) or pretrained_model_name_or_path.endswith(".model"):
634
- from sglang.srt.hf_transformers_utils import get_tokenizer
695
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
635
696
 
636
697
  return get_tokenizer(pretrained_model_name_or_path)
637
698
 
@@ -644,7 +705,30 @@ def get_tokenizer(
644
705
  )
645
706
 
646
707
 
647
- def get_dataset(args, tokenizer):
708
+ def get_processor(
709
+ pretrained_model_name_or_path: str,
710
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
711
+ assert (
712
+ pretrained_model_name_or_path is not None
713
+ and pretrained_model_name_or_path != ""
714
+ )
715
+ if pretrained_model_name_or_path.endswith(
716
+ ".json"
717
+ ) or pretrained_model_name_or_path.endswith(".model"):
718
+ from sglang.srt.utils.hf_transformers_utils import get_processor
719
+
720
+ return get_processor(pretrained_model_name_or_path)
721
+
722
+ if pretrained_model_name_or_path is not None and not os.path.exists(
723
+ pretrained_model_name_or_path
724
+ ):
725
+ pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
726
+ return AutoProcessor.from_pretrained(
727
+ pretrained_model_name_or_path, trust_remote_code=True
728
+ )
729
+
730
+
731
+ def get_dataset(args, tokenizer, model_id=None):
648
732
  tokenize_prompt = getattr(args, "tokenize_prompt", False)
649
733
  if args.dataset_name == "sharegpt":
650
734
  assert not tokenize_prompt
@@ -657,7 +741,7 @@ def get_dataset(args, tokenizer):
657
741
  prompt_suffix=args.prompt_suffix,
658
742
  apply_chat_template=args.apply_chat_template,
659
743
  )
660
- elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
744
+ elif args.dataset_name.startswith("random"):
661
745
  input_requests = sample_random_requests(
662
746
  input_len=args.random_input_len,
663
747
  output_len=args.random_output_len,
@@ -668,17 +752,18 @@ def get_dataset(args, tokenizer):
668
752
  random_sample=args.dataset_name == "random",
669
753
  return_text=not tokenize_prompt,
670
754
  )
671
- elif args.dataset_name == "random-image":
672
- assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
673
- input_requests = sample_random_image_requests(
755
+ elif args.dataset_name == "image":
756
+ processor = get_processor(model_id)
757
+ input_requests = sample_image_requests(
674
758
  num_requests=args.num_prompts,
675
- num_images=args.random_image_num_images,
759
+ image_count=args.image_count,
676
760
  input_len=args.random_input_len,
677
761
  output_len=args.random_output_len,
678
762
  range_ratio=args.random_range_ratio,
679
- tokenizer=tokenizer,
680
- apply_chat_template=args.apply_chat_template,
681
- image_resolution=args.random_image_resolution,
763
+ processor=processor,
764
+ image_content=args.image_content,
765
+ image_format=args.image_format,
766
+ image_resolution=args.image_resolution,
682
767
  )
683
768
  elif args.dataset_name == "generated-shared-prefix":
684
769
  assert not tokenize_prompt
@@ -692,12 +777,11 @@ def get_dataset(args, tokenizer):
692
777
  args=args,
693
778
  )
694
779
  elif args.dataset_name == "mmmu":
695
- assert not tokenize_prompt
780
+ processor = get_processor(model_id)
696
781
  input_requests = sample_mmmu_requests(
697
782
  num_requests=args.num_prompts,
698
- tokenizer=tokenizer,
783
+ processor=processor,
699
784
  fixed_output_len=args.random_output_len,
700
- apply_chat_template=args.apply_chat_template,
701
785
  random_sample=True,
702
786
  )
703
787
  elif args.dataset_name == "mooncake":
@@ -742,6 +826,8 @@ ASYNC_REQUEST_FUNCS = {
742
826
  class BenchmarkMetrics:
743
827
  completed: int
744
828
  total_input: int
829
+ total_input_text: int
830
+ total_input_vision: int
745
831
  total_output: int
746
832
  total_output_retokenized: int
747
833
  request_throughput: float
@@ -835,9 +921,17 @@ class DatasetRow:
835
921
  prompt: str
836
922
  prompt_len: int
837
923
  output_len: int
924
+ text_prompt_len: Optional[int] = None
925
+ vision_prompt_len: Optional[int] = None
838
926
  image_data: Optional[List[str]] = None
839
927
  timestamp: Optional[float] = None
840
928
 
929
+ def __post_init__(self):
930
+ if self.text_prompt_len is None:
931
+ self.text_prompt_len = self.prompt_len
932
+ if self.vision_prompt_len is None:
933
+ self.vision_prompt_len = 0
934
+
841
935
 
842
936
  async def get_mooncake_request_over_time(
843
937
  input_requests: List[Dict],
@@ -885,7 +979,7 @@ async def get_mooncake_request_over_time(
885
979
  for i in range(num_rounds):
886
980
  # Add user query for the current round
887
981
  chat_history.append(
888
- {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
982
+ {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
889
983
  )
890
984
 
891
985
  # Form the full prompt from history
@@ -914,9 +1008,8 @@ async def get_mooncake_request_over_time(
914
1008
 
915
1009
  def sample_mmmu_requests(
916
1010
  num_requests: int,
917
- tokenizer: PreTrainedTokenizerBase,
1011
+ processor: AutoProcessor | AutoTokenizer,
918
1012
  fixed_output_len: Optional[int] = None,
919
- apply_chat_template: bool = True,
920
1013
  random_sample: bool = True,
921
1014
  ) -> List[DatasetRow]:
922
1015
  """
@@ -924,22 +1017,12 @@ def sample_mmmu_requests(
924
1017
 
925
1018
  Args:
926
1019
  num_requests: Number of requests to sample.
927
- tokenizer: Tokenizer to use for token counting.
928
1020
  fixed_output_len: If provided, use this fixed output length for all requests.
929
- apply_chat_template: Whether to apply the chat template to the prompt.
930
1021
  random_sample: Whether to randomly sample or take the first N.
931
1022
 
932
1023
  Returns:
933
1024
  List of tuples (prompt, prompt_token_len, output_token_len).
934
1025
  """
935
- try:
936
- import io
937
-
938
- import pybase64
939
- from datasets import load_dataset
940
- except ImportError:
941
- raise ImportError("Please install datasets: pip install datasets")
942
-
943
1026
  print("Loading MMMU dataset from HuggingFace...")
944
1027
 
945
1028
  try:
@@ -995,54 +1078,12 @@ def sample_mmmu_requests(
995
1078
  question = example.get("question")
996
1079
 
997
1080
  # Construct the prompt
998
- prompt = f"Question: {question}\n\nAnswer: "
999
- if apply_chat_template:
1000
- try:
1001
- is_phi4_multimodal = (
1002
- "phi-4-multimodal" in tokenizer.name_or_path.lower()
1003
- )
1004
- if is_phi4_multimodal:
1005
- # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1006
- content = prompt.replace("image 1", "<|endoftext10|>")
1007
- else:
1008
- content = [
1009
- {
1010
- "type": "image_url",
1011
- "image_url": {"url": image_data},
1012
- },
1013
- {"type": "text", "text": prompt},
1014
- ]
1015
- prompt = tokenizer.apply_chat_template(
1016
- [
1017
- {
1018
- "role": "user",
1019
- "content": content,
1020
- }
1021
- ],
1022
- add_generation_prompt=True,
1023
- tokenize=False,
1024
- )
1025
- except Exception as e:
1026
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1027
- print(
1028
- f"Error applying chat template: {e}, fallback to <image> tag"
1029
- )
1030
- prompt = f"<image>{prompt}"
1031
-
1032
- # Calculate token lengths for text only (without image data)
1033
- prompt_token_ids = tokenizer.encode(prompt)
1034
- prompt_len = len(prompt_token_ids)
1035
-
1081
+ text_prompt = f"Question: {question}\n\nAnswer: "
1036
1082
  output_len = fixed_output_len if fixed_output_len is not None else 256
1037
-
1038
- filtered_dataset.append(
1039
- DatasetRow(
1040
- prompt=prompt,
1041
- prompt_len=prompt_len,
1042
- output_len=output_len,
1043
- image_data=[image_data],
1044
- )
1083
+ data_row = create_mm_data_row(
1084
+ text_prompt, [image], [image_data], output_len, processor
1045
1085
  )
1086
+ filtered_dataset.append(data_row)
1046
1087
 
1047
1088
  except Exception as e:
1048
1089
  print(f"Error processing example {i}: {e}")
@@ -1110,7 +1151,8 @@ def sample_sharegpt_requests(
1110
1151
  add_generation_prompt=True,
1111
1152
  tokenize=False,
1112
1153
  )
1113
- prompt = prompt.replace(tokenizer.bos_token, "")
1154
+ if tokenizer.bos_token:
1155
+ prompt = prompt.replace(tokenizer.bos_token, "")
1114
1156
 
1115
1157
  prompt_token_ids = tokenizer.encode(prompt)
1116
1158
  completion = dataset[i][1]
@@ -1129,7 +1171,11 @@ def sample_sharegpt_requests(
1129
1171
  continue
1130
1172
 
1131
1173
  filtered_dataset.append(
1132
- DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
1174
+ DatasetRow(
1175
+ prompt=prompt,
1176
+ prompt_len=prompt_len,
1177
+ output_len=output_len,
1178
+ )
1133
1179
  )
1134
1180
 
1135
1181
  print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
@@ -1240,7 +1286,7 @@ def sample_random_requests(
1240
1286
  return input_requests
1241
1287
 
1242
1288
 
1243
- def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1289
+ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1244
1290
  """Parse image resolution into (width, height).
1245
1291
 
1246
1292
  Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
@@ -1265,44 +1311,94 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1265
1311
  return (width, height)
1266
1312
 
1267
1313
  raise ValueError(
1268
- f"Unsupported random-image resolution: {image_resolution}. "
1314
+ f"Unsupported image resolution: {image_resolution}. "
1269
1315
  "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1270
1316
  )
1271
1317
 
1272
1318
 
1273
- def sample_random_image_requests(
1319
+ def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
1320
+ try:
1321
+ content_items = [
1322
+ {"type": "image", "image": {"url": image_base64}}
1323
+ for image_base64 in images_base64
1324
+ ]
1325
+ content_items.append({"type": "text", "text": text_prompt})
1326
+ prompt_str = processor.apply_chat_template(
1327
+ [{"role": "user", "content": content_items}],
1328
+ add_generation_prompt=True,
1329
+ tokenize=False,
1330
+ )
1331
+ except Exception as e:
1332
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1333
+ print(f"Error applying chat template: {e}, fallback to <image> tag")
1334
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1335
+ prompt_str = f"<image>{text_prompt}"
1336
+
1337
+ # Calculate total tokens (text + vision)
1338
+ prompt_len = processor(
1339
+ text=[prompt_str],
1340
+ images=images,
1341
+ padding=False,
1342
+ return_tensors="pt",
1343
+ )["input_ids"].numel()
1344
+
1345
+ # Calculate text-only tokens
1346
+ try:
1347
+ # Create text-only version of the prompt
1348
+ text_only_prompt = processor.apply_chat_template(
1349
+ [{"role": "user", "content": text_prompt}],
1350
+ add_generation_prompt=True,
1351
+ tokenize=False,
1352
+ )
1353
+ text_prompt_len = processor(
1354
+ text=[text_only_prompt],
1355
+ padding=False,
1356
+ return_tensors="pt",
1357
+ )["input_ids"].numel()
1358
+ except Exception:
1359
+ # Fallback: just tokenize the text prompt directly
1360
+ text_prompt_len = len(processor.tokenizer.encode(text_prompt))
1361
+
1362
+ # Vision tokens = total tokens - text tokens
1363
+ vision_prompt_len = prompt_len - text_prompt_len
1364
+
1365
+ return DatasetRow(
1366
+ prompt=text_prompt,
1367
+ prompt_len=prompt_len,
1368
+ output_len=output_len,
1369
+ text_prompt_len=text_prompt_len,
1370
+ vision_prompt_len=vision_prompt_len,
1371
+ image_data=images_base64,
1372
+ )
1373
+
1374
+
1375
+ def sample_image_requests(
1274
1376
  num_requests: int,
1275
- num_images: int,
1377
+ image_count: int,
1276
1378
  input_len: int,
1277
1379
  output_len: int,
1278
1380
  range_ratio: float,
1279
- tokenizer: PreTrainedTokenizerBase,
1280
- apply_chat_template: bool = True,
1281
- image_resolution: str = "1080p",
1381
+ processor: AutoProcessor,
1382
+ image_content: str,
1383
+ image_format: str,
1384
+ image_resolution: str,
1282
1385
  ) -> List[DatasetRow]:
1283
- """Generate requests with random images.
1386
+ """Generate requests with images.
1284
1387
 
1285
- - Each request includes ``num_images`` random images.
1388
+ - Each request includes ``image_count`` images.
1286
1389
  - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1287
1390
  or custom 'heightxwidth' (e.g., 1080x1920).
1288
1391
  - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1289
1392
  only counts text tokens and excludes image data.
1290
1393
  """
1291
- try:
1292
- import pybase64
1293
- from PIL import Image
1294
- except ImportError as e:
1295
- raise ImportError(
1296
- "Please install Pillow to generate random images: pip install pillow"
1297
- ) from e
1298
1394
 
1299
1395
  # Parse resolution (supports presets and 'heightxwidth')
1300
- width, height = parse_random_image_resolution(image_resolution)
1396
+ width, height = parse_image_resolution(image_resolution)
1301
1397
 
1302
1398
  # Check for potentially problematic combinations and warn user
1303
- if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1399
+ if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
1304
1400
  warnings.warn(
1305
- f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1401
+ f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
1306
1402
  f"may take a long time. Consider reducing resolution or image count.",
1307
1403
  UserWarning,
1308
1404
  stacklevel=2,
@@ -1316,53 +1412,50 @@ def sample_random_image_requests(
1316
1412
  int(output_len * range_ratio), output_len + 1, size=num_requests
1317
1413
  )
1318
1414
 
1319
- def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1320
- arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1321
- img = Image.fromarray(arr, mode="RGB")
1415
+ def _gen_random_image_data_uri(
1416
+ width: int = width, height: int = height
1417
+ ) -> (Image, str, int):
1418
+ if image_content == "blank":
1419
+ # Generate blank white image
1420
+ arr = np.full((height, width, 3), 255, dtype=np.uint8)
1421
+ else:
1422
+ # Generate random colored image
1423
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1424
+ img = Image.fromarray(arr)
1322
1425
  buf = io.BytesIO()
1323
- img.save(buf, format="JPEG", quality=85)
1426
+ img.save(buf, format=image_format, quality=85)
1324
1427
  encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1325
- return f"data:image/jpeg;base64,{encoded}"
1428
+ image_data = f"data:image/{image_format};base64,{encoded}"
1429
+ image_bytes = len(image_data.encode("utf-8"))
1430
+ return img, image_data, image_bytes
1326
1431
 
1327
1432
  dataset: List[DatasetRow] = []
1433
+ total_image_bytes = 0
1328
1434
  for i in range(num_requests):
1329
1435
  # Generate text prompt
1330
- text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1436
+ text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
1331
1437
 
1332
1438
  # Generate image list
1333
- images = [_gen_random_image_data_uri() for _ in range(num_images)]
1334
-
1335
- prompt_str = text_prompt
1336
- if apply_chat_template:
1337
- try:
1338
- content_items = [
1339
- {"type": "image_url", "image_url": {"url": img_url}}
1340
- for img_url in images
1341
- ]
1342
- content_items.append({"type": "text", "text": text_prompt})
1343
- prompt_str = tokenizer.apply_chat_template(
1344
- [{"role": "user", "content": content_items}],
1345
- add_generation_prompt=True,
1346
- tokenize=False,
1347
- )
1348
- except Exception:
1349
- # Some tokenizers do not support list content; fall back to a placeholder in the text
1350
- prompt_str = f"<image>{text_prompt}"
1351
-
1352
- prompt_token_ids = tokenizer.encode(prompt_str)
1353
- prompt_token_len = len(prompt_token_ids)
1354
-
1355
- dataset.append(
1356
- DatasetRow(
1357
- prompt=prompt_str,
1358
- prompt_len=prompt_token_len,
1359
- output_len=int(output_lens[i]),
1360
- image_data=images,
1361
- )
1439
+ images, images_base64, images_bytes = zip(
1440
+ *[_gen_random_image_data_uri() for _ in range(image_count)]
1441
+ )
1442
+ total_image_bytes += sum(list(images_bytes))
1443
+
1444
+ data_row = create_mm_data_row(
1445
+ text_prompt,
1446
+ list(images),
1447
+ list(images_base64),
1448
+ int(output_lens[i]),
1449
+ processor,
1362
1450
  )
1363
1451
 
1452
+ dataset.append(data_row)
1453
+
1364
1454
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1365
1455
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1456
+ print(
1457
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
1458
+ )
1366
1459
  return dataset
1367
1460
 
1368
1461
 
@@ -1434,7 +1527,9 @@ def sample_generated_shared_prefix_requests(
1434
1527
 
1435
1528
  input_requests.append(
1436
1529
  DatasetRow(
1437
- prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1530
+ prompt=full_prompt,
1531
+ prompt_len=prompt_len,
1532
+ output_len=output_len,
1438
1533
  )
1439
1534
  )
1440
1535
  total_input_tokens += prompt_len
@@ -1516,6 +1611,8 @@ def calculate_metrics(
1516
1611
  output_lens: List[int] = []
1517
1612
  retokenized_output_lens: List[int] = []
1518
1613
  total_input = 0
1614
+ total_input_text = 0
1615
+ total_input_vision = 0
1519
1616
  completed = 0
1520
1617
  itls: List[float] = []
1521
1618
  tpots: List[float] = []
@@ -1529,7 +1626,9 @@ def calculate_metrics(
1529
1626
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1530
1627
  )
1531
1628
  retokenized_output_lens.append(retokenized_output_len)
1532
- total_input += outputs[i].prompt_len
1629
+ total_input += input_requests[i].prompt_len
1630
+ total_input_text += input_requests[i].text_prompt_len
1631
+ total_input_vision += input_requests[i].vision_prompt_len
1533
1632
  if output_len > 1:
1534
1633
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1535
1634
  itls += outputs[i].itl
@@ -1551,6 +1650,8 @@ def calculate_metrics(
1551
1650
  metrics = BenchmarkMetrics(
1552
1651
  completed=completed,
1553
1652
  total_input=total_input,
1653
+ total_input_text=total_input_text,
1654
+ total_input_vision=total_input_vision,
1554
1655
  total_output=sum(output_lens),
1555
1656
  total_output_retokenized=sum(retokenized_output_lens),
1556
1657
  request_throughput=completed / dur_s,
@@ -1604,6 +1705,8 @@ async def benchmark(
1604
1705
  use_trace_timestamps: bool = False,
1605
1706
  mooncake_slowdown_factor=1.0,
1606
1707
  mooncake_num_rounds=1,
1708
+ profile_prefill_url: Optional[List[str]] = None,
1709
+ profile_decode_url: Optional[List[str]] = None,
1607
1710
  ):
1608
1711
  if backend in ASYNC_REQUEST_FUNCS:
1609
1712
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1693,14 +1796,28 @@ async def benchmark(
1693
1796
 
1694
1797
  time.sleep(1.0)
1695
1798
 
1799
+ # Build profile URLs for PD separated mode (do this once at the beginning)
1800
+ pd_profile_urls = []
1801
+ if profile and pd_separated:
1802
+ pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
1803
+ if not pd_profile_urls:
1804
+ print(
1805
+ "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
1806
+ )
1807
+ print("Skipping profiler start. Please specify worker URLs for profiling.")
1808
+
1696
1809
  # Start profiler
1697
1810
  if profile:
1698
- print("Starting profiler...")
1699
- profile_output = await async_request_profile(
1700
- api_url=base_url + "/start_profile"
1701
- )
1702
- if profile_output.success:
1703
- print("Profiler started")
1811
+ if pd_separated:
1812
+ if pd_profile_urls:
1813
+ await _call_profile_pd(pd_profile_urls, "start")
1814
+ else:
1815
+ print("Starting profiler...")
1816
+ profile_output = await async_request_profile(
1817
+ api_url=base_url + "/start_profile"
1818
+ )
1819
+ if profile_output.success:
1820
+ print("Profiler started")
1704
1821
 
1705
1822
  # Run all requests
1706
1823
  benchmark_start_time = time.perf_counter()
@@ -1749,23 +1866,37 @@ async def benchmark(
1749
1866
 
1750
1867
  # Stop profiler
1751
1868
  if profile:
1752
- print("Stopping profiler...")
1753
- profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
1754
- if profile_output.success:
1755
- print("Profiler stopped")
1869
+ if pd_separated:
1870
+ if pd_profile_urls:
1871
+ await _call_profile_pd(pd_profile_urls, "stop")
1872
+ else:
1873
+ print("Stopping profiler...")
1874
+ profile_output = await async_request_profile(
1875
+ api_url=base_url + "/stop_profile"
1876
+ )
1877
+ if profile_output.success:
1878
+ print("Profiler stopped")
1756
1879
 
1757
1880
  if pbar is not None:
1758
1881
  pbar.close()
1759
1882
 
1760
1883
  if "sglang" in backend:
1761
- server_info = requests.get(base_url + "/get_server_info")
1884
+ server_info = requests.get(
1885
+ base_url + "/get_server_info", headers=get_auth_headers()
1886
+ )
1762
1887
  if server_info.status_code == 200:
1763
1888
  server_info_json = server_info.json()
1764
1889
  if "decode" in server_info_json:
1765
1890
  server_info_json = server_info_json["decode"][0]
1766
- accept_length = server_info_json["internal_states"][0].get(
1767
- "avg_spec_accept_length", None
1768
- )
1891
+ if (
1892
+ "internal_states" in server_info_json
1893
+ and server_info_json["internal_states"]
1894
+ ):
1895
+ accept_length = server_info_json["internal_states"][0].get(
1896
+ "avg_spec_accept_length", None
1897
+ )
1898
+ else:
1899
+ accept_length = None
1769
1900
  else:
1770
1901
  accept_length = None
1771
1902
  else:
@@ -1797,6 +1928,10 @@ async def benchmark(
1797
1928
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
1798
1929
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
1799
1930
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
1931
+ print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
1932
+ print(
1933
+ "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
1934
+ )
1800
1935
  print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
1801
1936
  print(
1802
1937
  "{:<40} {:<10}".format(
@@ -1866,6 +2001,8 @@ async def benchmark(
1866
2001
  "duration": benchmark_duration,
1867
2002
  "completed": metrics.completed,
1868
2003
  "total_input_tokens": metrics.total_input,
2004
+ "total_input_text_tokens": metrics.total_input_text,
2005
+ "total_input_vision_tokens": metrics.total_input_vision,
1869
2006
  "total_output_tokens": metrics.total_output,
1870
2007
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1871
2008
  "request_throughput": metrics.request_throughput,
@@ -1900,11 +2037,11 @@ async def benchmark(
1900
2037
  output_file_name = args.output_file
1901
2038
  else:
1902
2039
  now = datetime.now().strftime("%m%d")
1903
- if args.dataset_name == "random-image":
2040
+ if args.dataset_name == "image":
1904
2041
  output_file_name = (
1905
2042
  f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1906
- f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1907
- f"{args.random_image_resolution}.jsonl"
2043
+ f"{args.random_output_len}_{args.image_count}imgs_"
2044
+ f"{args.image_resolution}.jsonl"
1908
2045
  )
1909
2046
  elif args.dataset_name.startswith("random"):
1910
2047
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
@@ -2080,6 +2217,12 @@ def run_benchmark(args_: argparse.Namespace):
2080
2217
  "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
2081
2218
  )
2082
2219
 
2220
+ if args.dataset_name in ["image", "mmmu"]:
2221
+ args.apply_chat_template = True
2222
+ assert (
2223
+ not args.tokenize_prompt
2224
+ ), "`--tokenize-prompt` not compatible with image dataset"
2225
+
2083
2226
  print(f"{args}\n")
2084
2227
 
2085
2228
  # Read dataset
@@ -2087,7 +2230,7 @@ def run_benchmark(args_: argparse.Namespace):
2087
2230
  model_id = args.model
2088
2231
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
2089
2232
  tokenizer = get_tokenizer(tokenizer_id)
2090
- input_requests = get_dataset(args, tokenizer)
2233
+ input_requests = get_dataset(args, tokenizer, model_id)
2091
2234
 
2092
2235
  # compatible with SimpleNamespace
2093
2236
  if not hasattr(args, "flush_cache"):
@@ -2113,6 +2256,8 @@ def run_benchmark(args_: argparse.Namespace):
2113
2256
  use_trace_timestamps=args.use_trace_timestamps,
2114
2257
  mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2115
2258
  mooncake_num_rounds=args.mooncake_num_rounds,
2259
+ profile_prefill_url=getattr(args, "profile_prefill_url", None),
2260
+ profile_decode_url=getattr(args, "profile_decode_url", None),
2116
2261
  )
2117
2262
  )
2118
2263
 
@@ -2168,7 +2313,7 @@ if __name__ == "__main__":
2168
2313
  "random-ids",
2169
2314
  "generated-shared-prefix",
2170
2315
  "mmmu",
2171
- "random-image",
2316
+ "image",
2172
2317
  "mooncake",
2173
2318
  ],
2174
2319
  help="Name of the dataset to benchmark on.",
@@ -2208,37 +2353,49 @@ if __name__ == "__main__":
2208
2353
  "--random-input-len",
2209
2354
  type=int,
2210
2355
  default=1024,
2211
- help="Number of input tokens per request, used only for random dataset.",
2356
+ help="Number of input tokens per request, used only for random and image dataset.",
2212
2357
  )
2213
2358
  parser.add_argument(
2214
2359
  "--random-output-len",
2215
2360
  default=1024,
2216
2361
  type=int,
2217
- help="Number of output tokens per request, used only for random dataset.",
2362
+ help="Number of output tokens per request, used only for random and image dataset.",
2218
2363
  )
2219
2364
  parser.add_argument(
2220
2365
  "--random-range-ratio",
2221
2366
  type=float,
2222
2367
  default=0.0,
2223
2368
  help="Range of sampled ratio of input/output length, "
2224
- "used only for random dataset.",
2369
+ "used only for random and image dataset.",
2225
2370
  )
2226
- # random-image dataset args
2371
+ # image dataset args
2227
2372
  parser.add_argument(
2228
- "--random-image-num-images",
2373
+ "--image-count",
2229
2374
  type=int,
2230
2375
  default=1,
2231
- help="Number of images per request (only available with the random-image dataset)",
2376
+ help="Number of images per request (only available with the image dataset)",
2232
2377
  )
2233
2378
  parser.add_argument(
2234
- "--random-image-resolution",
2379
+ "--image-resolution",
2235
2380
  type=str,
2236
2381
  default="1080p",
2237
2382
  help=(
2238
- "Resolution of random images for random-image dataset. "
2383
+ "Resolution of images for image dataset. "
2239
2384
  "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2240
2385
  ),
2241
2386
  )
2387
+ parser.add_argument(
2388
+ "--image-format",
2389
+ type=str,
2390
+ default="jpeg",
2391
+ help=("Format of images for image dataset. " "Supports jpeg and png."),
2392
+ )
2393
+ parser.add_argument(
2394
+ "--image-content",
2395
+ type=str,
2396
+ default="random",
2397
+ help=("Content for images for image dataset. " "Supports random and blank."),
2398
+ )
2242
2399
  parser.add_argument(
2243
2400
  "--request-rate",
2244
2401
  type=float,
@@ -2326,6 +2483,30 @@ if __name__ == "__main__":
2326
2483
  action="store_true",
2327
2484
  help="Benchmark PD disaggregation server",
2328
2485
  )
2486
+
2487
+ # Create a mutually exclusive group for profiling URLs
2488
+ # In PD separated mode, prefill and decode workers must be profiled separately
2489
+ profile_url_group = parser.add_mutually_exclusive_group()
2490
+ profile_url_group.add_argument(
2491
+ "--profile-prefill-url",
2492
+ type=str,
2493
+ nargs="*",
2494
+ default=None,
2495
+ help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
2496
+ "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
2497
+ "NOTE: Cannot be used together with --profile-decode-url. "
2498
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2499
+ )
2500
+ profile_url_group.add_argument(
2501
+ "--profile-decode-url",
2502
+ type=str,
2503
+ nargs="*",
2504
+ default=None,
2505
+ help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
2506
+ "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
2507
+ "NOTE: Cannot be used together with --profile-prefill-url. "
2508
+ "In PD separated mode, prefill and decode workers must be profiled separately.",
2509
+ )
2329
2510
  parser.add_argument(
2330
2511
  "--flush-cache",
2331
2512
  action="store_true",