sglang 0.5.3__py3-none-any.whl → 0.5.3.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (258) hide show
  1. sglang/bench_one_batch.py +12 -4
  2. sglang/bench_one_batch_server.py +40 -25
  3. sglang/bench_serving.py +327 -141
  4. sglang/compile_deep_gemm.py +6 -2
  5. sglang/global_config.py +1 -25
  6. sglang/lang/api.py +6 -0
  7. sglang/lang/interpreter.py +1 -0
  8. sglang/lang/ir.py +13 -0
  9. sglang/launch_server.py +8 -15
  10. sglang/profiler.py +18 -1
  11. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  12. sglang/srt/compilation/backend.py +431 -0
  13. sglang/srt/compilation/compilation_config.py +19 -0
  14. sglang/srt/compilation/compilation_counter.py +47 -0
  15. sglang/srt/compilation/compile.py +210 -0
  16. sglang/srt/compilation/compiler_interface.py +477 -0
  17. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  18. sglang/srt/compilation/fix_functionalization.py +134 -0
  19. sglang/srt/compilation/fx_utils.py +83 -0
  20. sglang/srt/compilation/inductor_pass.py +140 -0
  21. sglang/srt/compilation/pass_manager.py +66 -0
  22. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  23. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  24. sglang/srt/configs/__init__.py +2 -0
  25. sglang/srt/configs/falcon_h1.py +12 -58
  26. sglang/srt/configs/load_config.py +3 -2
  27. sglang/srt/configs/mamba_utils.py +117 -0
  28. sglang/srt/configs/model_config.py +73 -22
  29. sglang/srt/configs/nemotron_h.py +286 -0
  30. sglang/srt/configs/qwen3_next.py +11 -46
  31. sglang/srt/constrained/base_grammar_backend.py +5 -1
  32. sglang/srt/constrained/llguidance_backend.py +3 -0
  33. sglang/srt/constrained/outlines_backend.py +1 -1
  34. sglang/srt/constrained/xgrammar_backend.py +5 -1
  35. sglang/srt/disaggregation/decode.py +15 -24
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  37. sglang/srt/disaggregation/nixl/conn.py +55 -23
  38. sglang/srt/disaggregation/prefill.py +21 -34
  39. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  40. sglang/srt/distributed/device_communicators/custom_all_reduce.py +2 -2
  41. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  42. sglang/srt/distributed/parallel_state.py +37 -5
  43. sglang/srt/entrypoints/context.py +3 -1
  44. sglang/srt/entrypoints/engine.py +3 -3
  45. sglang/srt/entrypoints/grpc_server.py +384 -100
  46. sglang/srt/entrypoints/harmony_utils.py +2 -1
  47. sglang/srt/entrypoints/http_server.py +51 -2
  48. sglang/srt/entrypoints/openai/protocol.py +162 -31
  49. sglang/srt/entrypoints/openai/serving_base.py +2 -1
  50. sglang/srt/entrypoints/openai/serving_chat.py +16 -73
  51. sglang/srt/entrypoints/openai/serving_completions.py +1 -0
  52. sglang/srt/entrypoints/openai/serving_responses.py +1 -1
  53. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  54. sglang/srt/environ.py +14 -0
  55. sglang/srt/eplb/expert_distribution.py +3 -4
  56. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  57. sglang/srt/eplb/expert_location_updater.py +2 -2
  58. sglang/srt/function_call/base_format_detector.py +17 -18
  59. sglang/srt/function_call/function_call_parser.py +8 -6
  60. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  61. sglang/srt/function_call/utils.py +2 -1
  62. sglang/srt/grpc/compile_proto.py +2 -2
  63. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +93 -42
  64. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  65. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  66. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  67. sglang/srt/layers/attention/aiter_backend.py +3 -3
  68. sglang/srt/layers/attention/attention_registry.py +33 -22
  69. sglang/srt/layers/attention/base_attn_backend.py +19 -0
  70. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  71. sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
  72. sglang/srt/layers/attention/fla/utils.py +0 -3
  73. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  74. sglang/srt/layers/attention/flashinfer_backend.py +239 -19
  75. sglang/srt/layers/attention/flashinfer_mla_backend.py +10 -10
  76. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  77. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
  80. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  81. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  82. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  84. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  85. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  86. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +10 -4
  88. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  89. sglang/srt/layers/attention/triton_backend.py +72 -33
  90. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  91. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  92. sglang/srt/layers/attention/trtllm_mla_backend.py +5 -5
  93. sglang/srt/layers/attention/vision.py +3 -3
  94. sglang/srt/layers/communicator.py +8 -5
  95. sglang/srt/layers/layernorm.py +10 -5
  96. sglang/srt/layers/logits_processor.py +146 -16
  97. sglang/srt/layers/modelopt_utils.py +11 -0
  98. sglang/srt/layers/moe/cutlass_w4a8_moe.py +214 -21
  99. sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
  100. sglang/srt/layers/moe/ep_moe/layer.py +32 -290
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  103. sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -13
  104. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  105. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  106. sglang/srt/layers/moe/router.py +51 -15
  107. sglang/srt/layers/moe/token_dispatcher/__init__.py +8 -0
  108. sglang/srt/layers/moe/token_dispatcher/deepep.py +13 -4
  109. sglang/srt/layers/moe/token_dispatcher/mooncake.py +394 -0
  110. sglang/srt/layers/moe/utils.py +15 -1
  111. sglang/srt/layers/quantization/__init__.py +1 -1
  112. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +2 -2
  113. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +3 -5
  114. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +3 -2
  115. sglang/srt/layers/quantization/fp8.py +84 -18
  116. sglang/srt/layers/quantization/fp8_kernel.py +35 -8
  117. sglang/srt/layers/quantization/fp8_utils.py +1 -3
  118. sglang/srt/layers/quantization/int8_kernel.py +19 -3
  119. sglang/srt/layers/quantization/modelopt_quant.py +10 -20
  120. sglang/srt/layers/quantization/mxfp4.py +4 -4
  121. sglang/srt/layers/quantization/quark/quark.py +3 -1
  122. sglang/srt/layers/quantization/w4afp8.py +49 -17
  123. sglang/srt/layers/radix_attention.py +59 -9
  124. sglang/srt/layers/rotary_embedding.py +33 -9
  125. sglang/srt/layers/sampler.py +33 -13
  126. sglang/srt/lora/eviction_policy.py +139 -0
  127. sglang/srt/lora/lora_manager.py +23 -8
  128. sglang/srt/lora/lora_registry.py +1 -1
  129. sglang/srt/lora/mem_pool.py +40 -16
  130. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  131. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  132. sglang/srt/managers/cache_controller.py +45 -13
  133. sglang/srt/managers/data_parallel_controller.py +123 -27
  134. sglang/srt/managers/detokenizer_manager.py +3 -0
  135. sglang/srt/managers/io_struct.py +43 -3
  136. sglang/srt/managers/mm_utils.py +2 -2
  137. sglang/srt/managers/multi_tokenizer_mixin.py +17 -0
  138. sglang/srt/managers/overlap_utils.py +96 -19
  139. sglang/srt/managers/schedule_batch.py +195 -455
  140. sglang/srt/managers/schedule_policy.py +13 -1
  141. sglang/srt/managers/scheduler.py +373 -180
  142. sglang/srt/managers/scheduler_metrics_mixin.py +51 -2
  143. sglang/srt/managers/scheduler_output_processor_mixin.py +277 -96
  144. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  145. sglang/srt/managers/scheduler_update_weights_mixin.py +19 -14
  146. sglang/srt/managers/tokenizer_communicator_mixin.py +2 -0
  147. sglang/srt/managers/tokenizer_manager.py +315 -60
  148. sglang/srt/managers/tp_worker.py +66 -37
  149. sglang/srt/mem_cache/allocator.py +7 -2
  150. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  151. sglang/srt/mem_cache/chunk_cache.py +6 -2
  152. sglang/srt/mem_cache/common.py +475 -0
  153. sglang/srt/mem_cache/hicache_storage.py +4 -1
  154. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  155. sglang/srt/mem_cache/mamba_radix_cache.py +995 -0
  156. sglang/srt/mem_cache/memory_pool.py +199 -96
  157. sglang/srt/mem_cache/radix_cache.py +99 -20
  158. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  159. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  160. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  161. sglang/srt/mem_cache/storage/eic/eic_storage.py +3 -1
  162. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  163. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +4 -2
  164. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +44 -17
  165. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +37 -7
  166. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +16 -1
  167. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  168. sglang/srt/metrics/collector.py +18 -0
  169. sglang/srt/model_executor/cuda_graph_runner.py +4 -3
  170. sglang/srt/model_executor/forward_batch_info.py +13 -21
  171. sglang/srt/model_executor/model_runner.py +261 -113
  172. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +527 -0
  173. sglang/srt/model_loader/__init__.py +1 -1
  174. sglang/srt/model_loader/loader.py +277 -21
  175. sglang/srt/model_loader/weight_utils.py +3 -0
  176. sglang/srt/models/apertus.py +2 -3
  177. sglang/srt/models/arcee.py +2 -2
  178. sglang/srt/models/bailing_moe.py +8 -8
  179. sglang/srt/models/bailing_moe_nextn.py +3 -4
  180. sglang/srt/models/deepseek_nextn.py +2 -2
  181. sglang/srt/models/deepseek_v2.py +49 -32
  182. sglang/srt/models/dots_vlm_vit.py +1 -1
  183. sglang/srt/models/falcon_h1.py +13 -18
  184. sglang/srt/models/gemma3_mm.py +16 -0
  185. sglang/srt/models/glm4_moe.py +8 -12
  186. sglang/srt/models/glm4_moe_nextn.py +2 -2
  187. sglang/srt/models/glm4v.py +1 -1
  188. sglang/srt/models/glm4v_moe.py +5 -5
  189. sglang/srt/models/gpt_oss.py +4 -4
  190. sglang/srt/models/grok.py +10 -23
  191. sglang/srt/models/kimi_vl.py +1 -7
  192. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  193. sglang/srt/models/llama.py +2 -2
  194. sglang/srt/models/longcat_flash.py +3 -7
  195. sglang/srt/models/minicpmo.py +7 -2
  196. sglang/srt/models/mixtral.py +1 -3
  197. sglang/srt/models/mllama4.py +13 -3
  198. sglang/srt/models/nemotron_h.py +514 -0
  199. sglang/srt/models/qwen2_5_vl.py +1 -1
  200. sglang/srt/models/qwen2_moe.py +4 -4
  201. sglang/srt/models/qwen2_vl.py +1 -1
  202. sglang/srt/models/qwen3_moe.py +4 -4
  203. sglang/srt/models/qwen3_next.py +2 -2
  204. sglang/srt/models/qwen3_next_mtp.py +3 -4
  205. sglang/srt/models/qwen3_vl.py +9 -10
  206. sglang/srt/models/qwen3_vl_moe.py +6 -15
  207. sglang/srt/models/step3_vl.py +2 -3
  208. sglang/srt/models/utils.py +5 -1
  209. sglang/srt/sampling/custom_logit_processor.py +2 -1
  210. sglang/srt/sampling/sampling_batch_info.py +17 -22
  211. sglang/srt/sampling/sampling_params.py +70 -2
  212. sglang/srt/server_args.py +345 -64
  213. sglang/srt/single_batch_overlap.py +0 -1
  214. sglang/srt/speculative/draft_utils.py +210 -0
  215. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -6
  216. sglang/srt/speculative/eagle_info.py +53 -17
  217. sglang/srt/speculative/eagle_info_v2.py +404 -0
  218. sglang/srt/speculative/eagle_utils.py +138 -0
  219. sglang/srt/speculative/eagle_worker.py +65 -235
  220. sglang/srt/speculative/eagle_worker_v2.py +484 -0
  221. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  222. sglang/srt/speculative/ngram_worker.py +12 -11
  223. sglang/srt/speculative/spec_utils.py +1 -2
  224. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  225. sglang/srt/two_batch_overlap.py +23 -13
  226. sglang/srt/utils/__init__.py +1 -1
  227. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  228. sglang/srt/utils/common.py +94 -23
  229. sglang/srt/utils/hf_transformers_utils.py +4 -1
  230. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  231. sglang/srt/utils/profile_merger.py +199 -0
  232. sglang/test/longbench_v2/__init__.py +1 -0
  233. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  234. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  235. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  236. sglang/test/run_eval.py +41 -0
  237. sglang/test/runners.py +2 -0
  238. sglang/test/simple_eval_common.py +3 -0
  239. sglang/test/simple_eval_longbench_v2.py +344 -0
  240. sglang/test/test_cutlass_moe.py +1 -1
  241. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  242. sglang/test/test_deterministic.py +20 -7
  243. sglang/test/test_deterministic_utils.py +81 -0
  244. sglang/test/test_disaggregation_utils.py +63 -0
  245. sglang/test/test_utils.py +37 -17
  246. sglang/version.py +1 -1
  247. {sglang-0.5.3.dist-info → sglang-0.5.3.post2.dist-info}/METADATA +26 -14
  248. {sglang-0.5.3.dist-info → sglang-0.5.3.post2.dist-info}/RECORD +254 -220
  249. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  250. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  251. sglang/srt/speculative/build_eagle_tree.py +0 -427
  252. sglang/test/test_block_fp8_ep.py +0 -358
  253. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  254. /sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -0
  255. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  256. {sglang-0.5.3.dist-info → sglang-0.5.3.post2.dist-info}/WHEEL +0 -0
  257. {sglang-0.5.3.dist-info → sglang-0.5.3.post2.dist-info}/licenses/LICENSE +0 -0
  258. {sglang-0.5.3.dist-info → sglang-0.5.3.post2.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -51,6 +51,7 @@ import logging
51
51
  import multiprocessing
52
52
  import os
53
53
  import time
54
+ from types import SimpleNamespace
54
55
  from typing import Tuple
55
56
 
56
57
  import numpy as np
@@ -204,7 +205,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
204
205
  origin_input_ids=tmp_input_ids,
205
206
  sampling_params=sampling_params,
206
207
  )
207
- req.prefix_indices = []
208
208
  req.fill_ids = req.origin_input_ids
209
209
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
210
210
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -248,7 +248,6 @@ def prepare_synthetic_inputs_for_latency_test(
248
248
  origin_input_ids=list(input_ids[i]),
249
249
  sampling_params=sampling_params,
250
250
  )
251
- req.prefix_indices = []
252
251
  req.fill_ids = req.origin_input_ids
253
252
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
254
253
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -259,11 +258,18 @@ def prepare_synthetic_inputs_for_latency_test(
259
258
 
260
259
  @torch.no_grad
261
260
  def extend(reqs, model_runner):
261
+ # Create dummy tree_cache for benchmarks (no prefix caching, just allocation)
262
+ dummy_tree_cache = SimpleNamespace(
263
+ page_size=1,
264
+ device=model_runner.device,
265
+ token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
266
+ )
267
+
262
268
  batch = ScheduleBatch.init_new(
263
269
  reqs=reqs,
264
270
  req_to_token_pool=model_runner.req_to_token_pool,
265
271
  token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
266
- tree_cache=None,
272
+ tree_cache=dummy_tree_cache,
267
273
  model_config=model_runner.model_config,
268
274
  enable_overlap=False,
269
275
  spec_algorithm=SpeculativeAlgorithm.NONE,
@@ -512,7 +518,9 @@ def latency_test(
512
518
 
513
519
  # Set CPU affinity
514
520
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
515
- set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, tp_rank)
521
+ set_gpu_proc_affinity(
522
+ server_args.pp_size, server_args.tp_size, server_args.nnodes, tp_rank
523
+ )
516
524
 
517
525
  # Configure the logger
518
526
  configure_logger(server_args, prefix=f" TP{tp_rank}")
@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
25
25
  import numpy as np
26
26
  import requests
27
27
  from pydantic import BaseModel
28
+ from transformers import AutoProcessor, PreTrainedTokenizer
28
29
 
29
30
  from sglang.bench_serving import (
31
+ get_processor,
30
32
  get_tokenizer,
31
33
  sample_mmmu_requests,
32
34
  sample_random_requests,
@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
104
106
  if self.profile_links.extend or self.profile_links.decode:
105
107
  # Create a combined link or use the first available one
106
108
  trace_files = [self.profile_links.extend, self.profile_links.decode]
109
+ if any(trace_file is None for trace_file in trace_files):
110
+ logger.error("Some trace files are None", f"{trace_files=}")
107
111
  trace_files_relay_links = [
108
- f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
112
+ (
113
+ f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
114
+ if trace_file
115
+ else "N/A"
116
+ )
109
117
  for trace_file in trace_files
110
118
  ]
111
119
 
@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
114
122
  # Build the row
115
123
  return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
116
124
 
117
- @classmethod
118
- def generate_markdown_report(
119
- cls, trace_dir, results: List["BenchmarkResult"]
120
- ) -> str:
121
- """Generate a markdown report from a list of BenchmarkResult object from a single run."""
122
- import os
123
125
 
124
- summary = f"### {results[0].model_path}\n"
126
+ def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
127
+ """Generate a markdown report from a list of BenchmarkResult object from a single run."""
128
+ import os
129
+
130
+ summary = f"### {results[0].model_path}\n"
125
131
 
126
- # summary += (
127
- # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
128
- # )
129
- summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
130
- summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
132
+ # summary += (
133
+ # f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
134
+ # )
135
+ summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
136
+ summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
131
137
 
132
- # all results should share the same isl & osl
133
- for result in results:
134
- base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
135
- relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
136
- relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
137
- # base_url = "https://github.com/sgl-project/ci-data/traces"
138
- summary += result.to_markdown_row(trace_dir, base_url, relay_base)
138
+ # all results should share the same isl & osl
139
+ for result in results:
140
+ base_url = os.getenv(
141
+ "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
142
+ ).rstrip("/")
143
+ relay_base = os.getenv(
144
+ "PERFETTO_RELAY_URL",
145
+ "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
146
+ ).rstrip("/")
147
+ summary += result.to_markdown_row(trace_dir, base_url, relay_base)
139
148
 
140
- return summary
149
+ return summary
141
150
 
142
151
 
143
152
  @dataclasses.dataclass
@@ -288,7 +297,7 @@ def run_one_case(
288
297
  input_len_step_percentage: float,
289
298
  run_name: str,
290
299
  result_filename: str,
291
- tokenizer,
300
+ tokenizer: PreTrainedTokenizer | AutoProcessor,
292
301
  dataset_name="",
293
302
  profile: bool = False,
294
303
  profile_steps: int = 3,
@@ -302,9 +311,8 @@ def run_one_case(
302
311
  if dataset_name == "mmmu":
303
312
  input_requests = sample_mmmu_requests(
304
313
  num_requests=batch_size,
305
- tokenizer=tokenizer,
314
+ processor=tokenizer,
306
315
  fixed_output_len=output_len,
307
- apply_chat_template=True,
308
316
  random_sample=False,
309
317
  )
310
318
  elif dataset_name == "random":
@@ -364,6 +372,8 @@ def run_one_case(
364
372
  if dataset_name == "mmmu":
365
373
  # vlm
366
374
  input_ids = []
375
+ # for vlms, tokenizer is an instance of AutoProcessor
376
+ tokenizer = tokenizer.tokenizer
367
377
  for input_req in input_requests:
368
378
  input_ids += [tokenizer.encode(input_req.prompt)]
369
379
  payload["image_data"] = [req.image_data for req in input_requests]
@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
609
619
  tokenizer_path = server_info["tokenizer_path"]
610
620
  elif "prefill" in server_info:
611
621
  tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
612
- tokenizer = get_tokenizer(tokenizer_path)
622
+
623
+ if bench_args.dataset_name == "mmmu":
624
+ # mmmu implies this is a MLLM
625
+ tokenizer = get_processor(tokenizer_path)
626
+ else:
627
+ tokenizer = get_tokenizer(tokenizer_path)
613
628
 
614
629
  # warmup
615
630
  if not bench_args.skip_warmup: