sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +47 -28
 - sglang/bench_one_batch_server.py +41 -25
 - sglang/bench_serving.py +378 -160
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +10 -15
 - sglang/profiler.py +18 -1
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +4 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +13 -64
 - sglang/srt/configs/load_config.py +25 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +136 -25
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +0 -10
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +5 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +20 -11
 - sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +4 -2
 - sglang/srt/disaggregation/decode.py +123 -31
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +157 -19
 - sglang/srt/disaggregation/nixl/conn.py +69 -24
 - sglang/srt/disaggregation/prefill.py +96 -270
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +63 -19
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +83 -80
 - sglang/srt/entrypoints/grpc_server.py +430 -234
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +195 -102
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +225 -37
 - sglang/srt/entrypoints/openai/serving_base.py +49 -2
 - sglang/srt/entrypoints/openai/serving_chat.py +29 -74
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +15 -1
 - sglang/srt/entrypoints/openai/serving_responses.py +5 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +58 -6
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +33 -4
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +20 -14
 - sglang/srt/function_call/glm4_moe_detector.py +1 -5
 - sglang/srt/function_call/gpt_oss_detector.py +1 -1
 - sglang/srt/function_call/json_array_parser.py +0 -2
 - sglang/srt/function_call/minimax_m2.py +367 -0
 - sglang/srt/function_call/utils.py +2 -2
 - sglang/srt/grpc/compile_proto.py +3 -3
 - sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
 - sglang/srt/layers/activation.py +10 -1
 - sglang/srt/layers/attention/aiter_backend.py +3 -3
 - sglang/srt/layers/attention/ascend_backend.py +17 -1
 - sglang/srt/layers/attention/attention_registry.py +43 -23
 - sglang/srt/layers/attention/base_attn_backend.py +20 -1
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +24 -10
 - sglang/srt/layers/attention/flashinfer_backend.py +258 -22
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
 - sglang/srt/layers/attention/flashmla_backend.py +2 -2
 - sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
 - sglang/srt/layers/attention/mamba/mamba.py +189 -241
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +0 -1
 - sglang/srt/layers/attention/nsa_backend.py +404 -90
 - sglang/srt/layers/attention/triton_backend.py +208 -34
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
 - sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
 - sglang/srt/layers/attention/utils.py +89 -7
 - sglang/srt/layers/attention/vision.py +3 -3
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +12 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +17 -0
 - sglang/srt/layers/layernorm.py +64 -19
 - sglang/srt/layers/linear.py +9 -1
 - sglang/srt/layers/logits_processor.py +152 -17
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
 - sglang/srt/layers/moe/ep_moe/layer.py +154 -625
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +6 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
 - sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +7 -6
 - sglang/srt/layers/moe/utils.py +20 -5
 - sglang/srt/layers/quantization/__init__.py +5 -58
 - sglang/srt/layers/quantization/awq.py +183 -9
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +27 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +152 -81
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +42 -14
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gguf.py +566 -0
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +125 -100
 - sglang/srt/layers/quantization/mxfp4.py +35 -68
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +23 -48
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +87 -20
 - sglang/srt/layers/quantization/w8a8_int8.py +30 -24
 - sglang/srt/layers/radix_attention.py +62 -9
 - sglang/srt/layers/rotary_embedding.py +686 -17
 - sglang/srt/layers/sampler.py +47 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +0 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/triton_backend.py +0 -1
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora_manager.py +24 -9
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +40 -16
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
 - sglang/srt/managers/cache_controller.py +48 -17
 - sglang/srt/managers/data_parallel_controller.py +146 -42
 - sglang/srt/managers/detokenizer_manager.py +40 -13
 - sglang/srt/managers/io_struct.py +69 -16
 - sglang/srt/managers/mm_utils.py +20 -18
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
 - sglang/srt/managers/overlap_utils.py +96 -19
 - sglang/srt/managers/schedule_batch.py +241 -511
 - sglang/srt/managers/schedule_policy.py +15 -2
 - sglang/srt/managers/scheduler.py +420 -514
 - sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
 - sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
 - sglang/srt/managers/tokenizer_manager.py +375 -95
 - sglang/srt/managers/tp_worker.py +212 -161
 - sglang/srt/managers/utils.py +78 -2
 - sglang/srt/mem_cache/allocator.py +7 -2
 - sglang/srt/mem_cache/allocator_ascend.py +2 -2
 - sglang/srt/mem_cache/base_prefix_cache.py +2 -2
 - sglang/srt/mem_cache/chunk_cache.py +13 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +16 -1
 - sglang/srt/mem_cache/hicache_storage.py +11 -2
 - sglang/srt/mem_cache/hiradix_cache.py +16 -3
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +517 -219
 - sglang/srt/mem_cache/memory_pool_host.py +0 -1
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +53 -19
 - sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
 - sglang/srt/mem_cache/storage/backend_factory.py +2 -2
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +92 -26
 - sglang/srt/metrics/collector.py +31 -0
 - sglang/srt/metrics/func_timer.py +1 -1
 - sglang/srt/model_executor/cuda_graph_runner.py +43 -5
 - sglang/srt/model_executor/forward_batch_info.py +71 -25
 - sglang/srt/model_executor/model_runner.py +362 -270
 - sglang/srt/model_executor/npu_graph_runner.py +2 -3
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +424 -27
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +47 -28
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +13 -52
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +19 -3
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +418 -140
 - sglang/srt/models/dots_ocr.py +0 -2
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +13 -19
 - sglang/srt/models/gemma3_mm.py +16 -0
 - sglang/srt/models/gemma3n_mm.py +1 -2
 - sglang/srt/models/glm4_moe.py +327 -382
 - sglang/srt/models/glm4_moe_nextn.py +6 -16
 - sglang/srt/models/glm4v.py +2 -1
 - sglang/srt/models/glm4v_moe.py +32 -199
 - sglang/srt/models/gpt_oss.py +5 -5
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +3 -1
 - sglang/srt/models/llama.py +2 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +5 -22
 - sglang/srt/models/longcat_flash_nextn.py +3 -14
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/minimax_m2.py +922 -0
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +13 -3
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/nvila.py +355 -0
 - sglang/srt/models/nvila_lite.py +184 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +22 -1
 - sglang/srt/models/qwen2_5_vl.py +3 -3
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +15 -12
 - sglang/srt/models/qwen2_vl.py +5 -2
 - sglang/srt/models/qwen3.py +34 -4
 - sglang/srt/models/qwen3_moe.py +19 -37
 - sglang/srt/models/qwen3_next.py +7 -12
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +37 -33
 - sglang/srt/models/qwen3_vl_moe.py +57 -185
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +0 -1
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/utils.py +11 -1
 - sglang/srt/multimodal/processors/base_processor.py +7 -2
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +0 -1
 - sglang/srt/multimodal/processors/glm4v.py +2 -6
 - sglang/srt/multimodal/processors/internvl.py +0 -2
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +75 -16
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/reasoning_parser.py +28 -2
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +17 -22
 - sglang/srt/sampling/sampling_params.py +70 -2
 - sglang/srt/server_args.py +846 -163
 - sglang/srt/server_args_config_parser.py +1 -1
 - sglang/srt/single_batch_overlap.py +36 -31
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
 - sglang/srt/speculative/eagle_info.py +57 -18
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +138 -0
 - sglang/srt/speculative/eagle_worker.py +83 -280
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
 - sglang/srt/speculative/ngram_worker.py +12 -11
 - sglang/srt/speculative/spec_info.py +2 -0
 - sglang/srt/speculative/spec_utils.py +38 -3
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/two_batch_overlap.py +28 -14
 - sglang/srt/utils/__init__.py +1 -1
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/utils/common.py +272 -82
 - sglang/srt/utils/hf_transformers_utils.py +44 -17
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +41 -0
 - sglang/test/runners.py +2 -0
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +3 -0
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/test_block_fp8.py +1 -2
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +463 -107
 - sglang/test/test_deterministic_utils.py +74 -0
 - sglang/test/test_disaggregation_utils.py +81 -0
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_utils.py +85 -20
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
 - sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
 - sglang/srt/models/vila.py +0 -306
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
 
    
        sglang/bench_serving.py
    CHANGED
    
    | 
         @@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro 
     | 
|
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            import argparse
         
     | 
| 
       14 
14 
     | 
    
         
             
            import asyncio
         
     | 
| 
       15 
     | 
    
         
            -
            import base64
         
     | 
| 
       16 
15 
     | 
    
         
             
            import io
         
     | 
| 
       17 
16 
     | 
    
         
             
            import json
         
     | 
| 
       18 
17 
     | 
    
         
             
            import os
         
     | 
| 
         @@ -32,9 +31,13 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union 
     | 
|
| 
       32 
31 
     | 
    
         | 
| 
       33 
32 
     | 
    
         
             
            import aiohttp
         
     | 
| 
       34 
33 
     | 
    
         
             
            import numpy as np
         
     | 
| 
      
 34 
     | 
    
         
            +
            import pybase64
         
     | 
| 
       35 
35 
     | 
    
         
             
            import requests
         
     | 
| 
      
 36 
     | 
    
         
            +
            from datasets import load_dataset
         
     | 
| 
      
 37 
     | 
    
         
            +
            from PIL import Image
         
     | 
| 
       36 
38 
     | 
    
         
             
            from tqdm.asyncio import tqdm
         
     | 
| 
       37 
39 
     | 
    
         
             
            from transformers import (
         
     | 
| 
      
 40 
     | 
    
         
            +
                AutoProcessor,
         
     | 
| 
       38 
41 
     | 
    
         
             
                AutoTokenizer,
         
     | 
| 
       39 
42 
     | 
    
         
             
                PreTrainedTokenizer,
         
     | 
| 
       40 
43 
     | 
    
         
             
                PreTrainedTokenizerBase,
         
     | 
| 
         @@ -85,6 +88,7 @@ class RequestFuncOutput: 
     | 
|
| 
       85 
88 
     | 
    
         
             
                latency: float = 0.0
         
     | 
| 
       86 
89 
     | 
    
         
             
                ttft: float = 0.0  # Time to first token
         
     | 
| 
       87 
90 
     | 
    
         
             
                itl: List[float] = field(default_factory=list)  # List of inter-token latencies
         
     | 
| 
      
 91 
     | 
    
         
            +
                text_chunks: List[str] = field(default_factory=list)
         
     | 
| 
       88 
92 
     | 
    
         
             
                prompt_len: int = 0
         
     | 
| 
       89 
93 
     | 
    
         
             
                error: str = ""
         
     | 
| 
       90 
94 
     | 
    
         
             
                output_len: int = 0
         
     | 
| 
         @@ -209,6 +213,11 @@ async def async_request_openai_completions( 
     | 
|
| 
       209 
213 
     | 
    
         
             
                        **request_func_input.extra_request_body,
         
     | 
| 
       210 
214 
     | 
    
         
             
                    }
         
     | 
| 
       211 
215 
     | 
    
         | 
| 
      
 216 
     | 
    
         
            +
                    # hack to accommodate different LoRA conventions between SGLang and vLLM.
         
     | 
| 
      
 217 
     | 
    
         
            +
                    if request_func_input.lora_name:
         
     | 
| 
      
 218 
     | 
    
         
            +
                        payload["model"] = request_func_input.lora_name
         
     | 
| 
      
 219 
     | 
    
         
            +
                        payload["lora_path"] = request_func_input.lora_name
         
     | 
| 
      
 220 
     | 
    
         
            +
             
     | 
| 
       212 
221 
     | 
    
         
             
                    if request_func_input.image_data:
         
     | 
| 
       213 
222 
     | 
    
         
             
                        payload.update({"image_data": request_func_input.image_data})
         
     | 
| 
       214 
223 
     | 
    
         | 
| 
         @@ -250,6 +259,9 @@ async def async_request_openai_completions( 
     | 
|
| 
       250 
259 
     | 
    
         | 
| 
       251 
260 
     | 
    
         
             
                                            # Decoding phase
         
     | 
| 
       252 
261 
     | 
    
         
             
                                            else:
         
     | 
| 
      
 262 
     | 
    
         
            +
                                                output.text_chunks.append(
         
     | 
| 
      
 263 
     | 
    
         
            +
                                                    data["choices"][0]["text"]
         
     | 
| 
      
 264 
     | 
    
         
            +
                                                )
         
     | 
| 
       253 
265 
     | 
    
         
             
                                                output.itl.append(timestamp - most_recent_timestamp)
         
     | 
| 
       254 
266 
     | 
    
         | 
| 
       255 
267 
     | 
    
         
             
                                            most_recent_timestamp = timestamp
         
     | 
| 
         @@ -322,10 +334,17 @@ async def async_request_openai_chat_completions( 
     | 
|
| 
       322 
334 
     | 
    
         
             
                        "model": request_func_input.model,
         
     | 
| 
       323 
335 
     | 
    
         
             
                        "messages": messages,
         
     | 
| 
       324 
336 
     | 
    
         
             
                        "temperature": 0.0,
         
     | 
| 
       325 
     | 
    
         
            -
                        " 
     | 
| 
      
 337 
     | 
    
         
            +
                        "max_completion_tokens": request_func_input.output_len,
         
     | 
| 
       326 
338 
     | 
    
         
             
                        "stream": not args.disable_stream,
         
     | 
| 
      
 339 
     | 
    
         
            +
                        "ignore_eos": not args.disable_ignore_eos,
         
     | 
| 
       327 
340 
     | 
    
         
             
                        **request_func_input.extra_request_body,
         
     | 
| 
       328 
341 
     | 
    
         
             
                    }
         
     | 
| 
      
 342 
     | 
    
         
            +
             
     | 
| 
      
 343 
     | 
    
         
            +
                    # hack to accommodate different LoRA conventions between SGLang and vLLM.
         
     | 
| 
      
 344 
     | 
    
         
            +
                    if request_func_input.lora_name:
         
     | 
| 
      
 345 
     | 
    
         
            +
                        payload["model"] = request_func_input.lora_name
         
     | 
| 
      
 346 
     | 
    
         
            +
                        payload["lora_path"] = request_func_input.lora_name
         
     | 
| 
      
 347 
     | 
    
         
            +
             
     | 
| 
       329 
348 
     | 
    
         
             
                    headers = get_auth_headers()
         
     | 
| 
       330 
349 
     | 
    
         | 
| 
       331 
350 
     | 
    
         
             
                    output = RequestFuncOutput.init_new(request_func_input)
         
     | 
| 
         @@ -559,9 +578,8 @@ async def async_request_sglang_generate( 
     | 
|
| 
       559 
578 
     | 
    
         
             
                                                num_new_tokens = output_len - last_output_len
         
     | 
| 
       560 
579 
     | 
    
         
             
                                                if num_new_tokens == 0:
         
     | 
| 
       561 
580 
     | 
    
         
             
                                                    continue
         
     | 
| 
       562 
     | 
    
         
            -
                                                 
     | 
| 
       563 
     | 
    
         
            -
             
     | 
| 
       564 
     | 
    
         
            -
                                                ) / num_new_tokens
         
     | 
| 
      
 581 
     | 
    
         
            +
                                                chunk_gap = timestamp - most_recent_timestamp
         
     | 
| 
      
 582 
     | 
    
         
            +
                                                adjust_itl = chunk_gap / num_new_tokens
         
     | 
| 
       565 
583 
     | 
    
         
             
                                                output.itl.extend([adjust_itl] * num_new_tokens)
         
     | 
| 
       566 
584 
     | 
    
         | 
| 
       567 
585 
     | 
    
         
             
                                            most_recent_timestamp = timestamp
         
     | 
| 
         @@ -610,6 +628,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput: 
     | 
|
| 
       610 
628 
     | 
    
         
             
                return output
         
     | 
| 
       611 
629 
     | 
    
         | 
| 
       612 
630 
     | 
    
         | 
| 
      
 631 
     | 
    
         
            +
            def _build_profile_urls(
         
     | 
| 
      
 632 
     | 
    
         
            +
                profile_prefill_url: Optional[List[str]],
         
     | 
| 
      
 633 
     | 
    
         
            +
                profile_decode_url: Optional[List[str]],
         
     | 
| 
      
 634 
     | 
    
         
            +
            ) -> List[Tuple[str, str]]:
         
     | 
| 
      
 635 
     | 
    
         
            +
                """Build profile URLs list from prefill/decode URL arguments.
         
     | 
| 
      
 636 
     | 
    
         
            +
             
     | 
| 
      
 637 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 638 
     | 
    
         
            +
                    List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
         
     | 
| 
      
 639 
     | 
    
         
            +
                """
         
     | 
| 
      
 640 
     | 
    
         
            +
                profile_urls = []
         
     | 
| 
      
 641 
     | 
    
         
            +
                if profile_prefill_url:
         
     | 
| 
      
 642 
     | 
    
         
            +
                    for idx, url in enumerate(profile_prefill_url):
         
     | 
| 
      
 643 
     | 
    
         
            +
                        profile_urls.append((f"Prefill-{idx}", url))
         
     | 
| 
      
 644 
     | 
    
         
            +
                if profile_decode_url:
         
     | 
| 
      
 645 
     | 
    
         
            +
                    for idx, url in enumerate(profile_decode_url):
         
     | 
| 
      
 646 
     | 
    
         
            +
                        profile_urls.append((f"Decode-{idx}", url))
         
     | 
| 
      
 647 
     | 
    
         
            +
                return profile_urls
         
     | 
| 
      
 648 
     | 
    
         
            +
             
     | 
| 
      
 649 
     | 
    
         
            +
             
     | 
| 
      
 650 
     | 
    
         
            +
            async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
         
     | 
| 
      
 651 
     | 
    
         
            +
                """Call profile endpoint (start/stop) on PD separated workers.
         
     | 
| 
      
 652 
     | 
    
         
            +
             
     | 
| 
      
 653 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 654 
     | 
    
         
            +
                    profile_urls: List of (worker_type, url) tuples
         
     | 
| 
      
 655 
     | 
    
         
            +
                    mode: "start" or "stop"
         
     | 
| 
      
 656 
     | 
    
         
            +
                """
         
     | 
| 
      
 657 
     | 
    
         
            +
                endpoint = "/start_profile" if mode == "start" else "/stop_profile"
         
     | 
| 
      
 658 
     | 
    
         
            +
                action = "Starting" if mode == "start" else "Stopping"
         
     | 
| 
      
 659 
     | 
    
         
            +
                action_past = "started" if mode == "start" else "stopped"
         
     | 
| 
      
 660 
     | 
    
         
            +
             
     | 
| 
      
 661 
     | 
    
         
            +
                print(f"{action} profiler...")
         
     | 
| 
      
 662 
     | 
    
         
            +
             
     | 
| 
      
 663 
     | 
    
         
            +
                for worker_type, url in profile_urls:
         
     | 
| 
      
 664 
     | 
    
         
            +
                    profile_output = await async_request_profile(api_url=url + endpoint)
         
     | 
| 
      
 665 
     | 
    
         
            +
                    if profile_output.success:
         
     | 
| 
      
 666 
     | 
    
         
            +
                        print(f"Profiler {action_past} for {worker_type} worker at {url}")
         
     | 
| 
      
 667 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 668 
     | 
    
         
            +
                        print(
         
     | 
| 
      
 669 
     | 
    
         
            +
                            f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
         
     | 
| 
      
 670 
     | 
    
         
            +
                        )
         
     | 
| 
      
 671 
     | 
    
         
            +
             
     | 
| 
      
 672 
     | 
    
         
            +
             
     | 
| 
       613 
673 
     | 
    
         
             
            def get_model(pretrained_model_name_or_path: str) -> str:
         
     | 
| 
       614 
674 
     | 
    
         
             
                if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
         
     | 
| 
       615 
675 
     | 
    
         
             
                    import huggingface_hub.constants
         
     | 
| 
         @@ -648,7 +708,30 @@ def get_tokenizer( 
     | 
|
| 
       648 
708 
     | 
    
         
             
                )
         
     | 
| 
       649 
709 
     | 
    
         | 
| 
       650 
710 
     | 
    
         | 
| 
       651 
     | 
    
         
            -
            def  
     | 
| 
      
 711 
     | 
    
         
            +
            def get_processor(
         
     | 
| 
      
 712 
     | 
    
         
            +
                pretrained_model_name_or_path: str,
         
     | 
| 
      
 713 
     | 
    
         
            +
            ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
         
     | 
| 
      
 714 
     | 
    
         
            +
                assert (
         
     | 
| 
      
 715 
     | 
    
         
            +
                    pretrained_model_name_or_path is not None
         
     | 
| 
      
 716 
     | 
    
         
            +
                    and pretrained_model_name_or_path != ""
         
     | 
| 
      
 717 
     | 
    
         
            +
                )
         
     | 
| 
      
 718 
     | 
    
         
            +
                if pretrained_model_name_or_path.endswith(
         
     | 
| 
      
 719 
     | 
    
         
            +
                    ".json"
         
     | 
| 
      
 720 
     | 
    
         
            +
                ) or pretrained_model_name_or_path.endswith(".model"):
         
     | 
| 
      
 721 
     | 
    
         
            +
                    from sglang.srt.utils.hf_transformers_utils import get_processor
         
     | 
| 
      
 722 
     | 
    
         
            +
             
     | 
| 
      
 723 
     | 
    
         
            +
                    return get_processor(pretrained_model_name_or_path)
         
     | 
| 
      
 724 
     | 
    
         
            +
             
     | 
| 
      
 725 
     | 
    
         
            +
                if pretrained_model_name_or_path is not None and not os.path.exists(
         
     | 
| 
      
 726 
     | 
    
         
            +
                    pretrained_model_name_or_path
         
     | 
| 
      
 727 
     | 
    
         
            +
                ):
         
     | 
| 
      
 728 
     | 
    
         
            +
                    pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
         
     | 
| 
      
 729 
     | 
    
         
            +
                return AutoProcessor.from_pretrained(
         
     | 
| 
      
 730 
     | 
    
         
            +
                    pretrained_model_name_or_path, trust_remote_code=True
         
     | 
| 
      
 731 
     | 
    
         
            +
                )
         
     | 
| 
      
 732 
     | 
    
         
            +
             
     | 
| 
      
 733 
     | 
    
         
            +
             
     | 
| 
      
 734 
     | 
    
         
            +
            def get_dataset(args, tokenizer, model_id=None):
         
     | 
| 
       652 
735 
     | 
    
         
             
                tokenize_prompt = getattr(args, "tokenize_prompt", False)
         
     | 
| 
       653 
736 
     | 
    
         
             
                if args.dataset_name == "sharegpt":
         
     | 
| 
       654 
737 
     | 
    
         
             
                    assert not tokenize_prompt
         
     | 
| 
         @@ -661,7 +744,7 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       661 
744 
     | 
    
         
             
                        prompt_suffix=args.prompt_suffix,
         
     | 
| 
       662 
745 
     | 
    
         
             
                        apply_chat_template=args.apply_chat_template,
         
     | 
| 
       663 
746 
     | 
    
         
             
                    )
         
     | 
| 
       664 
     | 
    
         
            -
                elif args.dataset_name.startswith("random") 
     | 
| 
      
 747 
     | 
    
         
            +
                elif args.dataset_name.startswith("random"):
         
     | 
| 
       665 
748 
     | 
    
         
             
                    input_requests = sample_random_requests(
         
     | 
| 
       666 
749 
     | 
    
         
             
                        input_len=args.random_input_len,
         
     | 
| 
       667 
750 
     | 
    
         
             
                        output_len=args.random_output_len,
         
     | 
| 
         @@ -672,17 +755,19 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       672 
755 
     | 
    
         
             
                        random_sample=args.dataset_name == "random",
         
     | 
| 
       673 
756 
     | 
    
         
             
                        return_text=not tokenize_prompt,
         
     | 
| 
       674 
757 
     | 
    
         
             
                    )
         
     | 
| 
       675 
     | 
    
         
            -
                elif args.dataset_name == " 
     | 
| 
       676 
     | 
    
         
            -
                     
     | 
| 
       677 
     | 
    
         
            -
                    input_requests =  
     | 
| 
      
 758 
     | 
    
         
            +
                elif args.dataset_name == "image":
         
     | 
| 
      
 759 
     | 
    
         
            +
                    processor = get_processor(model_id)
         
     | 
| 
      
 760 
     | 
    
         
            +
                    input_requests = sample_image_requests(
         
     | 
| 
       678 
761 
     | 
    
         
             
                        num_requests=args.num_prompts,
         
     | 
| 
       679 
     | 
    
         
            -
                         
     | 
| 
      
 762 
     | 
    
         
            +
                        image_count=args.image_count,
         
     | 
| 
       680 
763 
     | 
    
         
             
                        input_len=args.random_input_len,
         
     | 
| 
       681 
764 
     | 
    
         
             
                        output_len=args.random_output_len,
         
     | 
| 
       682 
765 
     | 
    
         
             
                        range_ratio=args.random_range_ratio,
         
     | 
| 
       683 
     | 
    
         
            -
                         
     | 
| 
       684 
     | 
    
         
            -
                         
     | 
| 
       685 
     | 
    
         
            -
                         
     | 
| 
      
 766 
     | 
    
         
            +
                        processor=processor,
         
     | 
| 
      
 767 
     | 
    
         
            +
                        image_content=args.image_content,
         
     | 
| 
      
 768 
     | 
    
         
            +
                        image_format=args.image_format,
         
     | 
| 
      
 769 
     | 
    
         
            +
                        image_resolution=args.image_resolution,
         
     | 
| 
      
 770 
     | 
    
         
            +
                        backend=args.backend,
         
     | 
| 
       686 
771 
     | 
    
         
             
                    )
         
     | 
| 
       687 
772 
     | 
    
         
             
                elif args.dataset_name == "generated-shared-prefix":
         
     | 
| 
       688 
773 
     | 
    
         
             
                    assert not tokenize_prompt
         
     | 
| 
         @@ -696,12 +781,12 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       696 
781 
     | 
    
         
             
                        args=args,
         
     | 
| 
       697 
782 
     | 
    
         
             
                    )
         
     | 
| 
       698 
783 
     | 
    
         
             
                elif args.dataset_name == "mmmu":
         
     | 
| 
       699 
     | 
    
         
            -
                     
     | 
| 
      
 784 
     | 
    
         
            +
                    processor = get_processor(model_id)
         
     | 
| 
       700 
785 
     | 
    
         
             
                    input_requests = sample_mmmu_requests(
         
     | 
| 
       701 
786 
     | 
    
         
             
                        num_requests=args.num_prompts,
         
     | 
| 
       702 
     | 
    
         
            -
                         
     | 
| 
      
 787 
     | 
    
         
            +
                        processor=processor,
         
     | 
| 
      
 788 
     | 
    
         
            +
                        backend=args.backend,
         
     | 
| 
       703 
789 
     | 
    
         
             
                        fixed_output_len=args.random_output_len,
         
     | 
| 
       704 
     | 
    
         
            -
                        apply_chat_template=args.apply_chat_template,
         
     | 
| 
       705 
790 
     | 
    
         
             
                        random_sample=True,
         
     | 
| 
       706 
791 
     | 
    
         
             
                    )
         
     | 
| 
       707 
792 
     | 
    
         
             
                elif args.dataset_name == "mooncake":
         
     | 
| 
         @@ -746,6 +831,8 @@ ASYNC_REQUEST_FUNCS = { 
     | 
|
| 
       746 
831 
     | 
    
         
             
            class BenchmarkMetrics:
         
     | 
| 
       747 
832 
     | 
    
         
             
                completed: int
         
     | 
| 
       748 
833 
     | 
    
         
             
                total_input: int
         
     | 
| 
      
 834 
     | 
    
         
            +
                total_input_text: int
         
     | 
| 
      
 835 
     | 
    
         
            +
                total_input_vision: int
         
     | 
| 
       749 
836 
     | 
    
         
             
                total_output: int
         
     | 
| 
       750 
837 
     | 
    
         
             
                total_output_retokenized: int
         
     | 
| 
       751 
838 
     | 
    
         
             
                request_throughput: float
         
     | 
| 
         @@ -839,9 +926,17 @@ class DatasetRow: 
     | 
|
| 
       839 
926 
     | 
    
         
             
                prompt: str
         
     | 
| 
       840 
927 
     | 
    
         
             
                prompt_len: int
         
     | 
| 
       841 
928 
     | 
    
         
             
                output_len: int
         
     | 
| 
      
 929 
     | 
    
         
            +
                text_prompt_len: Optional[int] = None
         
     | 
| 
      
 930 
     | 
    
         
            +
                vision_prompt_len: Optional[int] = None
         
     | 
| 
       842 
931 
     | 
    
         
             
                image_data: Optional[List[str]] = None
         
     | 
| 
       843 
932 
     | 
    
         
             
                timestamp: Optional[float] = None
         
     | 
| 
       844 
933 
     | 
    
         | 
| 
      
 934 
     | 
    
         
            +
                def __post_init__(self):
         
     | 
| 
      
 935 
     | 
    
         
            +
                    if self.text_prompt_len is None:
         
     | 
| 
      
 936 
     | 
    
         
            +
                        self.text_prompt_len = self.prompt_len
         
     | 
| 
      
 937 
     | 
    
         
            +
                    if self.vision_prompt_len is None:
         
     | 
| 
      
 938 
     | 
    
         
            +
                        self.vision_prompt_len = 0
         
     | 
| 
      
 939 
     | 
    
         
            +
             
     | 
| 
       845 
940 
     | 
    
         | 
| 
       846 
941 
     | 
    
         
             
            async def get_mooncake_request_over_time(
         
     | 
| 
       847 
942 
     | 
    
         
             
                input_requests: List[Dict],
         
     | 
| 
         @@ -889,7 +984,7 @@ async def get_mooncake_request_over_time( 
     | 
|
| 
       889 
984 
     | 
    
         
             
                    for i in range(num_rounds):
         
     | 
| 
       890 
985 
     | 
    
         
             
                        # Add user query for the current round
         
     | 
| 
       891 
986 
     | 
    
         
             
                        chat_history.append(
         
     | 
| 
       892 
     | 
    
         
            -
                            {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
         
     | 
| 
      
 987 
     | 
    
         
            +
                            {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
         
     | 
| 
       893 
988 
     | 
    
         
             
                        )
         
     | 
| 
       894 
989 
     | 
    
         | 
| 
       895 
990 
     | 
    
         
             
                        # Form the full prompt from history
         
     | 
| 
         @@ -918,9 +1013,9 @@ async def get_mooncake_request_over_time( 
     | 
|
| 
       918 
1013 
     | 
    
         | 
| 
       919 
1014 
     | 
    
         
             
            def sample_mmmu_requests(
         
     | 
| 
       920 
1015 
     | 
    
         
             
                num_requests: int,
         
     | 
| 
       921 
     | 
    
         
            -
                 
     | 
| 
      
 1016 
     | 
    
         
            +
                processor: AutoProcessor | AutoTokenizer,
         
     | 
| 
      
 1017 
     | 
    
         
            +
                backend: str,
         
     | 
| 
       922 
1018 
     | 
    
         
             
                fixed_output_len: Optional[int] = None,
         
     | 
| 
       923 
     | 
    
         
            -
                apply_chat_template: bool = True,
         
     | 
| 
       924 
1019 
     | 
    
         
             
                random_sample: bool = True,
         
     | 
| 
       925 
1020 
     | 
    
         
             
            ) -> List[DatasetRow]:
         
     | 
| 
       926 
1021 
     | 
    
         
             
                """
         
     | 
| 
         @@ -928,22 +1023,12 @@ def sample_mmmu_requests( 
     | 
|
| 
       928 
1023 
     | 
    
         | 
| 
       929 
1024 
     | 
    
         
             
                Args:
         
     | 
| 
       930 
1025 
     | 
    
         
             
                    num_requests: Number of requests to sample.
         
     | 
| 
       931 
     | 
    
         
            -
                    tokenizer: Tokenizer to use for token counting.
         
     | 
| 
       932 
1026 
     | 
    
         
             
                    fixed_output_len: If provided, use this fixed output length for all requests.
         
     | 
| 
       933 
     | 
    
         
            -
                    apply_chat_template: Whether to apply the chat template to the prompt.
         
     | 
| 
       934 
1027 
     | 
    
         
             
                    random_sample: Whether to randomly sample or take the first N.
         
     | 
| 
       935 
1028 
     | 
    
         | 
| 
       936 
1029 
     | 
    
         
             
                Returns:
         
     | 
| 
       937 
1030 
     | 
    
         
             
                    List of tuples (prompt, prompt_token_len, output_token_len).
         
     | 
| 
       938 
1031 
     | 
    
         
             
                """
         
     | 
| 
       939 
     | 
    
         
            -
                try:
         
     | 
| 
       940 
     | 
    
         
            -
                    import io
         
     | 
| 
       941 
     | 
    
         
            -
             
     | 
| 
       942 
     | 
    
         
            -
                    import pybase64
         
     | 
| 
       943 
     | 
    
         
            -
                    from datasets import load_dataset
         
     | 
| 
       944 
     | 
    
         
            -
                except ImportError:
         
     | 
| 
       945 
     | 
    
         
            -
                    raise ImportError("Please install datasets: pip install datasets")
         
     | 
| 
       946 
     | 
    
         
            -
             
     | 
| 
       947 
1032 
     | 
    
         
             
                print("Loading MMMU dataset from HuggingFace...")
         
     | 
| 
       948 
1033 
     | 
    
         | 
| 
       949 
1034 
     | 
    
         
             
                try:
         
     | 
| 
         @@ -999,54 +1084,12 @@ def sample_mmmu_requests( 
     | 
|
| 
       999 
1084 
     | 
    
         
             
                            question = example.get("question")
         
     | 
| 
       1000 
1085 
     | 
    
         | 
| 
       1001 
1086 
     | 
    
         
             
                            # Construct the prompt
         
     | 
| 
       1002 
     | 
    
         
            -
                             
     | 
| 
       1003 
     | 
    
         
            -
                            if apply_chat_template:
         
     | 
| 
       1004 
     | 
    
         
            -
                                try:
         
     | 
| 
       1005 
     | 
    
         
            -
                                    is_phi4_multimodal = (
         
     | 
| 
       1006 
     | 
    
         
            -
                                        "phi-4-multimodal" in tokenizer.name_or_path.lower()
         
     | 
| 
       1007 
     | 
    
         
            -
                                    )
         
     | 
| 
       1008 
     | 
    
         
            -
                                    if is_phi4_multimodal:
         
     | 
| 
       1009 
     | 
    
         
            -
                                        # <|endoftext10|> is the image token used in the phi-4-multimodal model.
         
     | 
| 
       1010 
     | 
    
         
            -
                                        content = prompt.replace("image 1", "<|endoftext10|>")
         
     | 
| 
       1011 
     | 
    
         
            -
                                    else:
         
     | 
| 
       1012 
     | 
    
         
            -
                                        content = [
         
     | 
| 
       1013 
     | 
    
         
            -
                                            {
         
     | 
| 
       1014 
     | 
    
         
            -
                                                "type": "image_url",
         
     | 
| 
       1015 
     | 
    
         
            -
                                                "image_url": {"url": image_data},
         
     | 
| 
       1016 
     | 
    
         
            -
                                            },
         
     | 
| 
       1017 
     | 
    
         
            -
                                            {"type": "text", "text": prompt},
         
     | 
| 
       1018 
     | 
    
         
            -
                                        ]
         
     | 
| 
       1019 
     | 
    
         
            -
                                    prompt = tokenizer.apply_chat_template(
         
     | 
| 
       1020 
     | 
    
         
            -
                                        [
         
     | 
| 
       1021 
     | 
    
         
            -
                                            {
         
     | 
| 
       1022 
     | 
    
         
            -
                                                "role": "user",
         
     | 
| 
       1023 
     | 
    
         
            -
                                                "content": content,
         
     | 
| 
       1024 
     | 
    
         
            -
                                            }
         
     | 
| 
       1025 
     | 
    
         
            -
                                        ],
         
     | 
| 
       1026 
     | 
    
         
            -
                                        add_generation_prompt=True,
         
     | 
| 
       1027 
     | 
    
         
            -
                                        tokenize=False,
         
     | 
| 
       1028 
     | 
    
         
            -
                                    )
         
     | 
| 
       1029 
     | 
    
         
            -
                                except Exception as e:
         
     | 
| 
       1030 
     | 
    
         
            -
                                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
         
     | 
| 
       1031 
     | 
    
         
            -
                                    print(
         
     | 
| 
       1032 
     | 
    
         
            -
                                        f"Error applying chat template: {e}, fallback to <image> tag"
         
     | 
| 
       1033 
     | 
    
         
            -
                                    )
         
     | 
| 
       1034 
     | 
    
         
            -
                                    prompt = f"<image>{prompt}"
         
     | 
| 
       1035 
     | 
    
         
            -
             
     | 
| 
       1036 
     | 
    
         
            -
                            # Calculate token lengths for text only (without image data)
         
     | 
| 
       1037 
     | 
    
         
            -
                            prompt_token_ids = tokenizer.encode(prompt)
         
     | 
| 
       1038 
     | 
    
         
            -
                            prompt_len = len(prompt_token_ids)
         
     | 
| 
       1039 
     | 
    
         
            -
             
     | 
| 
      
 1087 
     | 
    
         
            +
                            text_prompt = f"Question: {question}\n\nAnswer: "
         
     | 
| 
       1040 
1088 
     | 
    
         
             
                            output_len = fixed_output_len if fixed_output_len is not None else 256
         
     | 
| 
       1041 
     | 
    
         
            -
             
     | 
| 
       1042 
     | 
    
         
            -
             
     | 
| 
       1043 
     | 
    
         
            -
                                DatasetRow(
         
     | 
| 
       1044 
     | 
    
         
            -
                                    prompt=prompt,
         
     | 
| 
       1045 
     | 
    
         
            -
                                    prompt_len=prompt_len,
         
     | 
| 
       1046 
     | 
    
         
            -
                                    output_len=output_len,
         
     | 
| 
       1047 
     | 
    
         
            -
                                    image_data=[image_data],
         
     | 
| 
       1048 
     | 
    
         
            -
                                )
         
     | 
| 
      
 1089 
     | 
    
         
            +
                            data_row = create_mm_data_row(
         
     | 
| 
      
 1090 
     | 
    
         
            +
                                text_prompt, [image], [image_data], output_len, processor, backend
         
     | 
| 
       1049 
1091 
     | 
    
         
             
                            )
         
     | 
| 
      
 1092 
     | 
    
         
            +
                            filtered_dataset.append(data_row)
         
     | 
| 
       1050 
1093 
     | 
    
         | 
| 
       1051 
1094 
     | 
    
         
             
                    except Exception as e:
         
     | 
| 
       1052 
1095 
     | 
    
         
             
                        print(f"Error processing example {i}: {e}")
         
     | 
| 
         @@ -1134,7 +1177,11 @@ def sample_sharegpt_requests( 
     | 
|
| 
       1134 
1177 
     | 
    
         
             
                        continue
         
     | 
| 
       1135 
1178 
     | 
    
         | 
| 
       1136 
1179 
     | 
    
         
             
                    filtered_dataset.append(
         
     | 
| 
       1137 
     | 
    
         
            -
                        DatasetRow( 
     | 
| 
      
 1180 
     | 
    
         
            +
                        DatasetRow(
         
     | 
| 
      
 1181 
     | 
    
         
            +
                            prompt=prompt,
         
     | 
| 
      
 1182 
     | 
    
         
            +
                            prompt_len=prompt_len,
         
     | 
| 
      
 1183 
     | 
    
         
            +
                            output_len=output_len,
         
     | 
| 
      
 1184 
     | 
    
         
            +
                        )
         
     | 
| 
       1138 
1185 
     | 
    
         
             
                    )
         
     | 
| 
       1139 
1186 
     | 
    
         | 
| 
       1140 
1187 
     | 
    
         
             
                print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
         
     | 
| 
         @@ -1245,7 +1292,7 @@ def sample_random_requests( 
     | 
|
| 
       1245 
1292 
     | 
    
         
             
                return input_requests
         
     | 
| 
       1246 
1293 
     | 
    
         | 
| 
       1247 
1294 
     | 
    
         | 
| 
       1248 
     | 
    
         
            -
            def  
     | 
| 
      
 1295 
     | 
    
         
            +
            def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
         
     | 
| 
       1249 
1296 
     | 
    
         
             
                """Parse image resolution into (width, height).
         
     | 
| 
       1250 
1297 
     | 
    
         | 
| 
       1251 
1298 
     | 
    
         
             
                Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
         
     | 
| 
         @@ -1270,44 +1317,109 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]: 
     | 
|
| 
       1270 
1317 
     | 
    
         
             
                            return (width, height)
         
     | 
| 
       1271 
1318 
     | 
    
         | 
| 
       1272 
1319 
     | 
    
         
             
                raise ValueError(
         
     | 
| 
       1273 
     | 
    
         
            -
                    f"Unsupported  
     | 
| 
      
 1320 
     | 
    
         
            +
                    f"Unsupported image resolution: {image_resolution}. "
         
     | 
| 
       1274 
1321 
     | 
    
         
             
                    "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
         
     | 
| 
       1275 
1322 
     | 
    
         
             
                )
         
     | 
| 
       1276 
1323 
     | 
    
         | 
| 
       1277 
1324 
     | 
    
         | 
| 
       1278 
     | 
    
         
            -
            def  
     | 
| 
      
 1325 
     | 
    
         
            +
            def create_mm_data_row(
         
     | 
| 
      
 1326 
     | 
    
         
            +
                text_prompt, images: list, images_base64, output_len, processor, backend
         
     | 
| 
      
 1327 
     | 
    
         
            +
            ):
         
     | 
| 
      
 1328 
     | 
    
         
            +
                try:
         
     | 
| 
      
 1329 
     | 
    
         
            +
                    if type(processor).__name__ == "Phi4MMProcessor":
         
     | 
| 
      
 1330 
     | 
    
         
            +
                        # <|endoftext10|> is the image token used in the phi-4-multimodal model.
         
     | 
| 
      
 1331 
     | 
    
         
            +
                        content_items = text_prompt.replace("image 1", "|endoftext10|")
         
     | 
| 
      
 1332 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1333 
     | 
    
         
            +
                        content_items = [
         
     | 
| 
      
 1334 
     | 
    
         
            +
                            {"type": "image", "image": {"url": image_base64}}
         
     | 
| 
      
 1335 
     | 
    
         
            +
                            for image_base64 in images_base64
         
     | 
| 
      
 1336 
     | 
    
         
            +
                        ]
         
     | 
| 
      
 1337 
     | 
    
         
            +
                        content_items.append({"type": "text", "text": text_prompt})
         
     | 
| 
      
 1338 
     | 
    
         
            +
                    prompt_str = processor.apply_chat_template(
         
     | 
| 
      
 1339 
     | 
    
         
            +
                        [{"role": "user", "content": content_items}],
         
     | 
| 
      
 1340 
     | 
    
         
            +
                        add_generation_prompt=True,
         
     | 
| 
      
 1341 
     | 
    
         
            +
                        tokenize=False,
         
     | 
| 
      
 1342 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1343 
     | 
    
         
            +
                except Exception as e:
         
     | 
| 
      
 1344 
     | 
    
         
            +
                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
         
     | 
| 
      
 1345 
     | 
    
         
            +
                    print(f"Error applying chat template: {e}, fallback to <image> tag")
         
     | 
| 
      
 1346 
     | 
    
         
            +
                    # Some tokenizers do not support list content; fall back to a placeholder in the text
         
     | 
| 
      
 1347 
     | 
    
         
            +
                    prompt_str = f"<image>{text_prompt}"
         
     | 
| 
      
 1348 
     | 
    
         
            +
             
     | 
| 
      
 1349 
     | 
    
         
            +
                # Calculate total tokens (text + vision)
         
     | 
| 
      
 1350 
     | 
    
         
            +
                prompt_len = processor(
         
     | 
| 
      
 1351 
     | 
    
         
            +
                    text=[prompt_str],
         
     | 
| 
      
 1352 
     | 
    
         
            +
                    images=images,
         
     | 
| 
      
 1353 
     | 
    
         
            +
                    padding=False,
         
     | 
| 
      
 1354 
     | 
    
         
            +
                    return_tensors="pt",
         
     | 
| 
      
 1355 
     | 
    
         
            +
                )["input_ids"].numel()
         
     | 
| 
      
 1356 
     | 
    
         
            +
             
     | 
| 
      
 1357 
     | 
    
         
            +
                # Calculate text-only tokens
         
     | 
| 
      
 1358 
     | 
    
         
            +
                try:
         
     | 
| 
      
 1359 
     | 
    
         
            +
                    # Create text-only version of the prompt
         
     | 
| 
      
 1360 
     | 
    
         
            +
                    text_only_prompt = processor.apply_chat_template(
         
     | 
| 
      
 1361 
     | 
    
         
            +
                        [{"role": "user", "content": text_prompt}],
         
     | 
| 
      
 1362 
     | 
    
         
            +
                        add_generation_prompt=True,
         
     | 
| 
      
 1363 
     | 
    
         
            +
                        tokenize=False,
         
     | 
| 
      
 1364 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1365 
     | 
    
         
            +
                    text_prompt_len = processor(
         
     | 
| 
      
 1366 
     | 
    
         
            +
                        text=[text_only_prompt],
         
     | 
| 
      
 1367 
     | 
    
         
            +
                        padding=False,
         
     | 
| 
      
 1368 
     | 
    
         
            +
                        return_tensors="pt",
         
     | 
| 
      
 1369 
     | 
    
         
            +
                    )["input_ids"].numel()
         
     | 
| 
      
 1370 
     | 
    
         
            +
                except Exception:
         
     | 
| 
      
 1371 
     | 
    
         
            +
                    # Fallback: just tokenize the text prompt directly
         
     | 
| 
      
 1372 
     | 
    
         
            +
                    text_prompt_len = len(processor.tokenizer.encode(text_prompt))
         
     | 
| 
      
 1373 
     | 
    
         
            +
             
     | 
| 
      
 1374 
     | 
    
         
            +
                # Vision tokens = total tokens - text tokens
         
     | 
| 
      
 1375 
     | 
    
         
            +
                vision_prompt_len = prompt_len - text_prompt_len
         
     | 
| 
      
 1376 
     | 
    
         
            +
             
     | 
| 
      
 1377 
     | 
    
         
            +
                use_raw_prompt = backend in [
         
     | 
| 
      
 1378 
     | 
    
         
            +
                    "sglang-oai",
         
     | 
| 
      
 1379 
     | 
    
         
            +
                    "sglang-oai-chat",
         
     | 
| 
      
 1380 
     | 
    
         
            +
                    "vllm",
         
     | 
| 
      
 1381 
     | 
    
         
            +
                    "vllm-chat",
         
     | 
| 
      
 1382 
     | 
    
         
            +
                    "lmdeploy",
         
     | 
| 
      
 1383 
     | 
    
         
            +
                    "lmdeploy-chat",
         
     | 
| 
      
 1384 
     | 
    
         
            +
                ]
         
     | 
| 
      
 1385 
     | 
    
         
            +
                return DatasetRow(
         
     | 
| 
      
 1386 
     | 
    
         
            +
                    prompt=text_prompt if use_raw_prompt else prompt_str,
         
     | 
| 
      
 1387 
     | 
    
         
            +
                    prompt_len=prompt_len,
         
     | 
| 
      
 1388 
     | 
    
         
            +
                    output_len=output_len,
         
     | 
| 
      
 1389 
     | 
    
         
            +
                    text_prompt_len=text_prompt_len,
         
     | 
| 
      
 1390 
     | 
    
         
            +
                    vision_prompt_len=vision_prompt_len,
         
     | 
| 
      
 1391 
     | 
    
         
            +
                    image_data=images_base64,
         
     | 
| 
      
 1392 
     | 
    
         
            +
                )
         
     | 
| 
      
 1393 
     | 
    
         
            +
             
     | 
| 
      
 1394 
     | 
    
         
            +
             
     | 
| 
      
 1395 
     | 
    
         
            +
            def sample_image_requests(
         
     | 
| 
       1279 
1396 
     | 
    
         
             
                num_requests: int,
         
     | 
| 
       1280 
     | 
    
         
            -
                 
     | 
| 
      
 1397 
     | 
    
         
            +
                image_count: int,
         
     | 
| 
       1281 
1398 
     | 
    
         
             
                input_len: int,
         
     | 
| 
       1282 
1399 
     | 
    
         
             
                output_len: int,
         
     | 
| 
       1283 
1400 
     | 
    
         
             
                range_ratio: float,
         
     | 
| 
       1284 
     | 
    
         
            -
                 
     | 
| 
       1285 
     | 
    
         
            -
                 
     | 
| 
       1286 
     | 
    
         
            -
                 
     | 
| 
      
 1401 
     | 
    
         
            +
                processor: AutoProcessor,
         
     | 
| 
      
 1402 
     | 
    
         
            +
                image_content: str,
         
     | 
| 
      
 1403 
     | 
    
         
            +
                image_format: str,
         
     | 
| 
      
 1404 
     | 
    
         
            +
                image_resolution: str,
         
     | 
| 
      
 1405 
     | 
    
         
            +
                backend: str,
         
     | 
| 
       1287 
1406 
     | 
    
         
             
            ) -> List[DatasetRow]:
         
     | 
| 
       1288 
     | 
    
         
            -
                """Generate requests with  
     | 
| 
      
 1407 
     | 
    
         
            +
                """Generate requests with images.
         
     | 
| 
       1289 
1408 
     | 
    
         | 
| 
       1290 
     | 
    
         
            -
                - Each request includes `` 
     | 
| 
      
 1409 
     | 
    
         
            +
                - Each request includes ``image_count`` images.
         
     | 
| 
       1291 
1410 
     | 
    
         
             
                - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
         
     | 
| 
       1292 
1411 
     | 
    
         
             
                  or custom 'heightxwidth' (e.g., 1080x1920).
         
     | 
| 
       1293 
1412 
     | 
    
         
             
                - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
         
     | 
| 
       1294 
1413 
     | 
    
         
             
                  only counts text tokens and excludes image data.
         
     | 
| 
       1295 
1414 
     | 
    
         
             
                """
         
     | 
| 
       1296 
     | 
    
         
            -
                try:
         
     | 
| 
       1297 
     | 
    
         
            -
                    import pybase64
         
     | 
| 
       1298 
     | 
    
         
            -
                    from PIL import Image
         
     | 
| 
       1299 
     | 
    
         
            -
                except ImportError as e:
         
     | 
| 
       1300 
     | 
    
         
            -
                    raise ImportError(
         
     | 
| 
       1301 
     | 
    
         
            -
                        "Please install Pillow to generate random images: pip install pillow"
         
     | 
| 
       1302 
     | 
    
         
            -
                    ) from e
         
     | 
| 
       1303 
1415 
     | 
    
         | 
| 
       1304 
1416 
     | 
    
         
             
                # Parse resolution (supports presets and 'heightxwidth')
         
     | 
| 
       1305 
     | 
    
         
            -
                width, height =  
     | 
| 
      
 1417 
     | 
    
         
            +
                width, height = parse_image_resolution(image_resolution)
         
     | 
| 
       1306 
1418 
     | 
    
         | 
| 
       1307 
1419 
     | 
    
         
             
                # Check for potentially problematic combinations and warn user
         
     | 
| 
       1308 
     | 
    
         
            -
                if width * height >= 1920 * 1080 and  
     | 
| 
      
 1420 
     | 
    
         
            +
                if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
         
     | 
| 
       1309 
1421 
     | 
    
         
             
                    warnings.warn(
         
     | 
| 
       1310 
     | 
    
         
            -
                        f"High resolution ({width}x{height}) with { 
     | 
| 
      
 1422 
     | 
    
         
            +
                        f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
         
     | 
| 
       1311 
1423 
     | 
    
         
             
                        f"may take a long time. Consider reducing resolution or image count.",
         
     | 
| 
       1312 
1424 
     | 
    
         
             
                        UserWarning,
         
     | 
| 
       1313 
1425 
     | 
    
         
             
                        stacklevel=2,
         
     | 
| 
         @@ -1321,53 +1433,51 @@ def sample_random_image_requests( 
     | 
|
| 
       1321 
1433 
     | 
    
         
             
                    int(output_len * range_ratio), output_len + 1, size=num_requests
         
     | 
| 
       1322 
1434 
     | 
    
         
             
                )
         
     | 
| 
       1323 
1435 
     | 
    
         | 
| 
       1324 
     | 
    
         
            -
                def _gen_random_image_data_uri( 
     | 
| 
       1325 
     | 
    
         
            -
                     
     | 
| 
       1326 
     | 
    
         
            -
             
     | 
| 
      
 1436 
     | 
    
         
            +
                def _gen_random_image_data_uri(
         
     | 
| 
      
 1437 
     | 
    
         
            +
                    width: int = width, height: int = height
         
     | 
| 
      
 1438 
     | 
    
         
            +
                ) -> (Image, str, int):
         
     | 
| 
      
 1439 
     | 
    
         
            +
                    if image_content == "blank":
         
     | 
| 
      
 1440 
     | 
    
         
            +
                        # Generate blank white image
         
     | 
| 
      
 1441 
     | 
    
         
            +
                        arr = np.full((height, width, 3), 255, dtype=np.uint8)
         
     | 
| 
      
 1442 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1443 
     | 
    
         
            +
                        # Generate random colored image
         
     | 
| 
      
 1444 
     | 
    
         
            +
                        arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
         
     | 
| 
      
 1445 
     | 
    
         
            +
                    img = Image.fromarray(arr)
         
     | 
| 
       1327 
1446 
     | 
    
         
             
                    buf = io.BytesIO()
         
     | 
| 
       1328 
     | 
    
         
            -
                    img.save(buf, format= 
     | 
| 
      
 1447 
     | 
    
         
            +
                    img.save(buf, format=image_format, quality=85)
         
     | 
| 
       1329 
1448 
     | 
    
         
             
                    encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
         
     | 
| 
       1330 
     | 
    
         
            -
                     
     | 
| 
      
 1449 
     | 
    
         
            +
                    image_data = f"data:image/{image_format};base64,{encoded}"
         
     | 
| 
      
 1450 
     | 
    
         
            +
                    image_bytes = len(image_data.encode("utf-8"))
         
     | 
| 
      
 1451 
     | 
    
         
            +
                    return img, image_data, image_bytes
         
     | 
| 
       1331 
1452 
     | 
    
         | 
| 
       1332 
1453 
     | 
    
         
             
                dataset: List[DatasetRow] = []
         
     | 
| 
      
 1454 
     | 
    
         
            +
                total_image_bytes = 0
         
     | 
| 
       1333 
1455 
     | 
    
         
             
                for i in range(num_requests):
         
     | 
| 
       1334 
1456 
     | 
    
         
             
                    # Generate text prompt
         
     | 
| 
       1335 
     | 
    
         
            -
                    text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
         
     | 
| 
      
 1457 
     | 
    
         
            +
                    text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
         
     | 
| 
       1336 
1458 
     | 
    
         | 
| 
       1337 
1459 
     | 
    
         
             
                    # Generate image list
         
     | 
| 
       1338 
     | 
    
         
            -
                    images  
     | 
| 
       1339 
     | 
    
         
            -
             
     | 
| 
       1340 
     | 
    
         
            -
                     
     | 
| 
       1341 
     | 
    
         
            -
                     
     | 
| 
       1342 
     | 
    
         
            -
             
     | 
| 
       1343 
     | 
    
         
            -
             
     | 
| 
       1344 
     | 
    
         
            -
             
     | 
| 
       1345 
     | 
    
         
            -
             
     | 
| 
       1346 
     | 
    
         
            -
             
     | 
| 
       1347 
     | 
    
         
            -
             
     | 
| 
       1348 
     | 
    
         
            -
             
     | 
| 
       1349 
     | 
    
         
            -
             
     | 
| 
       1350 
     | 
    
         
            -
                                add_generation_prompt=True,
         
     | 
| 
       1351 
     | 
    
         
            -
                                tokenize=False,
         
     | 
| 
       1352 
     | 
    
         
            -
                            )
         
     | 
| 
       1353 
     | 
    
         
            -
                        except Exception:
         
     | 
| 
       1354 
     | 
    
         
            -
                            # Some tokenizers do not support list content; fall back to a placeholder in the text
         
     | 
| 
       1355 
     | 
    
         
            -
                            prompt_str = f"<image>{text_prompt}"
         
     | 
| 
       1356 
     | 
    
         
            -
             
     | 
| 
       1357 
     | 
    
         
            -
                    prompt_token_ids = tokenizer.encode(prompt_str)
         
     | 
| 
       1358 
     | 
    
         
            -
                    prompt_token_len = len(prompt_token_ids)
         
     | 
| 
       1359 
     | 
    
         
            -
             
     | 
| 
       1360 
     | 
    
         
            -
                    dataset.append(
         
     | 
| 
       1361 
     | 
    
         
            -
                        DatasetRow(
         
     | 
| 
       1362 
     | 
    
         
            -
                            prompt=prompt_str,
         
     | 
| 
       1363 
     | 
    
         
            -
                            prompt_len=prompt_token_len,
         
     | 
| 
       1364 
     | 
    
         
            -
                            output_len=int(output_lens[i]),
         
     | 
| 
       1365 
     | 
    
         
            -
                            image_data=images,
         
     | 
| 
       1366 
     | 
    
         
            -
                        )
         
     | 
| 
      
 1460 
     | 
    
         
            +
                    images, images_base64, images_bytes = zip(
         
     | 
| 
      
 1461 
     | 
    
         
            +
                        *[_gen_random_image_data_uri() for _ in range(image_count)]
         
     | 
| 
      
 1462 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1463 
     | 
    
         
            +
                    total_image_bytes += sum(list(images_bytes))
         
     | 
| 
      
 1464 
     | 
    
         
            +
             
     | 
| 
      
 1465 
     | 
    
         
            +
                    data_row = create_mm_data_row(
         
     | 
| 
      
 1466 
     | 
    
         
            +
                        text_prompt,
         
     | 
| 
      
 1467 
     | 
    
         
            +
                        list(images),
         
     | 
| 
      
 1468 
     | 
    
         
            +
                        list(images_base64),
         
     | 
| 
      
 1469 
     | 
    
         
            +
                        int(output_lens[i]),
         
     | 
| 
      
 1470 
     | 
    
         
            +
                        processor,
         
     | 
| 
      
 1471 
     | 
    
         
            +
                        backend,
         
     | 
| 
       1367 
1472 
     | 
    
         
             
                    )
         
     | 
| 
       1368 
1473 
     | 
    
         | 
| 
      
 1474 
     | 
    
         
            +
                    dataset.append(data_row)
         
     | 
| 
      
 1475 
     | 
    
         
            +
             
     | 
| 
       1369 
1476 
     | 
    
         
             
                print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
         
     | 
| 
       1370 
1477 
     | 
    
         
             
                print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
         
     | 
| 
      
 1478 
     | 
    
         
            +
                print(
         
     | 
| 
      
 1479 
     | 
    
         
            +
                    f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
         
     | 
| 
      
 1480 
     | 
    
         
            +
                )
         
     | 
| 
       1371 
1481 
     | 
    
         
             
                return dataset
         
     | 
| 
       1372 
1482 
     | 
    
         | 
| 
       1373 
1483 
     | 
    
         | 
| 
         @@ -1439,7 +1549,9 @@ def sample_generated_shared_prefix_requests( 
     | 
|
| 
       1439 
1549 
     | 
    
         | 
| 
       1440 
1550 
     | 
    
         
             
                        input_requests.append(
         
     | 
| 
       1441 
1551 
     | 
    
         
             
                            DatasetRow(
         
     | 
| 
       1442 
     | 
    
         
            -
                                prompt=full_prompt, 
     | 
| 
      
 1552 
     | 
    
         
            +
                                prompt=full_prompt,
         
     | 
| 
      
 1553 
     | 
    
         
            +
                                prompt_len=prompt_len,
         
     | 
| 
      
 1554 
     | 
    
         
            +
                                output_len=output_len,
         
     | 
| 
       1443 
1555 
     | 
    
         
             
                            )
         
     | 
| 
       1444 
1556 
     | 
    
         
             
                        )
         
     | 
| 
       1445 
1557 
     | 
    
         
             
                        total_input_tokens += prompt_len
         
     | 
| 
         @@ -1517,15 +1629,26 @@ def calculate_metrics( 
     | 
|
| 
       1517 
1629 
     | 
    
         
             
                dur_s: float,
         
     | 
| 
       1518 
1630 
     | 
    
         
             
                tokenizer: PreTrainedTokenizerBase,
         
     | 
| 
       1519 
1631 
     | 
    
         
             
                backend: str,
         
     | 
| 
      
 1632 
     | 
    
         
            +
                accept_length: Optional[float] = None,
         
     | 
| 
       1520 
1633 
     | 
    
         
             
            ) -> Tuple[BenchmarkMetrics, List[int]]:
         
     | 
| 
       1521 
1634 
     | 
    
         
             
                output_lens: List[int] = []
         
     | 
| 
       1522 
1635 
     | 
    
         
             
                retokenized_output_lens: List[int] = []
         
     | 
| 
       1523 
1636 
     | 
    
         
             
                total_input = 0
         
     | 
| 
      
 1637 
     | 
    
         
            +
                total_input_text = 0
         
     | 
| 
      
 1638 
     | 
    
         
            +
                total_input_vision = 0
         
     | 
| 
       1524 
1639 
     | 
    
         
             
                completed = 0
         
     | 
| 
       1525 
1640 
     | 
    
         
             
                itls: List[float] = []
         
     | 
| 
       1526 
1641 
     | 
    
         
             
                tpots: List[float] = []
         
     | 
| 
       1527 
1642 
     | 
    
         
             
                ttfts: List[float] = []
         
     | 
| 
       1528 
1643 
     | 
    
         
             
                e2e_latencies: List[float] = []
         
     | 
| 
      
 1644 
     | 
    
         
            +
                retokenized_itls: List[float] = []
         
     | 
| 
      
 1645 
     | 
    
         
            +
             
     | 
| 
      
 1646 
     | 
    
         
            +
                use_retokenized_itl = (
         
     | 
| 
      
 1647 
     | 
    
         
            +
                    accept_length is not None
         
     | 
| 
      
 1648 
     | 
    
         
            +
                    and accept_length > 0
         
     | 
| 
      
 1649 
     | 
    
         
            +
                    and backend in ("sglang-oai", "sglang-oai-chat")
         
     | 
| 
      
 1650 
     | 
    
         
            +
                )
         
     | 
| 
      
 1651 
     | 
    
         
            +
             
     | 
| 
       1529 
1652 
     | 
    
         
             
                for i in range(len(outputs)):
         
     | 
| 
       1530 
1653 
     | 
    
         
             
                    if outputs[i].success:
         
     | 
| 
       1531 
1654 
     | 
    
         
             
                        output_len = outputs[i].output_len
         
     | 
| 
         @@ -1534,10 +1657,22 @@ def calculate_metrics( 
     | 
|
| 
       1534 
1657 
     | 
    
         
             
                            tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
         
     | 
| 
       1535 
1658 
     | 
    
         
             
                        )
         
     | 
| 
       1536 
1659 
     | 
    
         
             
                        retokenized_output_lens.append(retokenized_output_len)
         
     | 
| 
       1537 
     | 
    
         
            -
                        total_input +=  
     | 
| 
      
 1660 
     | 
    
         
            +
                        total_input += input_requests[i].prompt_len
         
     | 
| 
      
 1661 
     | 
    
         
            +
                        total_input_text += input_requests[i].text_prompt_len
         
     | 
| 
      
 1662 
     | 
    
         
            +
                        total_input_vision += input_requests[i].vision_prompt_len
         
     | 
| 
       1538 
1663 
     | 
    
         
             
                        if output_len > 1:
         
     | 
| 
       1539 
1664 
     | 
    
         
             
                            tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
         
     | 
| 
       1540 
     | 
    
         
            -
                         
     | 
| 
      
 1665 
     | 
    
         
            +
                        if use_retokenized_itl:
         
     | 
| 
      
 1666 
     | 
    
         
            +
                            for k, itl in enumerate(outputs[i].itl):
         
     | 
| 
      
 1667 
     | 
    
         
            +
                                num_tokens = len(
         
     | 
| 
      
 1668 
     | 
    
         
            +
                                    tokenizer.encode(
         
     | 
| 
      
 1669 
     | 
    
         
            +
                                        outputs[i].text_chunks[k], add_special_tokens=False
         
     | 
| 
      
 1670 
     | 
    
         
            +
                                    )
         
     | 
| 
      
 1671 
     | 
    
         
            +
                                )
         
     | 
| 
      
 1672 
     | 
    
         
            +
                                adjusted_itl = itl / num_tokens
         
     | 
| 
      
 1673 
     | 
    
         
            +
                                retokenized_itls.extend([adjusted_itl] * num_tokens)
         
     | 
| 
      
 1674 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1675 
     | 
    
         
            +
                            itls += outputs[i].itl
         
     | 
| 
       1541 
1676 
     | 
    
         
             
                        ttfts.append(outputs[i].ttft)
         
     | 
| 
       1542 
1677 
     | 
    
         | 
| 
       1543 
1678 
     | 
    
         
             
                        e2e_latencies.append(outputs[i].latency)
         
     | 
| 
         @@ -1553,9 +1688,13 @@ def calculate_metrics( 
     | 
|
| 
       1553 
1688 
     | 
    
         
             
                        "on the benchmark arguments.",
         
     | 
| 
       1554 
1689 
     | 
    
         
             
                        stacklevel=2,
         
     | 
| 
       1555 
1690 
     | 
    
         
             
                    )
         
     | 
| 
      
 1691 
     | 
    
         
            +
             
     | 
| 
      
 1692 
     | 
    
         
            +
                itls = retokenized_itls if use_retokenized_itl else itls
         
     | 
| 
       1556 
1693 
     | 
    
         
             
                metrics = BenchmarkMetrics(
         
     | 
| 
       1557 
1694 
     | 
    
         
             
                    completed=completed,
         
     | 
| 
       1558 
1695 
     | 
    
         
             
                    total_input=total_input,
         
     | 
| 
      
 1696 
     | 
    
         
            +
                    total_input_text=total_input_text,
         
     | 
| 
      
 1697 
     | 
    
         
            +
                    total_input_vision=total_input_vision,
         
     | 
| 
       1559 
1698 
     | 
    
         
             
                    total_output=sum(output_lens),
         
     | 
| 
       1560 
1699 
     | 
    
         
             
                    total_output_retokenized=sum(retokenized_output_lens),
         
     | 
| 
       1561 
1700 
     | 
    
         
             
                    request_throughput=completed / dur_s,
         
     | 
| 
         @@ -1609,6 +1748,8 @@ async def benchmark( 
     | 
|
| 
       1609 
1748 
     | 
    
         
             
                use_trace_timestamps: bool = False,
         
     | 
| 
       1610 
1749 
     | 
    
         
             
                mooncake_slowdown_factor=1.0,
         
     | 
| 
       1611 
1750 
     | 
    
         
             
                mooncake_num_rounds=1,
         
     | 
| 
      
 1751 
     | 
    
         
            +
                profile_prefill_url: Optional[List[str]] = None,
         
     | 
| 
      
 1752 
     | 
    
         
            +
                profile_decode_url: Optional[List[str]] = None,
         
     | 
| 
       1612 
1753 
     | 
    
         
             
            ):
         
     | 
| 
       1613 
1754 
     | 
    
         
             
                if backend in ASYNC_REQUEST_FUNCS:
         
     | 
| 
       1614 
1755 
     | 
    
         
             
                    request_func = ASYNC_REQUEST_FUNCS[backend]
         
     | 
| 
         @@ -1698,14 +1839,28 @@ async def benchmark( 
     | 
|
| 
       1698 
1839 
     | 
    
         | 
| 
       1699 
1840 
     | 
    
         
             
                time.sleep(1.0)
         
     | 
| 
       1700 
1841 
     | 
    
         | 
| 
      
 1842 
     | 
    
         
            +
                # Build profile URLs for PD separated mode (do this once at the beginning)
         
     | 
| 
      
 1843 
     | 
    
         
            +
                pd_profile_urls = []
         
     | 
| 
      
 1844 
     | 
    
         
            +
                if profile and pd_separated:
         
     | 
| 
      
 1845 
     | 
    
         
            +
                    pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
         
     | 
| 
      
 1846 
     | 
    
         
            +
                    if not pd_profile_urls:
         
     | 
| 
      
 1847 
     | 
    
         
            +
                        print(
         
     | 
| 
      
 1848 
     | 
    
         
            +
                            "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
         
     | 
| 
      
 1849 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1850 
     | 
    
         
            +
                        print("Skipping profiler start. Please specify worker URLs for profiling.")
         
     | 
| 
      
 1851 
     | 
    
         
            +
             
     | 
| 
       1701 
1852 
     | 
    
         
             
                # Start profiler
         
     | 
| 
       1702 
1853 
     | 
    
         
             
                if profile:
         
     | 
| 
       1703 
     | 
    
         
            -
                     
     | 
| 
       1704 
     | 
    
         
            -
             
     | 
| 
       1705 
     | 
    
         
            -
             
     | 
| 
       1706 
     | 
    
         
            -
                     
     | 
| 
       1707 
     | 
    
         
            -
             
     | 
| 
       1708 
     | 
    
         
            -
                         
     | 
| 
      
 1854 
     | 
    
         
            +
                    if pd_separated:
         
     | 
| 
      
 1855 
     | 
    
         
            +
                        if pd_profile_urls:
         
     | 
| 
      
 1856 
     | 
    
         
            +
                            await _call_profile_pd(pd_profile_urls, "start")
         
     | 
| 
      
 1857 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1858 
     | 
    
         
            +
                        print("Starting profiler...")
         
     | 
| 
      
 1859 
     | 
    
         
            +
                        profile_output = await async_request_profile(
         
     | 
| 
      
 1860 
     | 
    
         
            +
                            api_url=base_url + "/start_profile"
         
     | 
| 
      
 1861 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1862 
     | 
    
         
            +
                        if profile_output.success:
         
     | 
| 
      
 1863 
     | 
    
         
            +
                            print("Profiler started")
         
     | 
| 
       1709 
1864 
     | 
    
         | 
| 
       1710 
1865 
     | 
    
         
             
                # Run all requests
         
     | 
| 
       1711 
1866 
     | 
    
         
             
                benchmark_start_time = time.perf_counter()
         
     | 
| 
         @@ -1754,10 +1909,16 @@ async def benchmark( 
     | 
|
| 
       1754 
1909 
     | 
    
         | 
| 
       1755 
1910 
     | 
    
         
             
                # Stop profiler
         
     | 
| 
       1756 
1911 
     | 
    
         
             
                if profile:
         
     | 
| 
       1757 
     | 
    
         
            -
                     
     | 
| 
       1758 
     | 
    
         
            -
             
     | 
| 
       1759 
     | 
    
         
            -
             
     | 
| 
       1760 
     | 
    
         
            -
             
     | 
| 
      
 1912 
     | 
    
         
            +
                    if pd_separated:
         
     | 
| 
      
 1913 
     | 
    
         
            +
                        if pd_profile_urls:
         
     | 
| 
      
 1914 
     | 
    
         
            +
                            await _call_profile_pd(pd_profile_urls, "stop")
         
     | 
| 
      
 1915 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1916 
     | 
    
         
            +
                        print("Stopping profiler...")
         
     | 
| 
      
 1917 
     | 
    
         
            +
                        profile_output = await async_request_profile(
         
     | 
| 
      
 1918 
     | 
    
         
            +
                            api_url=base_url + "/stop_profile"
         
     | 
| 
      
 1919 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1920 
     | 
    
         
            +
                        if profile_output.success:
         
     | 
| 
      
 1921 
     | 
    
         
            +
                            print("Profiler stopped")
         
     | 
| 
       1761 
1922 
     | 
    
         | 
| 
       1762 
1923 
     | 
    
         
             
                if pbar is not None:
         
     | 
| 
       1763 
1924 
     | 
    
         
             
                    pbar.close()
         
     | 
| 
         @@ -1770,9 +1931,15 @@ async def benchmark( 
     | 
|
| 
       1770 
1931 
     | 
    
         
             
                        server_info_json = server_info.json()
         
     | 
| 
       1771 
1932 
     | 
    
         
             
                        if "decode" in server_info_json:
         
     | 
| 
       1772 
1933 
     | 
    
         
             
                            server_info_json = server_info_json["decode"][0]
         
     | 
| 
       1773 
     | 
    
         
            -
                         
     | 
| 
       1774 
     | 
    
         
            -
                            " 
     | 
| 
       1775 
     | 
    
         
            -
             
     | 
| 
      
 1934 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1935 
     | 
    
         
            +
                            "internal_states" in server_info_json
         
     | 
| 
      
 1936 
     | 
    
         
            +
                            and server_info_json["internal_states"]
         
     | 
| 
      
 1937 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1938 
     | 
    
         
            +
                            accept_length = server_info_json["internal_states"][0].get(
         
     | 
| 
      
 1939 
     | 
    
         
            +
                                "avg_spec_accept_length", None
         
     | 
| 
      
 1940 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1941 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1942 
     | 
    
         
            +
                            accept_length = None
         
     | 
| 
       1776 
1943 
     | 
    
         
             
                    else:
         
     | 
| 
       1777 
1944 
     | 
    
         
             
                        accept_length = None
         
     | 
| 
       1778 
1945 
     | 
    
         
             
                else:
         
     | 
| 
         @@ -1786,6 +1953,7 @@ async def benchmark( 
     | 
|
| 
       1786 
1953 
     | 
    
         
             
                    dur_s=benchmark_duration,
         
     | 
| 
       1787 
1954 
     | 
    
         
             
                    tokenizer=tokenizer,
         
     | 
| 
       1788 
1955 
     | 
    
         
             
                    backend=backend,
         
     | 
| 
      
 1956 
     | 
    
         
            +
                    accept_length=accept_length,
         
     | 
| 
       1789 
1957 
     | 
    
         
             
                )
         
     | 
| 
       1790 
1958 
     | 
    
         | 
| 
       1791 
1959 
     | 
    
         
             
                print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
         
     | 
| 
         @@ -1804,6 +1972,10 @@ async def benchmark( 
     | 
|
| 
       1804 
1972 
     | 
    
         
             
                print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
         
     | 
| 
       1805 
1973 
     | 
    
         
             
                print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
         
     | 
| 
       1806 
1974 
     | 
    
         
             
                print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
         
     | 
| 
      
 1975 
     | 
    
         
            +
                print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
         
     | 
| 
      
 1976 
     | 
    
         
            +
                print(
         
     | 
| 
      
 1977 
     | 
    
         
            +
                    "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
         
     | 
| 
      
 1978 
     | 
    
         
            +
                )
         
     | 
| 
       1807 
1979 
     | 
    
         
             
                print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
         
     | 
| 
       1808 
1980 
     | 
    
         
             
                print(
         
     | 
| 
       1809 
1981 
     | 
    
         
             
                    "{:<40} {:<10}".format(
         
     | 
| 
         @@ -1873,6 +2045,8 @@ async def benchmark( 
     | 
|
| 
       1873 
2045 
     | 
    
         
             
                        "duration": benchmark_duration,
         
     | 
| 
       1874 
2046 
     | 
    
         
             
                        "completed": metrics.completed,
         
     | 
| 
       1875 
2047 
     | 
    
         
             
                        "total_input_tokens": metrics.total_input,
         
     | 
| 
      
 2048 
     | 
    
         
            +
                        "total_input_text_tokens": metrics.total_input_text,
         
     | 
| 
      
 2049 
     | 
    
         
            +
                        "total_input_vision_tokens": metrics.total_input_vision,
         
     | 
| 
       1876 
2050 
     | 
    
         
             
                        "total_output_tokens": metrics.total_output,
         
     | 
| 
       1877 
2051 
     | 
    
         
             
                        "total_output_tokens_retokenized": metrics.total_output_retokenized,
         
     | 
| 
       1878 
2052 
     | 
    
         
             
                        "request_throughput": metrics.request_throughput,
         
     | 
| 
         @@ -1907,11 +2081,11 @@ async def benchmark( 
     | 
|
| 
       1907 
2081 
     | 
    
         
             
                    output_file_name = args.output_file
         
     | 
| 
       1908 
2082 
     | 
    
         
             
                else:
         
     | 
| 
       1909 
2083 
     | 
    
         
             
                    now = datetime.now().strftime("%m%d")
         
     | 
| 
       1910 
     | 
    
         
            -
                    if args.dataset_name == " 
     | 
| 
      
 2084 
     | 
    
         
            +
                    if args.dataset_name == "image":
         
     | 
| 
       1911 
2085 
     | 
    
         
             
                        output_file_name = (
         
     | 
| 
       1912 
2086 
     | 
    
         
             
                            f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
         
     | 
| 
       1913 
     | 
    
         
            -
                            f"{args.random_output_len}_{args. 
     | 
| 
       1914 
     | 
    
         
            -
                            f"{args. 
     | 
| 
      
 2087 
     | 
    
         
            +
                            f"{args.random_output_len}_{args.image_count}imgs_"
         
     | 
| 
      
 2088 
     | 
    
         
            +
                            f"{args.image_resolution}.jsonl"
         
     | 
| 
       1915 
2089 
     | 
    
         
             
                        )
         
     | 
| 
       1916 
2090 
     | 
    
         
             
                    elif args.dataset_name.startswith("random"):
         
     | 
| 
       1917 
2091 
     | 
    
         
             
                        output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
         
     | 
| 
         @@ -2087,6 +2261,12 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2087 
2261 
     | 
    
         
             
                        "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
         
     | 
| 
       2088 
2262 
     | 
    
         
             
                    )
         
     | 
| 
       2089 
2263 
     | 
    
         | 
| 
      
 2264 
     | 
    
         
            +
                if args.dataset_name in ["image", "mmmu"]:
         
     | 
| 
      
 2265 
     | 
    
         
            +
                    args.apply_chat_template = True
         
     | 
| 
      
 2266 
     | 
    
         
            +
                    assert (
         
     | 
| 
      
 2267 
     | 
    
         
            +
                        not args.tokenize_prompt
         
     | 
| 
      
 2268 
     | 
    
         
            +
                    ), "`--tokenize-prompt` not compatible with image dataset"
         
     | 
| 
      
 2269 
     | 
    
         
            +
             
     | 
| 
       2090 
2270 
     | 
    
         
             
                print(f"{args}\n")
         
     | 
| 
       2091 
2271 
     | 
    
         | 
| 
       2092 
2272 
     | 
    
         
             
                # Read dataset
         
     | 
| 
         @@ -2094,7 +2274,7 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2094 
2274 
     | 
    
         
             
                model_id = args.model
         
     | 
| 
       2095 
2275 
     | 
    
         
             
                tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
         
     | 
| 
       2096 
2276 
     | 
    
         
             
                tokenizer = get_tokenizer(tokenizer_id)
         
     | 
| 
       2097 
     | 
    
         
            -
                input_requests = get_dataset(args, tokenizer)
         
     | 
| 
      
 2277 
     | 
    
         
            +
                input_requests = get_dataset(args, tokenizer, model_id)
         
     | 
| 
       2098 
2278 
     | 
    
         | 
| 
       2099 
2279 
     | 
    
         
             
                # compatible with SimpleNamespace
         
     | 
| 
       2100 
2280 
     | 
    
         
             
                if not hasattr(args, "flush_cache"):
         
     | 
| 
         @@ -2120,6 +2300,8 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2120 
2300 
     | 
    
         
             
                        use_trace_timestamps=args.use_trace_timestamps,
         
     | 
| 
       2121 
2301 
     | 
    
         
             
                        mooncake_slowdown_factor=args.mooncake_slowdown_factor,
         
     | 
| 
       2122 
2302 
     | 
    
         
             
                        mooncake_num_rounds=args.mooncake_num_rounds,
         
     | 
| 
      
 2303 
     | 
    
         
            +
                        profile_prefill_url=getattr(args, "profile_prefill_url", None),
         
     | 
| 
      
 2304 
     | 
    
         
            +
                        profile_decode_url=getattr(args, "profile_decode_url", None),
         
     | 
| 
       2123 
2305 
     | 
    
         
             
                    )
         
     | 
| 
       2124 
2306 
     | 
    
         
             
                )
         
     | 
| 
       2125 
2307 
     | 
    
         | 
| 
         @@ -2175,7 +2357,7 @@ if __name__ == "__main__": 
     | 
|
| 
       2175 
2357 
     | 
    
         
             
                        "random-ids",
         
     | 
| 
       2176 
2358 
     | 
    
         
             
                        "generated-shared-prefix",
         
     | 
| 
       2177 
2359 
     | 
    
         
             
                        "mmmu",
         
     | 
| 
       2178 
     | 
    
         
            -
                        " 
     | 
| 
      
 2360 
     | 
    
         
            +
                        "image",
         
     | 
| 
       2179 
2361 
     | 
    
         
             
                        "mooncake",
         
     | 
| 
       2180 
2362 
     | 
    
         
             
                    ],
         
     | 
| 
       2181 
2363 
     | 
    
         
             
                    help="Name of the dataset to benchmark on.",
         
     | 
| 
         @@ -2215,37 +2397,49 @@ if __name__ == "__main__": 
     | 
|
| 
       2215 
2397 
     | 
    
         
             
                    "--random-input-len",
         
     | 
| 
       2216 
2398 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2217 
2399 
     | 
    
         
             
                    default=1024,
         
     | 
| 
       2218 
     | 
    
         
            -
                    help="Number of input tokens per request, used only for random dataset.",
         
     | 
| 
      
 2400 
     | 
    
         
            +
                    help="Number of input tokens per request, used only for random and image dataset.",
         
     | 
| 
       2219 
2401 
     | 
    
         
             
                )
         
     | 
| 
       2220 
2402 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2221 
2403 
     | 
    
         
             
                    "--random-output-len",
         
     | 
| 
       2222 
2404 
     | 
    
         
             
                    default=1024,
         
     | 
| 
       2223 
2405 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2224 
     | 
    
         
            -
                    help="Number of output tokens per request, used only for random dataset.",
         
     | 
| 
      
 2406 
     | 
    
         
            +
                    help="Number of output tokens per request, used only for random and image dataset.",
         
     | 
| 
       2225 
2407 
     | 
    
         
             
                )
         
     | 
| 
       2226 
2408 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2227 
2409 
     | 
    
         
             
                    "--random-range-ratio",
         
     | 
| 
       2228 
2410 
     | 
    
         
             
                    type=float,
         
     | 
| 
       2229 
2411 
     | 
    
         
             
                    default=0.0,
         
     | 
| 
       2230 
2412 
     | 
    
         
             
                    help="Range of sampled ratio of input/output length, "
         
     | 
| 
       2231 
     | 
    
         
            -
                    "used only for random dataset.",
         
     | 
| 
      
 2413 
     | 
    
         
            +
                    "used only for random and image dataset.",
         
     | 
| 
       2232 
2414 
     | 
    
         
             
                )
         
     | 
| 
       2233 
     | 
    
         
            -
                #  
     | 
| 
      
 2415 
     | 
    
         
            +
                # image dataset args
         
     | 
| 
       2234 
2416 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2235 
     | 
    
         
            -
                    "-- 
     | 
| 
      
 2417 
     | 
    
         
            +
                    "--image-count",
         
     | 
| 
       2236 
2418 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2237 
2419 
     | 
    
         
             
                    default=1,
         
     | 
| 
       2238 
     | 
    
         
            -
                    help="Number of images per request (only available with the  
     | 
| 
      
 2420 
     | 
    
         
            +
                    help="Number of images per request (only available with the image dataset)",
         
     | 
| 
       2239 
2421 
     | 
    
         
             
                )
         
     | 
| 
       2240 
2422 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2241 
     | 
    
         
            -
                    "-- 
     | 
| 
      
 2423 
     | 
    
         
            +
                    "--image-resolution",
         
     | 
| 
       2242 
2424 
     | 
    
         
             
                    type=str,
         
     | 
| 
       2243 
2425 
     | 
    
         
             
                    default="1080p",
         
     | 
| 
       2244 
2426 
     | 
    
         
             
                    help=(
         
     | 
| 
       2245 
     | 
    
         
            -
                        "Resolution of  
     | 
| 
      
 2427 
     | 
    
         
            +
                        "Resolution of images for image dataset. "
         
     | 
| 
       2246 
2428 
     | 
    
         
             
                        "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
         
     | 
| 
       2247 
2429 
     | 
    
         
             
                    ),
         
     | 
| 
       2248 
2430 
     | 
    
         
             
                )
         
     | 
| 
      
 2431 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 2432 
     | 
    
         
            +
                    "--image-format",
         
     | 
| 
      
 2433 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2434 
     | 
    
         
            +
                    default="jpeg",
         
     | 
| 
      
 2435 
     | 
    
         
            +
                    help=("Format of images for image dataset. " "Supports jpeg and png."),
         
     | 
| 
      
 2436 
     | 
    
         
            +
                )
         
     | 
| 
      
 2437 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 2438 
     | 
    
         
            +
                    "--image-content",
         
     | 
| 
      
 2439 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2440 
     | 
    
         
            +
                    default="random",
         
     | 
| 
      
 2441 
     | 
    
         
            +
                    help=("Content for images for image dataset. " "Supports random and blank."),
         
     | 
| 
      
 2442 
     | 
    
         
            +
                )
         
     | 
| 
       2249 
2443 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2250 
2444 
     | 
    
         
             
                    "--request-rate",
         
     | 
| 
       2251 
2445 
     | 
    
         
             
                    type=float,
         
     | 
| 
         @@ -2333,6 +2527,30 @@ if __name__ == "__main__": 
     | 
|
| 
       2333 
2527 
     | 
    
         
             
                    action="store_true",
         
     | 
| 
       2334 
2528 
     | 
    
         
             
                    help="Benchmark PD disaggregation server",
         
     | 
| 
       2335 
2529 
     | 
    
         
             
                )
         
     | 
| 
      
 2530 
     | 
    
         
            +
             
     | 
| 
      
 2531 
     | 
    
         
            +
                # Create a mutually exclusive group for profiling URLs
         
     | 
| 
      
 2532 
     | 
    
         
            +
                # In PD separated mode, prefill and decode workers must be profiled separately
         
     | 
| 
      
 2533 
     | 
    
         
            +
                profile_url_group = parser.add_mutually_exclusive_group()
         
     | 
| 
      
 2534 
     | 
    
         
            +
                profile_url_group.add_argument(
         
     | 
| 
      
 2535 
     | 
    
         
            +
                    "--profile-prefill-url",
         
     | 
| 
      
 2536 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2537 
     | 
    
         
            +
                    nargs="*",
         
     | 
| 
      
 2538 
     | 
    
         
            +
                    default=None,
         
     | 
| 
      
 2539 
     | 
    
         
            +
                    help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
         
     | 
| 
      
 2540 
     | 
    
         
            +
                    "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
         
     | 
| 
      
 2541 
     | 
    
         
            +
                    "NOTE: Cannot be used together with --profile-decode-url. "
         
     | 
| 
      
 2542 
     | 
    
         
            +
                    "In PD separated mode, prefill and decode workers must be profiled separately.",
         
     | 
| 
      
 2543 
     | 
    
         
            +
                )
         
     | 
| 
      
 2544 
     | 
    
         
            +
                profile_url_group.add_argument(
         
     | 
| 
      
 2545 
     | 
    
         
            +
                    "--profile-decode-url",
         
     | 
| 
      
 2546 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2547 
     | 
    
         
            +
                    nargs="*",
         
     | 
| 
      
 2548 
     | 
    
         
            +
                    default=None,
         
     | 
| 
      
 2549 
     | 
    
         
            +
                    help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
         
     | 
| 
      
 2550 
     | 
    
         
            +
                    "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
         
     | 
| 
      
 2551 
     | 
    
         
            +
                    "NOTE: Cannot be used together with --profile-prefill-url. "
         
     | 
| 
      
 2552 
     | 
    
         
            +
                    "In PD separated mode, prefill and decode workers must be profiled separately.",
         
     | 
| 
      
 2553 
     | 
    
         
            +
                )
         
     | 
| 
       2336 
2554 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2337 
2555 
     | 
    
         
             
                    "--flush-cache",
         
     | 
| 
       2338 
2556 
     | 
    
         
             
                    action="store_true",
         
     |