sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +54 -37
 - sglang/bench_one_batch_server.py +340 -34
 - sglang/bench_serving.py +340 -159
 - sglang/check_env.py +1 -1
 - sglang/compile_deep_gemm.py +6 -2
 - sglang/global_config.py +1 -25
 - sglang/lang/api.py +6 -0
 - sglang/lang/backend/runtime_endpoint.py +1 -1
 - sglang/lang/interpreter.py +1 -0
 - sglang/lang/ir.py +13 -0
 - sglang/launch_server.py +9 -2
 - sglang/profiler.py +20 -3
 - sglang/srt/_custom_ops.py +1 -1
 - sglang/srt/batch_invariant_ops/__init__.py +27 -0
 - sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
 - sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
 - sglang/srt/compilation/backend.py +437 -0
 - sglang/srt/compilation/compilation_config.py +20 -0
 - sglang/srt/compilation/compilation_counter.py +47 -0
 - sglang/srt/compilation/compile.py +210 -0
 - sglang/srt/compilation/compiler_interface.py +503 -0
 - sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
 - sglang/srt/compilation/fix_functionalization.py +134 -0
 - sglang/srt/compilation/fx_utils.py +83 -0
 - sglang/srt/compilation/inductor_pass.py +140 -0
 - sglang/srt/compilation/pass_manager.py +66 -0
 - sglang/srt/compilation/piecewise_context_manager.py +40 -0
 - sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
 - sglang/srt/configs/__init__.py +8 -0
 - sglang/srt/configs/deepseek_ocr.py +262 -0
 - sglang/srt/configs/deepseekvl2.py +194 -96
 - sglang/srt/configs/dots_ocr.py +64 -0
 - sglang/srt/configs/dots_vlm.py +2 -7
 - sglang/srt/configs/falcon_h1.py +309 -0
 - sglang/srt/configs/load_config.py +33 -2
 - sglang/srt/configs/mamba_utils.py +117 -0
 - sglang/srt/configs/model_config.py +284 -118
 - sglang/srt/configs/modelopt_config.py +30 -0
 - sglang/srt/configs/nemotron_h.py +286 -0
 - sglang/srt/configs/olmo3.py +105 -0
 - sglang/srt/configs/points_v15_chat.py +29 -0
 - sglang/srt/configs/qwen3_next.py +11 -47
 - sglang/srt/configs/qwen3_omni.py +613 -0
 - sglang/srt/configs/qwen3_vl.py +576 -0
 - sglang/srt/connector/remote_instance.py +1 -1
 - sglang/srt/constrained/base_grammar_backend.py +6 -1
 - sglang/srt/constrained/llguidance_backend.py +5 -0
 - sglang/srt/constrained/outlines_backend.py +1 -1
 - sglang/srt/constrained/outlines_jump_forward.py +1 -1
 - sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
 - sglang/srt/constrained/utils.py +12 -0
 - sglang/srt/constrained/xgrammar_backend.py +26 -15
 - sglang/srt/debug_utils/dumper.py +10 -3
 - sglang/srt/disaggregation/ascend/conn.py +2 -2
 - sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
 - sglang/srt/disaggregation/base/conn.py +17 -4
 - sglang/srt/disaggregation/common/conn.py +268 -98
 - sglang/srt/disaggregation/decode.py +172 -39
 - sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
 - sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
 - sglang/srt/disaggregation/fake/conn.py +11 -3
 - sglang/srt/disaggregation/mooncake/conn.py +203 -555
 - sglang/srt/disaggregation/nixl/conn.py +217 -63
 - sglang/srt/disaggregation/prefill.py +113 -270
 - sglang/srt/disaggregation/utils.py +36 -5
 - sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
 - sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
 - sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
 - sglang/srt/distributed/device_communicators/pynccl.py +24 -12
 - sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
 - sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
 - sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
 - sglang/srt/distributed/naive_distributed.py +5 -4
 - sglang/srt/distributed/parallel_state.py +203 -97
 - sglang/srt/elastic_ep/elastic_ep.py +74 -0
 - sglang/srt/entrypoints/context.py +3 -2
 - sglang/srt/entrypoints/engine.py +85 -65
 - sglang/srt/entrypoints/grpc_server.py +632 -305
 - sglang/srt/entrypoints/harmony_utils.py +2 -2
 - sglang/srt/entrypoints/http_server.py +169 -17
 - sglang/srt/entrypoints/http_server_engine.py +1 -7
 - sglang/srt/entrypoints/openai/protocol.py +327 -34
 - sglang/srt/entrypoints/openai/serving_base.py +74 -8
 - sglang/srt/entrypoints/openai/serving_chat.py +202 -118
 - sglang/srt/entrypoints/openai/serving_classify.py +204 -0
 - sglang/srt/entrypoints/openai/serving_completions.py +20 -4
 - sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
 - sglang/srt/entrypoints/openai/serving_responses.py +47 -2
 - sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
 - sglang/srt/environ.py +323 -0
 - sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
 - sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
 - sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
 - sglang/srt/eplb/expert_distribution.py +3 -4
 - sglang/srt/eplb/expert_location.py +30 -5
 - sglang/srt/eplb/expert_location_dispatch.py +2 -2
 - sglang/srt/eplb/expert_location_updater.py +2 -2
 - sglang/srt/function_call/base_format_detector.py +17 -18
 - sglang/srt/function_call/function_call_parser.py +21 -16
 - sglang/srt/function_call/glm4_moe_detector.py +4 -8
 - sglang/srt/function_call/gpt_oss_detector.py +24 -1
 - sglang/srt/function_call/json_array_parser.py +61 -0
 - sglang/srt/function_call/kimik2_detector.py +17 -4
 - sglang/srt/function_call/utils.py +98 -7
 - sglang/srt/grpc/compile_proto.py +245 -0
 - sglang/srt/grpc/grpc_request_manager.py +915 -0
 - sglang/srt/grpc/health_servicer.py +189 -0
 - sglang/srt/grpc/scheduler_launcher.py +181 -0
 - sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
 - sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
 - sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
 - sglang/srt/layers/activation.py +11 -7
 - sglang/srt/layers/attention/aiter_backend.py +17 -18
 - sglang/srt/layers/attention/ascend_backend.py +125 -10
 - sglang/srt/layers/attention/attention_registry.py +226 -0
 - sglang/srt/layers/attention/base_attn_backend.py +32 -4
 - sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
 - sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
 - sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
 - sglang/srt/layers/attention/fla/chunk.py +0 -1
 - sglang/srt/layers/attention/fla/chunk_o.py +1 -1
 - sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
 - sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
 - sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
 - sglang/srt/layers/attention/fla/index.py +0 -2
 - sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
 - sglang/srt/layers/attention/fla/utils.py +0 -3
 - sglang/srt/layers/attention/fla/wy_fast.py +0 -2
 - sglang/srt/layers/attention/flashattention_backend.py +52 -15
 - sglang/srt/layers/attention/flashinfer_backend.py +357 -212
 - sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
 - sglang/srt/layers/attention/flashmla_backend.py +9 -7
 - sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
 - sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
 - sglang/srt/layers/attention/intel_amx_backend.py +1 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
 - sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
 - sglang/srt/layers/attention/mamba/mamba.py +514 -1
 - sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
 - sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
 - sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
 - sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
 - sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
 - sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
 - sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
 - sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
 - sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
 - sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
 - sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
 - sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
 - sglang/srt/layers/attention/nsa/transform_index.py +144 -0
 - sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
 - sglang/srt/layers/attention/nsa/utils.py +23 -0
 - sglang/srt/layers/attention/nsa_backend.py +1201 -0
 - sglang/srt/layers/attention/tbo_backend.py +6 -6
 - sglang/srt/layers/attention/torch_flex_backend.py +325 -0
 - sglang/srt/layers/attention/triton_backend.py +249 -42
 - sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
 - sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
 - sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
 - sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
 - sglang/srt/layers/attention/utils.py +11 -7
 - sglang/srt/layers/attention/vision.py +61 -3
 - sglang/srt/layers/attention/wave_backend.py +4 -4
 - sglang/srt/layers/attention/xpu_backend.py +1028 -0
 - sglang/srt/layers/communicator.py +19 -7
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
 - sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
 - sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
 - sglang/srt/layers/dp_attention.py +28 -1
 - sglang/srt/layers/elementwise.py +3 -1
 - sglang/srt/layers/layernorm.py +47 -15
 - sglang/srt/layers/linear.py +30 -5
 - sglang/srt/layers/logits_processor.py +161 -18
 - sglang/srt/layers/modelopt_utils.py +11 -0
 - sglang/srt/layers/moe/cutlass_moe.py +0 -2
 - sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
 - sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
 - sglang/srt/layers/moe/ep_moe/layer.py +243 -448
 - sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
 - sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
 - sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
 - sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
 - sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
 - sglang/srt/layers/moe/moe_runner/runner.py +3 -0
 - sglang/srt/layers/moe/moe_runner/triton.py +3 -1
 - sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
 - sglang/srt/layers/moe/router.py +51 -15
 - sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
 - sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
 - sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
 - sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
 - sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
 - sglang/srt/layers/moe/topk.py +3 -2
 - sglang/srt/layers/moe/utils.py +27 -1
 - sglang/srt/layers/parameter.py +23 -6
 - sglang/srt/layers/quantization/__init__.py +2 -53
 - sglang/srt/layers/quantization/awq.py +183 -6
 - sglang/srt/layers/quantization/awq_triton.py +29 -0
 - sglang/srt/layers/quantization/base_config.py +20 -1
 - sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
 - sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
 - sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
 - sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
 - sglang/srt/layers/quantization/fp8.py +86 -20
 - sglang/srt/layers/quantization/fp8_kernel.py +55 -10
 - sglang/srt/layers/quantization/fp8_utils.py +43 -15
 - sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
 - sglang/srt/layers/quantization/gptq.py +0 -1
 - sglang/srt/layers/quantization/int8_kernel.py +18 -2
 - sglang/srt/layers/quantization/marlin_utils.py +12 -0
 - sglang/srt/layers/quantization/modelopt_quant.py +141 -81
 - sglang/srt/layers/quantization/mxfp4.py +17 -34
 - sglang/srt/layers/quantization/petit.py +1 -1
 - sglang/srt/layers/quantization/quark/quark.py +3 -1
 - sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
 - sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
 - sglang/srt/layers/quantization/unquant.py +1 -4
 - sglang/srt/layers/quantization/utils.py +0 -1
 - sglang/srt/layers/quantization/w4afp8.py +51 -24
 - sglang/srt/layers/quantization/w8a8_int8.py +45 -27
 - sglang/srt/layers/radix_attention.py +59 -9
 - sglang/srt/layers/rotary_embedding.py +750 -46
 - sglang/srt/layers/sampler.py +84 -16
 - sglang/srt/layers/sparse_pooler.py +98 -0
 - sglang/srt/layers/utils.py +23 -1
 - sglang/srt/layers/vocab_parallel_embedding.py +4 -1
 - sglang/srt/lora/backend/base_backend.py +3 -3
 - sglang/srt/lora/backend/chunked_backend.py +348 -0
 - sglang/srt/lora/backend/triton_backend.py +9 -4
 - sglang/srt/lora/eviction_policy.py +139 -0
 - sglang/srt/lora/lora.py +7 -5
 - sglang/srt/lora/lora_manager.py +33 -7
 - sglang/srt/lora/lora_registry.py +1 -1
 - sglang/srt/lora/mem_pool.py +41 -17
 - sglang/srt/lora/triton_ops/__init__.py +4 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
 - sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
 - sglang/srt/lora/utils.py +7 -5
 - sglang/srt/managers/cache_controller.py +83 -152
 - sglang/srt/managers/data_parallel_controller.py +156 -87
 - sglang/srt/managers/detokenizer_manager.py +51 -24
 - sglang/srt/managers/io_struct.py +223 -129
 - sglang/srt/managers/mm_utils.py +49 -10
 - sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
 - sglang/srt/managers/multimodal_processor.py +1 -2
 - sglang/srt/managers/overlap_utils.py +130 -0
 - sglang/srt/managers/schedule_batch.py +340 -529
 - sglang/srt/managers/schedule_policy.py +158 -18
 - sglang/srt/managers/scheduler.py +665 -620
 - sglang/srt/managers/scheduler_input_blocker.py +1 -1
 - sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
 - sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
 - sglang/srt/managers/scheduler_pp_mixin.py +341 -0
 - sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
 - sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
 - sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
 - sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
 - sglang/srt/managers/tokenizer_manager.py +462 -226
 - sglang/srt/managers/tp_worker.py +217 -156
 - sglang/srt/managers/utils.py +79 -47
 - sglang/srt/mem_cache/allocator.py +21 -22
 - sglang/srt/mem_cache/allocator_ascend.py +42 -28
 - sglang/srt/mem_cache/base_prefix_cache.py +3 -3
 - sglang/srt/mem_cache/chunk_cache.py +20 -2
 - sglang/srt/mem_cache/common.py +480 -0
 - sglang/srt/mem_cache/evict_policy.py +38 -0
 - sglang/srt/mem_cache/hicache_storage.py +44 -2
 - sglang/srt/mem_cache/hiradix_cache.py +134 -34
 - sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
 - sglang/srt/mem_cache/memory_pool.py +602 -208
 - sglang/srt/mem_cache/memory_pool_host.py +134 -183
 - sglang/srt/mem_cache/multimodal_cache.py +0 -1
 - sglang/srt/mem_cache/radix_cache.py +263 -78
 - sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
 - sglang/srt/mem_cache/storage/__init__.py +10 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
 - sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
 - sglang/srt/mem_cache/storage/backend_factory.py +223 -0
 - sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
 - sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
 - sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
 - sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
 - sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
 - sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
 - sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
 - sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
 - sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
 - sglang/srt/mem_cache/swa_radix_cache.py +115 -58
 - sglang/srt/metrics/collector.py +113 -120
 - sglang/srt/metrics/func_timer.py +3 -8
 - sglang/srt/metrics/utils.py +8 -1
 - sglang/srt/model_executor/cpu_graph_runner.py +2 -2
 - sglang/srt/model_executor/cuda_graph_runner.py +81 -36
 - sglang/srt/model_executor/forward_batch_info.py +40 -50
 - sglang/srt/model_executor/model_runner.py +507 -319
 - sglang/srt/model_executor/npu_graph_runner.py +11 -5
 - sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
 - sglang/srt/model_loader/__init__.py +1 -1
 - sglang/srt/model_loader/loader.py +438 -37
 - sglang/srt/model_loader/utils.py +0 -1
 - sglang/srt/model_loader/weight_utils.py +200 -27
 - sglang/srt/models/apertus.py +2 -3
 - sglang/srt/models/arcee.py +2 -2
 - sglang/srt/models/bailing_moe.py +40 -56
 - sglang/srt/models/bailing_moe_nextn.py +3 -4
 - sglang/srt/models/bert.py +1 -1
 - sglang/srt/models/deepseek_nextn.py +25 -4
 - sglang/srt/models/deepseek_ocr.py +1516 -0
 - sglang/srt/models/deepseek_v2.py +793 -235
 - sglang/srt/models/dots_ocr.py +171 -0
 - sglang/srt/models/dots_vlm.py +0 -1
 - sglang/srt/models/dots_vlm_vit.py +1 -1
 - sglang/srt/models/falcon_h1.py +570 -0
 - sglang/srt/models/gemma3_causal.py +0 -2
 - sglang/srt/models/gemma3_mm.py +17 -1
 - sglang/srt/models/gemma3n_mm.py +2 -3
 - sglang/srt/models/glm4_moe.py +17 -40
 - sglang/srt/models/glm4_moe_nextn.py +4 -4
 - sglang/srt/models/glm4v.py +3 -2
 - sglang/srt/models/glm4v_moe.py +6 -6
 - sglang/srt/models/gpt_oss.py +12 -35
 - sglang/srt/models/grok.py +10 -23
 - sglang/srt/models/hunyuan.py +2 -7
 - sglang/srt/models/interns1.py +0 -1
 - sglang/srt/models/kimi_vl.py +1 -7
 - sglang/srt/models/kimi_vl_moonvit.py +4 -2
 - sglang/srt/models/llama.py +6 -2
 - sglang/srt/models/llama_eagle3.py +1 -1
 - sglang/srt/models/longcat_flash.py +6 -23
 - sglang/srt/models/longcat_flash_nextn.py +4 -15
 - sglang/srt/models/mimo.py +2 -13
 - sglang/srt/models/mimo_mtp.py +1 -2
 - sglang/srt/models/minicpmo.py +7 -5
 - sglang/srt/models/mixtral.py +1 -4
 - sglang/srt/models/mllama.py +1 -1
 - sglang/srt/models/mllama4.py +27 -6
 - sglang/srt/models/nemotron_h.py +511 -0
 - sglang/srt/models/olmo2.py +31 -4
 - sglang/srt/models/opt.py +5 -5
 - sglang/srt/models/phi.py +1 -1
 - sglang/srt/models/phi4mm.py +1 -1
 - sglang/srt/models/phimoe.py +0 -1
 - sglang/srt/models/pixtral.py +0 -3
 - sglang/srt/models/points_v15_chat.py +186 -0
 - sglang/srt/models/qwen.py +0 -1
 - sglang/srt/models/qwen2.py +0 -7
 - sglang/srt/models/qwen2_5_vl.py +5 -5
 - sglang/srt/models/qwen2_audio.py +2 -15
 - sglang/srt/models/qwen2_moe.py +70 -4
 - sglang/srt/models/qwen2_vl.py +6 -3
 - sglang/srt/models/qwen3.py +18 -3
 - sglang/srt/models/qwen3_moe.py +50 -38
 - sglang/srt/models/qwen3_next.py +43 -21
 - sglang/srt/models/qwen3_next_mtp.py +3 -4
 - sglang/srt/models/qwen3_omni_moe.py +661 -0
 - sglang/srt/models/qwen3_vl.py +791 -0
 - sglang/srt/models/qwen3_vl_moe.py +343 -0
 - sglang/srt/models/registry.py +15 -3
 - sglang/srt/models/roberta.py +55 -3
 - sglang/srt/models/sarashina2_vision.py +268 -0
 - sglang/srt/models/solar.py +505 -0
 - sglang/srt/models/starcoder2.py +357 -0
 - sglang/srt/models/step3_vl.py +3 -5
 - sglang/srt/models/torch_native_llama.py +9 -2
 - sglang/srt/models/utils.py +61 -0
 - sglang/srt/multimodal/processors/base_processor.py +21 -9
 - sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
 - sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
 - sglang/srt/multimodal/processors/dots_vlm.py +2 -4
 - sglang/srt/multimodal/processors/glm4v.py +1 -5
 - sglang/srt/multimodal/processors/internvl.py +20 -10
 - sglang/srt/multimodal/processors/janus_pro.py +0 -1
 - sglang/srt/multimodal/processors/mllama4.py +0 -8
 - sglang/srt/multimodal/processors/phi4mm.py +0 -1
 - sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
 - sglang/srt/multimodal/processors/qwen_vl.py +83 -17
 - sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
 - sglang/srt/multimodal/processors/step3_vl.py +1 -1
 - sglang/srt/parser/conversation.py +41 -0
 - sglang/srt/parser/jinja_template_utils.py +6 -0
 - sglang/srt/parser/reasoning_parser.py +0 -1
 - sglang/srt/sampling/custom_logit_processor.py +77 -2
 - sglang/srt/sampling/sampling_batch_info.py +36 -23
 - sglang/srt/sampling/sampling_params.py +75 -0
 - sglang/srt/server_args.py +1300 -338
 - sglang/srt/server_args_config_parser.py +146 -0
 - sglang/srt/single_batch_overlap.py +161 -0
 - sglang/srt/speculative/base_spec_worker.py +34 -0
 - sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
 - sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
 - sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
 - sglang/srt/speculative/cpp_ngram/param.h +125 -0
 - sglang/srt/speculative/cpp_ngram/queue.h +71 -0
 - sglang/srt/speculative/draft_utils.py +226 -0
 - sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
 - sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
 - sglang/srt/speculative/eagle_info.py +786 -0
 - sglang/srt/speculative/eagle_info_v2.py +458 -0
 - sglang/srt/speculative/eagle_utils.py +113 -1270
 - sglang/srt/speculative/eagle_worker.py +120 -285
 - sglang/srt/speculative/eagle_worker_v2.py +702 -0
 - sglang/srt/speculative/ngram_info.py +433 -0
 - sglang/srt/speculative/ngram_worker.py +246 -0
 - sglang/srt/speculative/spec_info.py +49 -0
 - sglang/srt/speculative/spec_utils.py +641 -0
 - sglang/srt/speculative/standalone_worker.py +4 -14
 - sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
 - sglang/srt/tracing/trace.py +32 -6
 - sglang/srt/two_batch_overlap.py +35 -18
 - sglang/srt/utils/__init__.py +2 -0
 - sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
 - sglang/srt/{utils.py → utils/common.py} +583 -113
 - sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
 - sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
 - sglang/srt/{offloader.py → utils/offloader.py} +4 -4
 - sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
 - sglang/srt/utils/profile_merger.py +199 -0
 - sglang/srt/utils/rpd_utils.py +452 -0
 - sglang/srt/utils/slow_rank_detector.py +71 -0
 - sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
 - sglang/srt/warmup.py +8 -4
 - sglang/srt/weight_sync/utils.py +1 -1
 - sglang/test/attention/test_flashattn_backend.py +1 -1
 - sglang/test/attention/test_flashattn_mla_backend.py +0 -1
 - sglang/test/attention/test_prefix_chunk_info.py +0 -2
 - sglang/test/attention/test_trtllm_mla_backend.py +221 -53
 - sglang/test/few_shot_gsm8k_engine.py +2 -4
 - sglang/test/get_logits_ut.py +57 -0
 - sglang/test/kit_matched_stop.py +157 -0
 - sglang/test/longbench_v2/__init__.py +1 -0
 - sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
 - sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
 - sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
 - sglang/test/run_eval.py +120 -11
 - sglang/test/runners.py +3 -1
 - sglang/test/send_one.py +42 -7
 - sglang/test/simple_eval_common.py +8 -2
 - sglang/test/simple_eval_gpqa.py +0 -1
 - sglang/test/simple_eval_humaneval.py +0 -3
 - sglang/test/simple_eval_longbench_v2.py +344 -0
 - sglang/test/simple_eval_mmmu_vlm.py +441 -0
 - sglang/test/test_block_fp8.py +3 -4
 - sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
 - sglang/test/test_cutlass_moe.py +1 -2
 - sglang/test/test_cutlass_w4a8_moe.py +10 -20
 - sglang/test/test_deterministic.py +430 -0
 - sglang/test/test_deterministic_utils.py +73 -0
 - sglang/test/test_disaggregation_utils.py +93 -1
 - sglang/test/test_marlin_moe.py +0 -1
 - sglang/test/test_programs.py +1 -1
 - sglang/test/test_utils.py +432 -16
 - sglang/utils.py +10 -1
 - sglang/version.py +1 -1
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
 - sglang/srt/entrypoints/grpc_request_manager.py +0 -580
 - sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
 - sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
 - sglang/srt/mem_cache/lora_radix_cache.py +0 -421
 - sglang/srt/speculative/build_eagle_tree.py +0 -427
 - sglang/test/test_block_fp8_ep.py +0 -358
 - /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
 - /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
 - /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
 - /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
 - {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
 
    
        sglang/bench_serving.py
    CHANGED
    
    | 
         @@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro 
     | 
|
| 
       12 
12 
     | 
    
         | 
| 
       13 
13 
     | 
    
         
             
            import argparse
         
     | 
| 
       14 
14 
     | 
    
         
             
            import asyncio
         
     | 
| 
       15 
     | 
    
         
            -
            import base64
         
     | 
| 
       16 
15 
     | 
    
         
             
            import io
         
     | 
| 
       17 
16 
     | 
    
         
             
            import json
         
     | 
| 
       18 
17 
     | 
    
         
             
            import os
         
     | 
| 
         @@ -32,9 +31,13 @@ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union 
     | 
|
| 
       32 
31 
     | 
    
         | 
| 
       33 
32 
     | 
    
         
             
            import aiohttp
         
     | 
| 
       34 
33 
     | 
    
         
             
            import numpy as np
         
     | 
| 
      
 34 
     | 
    
         
            +
            import pybase64
         
     | 
| 
       35 
35 
     | 
    
         
             
            import requests
         
     | 
| 
      
 36 
     | 
    
         
            +
            from datasets import load_dataset
         
     | 
| 
      
 37 
     | 
    
         
            +
            from PIL import Image
         
     | 
| 
       36 
38 
     | 
    
         
             
            from tqdm.asyncio import tqdm
         
     | 
| 
       37 
39 
     | 
    
         
             
            from transformers import (
         
     | 
| 
      
 40 
     | 
    
         
            +
                AutoProcessor,
         
     | 
| 
       38 
41 
     | 
    
         
             
                AutoTokenizer,
         
     | 
| 
       39 
42 
     | 
    
         
             
                PreTrainedTokenizer,
         
     | 
| 
       40 
43 
     | 
    
         
             
                PreTrainedTokenizerBase,
         
     | 
| 
         @@ -208,6 +211,15 @@ async def async_request_openai_completions( 
     | 
|
| 
       208 
211 
     | 
    
         
             
                        "ignore_eos": not args.disable_ignore_eos,
         
     | 
| 
       209 
212 
     | 
    
         
             
                        **request_func_input.extra_request_body,
         
     | 
| 
       210 
213 
     | 
    
         
             
                    }
         
     | 
| 
      
 214 
     | 
    
         
            +
             
     | 
| 
      
 215 
     | 
    
         
            +
                    # hack to accommodate different LoRA conventions between SGLang and vLLM.
         
     | 
| 
      
 216 
     | 
    
         
            +
                    if request_func_input.lora_name:
         
     | 
| 
      
 217 
     | 
    
         
            +
                        payload["model"] = request_func_input.lora_name
         
     | 
| 
      
 218 
     | 
    
         
            +
                        payload["lora_path"] = request_func_input.lora_name
         
     | 
| 
      
 219 
     | 
    
         
            +
             
     | 
| 
      
 220 
     | 
    
         
            +
                    if request_func_input.image_data:
         
     | 
| 
      
 221 
     | 
    
         
            +
                        payload.update({"image_data": request_func_input.image_data})
         
     | 
| 
      
 222 
     | 
    
         
            +
             
     | 
| 
       211 
223 
     | 
    
         
             
                    headers = get_auth_headers()
         
     | 
| 
       212 
224 
     | 
    
         | 
| 
       213 
225 
     | 
    
         
             
                    output = RequestFuncOutput.init_new(request_func_input)
         
     | 
| 
         @@ -318,10 +330,17 @@ async def async_request_openai_chat_completions( 
     | 
|
| 
       318 
330 
     | 
    
         
             
                        "model": request_func_input.model,
         
     | 
| 
       319 
331 
     | 
    
         
             
                        "messages": messages,
         
     | 
| 
       320 
332 
     | 
    
         
             
                        "temperature": 0.0,
         
     | 
| 
       321 
     | 
    
         
            -
                        " 
     | 
| 
      
 333 
     | 
    
         
            +
                        "max_completion_tokens": request_func_input.output_len,
         
     | 
| 
       322 
334 
     | 
    
         
             
                        "stream": not args.disable_stream,
         
     | 
| 
      
 335 
     | 
    
         
            +
                        "ignore_eos": not args.disable_ignore_eos,
         
     | 
| 
       323 
336 
     | 
    
         
             
                        **request_func_input.extra_request_body,
         
     | 
| 
       324 
337 
     | 
    
         
             
                    }
         
     | 
| 
      
 338 
     | 
    
         
            +
             
     | 
| 
      
 339 
     | 
    
         
            +
                    # hack to accommodate different LoRA conventions between SGLang and vLLM.
         
     | 
| 
      
 340 
     | 
    
         
            +
                    if request_func_input.lora_name:
         
     | 
| 
      
 341 
     | 
    
         
            +
                        payload["model"] = request_func_input.lora_name
         
     | 
| 
      
 342 
     | 
    
         
            +
                        payload["lora_path"] = request_func_input.lora_name
         
     | 
| 
      
 343 
     | 
    
         
            +
             
     | 
| 
       325 
344 
     | 
    
         
             
                    headers = get_auth_headers()
         
     | 
| 
       326 
345 
     | 
    
         | 
| 
       327 
346 
     | 
    
         
             
                    output = RequestFuncOutput.init_new(request_func_input)
         
     | 
| 
         @@ -606,6 +625,48 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput: 
     | 
|
| 
       606 
625 
     | 
    
         
             
                return output
         
     | 
| 
       607 
626 
     | 
    
         | 
| 
       608 
627 
     | 
    
         | 
| 
      
 628 
     | 
    
         
            +
            def _build_profile_urls(
         
     | 
| 
      
 629 
     | 
    
         
            +
                profile_prefill_url: Optional[List[str]],
         
     | 
| 
      
 630 
     | 
    
         
            +
                profile_decode_url: Optional[List[str]],
         
     | 
| 
      
 631 
     | 
    
         
            +
            ) -> List[Tuple[str, str]]:
         
     | 
| 
      
 632 
     | 
    
         
            +
                """Build profile URLs list from prefill/decode URL arguments.
         
     | 
| 
      
 633 
     | 
    
         
            +
             
     | 
| 
      
 634 
     | 
    
         
            +
                Returns:
         
     | 
| 
      
 635 
     | 
    
         
            +
                    List of (worker_type, url) tuples. e.g., [("Prefill-0", "http://..."), ("Decode-0", "http://...")]
         
     | 
| 
      
 636 
     | 
    
         
            +
                """
         
     | 
| 
      
 637 
     | 
    
         
            +
                profile_urls = []
         
     | 
| 
      
 638 
     | 
    
         
            +
                if profile_prefill_url:
         
     | 
| 
      
 639 
     | 
    
         
            +
                    for idx, url in enumerate(profile_prefill_url):
         
     | 
| 
      
 640 
     | 
    
         
            +
                        profile_urls.append((f"Prefill-{idx}", url))
         
     | 
| 
      
 641 
     | 
    
         
            +
                if profile_decode_url:
         
     | 
| 
      
 642 
     | 
    
         
            +
                    for idx, url in enumerate(profile_decode_url):
         
     | 
| 
      
 643 
     | 
    
         
            +
                        profile_urls.append((f"Decode-{idx}", url))
         
     | 
| 
      
 644 
     | 
    
         
            +
                return profile_urls
         
     | 
| 
      
 645 
     | 
    
         
            +
             
     | 
| 
      
 646 
     | 
    
         
            +
             
     | 
| 
      
 647 
     | 
    
         
            +
            async def _call_profile_pd(profile_urls: List[Tuple[str, str]], mode: str) -> None:
         
     | 
| 
      
 648 
     | 
    
         
            +
                """Call profile endpoint (start/stop) on PD separated workers.
         
     | 
| 
      
 649 
     | 
    
         
            +
             
     | 
| 
      
 650 
     | 
    
         
            +
                Args:
         
     | 
| 
      
 651 
     | 
    
         
            +
                    profile_urls: List of (worker_type, url) tuples
         
     | 
| 
      
 652 
     | 
    
         
            +
                    mode: "start" or "stop"
         
     | 
| 
      
 653 
     | 
    
         
            +
                """
         
     | 
| 
      
 654 
     | 
    
         
            +
                endpoint = "/start_profile" if mode == "start" else "/stop_profile"
         
     | 
| 
      
 655 
     | 
    
         
            +
                action = "Starting" if mode == "start" else "Stopping"
         
     | 
| 
      
 656 
     | 
    
         
            +
                action_past = "started" if mode == "start" else "stopped"
         
     | 
| 
      
 657 
     | 
    
         
            +
             
     | 
| 
      
 658 
     | 
    
         
            +
                print(f"{action} profiler...")
         
     | 
| 
      
 659 
     | 
    
         
            +
             
     | 
| 
      
 660 
     | 
    
         
            +
                for worker_type, url in profile_urls:
         
     | 
| 
      
 661 
     | 
    
         
            +
                    profile_output = await async_request_profile(api_url=url + endpoint)
         
     | 
| 
      
 662 
     | 
    
         
            +
                    if profile_output.success:
         
     | 
| 
      
 663 
     | 
    
         
            +
                        print(f"Profiler {action_past} for {worker_type} worker at {url}")
         
     | 
| 
      
 664 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 665 
     | 
    
         
            +
                        print(
         
     | 
| 
      
 666 
     | 
    
         
            +
                            f"Failed to {mode} profiler for {worker_type} worker at {url}: {profile_output.error}"
         
     | 
| 
      
 667 
     | 
    
         
            +
                        )
         
     | 
| 
      
 668 
     | 
    
         
            +
             
     | 
| 
      
 669 
     | 
    
         
            +
             
     | 
| 
       609 
670 
     | 
    
         
             
            def get_model(pretrained_model_name_or_path: str) -> str:
         
     | 
| 
       610 
671 
     | 
    
         
             
                if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
         
     | 
| 
       611 
672 
     | 
    
         
             
                    import huggingface_hub.constants
         
     | 
| 
         @@ -631,7 +692,7 @@ def get_tokenizer( 
     | 
|
| 
       631 
692 
     | 
    
         
             
                if pretrained_model_name_or_path.endswith(
         
     | 
| 
       632 
693 
     | 
    
         
             
                    ".json"
         
     | 
| 
       633 
694 
     | 
    
         
             
                ) or pretrained_model_name_or_path.endswith(".model"):
         
     | 
| 
       634 
     | 
    
         
            -
                    from sglang.srt.hf_transformers_utils import get_tokenizer
         
     | 
| 
      
 695 
     | 
    
         
            +
                    from sglang.srt.utils.hf_transformers_utils import get_tokenizer
         
     | 
| 
       635 
696 
     | 
    
         | 
| 
       636 
697 
     | 
    
         
             
                    return get_tokenizer(pretrained_model_name_or_path)
         
     | 
| 
       637 
698 
     | 
    
         | 
| 
         @@ -644,7 +705,30 @@ def get_tokenizer( 
     | 
|
| 
       644 
705 
     | 
    
         
             
                )
         
     | 
| 
       645 
706 
     | 
    
         | 
| 
       646 
707 
     | 
    
         | 
| 
       647 
     | 
    
         
            -
            def  
     | 
| 
      
 708 
     | 
    
         
            +
            def get_processor(
         
     | 
| 
      
 709 
     | 
    
         
            +
                pretrained_model_name_or_path: str,
         
     | 
| 
      
 710 
     | 
    
         
            +
            ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
         
     | 
| 
      
 711 
     | 
    
         
            +
                assert (
         
     | 
| 
      
 712 
     | 
    
         
            +
                    pretrained_model_name_or_path is not None
         
     | 
| 
      
 713 
     | 
    
         
            +
                    and pretrained_model_name_or_path != ""
         
     | 
| 
      
 714 
     | 
    
         
            +
                )
         
     | 
| 
      
 715 
     | 
    
         
            +
                if pretrained_model_name_or_path.endswith(
         
     | 
| 
      
 716 
     | 
    
         
            +
                    ".json"
         
     | 
| 
      
 717 
     | 
    
         
            +
                ) or pretrained_model_name_or_path.endswith(".model"):
         
     | 
| 
      
 718 
     | 
    
         
            +
                    from sglang.srt.utils.hf_transformers_utils import get_processor
         
     | 
| 
      
 719 
     | 
    
         
            +
             
     | 
| 
      
 720 
     | 
    
         
            +
                    return get_processor(pretrained_model_name_or_path)
         
     | 
| 
      
 721 
     | 
    
         
            +
             
     | 
| 
      
 722 
     | 
    
         
            +
                if pretrained_model_name_or_path is not None and not os.path.exists(
         
     | 
| 
      
 723 
     | 
    
         
            +
                    pretrained_model_name_or_path
         
     | 
| 
      
 724 
     | 
    
         
            +
                ):
         
     | 
| 
      
 725 
     | 
    
         
            +
                    pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
         
     | 
| 
      
 726 
     | 
    
         
            +
                return AutoProcessor.from_pretrained(
         
     | 
| 
      
 727 
     | 
    
         
            +
                    pretrained_model_name_or_path, trust_remote_code=True
         
     | 
| 
      
 728 
     | 
    
         
            +
                )
         
     | 
| 
      
 729 
     | 
    
         
            +
             
     | 
| 
      
 730 
     | 
    
         
            +
             
     | 
| 
      
 731 
     | 
    
         
            +
            def get_dataset(args, tokenizer, model_id=None):
         
     | 
| 
       648 
732 
     | 
    
         
             
                tokenize_prompt = getattr(args, "tokenize_prompt", False)
         
     | 
| 
       649 
733 
     | 
    
         
             
                if args.dataset_name == "sharegpt":
         
     | 
| 
       650 
734 
     | 
    
         
             
                    assert not tokenize_prompt
         
     | 
| 
         @@ -657,7 +741,7 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       657 
741 
     | 
    
         
             
                        prompt_suffix=args.prompt_suffix,
         
     | 
| 
       658 
742 
     | 
    
         
             
                        apply_chat_template=args.apply_chat_template,
         
     | 
| 
       659 
743 
     | 
    
         
             
                    )
         
     | 
| 
       660 
     | 
    
         
            -
                elif args.dataset_name.startswith("random") 
     | 
| 
      
 744 
     | 
    
         
            +
                elif args.dataset_name.startswith("random"):
         
     | 
| 
       661 
745 
     | 
    
         
             
                    input_requests = sample_random_requests(
         
     | 
| 
       662 
746 
     | 
    
         
             
                        input_len=args.random_input_len,
         
     | 
| 
       663 
747 
     | 
    
         
             
                        output_len=args.random_output_len,
         
     | 
| 
         @@ -668,17 +752,18 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       668 
752 
     | 
    
         
             
                        random_sample=args.dataset_name == "random",
         
     | 
| 
       669 
753 
     | 
    
         
             
                        return_text=not tokenize_prompt,
         
     | 
| 
       670 
754 
     | 
    
         
             
                    )
         
     | 
| 
       671 
     | 
    
         
            -
                elif args.dataset_name == " 
     | 
| 
       672 
     | 
    
         
            -
                     
     | 
| 
       673 
     | 
    
         
            -
                    input_requests =  
     | 
| 
      
 755 
     | 
    
         
            +
                elif args.dataset_name == "image":
         
     | 
| 
      
 756 
     | 
    
         
            +
                    processor = get_processor(model_id)
         
     | 
| 
      
 757 
     | 
    
         
            +
                    input_requests = sample_image_requests(
         
     | 
| 
       674 
758 
     | 
    
         
             
                        num_requests=args.num_prompts,
         
     | 
| 
       675 
     | 
    
         
            -
                         
     | 
| 
      
 759 
     | 
    
         
            +
                        image_count=args.image_count,
         
     | 
| 
       676 
760 
     | 
    
         
             
                        input_len=args.random_input_len,
         
     | 
| 
       677 
761 
     | 
    
         
             
                        output_len=args.random_output_len,
         
     | 
| 
       678 
762 
     | 
    
         
             
                        range_ratio=args.random_range_ratio,
         
     | 
| 
       679 
     | 
    
         
            -
                         
     | 
| 
       680 
     | 
    
         
            -
                         
     | 
| 
       681 
     | 
    
         
            -
                         
     | 
| 
      
 763 
     | 
    
         
            +
                        processor=processor,
         
     | 
| 
      
 764 
     | 
    
         
            +
                        image_content=args.image_content,
         
     | 
| 
      
 765 
     | 
    
         
            +
                        image_format=args.image_format,
         
     | 
| 
      
 766 
     | 
    
         
            +
                        image_resolution=args.image_resolution,
         
     | 
| 
       682 
767 
     | 
    
         
             
                    )
         
     | 
| 
       683 
768 
     | 
    
         
             
                elif args.dataset_name == "generated-shared-prefix":
         
     | 
| 
       684 
769 
     | 
    
         
             
                    assert not tokenize_prompt
         
     | 
| 
         @@ -692,12 +777,11 @@ def get_dataset(args, tokenizer): 
     | 
|
| 
       692 
777 
     | 
    
         
             
                        args=args,
         
     | 
| 
       693 
778 
     | 
    
         
             
                    )
         
     | 
| 
       694 
779 
     | 
    
         
             
                elif args.dataset_name == "mmmu":
         
     | 
| 
       695 
     | 
    
         
            -
                     
     | 
| 
      
 780 
     | 
    
         
            +
                    processor = get_processor(model_id)
         
     | 
| 
       696 
781 
     | 
    
         
             
                    input_requests = sample_mmmu_requests(
         
     | 
| 
       697 
782 
     | 
    
         
             
                        num_requests=args.num_prompts,
         
     | 
| 
       698 
     | 
    
         
            -
                         
     | 
| 
      
 783 
     | 
    
         
            +
                        processor=processor,
         
     | 
| 
       699 
784 
     | 
    
         
             
                        fixed_output_len=args.random_output_len,
         
     | 
| 
       700 
     | 
    
         
            -
                        apply_chat_template=args.apply_chat_template,
         
     | 
| 
       701 
785 
     | 
    
         
             
                        random_sample=True,
         
     | 
| 
       702 
786 
     | 
    
         
             
                    )
         
     | 
| 
       703 
787 
     | 
    
         
             
                elif args.dataset_name == "mooncake":
         
     | 
| 
         @@ -742,6 +826,8 @@ ASYNC_REQUEST_FUNCS = { 
     | 
|
| 
       742 
826 
     | 
    
         
             
            class BenchmarkMetrics:
         
     | 
| 
       743 
827 
     | 
    
         
             
                completed: int
         
     | 
| 
       744 
828 
     | 
    
         
             
                total_input: int
         
     | 
| 
      
 829 
     | 
    
         
            +
                total_input_text: int
         
     | 
| 
      
 830 
     | 
    
         
            +
                total_input_vision: int
         
     | 
| 
       745 
831 
     | 
    
         
             
                total_output: int
         
     | 
| 
       746 
832 
     | 
    
         
             
                total_output_retokenized: int
         
     | 
| 
       747 
833 
     | 
    
         
             
                request_throughput: float
         
     | 
| 
         @@ -835,9 +921,17 @@ class DatasetRow: 
     | 
|
| 
       835 
921 
     | 
    
         
             
                prompt: str
         
     | 
| 
       836 
922 
     | 
    
         
             
                prompt_len: int
         
     | 
| 
       837 
923 
     | 
    
         
             
                output_len: int
         
     | 
| 
      
 924 
     | 
    
         
            +
                text_prompt_len: Optional[int] = None
         
     | 
| 
      
 925 
     | 
    
         
            +
                vision_prompt_len: Optional[int] = None
         
     | 
| 
       838 
926 
     | 
    
         
             
                image_data: Optional[List[str]] = None
         
     | 
| 
       839 
927 
     | 
    
         
             
                timestamp: Optional[float] = None
         
     | 
| 
       840 
928 
     | 
    
         | 
| 
      
 929 
     | 
    
         
            +
                def __post_init__(self):
         
     | 
| 
      
 930 
     | 
    
         
            +
                    if self.text_prompt_len is None:
         
     | 
| 
      
 931 
     | 
    
         
            +
                        self.text_prompt_len = self.prompt_len
         
     | 
| 
      
 932 
     | 
    
         
            +
                    if self.vision_prompt_len is None:
         
     | 
| 
      
 933 
     | 
    
         
            +
                        self.vision_prompt_len = 0
         
     | 
| 
      
 934 
     | 
    
         
            +
             
     | 
| 
       841 
935 
     | 
    
         | 
| 
       842 
936 
     | 
    
         
             
            async def get_mooncake_request_over_time(
         
     | 
| 
       843 
937 
     | 
    
         
             
                input_requests: List[Dict],
         
     | 
| 
         @@ -885,7 +979,7 @@ async def get_mooncake_request_over_time( 
     | 
|
| 
       885 
979 
     | 
    
         
             
                    for i in range(num_rounds):
         
     | 
| 
       886 
980 
     | 
    
         
             
                        # Add user query for the current round
         
     | 
| 
       887 
981 
     | 
    
         
             
                        chat_history.append(
         
     | 
| 
       888 
     | 
    
         
            -
                            {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
         
     | 
| 
      
 982 
     | 
    
         
            +
                            {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
         
     | 
| 
       889 
983 
     | 
    
         
             
                        )
         
     | 
| 
       890 
984 
     | 
    
         | 
| 
       891 
985 
     | 
    
         
             
                        # Form the full prompt from history
         
     | 
| 
         @@ -914,9 +1008,8 @@ async def get_mooncake_request_over_time( 
     | 
|
| 
       914 
1008 
     | 
    
         | 
| 
       915 
1009 
     | 
    
         
             
            def sample_mmmu_requests(
         
     | 
| 
       916 
1010 
     | 
    
         
             
                num_requests: int,
         
     | 
| 
       917 
     | 
    
         
            -
                 
     | 
| 
      
 1011 
     | 
    
         
            +
                processor: AutoProcessor | AutoTokenizer,
         
     | 
| 
       918 
1012 
     | 
    
         
             
                fixed_output_len: Optional[int] = None,
         
     | 
| 
       919 
     | 
    
         
            -
                apply_chat_template: bool = True,
         
     | 
| 
       920 
1013 
     | 
    
         
             
                random_sample: bool = True,
         
     | 
| 
       921 
1014 
     | 
    
         
             
            ) -> List[DatasetRow]:
         
     | 
| 
       922 
1015 
     | 
    
         
             
                """
         
     | 
| 
         @@ -924,22 +1017,12 @@ def sample_mmmu_requests( 
     | 
|
| 
       924 
1017 
     | 
    
         | 
| 
       925 
1018 
     | 
    
         
             
                Args:
         
     | 
| 
       926 
1019 
     | 
    
         
             
                    num_requests: Number of requests to sample.
         
     | 
| 
       927 
     | 
    
         
            -
                    tokenizer: Tokenizer to use for token counting.
         
     | 
| 
       928 
1020 
     | 
    
         
             
                    fixed_output_len: If provided, use this fixed output length for all requests.
         
     | 
| 
       929 
     | 
    
         
            -
                    apply_chat_template: Whether to apply the chat template to the prompt.
         
     | 
| 
       930 
1021 
     | 
    
         
             
                    random_sample: Whether to randomly sample or take the first N.
         
     | 
| 
       931 
1022 
     | 
    
         | 
| 
       932 
1023 
     | 
    
         
             
                Returns:
         
     | 
| 
       933 
1024 
     | 
    
         
             
                    List of tuples (prompt, prompt_token_len, output_token_len).
         
     | 
| 
       934 
1025 
     | 
    
         
             
                """
         
     | 
| 
       935 
     | 
    
         
            -
                try:
         
     | 
| 
       936 
     | 
    
         
            -
                    import io
         
     | 
| 
       937 
     | 
    
         
            -
             
     | 
| 
       938 
     | 
    
         
            -
                    import pybase64
         
     | 
| 
       939 
     | 
    
         
            -
                    from datasets import load_dataset
         
     | 
| 
       940 
     | 
    
         
            -
                except ImportError:
         
     | 
| 
       941 
     | 
    
         
            -
                    raise ImportError("Please install datasets: pip install datasets")
         
     | 
| 
       942 
     | 
    
         
            -
             
     | 
| 
       943 
1026 
     | 
    
         
             
                print("Loading MMMU dataset from HuggingFace...")
         
     | 
| 
       944 
1027 
     | 
    
         | 
| 
       945 
1028 
     | 
    
         
             
                try:
         
     | 
| 
         @@ -995,54 +1078,12 @@ def sample_mmmu_requests( 
     | 
|
| 
       995 
1078 
     | 
    
         
             
                            question = example.get("question")
         
     | 
| 
       996 
1079 
     | 
    
         | 
| 
       997 
1080 
     | 
    
         
             
                            # Construct the prompt
         
     | 
| 
       998 
     | 
    
         
            -
                             
     | 
| 
       999 
     | 
    
         
            -
                            if apply_chat_template:
         
     | 
| 
       1000 
     | 
    
         
            -
                                try:
         
     | 
| 
       1001 
     | 
    
         
            -
                                    is_phi4_multimodal = (
         
     | 
| 
       1002 
     | 
    
         
            -
                                        "phi-4-multimodal" in tokenizer.name_or_path.lower()
         
     | 
| 
       1003 
     | 
    
         
            -
                                    )
         
     | 
| 
       1004 
     | 
    
         
            -
                                    if is_phi4_multimodal:
         
     | 
| 
       1005 
     | 
    
         
            -
                                        # <|endoftext10|> is the image token used in the phi-4-multimodal model.
         
     | 
| 
       1006 
     | 
    
         
            -
                                        content = prompt.replace("image 1", "<|endoftext10|>")
         
     | 
| 
       1007 
     | 
    
         
            -
                                    else:
         
     | 
| 
       1008 
     | 
    
         
            -
                                        content = [
         
     | 
| 
       1009 
     | 
    
         
            -
                                            {
         
     | 
| 
       1010 
     | 
    
         
            -
                                                "type": "image_url",
         
     | 
| 
       1011 
     | 
    
         
            -
                                                "image_url": {"url": image_data},
         
     | 
| 
       1012 
     | 
    
         
            -
                                            },
         
     | 
| 
       1013 
     | 
    
         
            -
                                            {"type": "text", "text": prompt},
         
     | 
| 
       1014 
     | 
    
         
            -
                                        ]
         
     | 
| 
       1015 
     | 
    
         
            -
                                    prompt = tokenizer.apply_chat_template(
         
     | 
| 
       1016 
     | 
    
         
            -
                                        [
         
     | 
| 
       1017 
     | 
    
         
            -
                                            {
         
     | 
| 
       1018 
     | 
    
         
            -
                                                "role": "user",
         
     | 
| 
       1019 
     | 
    
         
            -
                                                "content": content,
         
     | 
| 
       1020 
     | 
    
         
            -
                                            }
         
     | 
| 
       1021 
     | 
    
         
            -
                                        ],
         
     | 
| 
       1022 
     | 
    
         
            -
                                        add_generation_prompt=True,
         
     | 
| 
       1023 
     | 
    
         
            -
                                        tokenize=False,
         
     | 
| 
       1024 
     | 
    
         
            -
                                    )
         
     | 
| 
       1025 
     | 
    
         
            -
                                except Exception as e:
         
     | 
| 
       1026 
     | 
    
         
            -
                                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
         
     | 
| 
       1027 
     | 
    
         
            -
                                    print(
         
     | 
| 
       1028 
     | 
    
         
            -
                                        f"Error applying chat template: {e}, fallback to <image> tag"
         
     | 
| 
       1029 
     | 
    
         
            -
                                    )
         
     | 
| 
       1030 
     | 
    
         
            -
                                    prompt = f"<image>{prompt}"
         
     | 
| 
       1031 
     | 
    
         
            -
             
     | 
| 
       1032 
     | 
    
         
            -
                            # Calculate token lengths for text only (without image data)
         
     | 
| 
       1033 
     | 
    
         
            -
                            prompt_token_ids = tokenizer.encode(prompt)
         
     | 
| 
       1034 
     | 
    
         
            -
                            prompt_len = len(prompt_token_ids)
         
     | 
| 
       1035 
     | 
    
         
            -
             
     | 
| 
      
 1081 
     | 
    
         
            +
                            text_prompt = f"Question: {question}\n\nAnswer: "
         
     | 
| 
       1036 
1082 
     | 
    
         
             
                            output_len = fixed_output_len if fixed_output_len is not None else 256
         
     | 
| 
       1037 
     | 
    
         
            -
             
     | 
| 
       1038 
     | 
    
         
            -
             
     | 
| 
       1039 
     | 
    
         
            -
                                DatasetRow(
         
     | 
| 
       1040 
     | 
    
         
            -
                                    prompt=prompt,
         
     | 
| 
       1041 
     | 
    
         
            -
                                    prompt_len=prompt_len,
         
     | 
| 
       1042 
     | 
    
         
            -
                                    output_len=output_len,
         
     | 
| 
       1043 
     | 
    
         
            -
                                    image_data=[image_data],
         
     | 
| 
       1044 
     | 
    
         
            -
                                )
         
     | 
| 
      
 1083 
     | 
    
         
            +
                            data_row = create_mm_data_row(
         
     | 
| 
      
 1084 
     | 
    
         
            +
                                text_prompt, [image], [image_data], output_len, processor
         
     | 
| 
       1045 
1085 
     | 
    
         
             
                            )
         
     | 
| 
      
 1086 
     | 
    
         
            +
                            filtered_dataset.append(data_row)
         
     | 
| 
       1046 
1087 
     | 
    
         | 
| 
       1047 
1088 
     | 
    
         
             
                    except Exception as e:
         
     | 
| 
       1048 
1089 
     | 
    
         
             
                        print(f"Error processing example {i}: {e}")
         
     | 
| 
         @@ -1110,7 +1151,8 @@ def sample_sharegpt_requests( 
     | 
|
| 
       1110 
1151 
     | 
    
         
             
                            add_generation_prompt=True,
         
     | 
| 
       1111 
1152 
     | 
    
         
             
                            tokenize=False,
         
     | 
| 
       1112 
1153 
     | 
    
         
             
                        )
         
     | 
| 
       1113 
     | 
    
         
            -
                         
     | 
| 
      
 1154 
     | 
    
         
            +
                        if tokenizer.bos_token:
         
     | 
| 
      
 1155 
     | 
    
         
            +
                            prompt = prompt.replace(tokenizer.bos_token, "")
         
     | 
| 
       1114 
1156 
     | 
    
         | 
| 
       1115 
1157 
     | 
    
         
             
                    prompt_token_ids = tokenizer.encode(prompt)
         
     | 
| 
       1116 
1158 
     | 
    
         
             
                    completion = dataset[i][1]
         
     | 
| 
         @@ -1129,7 +1171,11 @@ def sample_sharegpt_requests( 
     | 
|
| 
       1129 
1171 
     | 
    
         
             
                        continue
         
     | 
| 
       1130 
1172 
     | 
    
         | 
| 
       1131 
1173 
     | 
    
         
             
                    filtered_dataset.append(
         
     | 
| 
       1132 
     | 
    
         
            -
                        DatasetRow( 
     | 
| 
      
 1174 
     | 
    
         
            +
                        DatasetRow(
         
     | 
| 
      
 1175 
     | 
    
         
            +
                            prompt=prompt,
         
     | 
| 
      
 1176 
     | 
    
         
            +
                            prompt_len=prompt_len,
         
     | 
| 
      
 1177 
     | 
    
         
            +
                            output_len=output_len,
         
     | 
| 
      
 1178 
     | 
    
         
            +
                        )
         
     | 
| 
       1133 
1179 
     | 
    
         
             
                    )
         
     | 
| 
       1134 
1180 
     | 
    
         | 
| 
       1135 
1181 
     | 
    
         
             
                print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
         
     | 
| 
         @@ -1240,7 +1286,7 @@ def sample_random_requests( 
     | 
|
| 
       1240 
1286 
     | 
    
         
             
                return input_requests
         
     | 
| 
       1241 
1287 
     | 
    
         | 
| 
       1242 
1288 
     | 
    
         | 
| 
       1243 
     | 
    
         
            -
            def  
     | 
| 
      
 1289 
     | 
    
         
            +
            def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
         
     | 
| 
       1244 
1290 
     | 
    
         
             
                """Parse image resolution into (width, height).
         
     | 
| 
       1245 
1291 
     | 
    
         | 
| 
       1246 
1292 
     | 
    
         
             
                Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
         
     | 
| 
         @@ -1265,44 +1311,94 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]: 
     | 
|
| 
       1265 
1311 
     | 
    
         
             
                            return (width, height)
         
     | 
| 
       1266 
1312 
     | 
    
         | 
| 
       1267 
1313 
     | 
    
         
             
                raise ValueError(
         
     | 
| 
       1268 
     | 
    
         
            -
                    f"Unsupported  
     | 
| 
      
 1314 
     | 
    
         
            +
                    f"Unsupported image resolution: {image_resolution}. "
         
     | 
| 
       1269 
1315 
     | 
    
         
             
                    "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
         
     | 
| 
       1270 
1316 
     | 
    
         
             
                )
         
     | 
| 
       1271 
1317 
     | 
    
         | 
| 
       1272 
1318 
     | 
    
         | 
| 
       1273 
     | 
    
         
            -
            def  
     | 
| 
      
 1319 
     | 
    
         
            +
            def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
         
     | 
| 
      
 1320 
     | 
    
         
            +
                try:
         
     | 
| 
      
 1321 
     | 
    
         
            +
                    content_items = [
         
     | 
| 
      
 1322 
     | 
    
         
            +
                        {"type": "image", "image": {"url": image_base64}}
         
     | 
| 
      
 1323 
     | 
    
         
            +
                        for image_base64 in images_base64
         
     | 
| 
      
 1324 
     | 
    
         
            +
                    ]
         
     | 
| 
      
 1325 
     | 
    
         
            +
                    content_items.append({"type": "text", "text": text_prompt})
         
     | 
| 
      
 1326 
     | 
    
         
            +
                    prompt_str = processor.apply_chat_template(
         
     | 
| 
      
 1327 
     | 
    
         
            +
                        [{"role": "user", "content": content_items}],
         
     | 
| 
      
 1328 
     | 
    
         
            +
                        add_generation_prompt=True,
         
     | 
| 
      
 1329 
     | 
    
         
            +
                        tokenize=False,
         
     | 
| 
      
 1330 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1331 
     | 
    
         
            +
                except Exception as e:
         
     | 
| 
      
 1332 
     | 
    
         
            +
                    # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
         
     | 
| 
      
 1333 
     | 
    
         
            +
                    print(f"Error applying chat template: {e}, fallback to <image> tag")
         
     | 
| 
      
 1334 
     | 
    
         
            +
                    # Some tokenizers do not support list content; fall back to a placeholder in the text
         
     | 
| 
      
 1335 
     | 
    
         
            +
                    prompt_str = f"<image>{text_prompt}"
         
     | 
| 
      
 1336 
     | 
    
         
            +
             
     | 
| 
      
 1337 
     | 
    
         
            +
                # Calculate total tokens (text + vision)
         
     | 
| 
      
 1338 
     | 
    
         
            +
                prompt_len = processor(
         
     | 
| 
      
 1339 
     | 
    
         
            +
                    text=[prompt_str],
         
     | 
| 
      
 1340 
     | 
    
         
            +
                    images=images,
         
     | 
| 
      
 1341 
     | 
    
         
            +
                    padding=False,
         
     | 
| 
      
 1342 
     | 
    
         
            +
                    return_tensors="pt",
         
     | 
| 
      
 1343 
     | 
    
         
            +
                )["input_ids"].numel()
         
     | 
| 
      
 1344 
     | 
    
         
            +
             
     | 
| 
      
 1345 
     | 
    
         
            +
                # Calculate text-only tokens
         
     | 
| 
      
 1346 
     | 
    
         
            +
                try:
         
     | 
| 
      
 1347 
     | 
    
         
            +
                    # Create text-only version of the prompt
         
     | 
| 
      
 1348 
     | 
    
         
            +
                    text_only_prompt = processor.apply_chat_template(
         
     | 
| 
      
 1349 
     | 
    
         
            +
                        [{"role": "user", "content": text_prompt}],
         
     | 
| 
      
 1350 
     | 
    
         
            +
                        add_generation_prompt=True,
         
     | 
| 
      
 1351 
     | 
    
         
            +
                        tokenize=False,
         
     | 
| 
      
 1352 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1353 
     | 
    
         
            +
                    text_prompt_len = processor(
         
     | 
| 
      
 1354 
     | 
    
         
            +
                        text=[text_only_prompt],
         
     | 
| 
      
 1355 
     | 
    
         
            +
                        padding=False,
         
     | 
| 
      
 1356 
     | 
    
         
            +
                        return_tensors="pt",
         
     | 
| 
      
 1357 
     | 
    
         
            +
                    )["input_ids"].numel()
         
     | 
| 
      
 1358 
     | 
    
         
            +
                except Exception:
         
     | 
| 
      
 1359 
     | 
    
         
            +
                    # Fallback: just tokenize the text prompt directly
         
     | 
| 
      
 1360 
     | 
    
         
            +
                    text_prompt_len = len(processor.tokenizer.encode(text_prompt))
         
     | 
| 
      
 1361 
     | 
    
         
            +
             
     | 
| 
      
 1362 
     | 
    
         
            +
                # Vision tokens = total tokens - text tokens
         
     | 
| 
      
 1363 
     | 
    
         
            +
                vision_prompt_len = prompt_len - text_prompt_len
         
     | 
| 
      
 1364 
     | 
    
         
            +
             
     | 
| 
      
 1365 
     | 
    
         
            +
                return DatasetRow(
         
     | 
| 
      
 1366 
     | 
    
         
            +
                    prompt=text_prompt,
         
     | 
| 
      
 1367 
     | 
    
         
            +
                    prompt_len=prompt_len,
         
     | 
| 
      
 1368 
     | 
    
         
            +
                    output_len=output_len,
         
     | 
| 
      
 1369 
     | 
    
         
            +
                    text_prompt_len=text_prompt_len,
         
     | 
| 
      
 1370 
     | 
    
         
            +
                    vision_prompt_len=vision_prompt_len,
         
     | 
| 
      
 1371 
     | 
    
         
            +
                    image_data=images_base64,
         
     | 
| 
      
 1372 
     | 
    
         
            +
                )
         
     | 
| 
      
 1373 
     | 
    
         
            +
             
     | 
| 
      
 1374 
     | 
    
         
            +
             
     | 
| 
      
 1375 
     | 
    
         
            +
            def sample_image_requests(
         
     | 
| 
       1274 
1376 
     | 
    
         
             
                num_requests: int,
         
     | 
| 
       1275 
     | 
    
         
            -
                 
     | 
| 
      
 1377 
     | 
    
         
            +
                image_count: int,
         
     | 
| 
       1276 
1378 
     | 
    
         
             
                input_len: int,
         
     | 
| 
       1277 
1379 
     | 
    
         
             
                output_len: int,
         
     | 
| 
       1278 
1380 
     | 
    
         
             
                range_ratio: float,
         
     | 
| 
       1279 
     | 
    
         
            -
                 
     | 
| 
       1280 
     | 
    
         
            -
                 
     | 
| 
       1281 
     | 
    
         
            -
                 
     | 
| 
      
 1381 
     | 
    
         
            +
                processor: AutoProcessor,
         
     | 
| 
      
 1382 
     | 
    
         
            +
                image_content: str,
         
     | 
| 
      
 1383 
     | 
    
         
            +
                image_format: str,
         
     | 
| 
      
 1384 
     | 
    
         
            +
                image_resolution: str,
         
     | 
| 
       1282 
1385 
     | 
    
         
             
            ) -> List[DatasetRow]:
         
     | 
| 
       1283 
     | 
    
         
            -
                """Generate requests with  
     | 
| 
      
 1386 
     | 
    
         
            +
                """Generate requests with images.
         
     | 
| 
       1284 
1387 
     | 
    
         | 
| 
       1285 
     | 
    
         
            -
                - Each request includes `` 
     | 
| 
      
 1388 
     | 
    
         
            +
                - Each request includes ``image_count`` images.
         
     | 
| 
       1286 
1389 
     | 
    
         
             
                - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
         
     | 
| 
       1287 
1390 
     | 
    
         
             
                  or custom 'heightxwidth' (e.g., 1080x1920).
         
     | 
| 
       1288 
1391 
     | 
    
         
             
                - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
         
     | 
| 
       1289 
1392 
     | 
    
         
             
                  only counts text tokens and excludes image data.
         
     | 
| 
       1290 
1393 
     | 
    
         
             
                """
         
     | 
| 
       1291 
     | 
    
         
            -
                try:
         
     | 
| 
       1292 
     | 
    
         
            -
                    import pybase64
         
     | 
| 
       1293 
     | 
    
         
            -
                    from PIL import Image
         
     | 
| 
       1294 
     | 
    
         
            -
                except ImportError as e:
         
     | 
| 
       1295 
     | 
    
         
            -
                    raise ImportError(
         
     | 
| 
       1296 
     | 
    
         
            -
                        "Please install Pillow to generate random images: pip install pillow"
         
     | 
| 
       1297 
     | 
    
         
            -
                    ) from e
         
     | 
| 
       1298 
1394 
     | 
    
         | 
| 
       1299 
1395 
     | 
    
         
             
                # Parse resolution (supports presets and 'heightxwidth')
         
     | 
| 
       1300 
     | 
    
         
            -
                width, height =  
     | 
| 
      
 1396 
     | 
    
         
            +
                width, height = parse_image_resolution(image_resolution)
         
     | 
| 
       1301 
1397 
     | 
    
         | 
| 
       1302 
1398 
     | 
    
         
             
                # Check for potentially problematic combinations and warn user
         
     | 
| 
       1303 
     | 
    
         
            -
                if width * height >= 1920 * 1080 and  
     | 
| 
      
 1399 
     | 
    
         
            +
                if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
         
     | 
| 
       1304 
1400 
     | 
    
         
             
                    warnings.warn(
         
     | 
| 
       1305 
     | 
    
         
            -
                        f"High resolution ({width}x{height}) with { 
     | 
| 
      
 1401 
     | 
    
         
            +
                        f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
         
     | 
| 
       1306 
1402 
     | 
    
         
             
                        f"may take a long time. Consider reducing resolution or image count.",
         
     | 
| 
       1307 
1403 
     | 
    
         
             
                        UserWarning,
         
     | 
| 
       1308 
1404 
     | 
    
         
             
                        stacklevel=2,
         
     | 
| 
         @@ -1316,53 +1412,50 @@ def sample_random_image_requests( 
     | 
|
| 
       1316 
1412 
     | 
    
         
             
                    int(output_len * range_ratio), output_len + 1, size=num_requests
         
     | 
| 
       1317 
1413 
     | 
    
         
             
                )
         
     | 
| 
       1318 
1414 
     | 
    
         | 
| 
       1319 
     | 
    
         
            -
                def _gen_random_image_data_uri( 
     | 
| 
       1320 
     | 
    
         
            -
                     
     | 
| 
       1321 
     | 
    
         
            -
             
     | 
| 
      
 1415 
     | 
    
         
            +
                def _gen_random_image_data_uri(
         
     | 
| 
      
 1416 
     | 
    
         
            +
                    width: int = width, height: int = height
         
     | 
| 
      
 1417 
     | 
    
         
            +
                ) -> (Image, str, int):
         
     | 
| 
      
 1418 
     | 
    
         
            +
                    if image_content == "blank":
         
     | 
| 
      
 1419 
     | 
    
         
            +
                        # Generate blank white image
         
     | 
| 
      
 1420 
     | 
    
         
            +
                        arr = np.full((height, width, 3), 255, dtype=np.uint8)
         
     | 
| 
      
 1421 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1422 
     | 
    
         
            +
                        # Generate random colored image
         
     | 
| 
      
 1423 
     | 
    
         
            +
                        arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
         
     | 
| 
      
 1424 
     | 
    
         
            +
                    img = Image.fromarray(arr)
         
     | 
| 
       1322 
1425 
     | 
    
         
             
                    buf = io.BytesIO()
         
     | 
| 
       1323 
     | 
    
         
            -
                    img.save(buf, format= 
     | 
| 
      
 1426 
     | 
    
         
            +
                    img.save(buf, format=image_format, quality=85)
         
     | 
| 
       1324 
1427 
     | 
    
         
             
                    encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
         
     | 
| 
       1325 
     | 
    
         
            -
                     
     | 
| 
      
 1428 
     | 
    
         
            +
                    image_data = f"data:image/{image_format};base64,{encoded}"
         
     | 
| 
      
 1429 
     | 
    
         
            +
                    image_bytes = len(image_data.encode("utf-8"))
         
     | 
| 
      
 1430 
     | 
    
         
            +
                    return img, image_data, image_bytes
         
     | 
| 
       1326 
1431 
     | 
    
         | 
| 
       1327 
1432 
     | 
    
         
             
                dataset: List[DatasetRow] = []
         
     | 
| 
      
 1433 
     | 
    
         
            +
                total_image_bytes = 0
         
     | 
| 
       1328 
1434 
     | 
    
         
             
                for i in range(num_requests):
         
     | 
| 
       1329 
1435 
     | 
    
         
             
                    # Generate text prompt
         
     | 
| 
       1330 
     | 
    
         
            -
                    text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
         
     | 
| 
      
 1436 
     | 
    
         
            +
                    text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
         
     | 
| 
       1331 
1437 
     | 
    
         | 
| 
       1332 
1438 
     | 
    
         
             
                    # Generate image list
         
     | 
| 
       1333 
     | 
    
         
            -
                    images  
     | 
| 
       1334 
     | 
    
         
            -
             
     | 
| 
       1335 
     | 
    
         
            -
                     
     | 
| 
       1336 
     | 
    
         
            -
                     
     | 
| 
       1337 
     | 
    
         
            -
             
     | 
| 
       1338 
     | 
    
         
            -
             
     | 
| 
       1339 
     | 
    
         
            -
             
     | 
| 
       1340 
     | 
    
         
            -
             
     | 
| 
       1341 
     | 
    
         
            -
             
     | 
| 
       1342 
     | 
    
         
            -
             
     | 
| 
       1343 
     | 
    
         
            -
             
     | 
| 
       1344 
     | 
    
         
            -
                                [{"role": "user", "content": content_items}],
         
     | 
| 
       1345 
     | 
    
         
            -
                                add_generation_prompt=True,
         
     | 
| 
       1346 
     | 
    
         
            -
                                tokenize=False,
         
     | 
| 
       1347 
     | 
    
         
            -
                            )
         
     | 
| 
       1348 
     | 
    
         
            -
                        except Exception:
         
     | 
| 
       1349 
     | 
    
         
            -
                            # Some tokenizers do not support list content; fall back to a placeholder in the text
         
     | 
| 
       1350 
     | 
    
         
            -
                            prompt_str = f"<image>{text_prompt}"
         
     | 
| 
       1351 
     | 
    
         
            -
             
     | 
| 
       1352 
     | 
    
         
            -
                    prompt_token_ids = tokenizer.encode(prompt_str)
         
     | 
| 
       1353 
     | 
    
         
            -
                    prompt_token_len = len(prompt_token_ids)
         
     | 
| 
       1354 
     | 
    
         
            -
             
     | 
| 
       1355 
     | 
    
         
            -
                    dataset.append(
         
     | 
| 
       1356 
     | 
    
         
            -
                        DatasetRow(
         
     | 
| 
       1357 
     | 
    
         
            -
                            prompt=prompt_str,
         
     | 
| 
       1358 
     | 
    
         
            -
                            prompt_len=prompt_token_len,
         
     | 
| 
       1359 
     | 
    
         
            -
                            output_len=int(output_lens[i]),
         
     | 
| 
       1360 
     | 
    
         
            -
                            image_data=images,
         
     | 
| 
       1361 
     | 
    
         
            -
                        )
         
     | 
| 
      
 1439 
     | 
    
         
            +
                    images, images_base64, images_bytes = zip(
         
     | 
| 
      
 1440 
     | 
    
         
            +
                        *[_gen_random_image_data_uri() for _ in range(image_count)]
         
     | 
| 
      
 1441 
     | 
    
         
            +
                    )
         
     | 
| 
      
 1442 
     | 
    
         
            +
                    total_image_bytes += sum(list(images_bytes))
         
     | 
| 
      
 1443 
     | 
    
         
            +
             
     | 
| 
      
 1444 
     | 
    
         
            +
                    data_row = create_mm_data_row(
         
     | 
| 
      
 1445 
     | 
    
         
            +
                        text_prompt,
         
     | 
| 
      
 1446 
     | 
    
         
            +
                        list(images),
         
     | 
| 
      
 1447 
     | 
    
         
            +
                        list(images_base64),
         
     | 
| 
      
 1448 
     | 
    
         
            +
                        int(output_lens[i]),
         
     | 
| 
      
 1449 
     | 
    
         
            +
                        processor,
         
     | 
| 
       1362 
1450 
     | 
    
         
             
                    )
         
     | 
| 
       1363 
1451 
     | 
    
         | 
| 
      
 1452 
     | 
    
         
            +
                    dataset.append(data_row)
         
     | 
| 
      
 1453 
     | 
    
         
            +
             
     | 
| 
       1364 
1454 
     | 
    
         
             
                print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
         
     | 
| 
       1365 
1455 
     | 
    
         
             
                print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
         
     | 
| 
      
 1456 
     | 
    
         
            +
                print(
         
     | 
| 
      
 1457 
     | 
    
         
            +
                    f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
         
     | 
| 
      
 1458 
     | 
    
         
            +
                )
         
     | 
| 
       1366 
1459 
     | 
    
         
             
                return dataset
         
     | 
| 
       1367 
1460 
     | 
    
         | 
| 
       1368 
1461 
     | 
    
         | 
| 
         @@ -1434,7 +1527,9 @@ def sample_generated_shared_prefix_requests( 
     | 
|
| 
       1434 
1527 
     | 
    
         | 
| 
       1435 
1528 
     | 
    
         
             
                        input_requests.append(
         
     | 
| 
       1436 
1529 
     | 
    
         
             
                            DatasetRow(
         
     | 
| 
       1437 
     | 
    
         
            -
                                prompt=full_prompt, 
     | 
| 
      
 1530 
     | 
    
         
            +
                                prompt=full_prompt,
         
     | 
| 
      
 1531 
     | 
    
         
            +
                                prompt_len=prompt_len,
         
     | 
| 
      
 1532 
     | 
    
         
            +
                                output_len=output_len,
         
     | 
| 
       1438 
1533 
     | 
    
         
             
                            )
         
     | 
| 
       1439 
1534 
     | 
    
         
             
                        )
         
     | 
| 
       1440 
1535 
     | 
    
         
             
                        total_input_tokens += prompt_len
         
     | 
| 
         @@ -1516,6 +1611,8 @@ def calculate_metrics( 
     | 
|
| 
       1516 
1611 
     | 
    
         
             
                output_lens: List[int] = []
         
     | 
| 
       1517 
1612 
     | 
    
         
             
                retokenized_output_lens: List[int] = []
         
     | 
| 
       1518 
1613 
     | 
    
         
             
                total_input = 0
         
     | 
| 
      
 1614 
     | 
    
         
            +
                total_input_text = 0
         
     | 
| 
      
 1615 
     | 
    
         
            +
                total_input_vision = 0
         
     | 
| 
       1519 
1616 
     | 
    
         
             
                completed = 0
         
     | 
| 
       1520 
1617 
     | 
    
         
             
                itls: List[float] = []
         
     | 
| 
       1521 
1618 
     | 
    
         
             
                tpots: List[float] = []
         
     | 
| 
         @@ -1529,7 +1626,9 @@ def calculate_metrics( 
     | 
|
| 
       1529 
1626 
     | 
    
         
             
                            tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
         
     | 
| 
       1530 
1627 
     | 
    
         
             
                        )
         
     | 
| 
       1531 
1628 
     | 
    
         
             
                        retokenized_output_lens.append(retokenized_output_len)
         
     | 
| 
       1532 
     | 
    
         
            -
                        total_input +=  
     | 
| 
      
 1629 
     | 
    
         
            +
                        total_input += input_requests[i].prompt_len
         
     | 
| 
      
 1630 
     | 
    
         
            +
                        total_input_text += input_requests[i].text_prompt_len
         
     | 
| 
      
 1631 
     | 
    
         
            +
                        total_input_vision += input_requests[i].vision_prompt_len
         
     | 
| 
       1533 
1632 
     | 
    
         
             
                        if output_len > 1:
         
     | 
| 
       1534 
1633 
     | 
    
         
             
                            tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
         
     | 
| 
       1535 
1634 
     | 
    
         
             
                        itls += outputs[i].itl
         
     | 
| 
         @@ -1551,6 +1650,8 @@ def calculate_metrics( 
     | 
|
| 
       1551 
1650 
     | 
    
         
             
                metrics = BenchmarkMetrics(
         
     | 
| 
       1552 
1651 
     | 
    
         
             
                    completed=completed,
         
     | 
| 
       1553 
1652 
     | 
    
         
             
                    total_input=total_input,
         
     | 
| 
      
 1653 
     | 
    
         
            +
                    total_input_text=total_input_text,
         
     | 
| 
      
 1654 
     | 
    
         
            +
                    total_input_vision=total_input_vision,
         
     | 
| 
       1554 
1655 
     | 
    
         
             
                    total_output=sum(output_lens),
         
     | 
| 
       1555 
1656 
     | 
    
         
             
                    total_output_retokenized=sum(retokenized_output_lens),
         
     | 
| 
       1556 
1657 
     | 
    
         
             
                    request_throughput=completed / dur_s,
         
     | 
| 
         @@ -1604,6 +1705,8 @@ async def benchmark( 
     | 
|
| 
       1604 
1705 
     | 
    
         
             
                use_trace_timestamps: bool = False,
         
     | 
| 
       1605 
1706 
     | 
    
         
             
                mooncake_slowdown_factor=1.0,
         
     | 
| 
       1606 
1707 
     | 
    
         
             
                mooncake_num_rounds=1,
         
     | 
| 
      
 1708 
     | 
    
         
            +
                profile_prefill_url: Optional[List[str]] = None,
         
     | 
| 
      
 1709 
     | 
    
         
            +
                profile_decode_url: Optional[List[str]] = None,
         
     | 
| 
       1607 
1710 
     | 
    
         
             
            ):
         
     | 
| 
       1608 
1711 
     | 
    
         
             
                if backend in ASYNC_REQUEST_FUNCS:
         
     | 
| 
       1609 
1712 
     | 
    
         
             
                    request_func = ASYNC_REQUEST_FUNCS[backend]
         
     | 
| 
         @@ -1693,14 +1796,28 @@ async def benchmark( 
     | 
|
| 
       1693 
1796 
     | 
    
         | 
| 
       1694 
1797 
     | 
    
         
             
                time.sleep(1.0)
         
     | 
| 
       1695 
1798 
     | 
    
         | 
| 
      
 1799 
     | 
    
         
            +
                # Build profile URLs for PD separated mode (do this once at the beginning)
         
     | 
| 
      
 1800 
     | 
    
         
            +
                pd_profile_urls = []
         
     | 
| 
      
 1801 
     | 
    
         
            +
                if profile and pd_separated:
         
     | 
| 
      
 1802 
     | 
    
         
            +
                    pd_profile_urls = _build_profile_urls(profile_prefill_url, profile_decode_url)
         
     | 
| 
      
 1803 
     | 
    
         
            +
                    if not pd_profile_urls:
         
     | 
| 
      
 1804 
     | 
    
         
            +
                        print(
         
     | 
| 
      
 1805 
     | 
    
         
            +
                            "Warning: PD separated mode requires --profile-prefill-url or --profile-decode-url"
         
     | 
| 
      
 1806 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1807 
     | 
    
         
            +
                        print("Skipping profiler start. Please specify worker URLs for profiling.")
         
     | 
| 
      
 1808 
     | 
    
         
            +
             
     | 
| 
       1696 
1809 
     | 
    
         
             
                # Start profiler
         
     | 
| 
       1697 
1810 
     | 
    
         
             
                if profile:
         
     | 
| 
       1698 
     | 
    
         
            -
                     
     | 
| 
       1699 
     | 
    
         
            -
             
     | 
| 
       1700 
     | 
    
         
            -
             
     | 
| 
       1701 
     | 
    
         
            -
                     
     | 
| 
       1702 
     | 
    
         
            -
             
     | 
| 
       1703 
     | 
    
         
            -
                         
     | 
| 
      
 1811 
     | 
    
         
            +
                    if pd_separated:
         
     | 
| 
      
 1812 
     | 
    
         
            +
                        if pd_profile_urls:
         
     | 
| 
      
 1813 
     | 
    
         
            +
                            await _call_profile_pd(pd_profile_urls, "start")
         
     | 
| 
      
 1814 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1815 
     | 
    
         
            +
                        print("Starting profiler...")
         
     | 
| 
      
 1816 
     | 
    
         
            +
                        profile_output = await async_request_profile(
         
     | 
| 
      
 1817 
     | 
    
         
            +
                            api_url=base_url + "/start_profile"
         
     | 
| 
      
 1818 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1819 
     | 
    
         
            +
                        if profile_output.success:
         
     | 
| 
      
 1820 
     | 
    
         
            +
                            print("Profiler started")
         
     | 
| 
       1704 
1821 
     | 
    
         | 
| 
       1705 
1822 
     | 
    
         
             
                # Run all requests
         
     | 
| 
       1706 
1823 
     | 
    
         
             
                benchmark_start_time = time.perf_counter()
         
     | 
| 
         @@ -1749,23 +1866,37 @@ async def benchmark( 
     | 
|
| 
       1749 
1866 
     | 
    
         | 
| 
       1750 
1867 
     | 
    
         
             
                # Stop profiler
         
     | 
| 
       1751 
1868 
     | 
    
         
             
                if profile:
         
     | 
| 
       1752 
     | 
    
         
            -
                     
     | 
| 
       1753 
     | 
    
         
            -
             
     | 
| 
       1754 
     | 
    
         
            -
             
     | 
| 
       1755 
     | 
    
         
            -
             
     | 
| 
      
 1869 
     | 
    
         
            +
                    if pd_separated:
         
     | 
| 
      
 1870 
     | 
    
         
            +
                        if pd_profile_urls:
         
     | 
| 
      
 1871 
     | 
    
         
            +
                            await _call_profile_pd(pd_profile_urls, "stop")
         
     | 
| 
      
 1872 
     | 
    
         
            +
                    else:
         
     | 
| 
      
 1873 
     | 
    
         
            +
                        print("Stopping profiler...")
         
     | 
| 
      
 1874 
     | 
    
         
            +
                        profile_output = await async_request_profile(
         
     | 
| 
      
 1875 
     | 
    
         
            +
                            api_url=base_url + "/stop_profile"
         
     | 
| 
      
 1876 
     | 
    
         
            +
                        )
         
     | 
| 
      
 1877 
     | 
    
         
            +
                        if profile_output.success:
         
     | 
| 
      
 1878 
     | 
    
         
            +
                            print("Profiler stopped")
         
     | 
| 
       1756 
1879 
     | 
    
         | 
| 
       1757 
1880 
     | 
    
         
             
                if pbar is not None:
         
     | 
| 
       1758 
1881 
     | 
    
         
             
                    pbar.close()
         
     | 
| 
       1759 
1882 
     | 
    
         | 
| 
       1760 
1883 
     | 
    
         
             
                if "sglang" in backend:
         
     | 
| 
       1761 
     | 
    
         
            -
                    server_info = requests.get( 
     | 
| 
      
 1884 
     | 
    
         
            +
                    server_info = requests.get(
         
     | 
| 
      
 1885 
     | 
    
         
            +
                        base_url + "/get_server_info", headers=get_auth_headers()
         
     | 
| 
      
 1886 
     | 
    
         
            +
                    )
         
     | 
| 
       1762 
1887 
     | 
    
         
             
                    if server_info.status_code == 200:
         
     | 
| 
       1763 
1888 
     | 
    
         
             
                        server_info_json = server_info.json()
         
     | 
| 
       1764 
1889 
     | 
    
         
             
                        if "decode" in server_info_json:
         
     | 
| 
       1765 
1890 
     | 
    
         
             
                            server_info_json = server_info_json["decode"][0]
         
     | 
| 
       1766 
     | 
    
         
            -
                         
     | 
| 
       1767 
     | 
    
         
            -
                            " 
     | 
| 
       1768 
     | 
    
         
            -
             
     | 
| 
      
 1891 
     | 
    
         
            +
                        if (
         
     | 
| 
      
 1892 
     | 
    
         
            +
                            "internal_states" in server_info_json
         
     | 
| 
      
 1893 
     | 
    
         
            +
                            and server_info_json["internal_states"]
         
     | 
| 
      
 1894 
     | 
    
         
            +
                        ):
         
     | 
| 
      
 1895 
     | 
    
         
            +
                            accept_length = server_info_json["internal_states"][0].get(
         
     | 
| 
      
 1896 
     | 
    
         
            +
                                "avg_spec_accept_length", None
         
     | 
| 
      
 1897 
     | 
    
         
            +
                            )
         
     | 
| 
      
 1898 
     | 
    
         
            +
                        else:
         
     | 
| 
      
 1899 
     | 
    
         
            +
                            accept_length = None
         
     | 
| 
       1769 
1900 
     | 
    
         
             
                    else:
         
     | 
| 
       1770 
1901 
     | 
    
         
             
                        accept_length = None
         
     | 
| 
       1771 
1902 
     | 
    
         
             
                else:
         
     | 
| 
         @@ -1797,6 +1928,10 @@ async def benchmark( 
     | 
|
| 
       1797 
1928 
     | 
    
         
             
                print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
         
     | 
| 
       1798 
1929 
     | 
    
         
             
                print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
         
     | 
| 
       1799 
1930 
     | 
    
         
             
                print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
         
     | 
| 
      
 1931 
     | 
    
         
            +
                print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
         
     | 
| 
      
 1932 
     | 
    
         
            +
                print(
         
     | 
| 
      
 1933 
     | 
    
         
            +
                    "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
         
     | 
| 
      
 1934 
     | 
    
         
            +
                )
         
     | 
| 
       1800 
1935 
     | 
    
         
             
                print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
         
     | 
| 
       1801 
1936 
     | 
    
         
             
                print(
         
     | 
| 
       1802 
1937 
     | 
    
         
             
                    "{:<40} {:<10}".format(
         
     | 
| 
         @@ -1866,6 +2001,8 @@ async def benchmark( 
     | 
|
| 
       1866 
2001 
     | 
    
         
             
                        "duration": benchmark_duration,
         
     | 
| 
       1867 
2002 
     | 
    
         
             
                        "completed": metrics.completed,
         
     | 
| 
       1868 
2003 
     | 
    
         
             
                        "total_input_tokens": metrics.total_input,
         
     | 
| 
      
 2004 
     | 
    
         
            +
                        "total_input_text_tokens": metrics.total_input_text,
         
     | 
| 
      
 2005 
     | 
    
         
            +
                        "total_input_vision_tokens": metrics.total_input_vision,
         
     | 
| 
       1869 
2006 
     | 
    
         
             
                        "total_output_tokens": metrics.total_output,
         
     | 
| 
       1870 
2007 
     | 
    
         
             
                        "total_output_tokens_retokenized": metrics.total_output_retokenized,
         
     | 
| 
       1871 
2008 
     | 
    
         
             
                        "request_throughput": metrics.request_throughput,
         
     | 
| 
         @@ -1900,11 +2037,11 @@ async def benchmark( 
     | 
|
| 
       1900 
2037 
     | 
    
         
             
                    output_file_name = args.output_file
         
     | 
| 
       1901 
2038 
     | 
    
         
             
                else:
         
     | 
| 
       1902 
2039 
     | 
    
         
             
                    now = datetime.now().strftime("%m%d")
         
     | 
| 
       1903 
     | 
    
         
            -
                    if args.dataset_name == " 
     | 
| 
      
 2040 
     | 
    
         
            +
                    if args.dataset_name == "image":
         
     | 
| 
       1904 
2041 
     | 
    
         
             
                        output_file_name = (
         
     | 
| 
       1905 
2042 
     | 
    
         
             
                            f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
         
     | 
| 
       1906 
     | 
    
         
            -
                            f"{args.random_output_len}_{args. 
     | 
| 
       1907 
     | 
    
         
            -
                            f"{args. 
     | 
| 
      
 2043 
     | 
    
         
            +
                            f"{args.random_output_len}_{args.image_count}imgs_"
         
     | 
| 
      
 2044 
     | 
    
         
            +
                            f"{args.image_resolution}.jsonl"
         
     | 
| 
       1908 
2045 
     | 
    
         
             
                        )
         
     | 
| 
       1909 
2046 
     | 
    
         
             
                    elif args.dataset_name.startswith("random"):
         
     | 
| 
       1910 
2047 
     | 
    
         
             
                        output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
         
     | 
| 
         @@ -2080,6 +2217,12 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2080 
2217 
     | 
    
         
             
                        "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
         
     | 
| 
       2081 
2218 
     | 
    
         
             
                    )
         
     | 
| 
       2082 
2219 
     | 
    
         | 
| 
      
 2220 
     | 
    
         
            +
                if args.dataset_name in ["image", "mmmu"]:
         
     | 
| 
      
 2221 
     | 
    
         
            +
                    args.apply_chat_template = True
         
     | 
| 
      
 2222 
     | 
    
         
            +
                    assert (
         
     | 
| 
      
 2223 
     | 
    
         
            +
                        not args.tokenize_prompt
         
     | 
| 
      
 2224 
     | 
    
         
            +
                    ), "`--tokenize-prompt` not compatible with image dataset"
         
     | 
| 
      
 2225 
     | 
    
         
            +
             
     | 
| 
       2083 
2226 
     | 
    
         
             
                print(f"{args}\n")
         
     | 
| 
       2084 
2227 
     | 
    
         | 
| 
       2085 
2228 
     | 
    
         
             
                # Read dataset
         
     | 
| 
         @@ -2087,7 +2230,7 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2087 
2230 
     | 
    
         
             
                model_id = args.model
         
     | 
| 
       2088 
2231 
     | 
    
         
             
                tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
         
     | 
| 
       2089 
2232 
     | 
    
         
             
                tokenizer = get_tokenizer(tokenizer_id)
         
     | 
| 
       2090 
     | 
    
         
            -
                input_requests = get_dataset(args, tokenizer)
         
     | 
| 
      
 2233 
     | 
    
         
            +
                input_requests = get_dataset(args, tokenizer, model_id)
         
     | 
| 
       2091 
2234 
     | 
    
         | 
| 
       2092 
2235 
     | 
    
         
             
                # compatible with SimpleNamespace
         
     | 
| 
       2093 
2236 
     | 
    
         
             
                if not hasattr(args, "flush_cache"):
         
     | 
| 
         @@ -2113,6 +2256,8 @@ def run_benchmark(args_: argparse.Namespace): 
     | 
|
| 
       2113 
2256 
     | 
    
         
             
                        use_trace_timestamps=args.use_trace_timestamps,
         
     | 
| 
       2114 
2257 
     | 
    
         
             
                        mooncake_slowdown_factor=args.mooncake_slowdown_factor,
         
     | 
| 
       2115 
2258 
     | 
    
         
             
                        mooncake_num_rounds=args.mooncake_num_rounds,
         
     | 
| 
      
 2259 
     | 
    
         
            +
                        profile_prefill_url=getattr(args, "profile_prefill_url", None),
         
     | 
| 
      
 2260 
     | 
    
         
            +
                        profile_decode_url=getattr(args, "profile_decode_url", None),
         
     | 
| 
       2116 
2261 
     | 
    
         
             
                    )
         
     | 
| 
       2117 
2262 
     | 
    
         
             
                )
         
     | 
| 
       2118 
2263 
     | 
    
         | 
| 
         @@ -2168,7 +2313,7 @@ if __name__ == "__main__": 
     | 
|
| 
       2168 
2313 
     | 
    
         
             
                        "random-ids",
         
     | 
| 
       2169 
2314 
     | 
    
         
             
                        "generated-shared-prefix",
         
     | 
| 
       2170 
2315 
     | 
    
         
             
                        "mmmu",
         
     | 
| 
       2171 
     | 
    
         
            -
                        " 
     | 
| 
      
 2316 
     | 
    
         
            +
                        "image",
         
     | 
| 
       2172 
2317 
     | 
    
         
             
                        "mooncake",
         
     | 
| 
       2173 
2318 
     | 
    
         
             
                    ],
         
     | 
| 
       2174 
2319 
     | 
    
         
             
                    help="Name of the dataset to benchmark on.",
         
     | 
| 
         @@ -2208,37 +2353,49 @@ if __name__ == "__main__": 
     | 
|
| 
       2208 
2353 
     | 
    
         
             
                    "--random-input-len",
         
     | 
| 
       2209 
2354 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2210 
2355 
     | 
    
         
             
                    default=1024,
         
     | 
| 
       2211 
     | 
    
         
            -
                    help="Number of input tokens per request, used only for random dataset.",
         
     | 
| 
      
 2356 
     | 
    
         
            +
                    help="Number of input tokens per request, used only for random and image dataset.",
         
     | 
| 
       2212 
2357 
     | 
    
         
             
                )
         
     | 
| 
       2213 
2358 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2214 
2359 
     | 
    
         
             
                    "--random-output-len",
         
     | 
| 
       2215 
2360 
     | 
    
         
             
                    default=1024,
         
     | 
| 
       2216 
2361 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2217 
     | 
    
         
            -
                    help="Number of output tokens per request, used only for random dataset.",
         
     | 
| 
      
 2362 
     | 
    
         
            +
                    help="Number of output tokens per request, used only for random and image dataset.",
         
     | 
| 
       2218 
2363 
     | 
    
         
             
                )
         
     | 
| 
       2219 
2364 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2220 
2365 
     | 
    
         
             
                    "--random-range-ratio",
         
     | 
| 
       2221 
2366 
     | 
    
         
             
                    type=float,
         
     | 
| 
       2222 
2367 
     | 
    
         
             
                    default=0.0,
         
     | 
| 
       2223 
2368 
     | 
    
         
             
                    help="Range of sampled ratio of input/output length, "
         
     | 
| 
       2224 
     | 
    
         
            -
                    "used only for random dataset.",
         
     | 
| 
      
 2369 
     | 
    
         
            +
                    "used only for random and image dataset.",
         
     | 
| 
       2225 
2370 
     | 
    
         
             
                )
         
     | 
| 
       2226 
     | 
    
         
            -
                #  
     | 
| 
      
 2371 
     | 
    
         
            +
                # image dataset args
         
     | 
| 
       2227 
2372 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2228 
     | 
    
         
            -
                    "-- 
     | 
| 
      
 2373 
     | 
    
         
            +
                    "--image-count",
         
     | 
| 
       2229 
2374 
     | 
    
         
             
                    type=int,
         
     | 
| 
       2230 
2375 
     | 
    
         
             
                    default=1,
         
     | 
| 
       2231 
     | 
    
         
            -
                    help="Number of images per request (only available with the  
     | 
| 
      
 2376 
     | 
    
         
            +
                    help="Number of images per request (only available with the image dataset)",
         
     | 
| 
       2232 
2377 
     | 
    
         
             
                )
         
     | 
| 
       2233 
2378 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2234 
     | 
    
         
            -
                    "-- 
     | 
| 
      
 2379 
     | 
    
         
            +
                    "--image-resolution",
         
     | 
| 
       2235 
2380 
     | 
    
         
             
                    type=str,
         
     | 
| 
       2236 
2381 
     | 
    
         
             
                    default="1080p",
         
     | 
| 
       2237 
2382 
     | 
    
         
             
                    help=(
         
     | 
| 
       2238 
     | 
    
         
            -
                        "Resolution of  
     | 
| 
      
 2383 
     | 
    
         
            +
                        "Resolution of images for image dataset. "
         
     | 
| 
       2239 
2384 
     | 
    
         
             
                        "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
         
     | 
| 
       2240 
2385 
     | 
    
         
             
                    ),
         
     | 
| 
       2241 
2386 
     | 
    
         
             
                )
         
     | 
| 
      
 2387 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 2388 
     | 
    
         
            +
                    "--image-format",
         
     | 
| 
      
 2389 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2390 
     | 
    
         
            +
                    default="jpeg",
         
     | 
| 
      
 2391 
     | 
    
         
            +
                    help=("Format of images for image dataset. " "Supports jpeg and png."),
         
     | 
| 
      
 2392 
     | 
    
         
            +
                )
         
     | 
| 
      
 2393 
     | 
    
         
            +
                parser.add_argument(
         
     | 
| 
      
 2394 
     | 
    
         
            +
                    "--image-content",
         
     | 
| 
      
 2395 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2396 
     | 
    
         
            +
                    default="random",
         
     | 
| 
      
 2397 
     | 
    
         
            +
                    help=("Content for images for image dataset. " "Supports random and blank."),
         
     | 
| 
      
 2398 
     | 
    
         
            +
                )
         
     | 
| 
       2242 
2399 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2243 
2400 
     | 
    
         
             
                    "--request-rate",
         
     | 
| 
       2244 
2401 
     | 
    
         
             
                    type=float,
         
     | 
| 
         @@ -2326,6 +2483,30 @@ if __name__ == "__main__": 
     | 
|
| 
       2326 
2483 
     | 
    
         
             
                    action="store_true",
         
     | 
| 
       2327 
2484 
     | 
    
         
             
                    help="Benchmark PD disaggregation server",
         
     | 
| 
       2328 
2485 
     | 
    
         
             
                )
         
     | 
| 
      
 2486 
     | 
    
         
            +
             
     | 
| 
      
 2487 
     | 
    
         
            +
                # Create a mutually exclusive group for profiling URLs
         
     | 
| 
      
 2488 
     | 
    
         
            +
                # In PD separated mode, prefill and decode workers must be profiled separately
         
     | 
| 
      
 2489 
     | 
    
         
            +
                profile_url_group = parser.add_mutually_exclusive_group()
         
     | 
| 
      
 2490 
     | 
    
         
            +
                profile_url_group.add_argument(
         
     | 
| 
      
 2491 
     | 
    
         
            +
                    "--profile-prefill-url",
         
     | 
| 
      
 2492 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2493 
     | 
    
         
            +
                    nargs="*",
         
     | 
| 
      
 2494 
     | 
    
         
            +
                    default=None,
         
     | 
| 
      
 2495 
     | 
    
         
            +
                    help="URL(s) of the prefill worker(s) for profiling in PD separated mode. "
         
     | 
| 
      
 2496 
     | 
    
         
            +
                    "Can specify multiple URLs: --profile-prefill-url http://localhost:30000 http://localhost:30001. "
         
     | 
| 
      
 2497 
     | 
    
         
            +
                    "NOTE: Cannot be used together with --profile-decode-url. "
         
     | 
| 
      
 2498 
     | 
    
         
            +
                    "In PD separated mode, prefill and decode workers must be profiled separately.",
         
     | 
| 
      
 2499 
     | 
    
         
            +
                )
         
     | 
| 
      
 2500 
     | 
    
         
            +
                profile_url_group.add_argument(
         
     | 
| 
      
 2501 
     | 
    
         
            +
                    "--profile-decode-url",
         
     | 
| 
      
 2502 
     | 
    
         
            +
                    type=str,
         
     | 
| 
      
 2503 
     | 
    
         
            +
                    nargs="*",
         
     | 
| 
      
 2504 
     | 
    
         
            +
                    default=None,
         
     | 
| 
      
 2505 
     | 
    
         
            +
                    help="URL(s) of the decode worker(s) for profiling in PD separated mode. "
         
     | 
| 
      
 2506 
     | 
    
         
            +
                    "Can specify multiple URLs: --profile-decode-url http://localhost:30010 http://localhost:30011. "
         
     | 
| 
      
 2507 
     | 
    
         
            +
                    "NOTE: Cannot be used together with --profile-prefill-url. "
         
     | 
| 
      
 2508 
     | 
    
         
            +
                    "In PD separated mode, prefill and decode workers must be profiled separately.",
         
     | 
| 
      
 2509 
     | 
    
         
            +
                )
         
     | 
| 
       2329 
2510 
     | 
    
         
             
                parser.add_argument(
         
     | 
| 
       2330 
2511 
     | 
    
         
             
                    "--flush-cache",
         
     | 
| 
       2331 
2512 
     | 
    
         
             
                    action="store_true",
         
     |