sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -11
- sglang/bench_one_batch_server.py +330 -31
- sglang/bench_serving.py +474 -142
- sglang/compile_deep_gemm.py +3 -0
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +10 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/falcon_h1.py +314 -0
- sglang/srt/configs/load_config.py +9 -0
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +228 -92
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +294 -0
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +49 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +30 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +21 -6
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +279 -108
- sglang/srt/disaggregation/decode.py +78 -37
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +55 -537
- sglang/srt/disaggregation/nixl/conn.py +373 -68
- sglang/srt/disaggregation/prefill.py +53 -49
- sglang/srt/disaggregation/utils.py +40 -54
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +156 -80
- sglang/srt/entrypoints/engine.py +59 -18
- sglang/srt/entrypoints/grpc_request_manager.py +842 -0
- sglang/srt/entrypoints/grpc_server.py +950 -0
- sglang/srt/entrypoints/http_server.py +179 -60
- sglang/srt/entrypoints/openai/protocol.py +265 -29
- sglang/srt/entrypoints/openai/serving_base.py +65 -3
- sglang/srt/entrypoints/openai/serving_chat.py +213 -122
- sglang/srt/entrypoints/openai/serving_completions.py +14 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +48 -3
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +289 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +38 -8
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +17 -8
- sglang/srt/function_call/glm4_moe_detector.py +4 -4
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
- sglang/srt/layers/activation.py +143 -9
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +115 -9
- sglang/srt/layers/attention/attention_registry.py +215 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +40 -8
- sglang/srt/layers/attention/flashinfer_backend.py +341 -204
- sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
- sglang/srt/layers/attention/mamba/mamba.py +577 -0
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +57 -7
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +41 -2
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +34 -15
- sglang/srt/layers/linear.py +55 -7
- sglang/srt/layers/logits_processor.py +180 -18
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
- sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
- sglang/srt/layers/moe/ep_moe/layer.py +248 -333
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +83 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +29 -7
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +155 -60
- sglang/srt/layers/quantization/fp8_utils.py +51 -32
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +191 -56
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +74 -42
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +28 -33
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +91 -41
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +213 -21
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +99 -5
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +8 -3
- sglang/srt/lora/lora_manager.py +44 -118
- sglang/srt/lora/mem_pool.py +25 -11
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +22 -11
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +199 -301
- sglang/srt/managers/data_parallel_controller.py +115 -80
- sglang/srt/managers/detokenizer_manager.py +19 -15
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +340 -109
- sglang/srt/managers/mm_utils.py +44 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +55 -0
- sglang/srt/managers/schedule_batch.py +343 -212
- sglang/srt/managers/schedule_policy.py +145 -18
- sglang/srt/managers/scheduler.py +653 -273
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
- sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
- sglang/srt/managers/tokenizer_manager.py +579 -674
- sglang/srt/managers/tp_worker.py +96 -26
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +21 -22
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +9 -2
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +43 -24
- sglang/srt/mem_cache/hiradix_cache.py +222 -75
- sglang/srt/mem_cache/memory_pool.py +651 -80
- sglang/srt/mem_cache/memory_pool_host.py +239 -228
- sglang/srt/mem_cache/radix_cache.py +227 -73
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
- sglang/srt/mem_cache/swa_radix_cache.py +93 -48
- sglang/srt/metrics/collector.py +511 -132
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +52 -37
- sglang/srt/model_executor/forward_batch_info.py +74 -46
- sglang/srt/model_executor/model_runner.py +455 -176
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/__init__.py +10 -4
- sglang/srt/model_loader/loader.py +319 -10
- sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/model_loader/weight_utils.py +161 -3
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +820 -217
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +607 -130
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/falcon_h1.py +578 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +17 -1
- sglang/srt/models/gemma3n_mm.py +2 -2
- sglang/srt/models/glm4_moe.py +4 -4
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +5 -3
- sglang/srt/models/glm4v_moe.py +4 -1
- sglang/srt/models/gpt_oss.py +8 -31
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +3 -3
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +50 -4
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2_5_vl.py +29 -5
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +120 -13
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +32 -4
- sglang/srt/models/qwen3_next.py +1069 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +55 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +98 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +153 -129
- sglang/srt/multimodal/processors/qwen_vl.py +23 -6
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/offloader.py +27 -3
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +49 -26
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +1051 -285
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
- sglang/srt/speculative/eagle_worker.py +98 -29
- sglang/srt/speculative/ngram_info.py +428 -0
- sglang/srt/speculative/ngram_worker.py +246 -0
- sglang/srt/speculative/spec_info.py +52 -0
- sglang/srt/speculative/spec_utils.py +605 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +578 -0
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +451 -77
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +2 -2
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +119 -11
- sglang/test/runners.py +5 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +313 -0
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +140 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +407 -8
- sglang/utils.py +21 -1
- sglang/version.py +1 -1
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -35,6 +35,7 @@ import numpy as np
|
|
35
35
|
import requests
|
36
36
|
from tqdm.asyncio import tqdm
|
37
37
|
from transformers import (
|
38
|
+
AutoProcessor,
|
38
39
|
AutoTokenizer,
|
39
40
|
PreTrainedTokenizer,
|
40
41
|
PreTrainedTokenizerBase,
|
@@ -75,6 +76,7 @@ class RequestFuncInput:
|
|
75
76
|
lora_name: str
|
76
77
|
image_data: Optional[List[str]]
|
77
78
|
extra_request_body: Dict[str, Any]
|
79
|
+
timestamp: Optional[float] = None
|
78
80
|
|
79
81
|
|
80
82
|
@dataclass
|
@@ -104,10 +106,13 @@ def remove_suffix(text: str, suffix: str) -> str:
|
|
104
106
|
|
105
107
|
|
106
108
|
def get_auth_headers() -> Dict[str, str]:
|
107
|
-
|
108
|
-
if
|
109
|
-
return {"Authorization": f"Bearer {
|
109
|
+
openai_api_key = os.environ.get("OPENAI_API_KEY")
|
110
|
+
if openai_api_key:
|
111
|
+
return {"Authorization": f"Bearer {openai_api_key}"}
|
110
112
|
else:
|
113
|
+
api_key = os.environ.get("API_KEY")
|
114
|
+
if api_key:
|
115
|
+
return {"Authorization": f"{api_key}"}
|
111
116
|
return {}
|
112
117
|
|
113
118
|
|
@@ -204,6 +209,15 @@ async def async_request_openai_completions(
|
|
204
209
|
"ignore_eos": not args.disable_ignore_eos,
|
205
210
|
**request_func_input.extra_request_body,
|
206
211
|
}
|
212
|
+
|
213
|
+
# hack to accommodate different LoRA conventions between SGLang and vLLM.
|
214
|
+
if request_func_input.lora_name:
|
215
|
+
payload["model"] = request_func_input.lora_name
|
216
|
+
payload["lora_path"] = request_func_input.lora_name
|
217
|
+
|
218
|
+
if request_func_input.image_data:
|
219
|
+
payload.update({"image_data": request_func_input.image_data})
|
220
|
+
|
207
221
|
headers = get_auth_headers()
|
208
222
|
|
209
223
|
output = RequestFuncOutput.init_new(request_func_input)
|
@@ -314,10 +328,17 @@ async def async_request_openai_chat_completions(
|
|
314
328
|
"model": request_func_input.model,
|
315
329
|
"messages": messages,
|
316
330
|
"temperature": 0.0,
|
317
|
-
"
|
331
|
+
"max_completion_tokens": request_func_input.output_len,
|
318
332
|
"stream": not args.disable_stream,
|
333
|
+
"ignore_eos": not args.disable_ignore_eos,
|
319
334
|
**request_func_input.extra_request_body,
|
320
335
|
}
|
336
|
+
|
337
|
+
# hack to accommodate different LoRA conventions between SGLang and vLLM.
|
338
|
+
if request_func_input.lora_name:
|
339
|
+
payload["model"] = request_func_input.lora_name
|
340
|
+
payload["lora_path"] = request_func_input.lora_name
|
341
|
+
|
321
342
|
headers = get_auth_headers()
|
322
343
|
|
323
344
|
output = RequestFuncOutput.init_new(request_func_input)
|
@@ -627,7 +648,7 @@ def get_tokenizer(
|
|
627
648
|
if pretrained_model_name_or_path.endswith(
|
628
649
|
".json"
|
629
650
|
) or pretrained_model_name_or_path.endswith(".model"):
|
630
|
-
from sglang.srt.hf_transformers_utils import get_tokenizer
|
651
|
+
from sglang.srt.utils.hf_transformers_utils import get_tokenizer
|
631
652
|
|
632
653
|
return get_tokenizer(pretrained_model_name_or_path)
|
633
654
|
|
@@ -640,7 +661,30 @@ def get_tokenizer(
|
|
640
661
|
)
|
641
662
|
|
642
663
|
|
643
|
-
def
|
664
|
+
def get_processor(
|
665
|
+
pretrained_model_name_or_path: str,
|
666
|
+
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
667
|
+
assert (
|
668
|
+
pretrained_model_name_or_path is not None
|
669
|
+
and pretrained_model_name_or_path != ""
|
670
|
+
)
|
671
|
+
if pretrained_model_name_or_path.endswith(
|
672
|
+
".json"
|
673
|
+
) or pretrained_model_name_or_path.endswith(".model"):
|
674
|
+
from sglang.srt.hf_transformers_utils import get_processor
|
675
|
+
|
676
|
+
return get_processor(pretrained_model_name_or_path)
|
677
|
+
|
678
|
+
if pretrained_model_name_or_path is not None and not os.path.exists(
|
679
|
+
pretrained_model_name_or_path
|
680
|
+
):
|
681
|
+
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
682
|
+
return AutoProcessor.from_pretrained(
|
683
|
+
pretrained_model_name_or_path, trust_remote_code=True
|
684
|
+
)
|
685
|
+
|
686
|
+
|
687
|
+
def get_dataset(args, tokenizer, model_id=None):
|
644
688
|
tokenize_prompt = getattr(args, "tokenize_prompt", False)
|
645
689
|
if args.dataset_name == "sharegpt":
|
646
690
|
assert not tokenize_prompt
|
@@ -653,7 +697,7 @@ def get_dataset(args, tokenizer):
|
|
653
697
|
prompt_suffix=args.prompt_suffix,
|
654
698
|
apply_chat_template=args.apply_chat_template,
|
655
699
|
)
|
656
|
-
elif args.dataset_name.startswith("random")
|
700
|
+
elif args.dataset_name.startswith("random"):
|
657
701
|
input_requests = sample_random_requests(
|
658
702
|
input_len=args.random_input_len,
|
659
703
|
output_len=args.random_output_len,
|
@@ -664,17 +708,18 @@ def get_dataset(args, tokenizer):
|
|
664
708
|
random_sample=args.dataset_name == "random",
|
665
709
|
return_text=not tokenize_prompt,
|
666
710
|
)
|
667
|
-
elif args.dataset_name == "
|
668
|
-
|
669
|
-
input_requests =
|
711
|
+
elif args.dataset_name == "image":
|
712
|
+
processor = get_processor(model_id)
|
713
|
+
input_requests = sample_image_requests(
|
670
714
|
num_requests=args.num_prompts,
|
671
|
-
|
715
|
+
image_count=args.image_count,
|
672
716
|
input_len=args.random_input_len,
|
673
717
|
output_len=args.random_output_len,
|
674
718
|
range_ratio=args.random_range_ratio,
|
675
|
-
|
676
|
-
|
677
|
-
|
719
|
+
processor=processor,
|
720
|
+
image_content=args.image_content,
|
721
|
+
image_format=args.image_format,
|
722
|
+
image_resolution=args.image_resolution,
|
678
723
|
)
|
679
724
|
elif args.dataset_name == "generated-shared-prefix":
|
680
725
|
assert not tokenize_prompt
|
@@ -688,14 +733,31 @@ def get_dataset(args, tokenizer):
|
|
688
733
|
args=args,
|
689
734
|
)
|
690
735
|
elif args.dataset_name == "mmmu":
|
691
|
-
|
736
|
+
processor = get_processor(model_id)
|
692
737
|
input_requests = sample_mmmu_requests(
|
693
738
|
num_requests=args.num_prompts,
|
694
|
-
|
739
|
+
processor=processor,
|
695
740
|
fixed_output_len=args.random_output_len,
|
696
|
-
apply_chat_template=args.apply_chat_template,
|
697
741
|
random_sample=True,
|
698
742
|
)
|
743
|
+
elif args.dataset_name == "mooncake":
|
744
|
+
# For mooncake, we don't generate the prompts here.
|
745
|
+
# We just load the raw trace data. The async generator will handle the rest.
|
746
|
+
if not args.dataset_path:
|
747
|
+
local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
|
748
|
+
else:
|
749
|
+
local_path = args.dataset_path
|
750
|
+
|
751
|
+
if not os.path.exists(local_path):
|
752
|
+
download_and_cache_file(
|
753
|
+
MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
|
754
|
+
)
|
755
|
+
|
756
|
+
with open(local_path, "r") as f:
|
757
|
+
all_requests_data = [json.loads(line) for line in f if line.strip()]
|
758
|
+
|
759
|
+
# Limit the number of requests based on --num-prompts
|
760
|
+
input_requests = all_requests_data[: args.num_prompts]
|
699
761
|
else:
|
700
762
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
701
763
|
return input_requests
|
@@ -720,6 +782,8 @@ ASYNC_REQUEST_FUNCS = {
|
|
720
782
|
class BenchmarkMetrics:
|
721
783
|
completed: int
|
722
784
|
total_input: int
|
785
|
+
total_input_text: int
|
786
|
+
total_input_vision: int
|
723
787
|
total_output: int
|
724
788
|
total_output_retokenized: int
|
725
789
|
request_throughput: float
|
@@ -750,6 +814,12 @@ class BenchmarkMetrics:
|
|
750
814
|
|
751
815
|
|
752
816
|
SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
|
817
|
+
MOONCAKE_DATASET_URL = {
|
818
|
+
"mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
|
819
|
+
"conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
|
820
|
+
"synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
|
821
|
+
"toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
|
822
|
+
}
|
753
823
|
|
754
824
|
|
755
825
|
def download_and_cache_file(url: str, filename: Optional[str] = None):
|
@@ -807,14 +877,95 @@ class DatasetRow:
|
|
807
877
|
prompt: str
|
808
878
|
prompt_len: int
|
809
879
|
output_len: int
|
880
|
+
text_prompt_len: Optional[int] = None
|
881
|
+
vision_prompt_len: Optional[int] = None
|
810
882
|
image_data: Optional[List[str]] = None
|
883
|
+
timestamp: Optional[float] = None
|
884
|
+
|
885
|
+
def __post_init__(self):
|
886
|
+
if self.text_prompt_len is None:
|
887
|
+
self.text_prompt_len = self.prompt_len
|
888
|
+
if self.vision_prompt_len is None:
|
889
|
+
self.vision_prompt_len = 0
|
890
|
+
|
891
|
+
|
892
|
+
async def get_mooncake_request_over_time(
|
893
|
+
input_requests: List[Dict],
|
894
|
+
tokenizer: PreTrainedTokenizerBase,
|
895
|
+
slowdown_factor: float,
|
896
|
+
num_rounds: int,
|
897
|
+
) -> AsyncGenerator[DatasetRow, None]:
|
898
|
+
"""
|
899
|
+
An async generator that yields requests based on the timestamps in the Mooncake trace file,
|
900
|
+
with support for multi-round sessions.
|
901
|
+
"""
|
902
|
+
if not input_requests:
|
903
|
+
return
|
904
|
+
|
905
|
+
input_requests.sort(key=lambda r: r["timestamp"])
|
906
|
+
|
907
|
+
start_time = time.perf_counter()
|
908
|
+
trace_start_time_ms = input_requests[0]["timestamp"]
|
909
|
+
|
910
|
+
for record in input_requests:
|
911
|
+
# Calculate when this entire session should start
|
912
|
+
relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
|
913
|
+
target_arrival_time_s = relative_arrival_time_s * slowdown_factor
|
914
|
+
|
915
|
+
current_elapsed_time_s = time.perf_counter() - start_time
|
916
|
+
sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
|
917
|
+
if sleep_duration_s > 0:
|
918
|
+
await asyncio.sleep(sleep_duration_s)
|
919
|
+
|
920
|
+
# Once the session starts, generate all rounds for it as a burst
|
921
|
+
# This simulates a user engaging in a multi-turn conversation
|
922
|
+
|
923
|
+
# Base user query constructed from hash_ids
|
924
|
+
user_query_base = ""
|
925
|
+
hash_ids = record.get("hash_ids", [])
|
926
|
+
for hash_id in hash_ids:
|
927
|
+
user_query_base += f"{hash_id}" + " ".join(
|
928
|
+
["hi"] * 128
|
929
|
+
) # Shorter for multi-round
|
930
|
+
user_query_base += "Tell me a story based on this context."
|
931
|
+
|
932
|
+
output_len_per_round = record.get("output_length", 256)
|
933
|
+
chat_history = []
|
934
|
+
|
935
|
+
for i in range(num_rounds):
|
936
|
+
# Add user query for the current round
|
937
|
+
chat_history.append(
|
938
|
+
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
|
939
|
+
)
|
940
|
+
|
941
|
+
# Form the full prompt from history
|
942
|
+
try:
|
943
|
+
full_prompt_text = tokenizer.apply_chat_template(
|
944
|
+
chat_history, tokenize=False, add_generation_prompt=True
|
945
|
+
)
|
946
|
+
except Exception:
|
947
|
+
full_prompt_text = "\n".join(
|
948
|
+
[f"{msg['role']}: {msg['content']}" for msg in chat_history]
|
949
|
+
)
|
950
|
+
|
951
|
+
prompt_len = len(tokenizer.encode(full_prompt_text))
|
952
|
+
|
953
|
+
yield DatasetRow(
|
954
|
+
prompt=full_prompt_text,
|
955
|
+
prompt_len=prompt_len,
|
956
|
+
output_len=output_len_per_round,
|
957
|
+
)
|
958
|
+
|
959
|
+
# Add a placeholder assistant response for the next round's context
|
960
|
+
# We use a placeholder because we don't know the real response
|
961
|
+
placeholder_response = " ".join(["story"] * output_len_per_round)
|
962
|
+
chat_history.append({"role": "assistant", "content": placeholder_response})
|
811
963
|
|
812
964
|
|
813
965
|
def sample_mmmu_requests(
|
814
966
|
num_requests: int,
|
815
|
-
|
967
|
+
processor: AutoProcessor,
|
816
968
|
fixed_output_len: Optional[int] = None,
|
817
|
-
apply_chat_template: bool = True,
|
818
969
|
random_sample: bool = True,
|
819
970
|
) -> List[DatasetRow]:
|
820
971
|
"""
|
@@ -893,46 +1044,12 @@ def sample_mmmu_requests(
|
|
893
1044
|
question = example.get("question")
|
894
1045
|
|
895
1046
|
# Construct the prompt
|
896
|
-
|
897
|
-
if apply_chat_template:
|
898
|
-
try:
|
899
|
-
prompt = tokenizer.apply_chat_template(
|
900
|
-
[
|
901
|
-
{
|
902
|
-
"role": "user",
|
903
|
-
"content": [
|
904
|
-
{
|
905
|
-
"type": "image_url",
|
906
|
-
"image_url": {"url": image_data},
|
907
|
-
},
|
908
|
-
{"type": "text", "text": prompt},
|
909
|
-
],
|
910
|
-
}
|
911
|
-
],
|
912
|
-
add_generation_prompt=True,
|
913
|
-
tokenize=False,
|
914
|
-
)
|
915
|
-
except Exception as e:
|
916
|
-
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
|
917
|
-
print(
|
918
|
-
f"Error applying chat template: {e}, fallback to <image> tag"
|
919
|
-
)
|
920
|
-
prompt = f"<image>{prompt}"
|
921
|
-
|
922
|
-
# Calculate token lengths for text only (without image data)
|
923
|
-
prompt_token_ids = tokenizer.encode(prompt)
|
924
|
-
prompt_len = len(prompt_token_ids)
|
925
|
-
|
1047
|
+
text_prompt = f"Question: {question}\n\nAnswer: "
|
926
1048
|
output_len = fixed_output_len if fixed_output_len is not None else 256
|
927
|
-
|
928
|
-
|
929
|
-
DatasetRow(
|
930
|
-
prompt=prompt,
|
931
|
-
prompt_len=prompt_len,
|
932
|
-
output_len=output_len,
|
933
|
-
image_data=[image_data],
|
934
|
-
)
|
1049
|
+
data_row = create_mm_data_row(
|
1050
|
+
text_prompt, [image], [image_data], output_len, processor
|
935
1051
|
)
|
1052
|
+
filtered_dataset.append(data_row)
|
936
1053
|
|
937
1054
|
except Exception as e:
|
938
1055
|
print(f"Error processing example {i}: {e}")
|
@@ -1000,7 +1117,8 @@ def sample_sharegpt_requests(
|
|
1000
1117
|
add_generation_prompt=True,
|
1001
1118
|
tokenize=False,
|
1002
1119
|
)
|
1003
|
-
|
1120
|
+
if tokenizer.bos_token:
|
1121
|
+
prompt = prompt.replace(tokenizer.bos_token, "")
|
1004
1122
|
|
1005
1123
|
prompt_token_ids = tokenizer.encode(prompt)
|
1006
1124
|
completion = dataset[i][1]
|
@@ -1019,7 +1137,11 @@ def sample_sharegpt_requests(
|
|
1019
1137
|
continue
|
1020
1138
|
|
1021
1139
|
filtered_dataset.append(
|
1022
|
-
DatasetRow(
|
1140
|
+
DatasetRow(
|
1141
|
+
prompt=prompt,
|
1142
|
+
prompt_len=prompt_len,
|
1143
|
+
output_len=output_len,
|
1144
|
+
)
|
1023
1145
|
)
|
1024
1146
|
|
1025
1147
|
print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
|
@@ -1130,7 +1252,7 @@ def sample_random_requests(
|
|
1130
1252
|
return input_requests
|
1131
1253
|
|
1132
1254
|
|
1133
|
-
def
|
1255
|
+
def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1134
1256
|
"""Parse image resolution into (width, height).
|
1135
1257
|
|
1136
1258
|
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
@@ -1155,24 +1277,79 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
|
1155
1277
|
return (width, height)
|
1156
1278
|
|
1157
1279
|
raise ValueError(
|
1158
|
-
f"Unsupported
|
1280
|
+
f"Unsupported image resolution: {image_resolution}. "
|
1159
1281
|
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1160
1282
|
)
|
1161
1283
|
|
1162
1284
|
|
1163
|
-
def
|
1285
|
+
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
|
1286
|
+
try:
|
1287
|
+
content_items = [
|
1288
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1289
|
+
for img_url in images_base64
|
1290
|
+
]
|
1291
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1292
|
+
prompt_str = processor.apply_chat_template(
|
1293
|
+
[{"role": "user", "content": content_items}],
|
1294
|
+
add_generation_prompt=True,
|
1295
|
+
tokenize=False,
|
1296
|
+
)
|
1297
|
+
except Exception:
|
1298
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1299
|
+
prompt_str = f"<image>{text_prompt}"
|
1300
|
+
|
1301
|
+
# Calculate total tokens (text + vision)
|
1302
|
+
prompt_len = processor(
|
1303
|
+
text=[prompt_str],
|
1304
|
+
images=images,
|
1305
|
+
padding=False,
|
1306
|
+
return_tensors="pt",
|
1307
|
+
)["input_ids"].numel()
|
1308
|
+
|
1309
|
+
# Calculate text-only tokens
|
1310
|
+
try:
|
1311
|
+
# Create text-only version of the prompt
|
1312
|
+
text_only_prompt = processor.apply_chat_template(
|
1313
|
+
[{"role": "user", "content": text_prompt}],
|
1314
|
+
add_generation_prompt=True,
|
1315
|
+
tokenize=False,
|
1316
|
+
)
|
1317
|
+
text_prompt_len = processor(
|
1318
|
+
text=[text_only_prompt],
|
1319
|
+
padding=False,
|
1320
|
+
return_tensors="pt",
|
1321
|
+
)["input_ids"].numel()
|
1322
|
+
except Exception:
|
1323
|
+
# Fallback: just tokenize the text prompt directly
|
1324
|
+
text_prompt_len = len(processor.tokenizer.encode(text_prompt))
|
1325
|
+
|
1326
|
+
# Vision tokens = total tokens - text tokens
|
1327
|
+
vision_prompt_len = prompt_len - text_prompt_len
|
1328
|
+
|
1329
|
+
return DatasetRow(
|
1330
|
+
prompt=text_prompt,
|
1331
|
+
prompt_len=prompt_len,
|
1332
|
+
output_len=output_len,
|
1333
|
+
text_prompt_len=text_prompt_len,
|
1334
|
+
vision_prompt_len=vision_prompt_len,
|
1335
|
+
image_data=images_base64,
|
1336
|
+
)
|
1337
|
+
|
1338
|
+
|
1339
|
+
def sample_image_requests(
|
1164
1340
|
num_requests: int,
|
1165
|
-
|
1341
|
+
image_count: int,
|
1166
1342
|
input_len: int,
|
1167
1343
|
output_len: int,
|
1168
1344
|
range_ratio: float,
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1345
|
+
processor: AutoProcessor,
|
1346
|
+
image_content: str,
|
1347
|
+
image_format: str,
|
1348
|
+
image_resolution: str,
|
1172
1349
|
) -> List[DatasetRow]:
|
1173
|
-
"""Generate requests with
|
1350
|
+
"""Generate requests with images.
|
1174
1351
|
|
1175
|
-
- Each request includes ``
|
1352
|
+
- Each request includes ``image_count`` images.
|
1176
1353
|
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1177
1354
|
or custom 'heightxwidth' (e.g., 1080x1920).
|
1178
1355
|
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
@@ -1187,12 +1364,12 @@ def sample_random_image_requests(
|
|
1187
1364
|
) from e
|
1188
1365
|
|
1189
1366
|
# Parse resolution (supports presets and 'heightxwidth')
|
1190
|
-
width, height =
|
1367
|
+
width, height = parse_image_resolution(image_resolution)
|
1191
1368
|
|
1192
1369
|
# Check for potentially problematic combinations and warn user
|
1193
|
-
if width * height >= 1920 * 1080 and
|
1370
|
+
if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
|
1194
1371
|
warnings.warn(
|
1195
|
-
f"High resolution ({width}x{height}) with {
|
1372
|
+
f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
|
1196
1373
|
f"may take a long time. Consider reducing resolution or image count.",
|
1197
1374
|
UserWarning,
|
1198
1375
|
stacklevel=2,
|
@@ -1206,53 +1383,50 @@ def sample_random_image_requests(
|
|
1206
1383
|
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1207
1384
|
)
|
1208
1385
|
|
1209
|
-
def _gen_random_image_data_uri(
|
1210
|
-
|
1211
|
-
|
1386
|
+
def _gen_random_image_data_uri(
|
1387
|
+
width: int = width, height: int = height
|
1388
|
+
) -> (Image, str, int):
|
1389
|
+
if image_content == "blank":
|
1390
|
+
# Generate blank white image
|
1391
|
+
arr = np.full((height, width, 3), 255, dtype=np.uint8)
|
1392
|
+
else:
|
1393
|
+
# Generate random colored image
|
1394
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1395
|
+
img = Image.fromarray(arr)
|
1212
1396
|
buf = io.BytesIO()
|
1213
|
-
img.save(buf, format=
|
1397
|
+
img.save(buf, format=image_format, quality=85)
|
1214
1398
|
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1215
|
-
|
1399
|
+
image_data = f"data:image/{image_format};base64,{encoded}"
|
1400
|
+
image_bytes = len(image_data.encode("utf-8"))
|
1401
|
+
return img, image_data, image_bytes
|
1216
1402
|
|
1217
1403
|
dataset: List[DatasetRow] = []
|
1404
|
+
total_image_bytes = 0
|
1218
1405
|
for i in range(num_requests):
|
1219
1406
|
# Generate text prompt
|
1220
|
-
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1407
|
+
text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
|
1221
1408
|
|
1222
1409
|
# Generate image list
|
1223
|
-
images
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1232
|
-
|
1233
|
-
|
1234
|
-
[{"role": "user", "content": content_items}],
|
1235
|
-
add_generation_prompt=True,
|
1236
|
-
tokenize=False,
|
1237
|
-
)
|
1238
|
-
except Exception:
|
1239
|
-
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1240
|
-
prompt_str = f"<image>{text_prompt}"
|
1241
|
-
|
1242
|
-
prompt_token_ids = tokenizer.encode(prompt_str)
|
1243
|
-
prompt_token_len = len(prompt_token_ids)
|
1244
|
-
|
1245
|
-
dataset.append(
|
1246
|
-
DatasetRow(
|
1247
|
-
prompt=prompt_str,
|
1248
|
-
prompt_len=prompt_token_len,
|
1249
|
-
output_len=int(output_lens[i]),
|
1250
|
-
image_data=images,
|
1251
|
-
)
|
1410
|
+
images, images_base64, images_bytes = zip(
|
1411
|
+
*[_gen_random_image_data_uri() for _ in range(image_count)]
|
1412
|
+
)
|
1413
|
+
total_image_bytes += sum(list(images_bytes))
|
1414
|
+
|
1415
|
+
data_row = create_mm_data_row(
|
1416
|
+
text_prompt,
|
1417
|
+
list(images),
|
1418
|
+
list(images_base64),
|
1419
|
+
int(output_lens[i]),
|
1420
|
+
processor,
|
1252
1421
|
)
|
1253
1422
|
|
1423
|
+
dataset.append(data_row)
|
1424
|
+
|
1254
1425
|
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1255
1426
|
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1427
|
+
print(
|
1428
|
+
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
|
1429
|
+
)
|
1256
1430
|
return dataset
|
1257
1431
|
|
1258
1432
|
|
@@ -1324,7 +1498,9 @@ def sample_generated_shared_prefix_requests(
|
|
1324
1498
|
|
1325
1499
|
input_requests.append(
|
1326
1500
|
DatasetRow(
|
1327
|
-
prompt=full_prompt,
|
1501
|
+
prompt=full_prompt,
|
1502
|
+
prompt_len=prompt_len,
|
1503
|
+
output_len=output_len,
|
1328
1504
|
)
|
1329
1505
|
)
|
1330
1506
|
total_input_tokens += prompt_len
|
@@ -1359,19 +1535,41 @@ def sample_generated_shared_prefix_requests(
|
|
1359
1535
|
async def get_request(
|
1360
1536
|
input_requests: List[DatasetRow],
|
1361
1537
|
request_rate: float,
|
1538
|
+
use_trace_timestamps: bool = False,
|
1539
|
+
slowdown_factor: float = 1.0,
|
1362
1540
|
) -> AsyncGenerator[DatasetRow, None]:
|
1363
|
-
|
1364
|
-
|
1365
|
-
|
1541
|
+
if use_trace_timestamps:
|
1542
|
+
print(
|
1543
|
+
f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
|
1544
|
+
)
|
1545
|
+
# Sort requests by timestamp for correct replay
|
1546
|
+
input_requests.sort(key=lambda r: r.timestamp)
|
1366
1547
|
|
1367
|
-
|
1368
|
-
|
1369
|
-
continue
|
1548
|
+
start_time = time.perf_counter()
|
1549
|
+
trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
|
1370
1550
|
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1551
|
+
for request in input_requests:
|
1552
|
+
trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
|
1553
|
+
target_arrival_time = start_time + (trace_time_s * slowdown_factor)
|
1554
|
+
|
1555
|
+
sleep_duration = target_arrival_time - time.perf_counter()
|
1556
|
+
if sleep_duration > 0:
|
1557
|
+
await asyncio.sleep(sleep_duration)
|
1558
|
+
|
1559
|
+
yield request
|
1560
|
+
else:
|
1561
|
+
input_requests_iter = iter(input_requests)
|
1562
|
+
for request in input_requests_iter:
|
1563
|
+
yield request
|
1564
|
+
|
1565
|
+
if request_rate == float("inf"):
|
1566
|
+
# If the request rate is infinity, then we don't need to wait.
|
1567
|
+
continue
|
1568
|
+
|
1569
|
+
# Sample the request interval from the exponential distribution.
|
1570
|
+
interval = np.random.exponential(1.0 / request_rate)
|
1571
|
+
# The next request will be sent after the interval.
|
1572
|
+
await asyncio.sleep(interval)
|
1375
1573
|
|
1376
1574
|
|
1377
1575
|
def calculate_metrics(
|
@@ -1384,6 +1582,8 @@ def calculate_metrics(
|
|
1384
1582
|
output_lens: List[int] = []
|
1385
1583
|
retokenized_output_lens: List[int] = []
|
1386
1584
|
total_input = 0
|
1585
|
+
total_input_text = 0
|
1586
|
+
total_input_vision = 0
|
1387
1587
|
completed = 0
|
1388
1588
|
itls: List[float] = []
|
1389
1589
|
tpots: List[float] = []
|
@@ -1398,6 +1598,8 @@ def calculate_metrics(
|
|
1398
1598
|
)
|
1399
1599
|
retokenized_output_lens.append(retokenized_output_len)
|
1400
1600
|
total_input += input_requests[i].prompt_len
|
1601
|
+
total_input_text += input_requests[i].text_prompt_len
|
1602
|
+
total_input_vision += input_requests[i].vision_prompt_len
|
1401
1603
|
if output_len > 1:
|
1402
1604
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1403
1605
|
itls += outputs[i].itl
|
@@ -1419,6 +1621,8 @@ def calculate_metrics(
|
|
1419
1621
|
metrics = BenchmarkMetrics(
|
1420
1622
|
completed=completed,
|
1421
1623
|
total_input=total_input,
|
1624
|
+
total_input_text=total_input_text,
|
1625
|
+
total_input_vision=total_input_vision,
|
1422
1626
|
total_output=sum(output_lens),
|
1423
1627
|
total_output_retokenized=sum(retokenized_output_lens),
|
1424
1628
|
request_throughput=completed / dur_s,
|
@@ -1469,6 +1673,9 @@ async def benchmark(
|
|
1469
1673
|
pd_separated: bool = False,
|
1470
1674
|
flush_cache: bool = False,
|
1471
1675
|
warmup_requests: int = 1,
|
1676
|
+
use_trace_timestamps: bool = False,
|
1677
|
+
mooncake_slowdown_factor=1.0,
|
1678
|
+
mooncake_num_rounds=1,
|
1472
1679
|
):
|
1473
1680
|
if backend in ASYNC_REQUEST_FUNCS:
|
1474
1681
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -1488,8 +1695,32 @@ async def benchmark(
|
|
1488
1695
|
# Warmup
|
1489
1696
|
print(f"Starting warmup with {warmup_requests} sequences...")
|
1490
1697
|
|
1491
|
-
#
|
1492
|
-
|
1698
|
+
# Handle the data structure difference for the warmup request
|
1699
|
+
if args.dataset_name == "mooncake":
|
1700
|
+
# For mooncake, input_requests is a list of dicts.
|
1701
|
+
# We need to build a temporary DatasetRow for the warmup phase.
|
1702
|
+
warmup_record = input_requests[0]
|
1703
|
+
|
1704
|
+
# Build prompt from hash_ids, just like in the async generator
|
1705
|
+
hash_ids = warmup_record.get("hash_ids", [])
|
1706
|
+
prompt_text = ""
|
1707
|
+
for hash_id in hash_ids:
|
1708
|
+
prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
|
1709
|
+
prompt_text += "Can you tell me a detailed story in 1000 words?"
|
1710
|
+
|
1711
|
+
output_len = warmup_record.get("output_length", 32)
|
1712
|
+
prompt_len = len(tokenizer.encode(prompt_text))
|
1713
|
+
|
1714
|
+
# Create a temporary DatasetRow object for warmup
|
1715
|
+
test_request = DatasetRow(
|
1716
|
+
prompt=prompt_text,
|
1717
|
+
prompt_len=prompt_len,
|
1718
|
+
output_len=output_len,
|
1719
|
+
image_data=None, # Mooncake doesn't have image data
|
1720
|
+
)
|
1721
|
+
else:
|
1722
|
+
# For all other datasets, input_requests is a list of DatasetRow objects
|
1723
|
+
test_request = input_requests[0]
|
1493
1724
|
|
1494
1725
|
if lora_names is not None and len(lora_names) != 0:
|
1495
1726
|
lora_name = lora_names[0]
|
@@ -1543,12 +1774,26 @@ async def benchmark(
|
|
1543
1774
|
if profile_output.success:
|
1544
1775
|
print("Profiler started")
|
1545
1776
|
|
1546
|
-
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
1547
|
-
|
1548
1777
|
# Run all requests
|
1549
1778
|
benchmark_start_time = time.perf_counter()
|
1550
1779
|
tasks: List[asyncio.Task] = []
|
1551
|
-
|
1780
|
+
pbar_total = len(input_requests)
|
1781
|
+
if (
|
1782
|
+
backend == "sglang" and args.dataset_name == "mooncake"
|
1783
|
+
): # Assuming mooncake is mainly for sglang or similar backends
|
1784
|
+
print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
|
1785
|
+
request_generator = get_mooncake_request_over_time(
|
1786
|
+
input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
|
1787
|
+
)
|
1788
|
+
print(
|
1789
|
+
f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
|
1790
|
+
)
|
1791
|
+
pbar_total *= args.mooncake_num_rounds
|
1792
|
+
else:
|
1793
|
+
request_generator = get_request(input_requests, request_rate)
|
1794
|
+
|
1795
|
+
pbar = None if disable_tqdm else tqdm(total=pbar_total)
|
1796
|
+
async for request in request_generator:
|
1552
1797
|
if lora_names is not None and len(lora_names) != 0:
|
1553
1798
|
idx = random.randint(0, len(lora_names) - 1)
|
1554
1799
|
lora_name = lora_names[idx]
|
@@ -1564,6 +1809,7 @@ async def benchmark(
|
|
1564
1809
|
lora_name=lora_name,
|
1565
1810
|
image_data=request.image_data,
|
1566
1811
|
extra_request_body=extra_request_body,
|
1812
|
+
timestamp=request.timestamp,
|
1567
1813
|
)
|
1568
1814
|
|
1569
1815
|
tasks.append(
|
@@ -1584,14 +1830,22 @@ async def benchmark(
|
|
1584
1830
|
pbar.close()
|
1585
1831
|
|
1586
1832
|
if "sglang" in backend:
|
1587
|
-
server_info = requests.get(
|
1833
|
+
server_info = requests.get(
|
1834
|
+
base_url + "/get_server_info", headers=get_auth_headers()
|
1835
|
+
)
|
1588
1836
|
if server_info.status_code == 200:
|
1589
1837
|
server_info_json = server_info.json()
|
1590
1838
|
if "decode" in server_info_json:
|
1591
1839
|
server_info_json = server_info_json["decode"][0]
|
1592
|
-
|
1593
|
-
"
|
1594
|
-
|
1840
|
+
if (
|
1841
|
+
"internal_states" in server_info_json
|
1842
|
+
and server_info_json["internal_states"]
|
1843
|
+
):
|
1844
|
+
accept_length = server_info_json["internal_states"][0].get(
|
1845
|
+
"avg_spec_accept_length", None
|
1846
|
+
)
|
1847
|
+
else:
|
1848
|
+
accept_length = None
|
1595
1849
|
else:
|
1596
1850
|
accept_length = None
|
1597
1851
|
else:
|
@@ -1609,7 +1863,11 @@ async def benchmark(
|
|
1609
1863
|
|
1610
1864
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
1611
1865
|
print("{:<40} {:<10}".format("Backend:", backend))
|
1612
|
-
print(
|
1866
|
+
print(
|
1867
|
+
"{:<40} {:<10}".format(
|
1868
|
+
"Traffic request rate:", "trace" if use_trace_timestamps else request_rate
|
1869
|
+
)
|
1870
|
+
)
|
1613
1871
|
print(
|
1614
1872
|
"{:<40} {:<10}".format(
|
1615
1873
|
"Max request concurrency:",
|
@@ -1619,6 +1877,10 @@ async def benchmark(
|
|
1619
1877
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
1620
1878
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
1621
1879
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
1880
|
+
print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
|
1881
|
+
print(
|
1882
|
+
"{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
|
1883
|
+
)
|
1622
1884
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
1623
1885
|
print(
|
1624
1886
|
"{:<40} {:<10}".format(
|
@@ -1678,7 +1940,7 @@ async def benchmark(
|
|
1678
1940
|
# Arguments
|
1679
1941
|
"backend": args.backend,
|
1680
1942
|
"dataset_name": args.dataset_name,
|
1681
|
-
"request_rate": request_rate,
|
1943
|
+
"request_rate": "trace" if use_trace_timestamps else request_rate,
|
1682
1944
|
"max_concurrency": max_concurrency,
|
1683
1945
|
"sharegpt_output_len": args.sharegpt_output_len,
|
1684
1946
|
"random_input_len": args.random_input_len,
|
@@ -1688,6 +1950,8 @@ async def benchmark(
|
|
1688
1950
|
"duration": benchmark_duration,
|
1689
1951
|
"completed": metrics.completed,
|
1690
1952
|
"total_input_tokens": metrics.total_input,
|
1953
|
+
"total_input_text_tokens": metrics.total_input_text,
|
1954
|
+
"total_input_vision_tokens": metrics.total_input_vision,
|
1691
1955
|
"total_output_tokens": metrics.total_output,
|
1692
1956
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
1693
1957
|
"request_throughput": metrics.request_throughput,
|
@@ -1722,16 +1986,18 @@ async def benchmark(
|
|
1722
1986
|
output_file_name = args.output_file
|
1723
1987
|
else:
|
1724
1988
|
now = datetime.now().strftime("%m%d")
|
1725
|
-
if args.dataset_name == "
|
1989
|
+
if args.dataset_name == "image":
|
1726
1990
|
output_file_name = (
|
1727
1991
|
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1728
|
-
f"{args.random_output_len}_{args.
|
1729
|
-
f"{args.
|
1992
|
+
f"{args.random_output_len}_{args.image_count}imgs_"
|
1993
|
+
f"{args.image_resolution}.jsonl"
|
1730
1994
|
)
|
1731
1995
|
elif args.dataset_name.startswith("random"):
|
1732
1996
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1733
1997
|
else:
|
1734
|
-
output_file_name =
|
1998
|
+
output_file_name = (
|
1999
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
|
2000
|
+
)
|
1735
2001
|
|
1736
2002
|
result_details = {
|
1737
2003
|
"input_lens": [output.prompt_len for output in outputs],
|
@@ -1786,6 +2052,17 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1786
2052
|
if not hasattr(args, "tokenize_prompt"):
|
1787
2053
|
args.tokenize_prompt = False
|
1788
2054
|
|
2055
|
+
if not hasattr(args, "use_trace_timestamps"):
|
2056
|
+
args.use_trace_timestamps = False
|
2057
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
2058
|
+
args.mooncake_slowdown_factor = 1.0
|
2059
|
+
|
2060
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
2061
|
+
args.mooncake_slowdown_factor = 1.0
|
2062
|
+
|
2063
|
+
if not hasattr(args, "mooncake_num_rounds"):
|
2064
|
+
args.mooncake_num_rounds = 1
|
2065
|
+
|
1789
2066
|
print(f"benchmark_args={args}")
|
1790
2067
|
|
1791
2068
|
# Set global environments
|
@@ -1889,6 +2166,12 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1889
2166
|
"Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
|
1890
2167
|
)
|
1891
2168
|
|
2169
|
+
if args.dataset_name in ["image", "mmmu"]:
|
2170
|
+
args.apply_chat_template = True
|
2171
|
+
assert (
|
2172
|
+
not args.tokenize_prompt
|
2173
|
+
), "`--tokenize-prompt` not compatible with image dataset"
|
2174
|
+
|
1892
2175
|
print(f"{args}\n")
|
1893
2176
|
|
1894
2177
|
# Read dataset
|
@@ -1896,7 +2179,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1896
2179
|
model_id = args.model
|
1897
2180
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
1898
2181
|
tokenizer = get_tokenizer(tokenizer_id)
|
1899
|
-
input_requests = get_dataset(args, tokenizer)
|
2182
|
+
input_requests = get_dataset(args, tokenizer, model_id)
|
1900
2183
|
|
1901
2184
|
# compatible with SimpleNamespace
|
1902
2185
|
if not hasattr(args, "flush_cache"):
|
@@ -1919,6 +2202,9 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1919
2202
|
pd_separated=args.pd_separated,
|
1920
2203
|
flush_cache=args.flush_cache,
|
1921
2204
|
warmup_requests=args.warmup_requests,
|
2205
|
+
use_trace_timestamps=args.use_trace_timestamps,
|
2206
|
+
mooncake_slowdown_factor=args.mooncake_slowdown_factor,
|
2207
|
+
mooncake_num_rounds=args.mooncake_num_rounds,
|
1922
2208
|
)
|
1923
2209
|
)
|
1924
2210
|
|
@@ -1974,7 +2260,8 @@ if __name__ == "__main__":
|
|
1974
2260
|
"random-ids",
|
1975
2261
|
"generated-shared-prefix",
|
1976
2262
|
"mmmu",
|
1977
|
-
"
|
2263
|
+
"image",
|
2264
|
+
"mooncake",
|
1978
2265
|
],
|
1979
2266
|
help="Name of the dataset to benchmark on.",
|
1980
2267
|
)
|
@@ -2013,37 +2300,49 @@ if __name__ == "__main__":
|
|
2013
2300
|
"--random-input-len",
|
2014
2301
|
type=int,
|
2015
2302
|
default=1024,
|
2016
|
-
help="Number of input tokens per request, used only for random dataset.",
|
2303
|
+
help="Number of input tokens per request, used only for random and image dataset.",
|
2017
2304
|
)
|
2018
2305
|
parser.add_argument(
|
2019
2306
|
"--random-output-len",
|
2020
2307
|
default=1024,
|
2021
2308
|
type=int,
|
2022
|
-
help="Number of output tokens per request, used only for random dataset.",
|
2309
|
+
help="Number of output tokens per request, used only for random and image dataset.",
|
2023
2310
|
)
|
2024
2311
|
parser.add_argument(
|
2025
2312
|
"--random-range-ratio",
|
2026
2313
|
type=float,
|
2027
2314
|
default=0.0,
|
2028
2315
|
help="Range of sampled ratio of input/output length, "
|
2029
|
-
"used only for random dataset.",
|
2316
|
+
"used only for random and image dataset.",
|
2030
2317
|
)
|
2031
|
-
#
|
2318
|
+
# image dataset args
|
2032
2319
|
parser.add_argument(
|
2033
|
-
"--
|
2320
|
+
"--image-count",
|
2034
2321
|
type=int,
|
2035
2322
|
default=1,
|
2036
|
-
help="Number of images per request (only available with the
|
2323
|
+
help="Number of images per request (only available with the image dataset)",
|
2037
2324
|
)
|
2038
2325
|
parser.add_argument(
|
2039
|
-
"--
|
2326
|
+
"--image-resolution",
|
2040
2327
|
type=str,
|
2041
2328
|
default="1080p",
|
2042
2329
|
help=(
|
2043
|
-
"Resolution of
|
2330
|
+
"Resolution of images for image dataset. "
|
2044
2331
|
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2045
2332
|
),
|
2046
2333
|
)
|
2334
|
+
parser.add_argument(
|
2335
|
+
"--image-format",
|
2336
|
+
type=str,
|
2337
|
+
default="jpeg",
|
2338
|
+
help=("Format of images for image dataset. " "Supports jpeg and png."),
|
2339
|
+
)
|
2340
|
+
parser.add_argument(
|
2341
|
+
"--image-content",
|
2342
|
+
type=str,
|
2343
|
+
default="random",
|
2344
|
+
help=("Content for images for image dataset. " "Supports random and blank."),
|
2345
|
+
)
|
2047
2346
|
parser.add_argument(
|
2048
2347
|
"--request-rate",
|
2049
2348
|
type=float,
|
@@ -2051,6 +2350,11 @@ if __name__ == "__main__":
|
|
2051
2350
|
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
|
2052
2351
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
|
2053
2352
|
)
|
2353
|
+
parser.add_argument(
|
2354
|
+
"--use-trace-timestamps",
|
2355
|
+
action="store_true",
|
2356
|
+
help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
|
2357
|
+
)
|
2054
2358
|
parser.add_argument(
|
2055
2359
|
"--max-concurrency",
|
2056
2360
|
type=int,
|
@@ -2174,5 +2478,33 @@ if __name__ == "__main__":
|
|
2174
2478
|
default=256,
|
2175
2479
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
2176
2480
|
)
|
2481
|
+
mooncake_group = parser.add_argument_group("mooncake dataset arguments")
|
2482
|
+
mooncake_group.add_argument(
|
2483
|
+
"--mooncake-slowdown-factor",
|
2484
|
+
type=float,
|
2485
|
+
default=1.0,
|
2486
|
+
help="Slowdown factor for replaying the mooncake trace. "
|
2487
|
+
"A value of 2.0 means the replay is twice as slow. "
|
2488
|
+
"NOTE: --request-rate is IGNORED in mooncake mode.",
|
2489
|
+
)
|
2490
|
+
mooncake_group.add_argument(
|
2491
|
+
"--mooncake-num-rounds",
|
2492
|
+
type=int,
|
2493
|
+
default=1,
|
2494
|
+
help="Number of conversation rounds for each session in the mooncake dataset. "
|
2495
|
+
"A value > 1 will enable true multi-turn session benchmarking.",
|
2496
|
+
)
|
2497
|
+
mooncake_group.add_argument(
|
2498
|
+
"--mooncake-workload",
|
2499
|
+
type=str,
|
2500
|
+
default="conversation",
|
2501
|
+
choices=[
|
2502
|
+
"mooncake",
|
2503
|
+
"conversation",
|
2504
|
+
"synthetic",
|
2505
|
+
"toolagent",
|
2506
|
+
],
|
2507
|
+
help="Underlying workload for the mooncake dataset.",
|
2508
|
+
)
|
2177
2509
|
args = parser.parse_args()
|
2178
2510
|
run_benchmark(args)
|