sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +89 -54
- sglang/bench_serving.py +437 -40
- sglang/lang/interpreter.py +1 -1
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +90 -27
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +82 -26
- sglang/srt/entrypoints/openai/serving_completions.py +25 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +28 -7
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +381 -136
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +11 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -8
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +111 -56
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
- sglang/srt/layers/quantization/fp8.py +78 -48
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +45 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +93 -68
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +396 -365
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +18 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +190 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +148 -122
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +77 -480
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +53 -40
- sglang/srt/mem_cache/hiradix_cache.py +196 -104
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +395 -53
- sglang/srt/mem_cache/memory_pool_host.py +27 -19
- sglang/srt/mem_cache/radix_cache.py +6 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +190 -32
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +323 -53
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +7 -19
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +91 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{conversation.py → parser/conversation.py} +38 -5
- sglang/srt/parser/harmony_parser.py +588 -0
- sglang/srt/parser/reasoning_parser.py +309 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +307 -80
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +96 -7
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- sglang/srt/reasoning_parser.py +0 -553
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
|
12
12
|
|
13
13
|
import argparse
|
14
14
|
import asyncio
|
15
|
+
import base64
|
16
|
+
import io
|
15
17
|
import json
|
16
18
|
import os
|
17
19
|
import pickle
|
@@ -71,8 +73,9 @@ class RequestFuncInput:
|
|
71
73
|
output_len: int
|
72
74
|
model: str
|
73
75
|
lora_name: str
|
74
|
-
image_data: str
|
76
|
+
image_data: Optional[List[str]]
|
75
77
|
extra_request_body: Dict[str, Any]
|
78
|
+
timestamp: Optional[float] = None
|
76
79
|
|
77
80
|
|
78
81
|
@dataclass
|
@@ -289,16 +292,19 @@ async def async_request_openai_chat_completions(
|
|
289
292
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
290
293
|
|
291
294
|
if request_func_input.image_data:
|
295
|
+
# Build multi-image content: a list of image_url entries followed by the text
|
296
|
+
content_items = [
|
297
|
+
{
|
298
|
+
"type": "image_url",
|
299
|
+
"image_url": {"url": img_url},
|
300
|
+
}
|
301
|
+
for img_url in request_func_input.image_data
|
302
|
+
]
|
303
|
+
content_items.append({"type": "text", "text": request_func_input.prompt})
|
292
304
|
messages = [
|
293
305
|
{
|
294
306
|
"role": "user",
|
295
|
-
"content":
|
296
|
-
{
|
297
|
-
"type": "image_url",
|
298
|
-
"image_url": {"url": request_func_input.image_data},
|
299
|
-
},
|
300
|
-
{"type": "text", "text": request_func_input.prompt},
|
301
|
-
],
|
307
|
+
"content": content_items,
|
302
308
|
},
|
303
309
|
]
|
304
310
|
else:
|
@@ -497,7 +503,7 @@ async def async_request_sglang_generate(
|
|
497
503
|
**request_func_input.extra_request_body,
|
498
504
|
}
|
499
505
|
|
500
|
-
# Add image data if available
|
506
|
+
# Add image data if available (list of image urls/base64)
|
501
507
|
if request_func_input.image_data:
|
502
508
|
payload["image_data"] = request_func_input.image_data
|
503
509
|
|
@@ -648,7 +654,7 @@ def get_dataset(args, tokenizer):
|
|
648
654
|
prompt_suffix=args.prompt_suffix,
|
649
655
|
apply_chat_template=args.apply_chat_template,
|
650
656
|
)
|
651
|
-
elif args.dataset_name.startswith("random"):
|
657
|
+
elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
|
652
658
|
input_requests = sample_random_requests(
|
653
659
|
input_len=args.random_input_len,
|
654
660
|
output_len=args.random_output_len,
|
@@ -659,6 +665,18 @@ def get_dataset(args, tokenizer):
|
|
659
665
|
random_sample=args.dataset_name == "random",
|
660
666
|
return_text=not tokenize_prompt,
|
661
667
|
)
|
668
|
+
elif args.dataset_name == "random-image":
|
669
|
+
assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
|
670
|
+
input_requests = sample_random_image_requests(
|
671
|
+
num_requests=args.num_prompts,
|
672
|
+
num_images=args.random_image_num_images,
|
673
|
+
input_len=args.random_input_len,
|
674
|
+
output_len=args.random_output_len,
|
675
|
+
range_ratio=args.random_range_ratio,
|
676
|
+
tokenizer=tokenizer,
|
677
|
+
apply_chat_template=args.apply_chat_template,
|
678
|
+
image_resolution=args.random_image_resolution,
|
679
|
+
)
|
662
680
|
elif args.dataset_name == "generated-shared-prefix":
|
663
681
|
assert not tokenize_prompt
|
664
682
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -679,6 +697,24 @@ def get_dataset(args, tokenizer):
|
|
679
697
|
apply_chat_template=args.apply_chat_template,
|
680
698
|
random_sample=True,
|
681
699
|
)
|
700
|
+
elif args.dataset_name == "mooncake":
|
701
|
+
# For mooncake, we don't generate the prompts here.
|
702
|
+
# We just load the raw trace data. The async generator will handle the rest.
|
703
|
+
if not args.dataset_path:
|
704
|
+
local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
|
705
|
+
else:
|
706
|
+
local_path = args.dataset_path
|
707
|
+
|
708
|
+
if not os.path.exists(local_path):
|
709
|
+
download_and_cache_file(
|
710
|
+
MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
|
711
|
+
)
|
712
|
+
|
713
|
+
with open(local_path, "r") as f:
|
714
|
+
all_requests_data = [json.loads(line) for line in f if line.strip()]
|
715
|
+
|
716
|
+
# Limit the number of requests based on --num-prompts
|
717
|
+
input_requests = all_requests_data[: args.num_prompts]
|
682
718
|
else:
|
683
719
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
684
720
|
return input_requests
|
@@ -733,6 +769,12 @@ class BenchmarkMetrics:
|
|
733
769
|
|
734
770
|
|
735
771
|
SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
|
772
|
+
MOONCAKE_DATASET_URL = {
|
773
|
+
"mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
|
774
|
+
"conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
|
775
|
+
"synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
|
776
|
+
"toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
|
777
|
+
}
|
736
778
|
|
737
779
|
|
738
780
|
def download_and_cache_file(url: str, filename: Optional[str] = None):
|
@@ -790,7 +832,81 @@ class DatasetRow:
|
|
790
832
|
prompt: str
|
791
833
|
prompt_len: int
|
792
834
|
output_len: int
|
793
|
-
image_data: Optional[str] = None
|
835
|
+
image_data: Optional[List[str]] = None
|
836
|
+
timestamp: Optional[float] = None
|
837
|
+
|
838
|
+
|
839
|
+
async def get_mooncake_request_over_time(
|
840
|
+
input_requests: List[Dict],
|
841
|
+
tokenizer: PreTrainedTokenizerBase,
|
842
|
+
slowdown_factor: float,
|
843
|
+
num_rounds: int,
|
844
|
+
) -> AsyncGenerator[DatasetRow, None]:
|
845
|
+
"""
|
846
|
+
An async generator that yields requests based on the timestamps in the Mooncake trace file,
|
847
|
+
with support for multi-round sessions.
|
848
|
+
"""
|
849
|
+
if not input_requests:
|
850
|
+
return
|
851
|
+
|
852
|
+
input_requests.sort(key=lambda r: r["timestamp"])
|
853
|
+
|
854
|
+
start_time = time.perf_counter()
|
855
|
+
trace_start_time_ms = input_requests[0]["timestamp"]
|
856
|
+
|
857
|
+
for record in input_requests:
|
858
|
+
# Calculate when this entire session should start
|
859
|
+
relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
|
860
|
+
target_arrival_time_s = relative_arrival_time_s * slowdown_factor
|
861
|
+
|
862
|
+
current_elapsed_time_s = time.perf_counter() - start_time
|
863
|
+
sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
|
864
|
+
if sleep_duration_s > 0:
|
865
|
+
await asyncio.sleep(sleep_duration_s)
|
866
|
+
|
867
|
+
# Once the session starts, generate all rounds for it as a burst
|
868
|
+
# This simulates a user engaging in a multi-turn conversation
|
869
|
+
|
870
|
+
# Base user query constructed from hash_ids
|
871
|
+
user_query_base = ""
|
872
|
+
hash_ids = record.get("hash_ids", [])
|
873
|
+
for hash_id in hash_ids:
|
874
|
+
user_query_base += f"{hash_id}" + " ".join(
|
875
|
+
["hi"] * 128
|
876
|
+
) # Shorter for multi-round
|
877
|
+
user_query_base += "Tell me a story based on this context."
|
878
|
+
|
879
|
+
output_len_per_round = record.get("output_length", 256)
|
880
|
+
chat_history = []
|
881
|
+
|
882
|
+
for i in range(num_rounds):
|
883
|
+
# Add user query for the current round
|
884
|
+
chat_history.append(
|
885
|
+
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
|
886
|
+
)
|
887
|
+
|
888
|
+
# Form the full prompt from history
|
889
|
+
try:
|
890
|
+
full_prompt_text = tokenizer.apply_chat_template(
|
891
|
+
chat_history, tokenize=False, add_generation_prompt=True
|
892
|
+
)
|
893
|
+
except Exception:
|
894
|
+
full_prompt_text = "\n".join(
|
895
|
+
[f"{msg['role']}: {msg['content']}" for msg in chat_history]
|
896
|
+
)
|
897
|
+
|
898
|
+
prompt_len = len(tokenizer.encode(full_prompt_text))
|
899
|
+
|
900
|
+
yield DatasetRow(
|
901
|
+
prompt=full_prompt_text,
|
902
|
+
prompt_len=prompt_len,
|
903
|
+
output_len=output_len_per_round,
|
904
|
+
)
|
905
|
+
|
906
|
+
# Add a placeholder assistant response for the next round's context
|
907
|
+
# We use a placeholder because we don't know the real response
|
908
|
+
placeholder_response = " ".join(["story"] * output_len_per_round)
|
909
|
+
chat_history.append({"role": "assistant", "content": placeholder_response})
|
794
910
|
|
795
911
|
|
796
912
|
def sample_mmmu_requests(
|
@@ -879,17 +995,25 @@ def sample_mmmu_requests(
|
|
879
995
|
prompt = f"Question: {question}\n\nAnswer: "
|
880
996
|
if apply_chat_template:
|
881
997
|
try:
|
998
|
+
is_phi4_multimodal = (
|
999
|
+
"phi-4-multimodal" in tokenizer.name_or_path.lower()
|
1000
|
+
)
|
1001
|
+
if is_phi4_multimodal:
|
1002
|
+
# <|endoftext10|> is the image token used in the phi-4-multimodal model.
|
1003
|
+
content = prompt.replace("image 1", "<|endoftext10|>")
|
1004
|
+
else:
|
1005
|
+
content = [
|
1006
|
+
{
|
1007
|
+
"type": "image_url",
|
1008
|
+
"image_url": {"url": image_data},
|
1009
|
+
},
|
1010
|
+
{"type": "text", "text": prompt},
|
1011
|
+
]
|
882
1012
|
prompt = tokenizer.apply_chat_template(
|
883
1013
|
[
|
884
1014
|
{
|
885
1015
|
"role": "user",
|
886
|
-
"content":
|
887
|
-
{
|
888
|
-
"type": "image_url",
|
889
|
-
"image_url": {"url": image_data},
|
890
|
-
},
|
891
|
-
{"type": "text", "text": prompt},
|
892
|
-
],
|
1016
|
+
"content": content,
|
893
1017
|
}
|
894
1018
|
],
|
895
1019
|
add_generation_prompt=True,
|
@@ -913,7 +1037,7 @@ def sample_mmmu_requests(
|
|
913
1037
|
prompt=prompt,
|
914
1038
|
prompt_len=prompt_len,
|
915
1039
|
output_len=output_len,
|
916
|
-
image_data=image_data,
|
1040
|
+
image_data=[image_data],
|
917
1041
|
)
|
918
1042
|
)
|
919
1043
|
|
@@ -1113,6 +1237,132 @@ def sample_random_requests(
|
|
1113
1237
|
return input_requests
|
1114
1238
|
|
1115
1239
|
|
1240
|
+
def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1241
|
+
"""Parse image resolution into (width, height).
|
1242
|
+
|
1243
|
+
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
1244
|
+
(e.g., '1080x1920' means height=1080, width=1920).
|
1245
|
+
"""
|
1246
|
+
resolution_to_size = {
|
1247
|
+
"4k": (3840, 2160),
|
1248
|
+
"1080p": (1920, 1080),
|
1249
|
+
"720p": (1280, 720),
|
1250
|
+
"360p": (640, 360),
|
1251
|
+
}
|
1252
|
+
if image_resolution in resolution_to_size:
|
1253
|
+
return resolution_to_size[image_resolution]
|
1254
|
+
|
1255
|
+
res = image_resolution.strip().lower()
|
1256
|
+
if "x" in res:
|
1257
|
+
parts = res.split("x")
|
1258
|
+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
1259
|
+
height = int(parts[0])
|
1260
|
+
width = int(parts[1])
|
1261
|
+
if height > 0 and width > 0:
|
1262
|
+
return (width, height)
|
1263
|
+
|
1264
|
+
raise ValueError(
|
1265
|
+
f"Unsupported random-image resolution: {image_resolution}. "
|
1266
|
+
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
|
1270
|
+
def sample_random_image_requests(
|
1271
|
+
num_requests: int,
|
1272
|
+
num_images: int,
|
1273
|
+
input_len: int,
|
1274
|
+
output_len: int,
|
1275
|
+
range_ratio: float,
|
1276
|
+
tokenizer: PreTrainedTokenizerBase,
|
1277
|
+
apply_chat_template: bool = True,
|
1278
|
+
image_resolution: str = "1080p",
|
1279
|
+
) -> List[DatasetRow]:
|
1280
|
+
"""Generate requests with random images.
|
1281
|
+
|
1282
|
+
- Each request includes ``num_images`` random images.
|
1283
|
+
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1284
|
+
or custom 'heightxwidth' (e.g., 1080x1920).
|
1285
|
+
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
1286
|
+
only counts text tokens and excludes image data.
|
1287
|
+
"""
|
1288
|
+
try:
|
1289
|
+
import pybase64
|
1290
|
+
from PIL import Image
|
1291
|
+
except ImportError as e:
|
1292
|
+
raise ImportError(
|
1293
|
+
"Please install Pillow to generate random images: pip install pillow"
|
1294
|
+
) from e
|
1295
|
+
|
1296
|
+
# Parse resolution (supports presets and 'heightxwidth')
|
1297
|
+
width, height = parse_random_image_resolution(image_resolution)
|
1298
|
+
|
1299
|
+
# Check for potentially problematic combinations and warn user
|
1300
|
+
if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
|
1301
|
+
warnings.warn(
|
1302
|
+
f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
|
1303
|
+
f"may take a long time. Consider reducing resolution or image count.",
|
1304
|
+
UserWarning,
|
1305
|
+
stacklevel=2,
|
1306
|
+
)
|
1307
|
+
|
1308
|
+
# Sample text lengths
|
1309
|
+
input_lens = np.random.randint(
|
1310
|
+
max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
|
1311
|
+
)
|
1312
|
+
output_lens = np.random.randint(
|
1313
|
+
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1314
|
+
)
|
1315
|
+
|
1316
|
+
def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
|
1317
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1318
|
+
img = Image.fromarray(arr, mode="RGB")
|
1319
|
+
buf = io.BytesIO()
|
1320
|
+
img.save(buf, format="JPEG", quality=85)
|
1321
|
+
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1322
|
+
return f"data:image/jpeg;base64,{encoded}"
|
1323
|
+
|
1324
|
+
dataset: List[DatasetRow] = []
|
1325
|
+
for i in range(num_requests):
|
1326
|
+
# Generate text prompt
|
1327
|
+
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1328
|
+
|
1329
|
+
# Generate image list
|
1330
|
+
images = [_gen_random_image_data_uri() for _ in range(num_images)]
|
1331
|
+
|
1332
|
+
prompt_str = text_prompt
|
1333
|
+
if apply_chat_template:
|
1334
|
+
try:
|
1335
|
+
content_items = [
|
1336
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1337
|
+
for img_url in images
|
1338
|
+
]
|
1339
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1340
|
+
prompt_str = tokenizer.apply_chat_template(
|
1341
|
+
[{"role": "user", "content": content_items}],
|
1342
|
+
add_generation_prompt=True,
|
1343
|
+
tokenize=False,
|
1344
|
+
)
|
1345
|
+
except Exception:
|
1346
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1347
|
+
prompt_str = f"<image>{text_prompt}"
|
1348
|
+
|
1349
|
+
prompt_token_ids = tokenizer.encode(prompt_str)
|
1350
|
+
prompt_token_len = len(prompt_token_ids)
|
1351
|
+
|
1352
|
+
dataset.append(
|
1353
|
+
DatasetRow(
|
1354
|
+
prompt=prompt_str,
|
1355
|
+
prompt_len=prompt_token_len,
|
1356
|
+
output_len=int(output_lens[i]),
|
1357
|
+
image_data=images,
|
1358
|
+
)
|
1359
|
+
)
|
1360
|
+
|
1361
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1362
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1363
|
+
return dataset
|
1364
|
+
|
1365
|
+
|
1116
1366
|
def gen_prompt(tokenizer, token_num):
|
1117
1367
|
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
1118
1368
|
all_available_tokens = list(tokenizer.get_vocab().values())
|
@@ -1216,19 +1466,41 @@ def sample_generated_shared_prefix_requests(
|
|
1216
1466
|
async def get_request(
|
1217
1467
|
input_requests: List[DatasetRow],
|
1218
1468
|
request_rate: float,
|
1469
|
+
use_trace_timestamps: bool = False,
|
1470
|
+
slowdown_factor: float = 1.0,
|
1219
1471
|
) -> AsyncGenerator[DatasetRow, None]:
|
1220
|
-
|
1221
|
-
|
1222
|
-
|
1472
|
+
if use_trace_timestamps:
|
1473
|
+
print(
|
1474
|
+
f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
|
1475
|
+
)
|
1476
|
+
# Sort requests by timestamp for correct replay
|
1477
|
+
input_requests.sort(key=lambda r: r.timestamp)
|
1223
1478
|
|
1224
|
-
|
1225
|
-
|
1226
|
-
|
1479
|
+
start_time = time.perf_counter()
|
1480
|
+
trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
|
1481
|
+
|
1482
|
+
for request in input_requests:
|
1483
|
+
trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
|
1484
|
+
target_arrival_time = start_time + (trace_time_s * slowdown_factor)
|
1485
|
+
|
1486
|
+
sleep_duration = target_arrival_time - time.perf_counter()
|
1487
|
+
if sleep_duration > 0:
|
1488
|
+
await asyncio.sleep(sleep_duration)
|
1489
|
+
|
1490
|
+
yield request
|
1491
|
+
else:
|
1492
|
+
input_requests_iter = iter(input_requests)
|
1493
|
+
for request in input_requests_iter:
|
1494
|
+
yield request
|
1495
|
+
|
1496
|
+
if request_rate == float("inf"):
|
1497
|
+
# If the request rate is infinity, then we don't need to wait.
|
1498
|
+
continue
|
1227
1499
|
|
1228
|
-
|
1229
|
-
|
1230
|
-
|
1231
|
-
|
1500
|
+
# Sample the request interval from the exponential distribution.
|
1501
|
+
interval = np.random.exponential(1.0 / request_rate)
|
1502
|
+
# The next request will be sent after the interval.
|
1503
|
+
await asyncio.sleep(interval)
|
1232
1504
|
|
1233
1505
|
|
1234
1506
|
def calculate_metrics(
|
@@ -1254,7 +1526,7 @@ def calculate_metrics(
|
|
1254
1526
|
tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
|
1255
1527
|
)
|
1256
1528
|
retokenized_output_lens.append(retokenized_output_len)
|
1257
|
-
total_input +=
|
1529
|
+
total_input += outputs[i].prompt_len
|
1258
1530
|
if output_len > 1:
|
1259
1531
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1260
1532
|
itls += outputs[i].itl
|
@@ -1326,6 +1598,9 @@ async def benchmark(
|
|
1326
1598
|
pd_separated: bool = False,
|
1327
1599
|
flush_cache: bool = False,
|
1328
1600
|
warmup_requests: int = 1,
|
1601
|
+
use_trace_timestamps: bool = False,
|
1602
|
+
mooncake_slowdown_factor=1.0,
|
1603
|
+
mooncake_num_rounds=1,
|
1329
1604
|
):
|
1330
1605
|
if backend in ASYNC_REQUEST_FUNCS:
|
1331
1606
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -1345,8 +1620,32 @@ async def benchmark(
|
|
1345
1620
|
# Warmup
|
1346
1621
|
print(f"Starting warmup with {warmup_requests} sequences...")
|
1347
1622
|
|
1348
|
-
#
|
1349
|
-
|
1623
|
+
# Handle the data structure difference for the warmup request
|
1624
|
+
if args.dataset_name == "mooncake":
|
1625
|
+
# For mooncake, input_requests is a list of dicts.
|
1626
|
+
# We need to build a temporary DatasetRow for the warmup phase.
|
1627
|
+
warmup_record = input_requests[0]
|
1628
|
+
|
1629
|
+
# Build prompt from hash_ids, just like in the async generator
|
1630
|
+
hash_ids = warmup_record.get("hash_ids", [])
|
1631
|
+
prompt_text = ""
|
1632
|
+
for hash_id in hash_ids:
|
1633
|
+
prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
|
1634
|
+
prompt_text += "Can you tell me a detailed story in 1000 words?"
|
1635
|
+
|
1636
|
+
output_len = warmup_record.get("output_length", 32)
|
1637
|
+
prompt_len = len(tokenizer.encode(prompt_text))
|
1638
|
+
|
1639
|
+
# Create a temporary DatasetRow object for warmup
|
1640
|
+
test_request = DatasetRow(
|
1641
|
+
prompt=prompt_text,
|
1642
|
+
prompt_len=prompt_len,
|
1643
|
+
output_len=output_len,
|
1644
|
+
image_data=None, # Mooncake doesn't have image data
|
1645
|
+
)
|
1646
|
+
else:
|
1647
|
+
# For all other datasets, input_requests is a list of DatasetRow objects
|
1648
|
+
test_request = input_requests[0]
|
1350
1649
|
|
1351
1650
|
if lora_names is not None and len(lora_names) != 0:
|
1352
1651
|
lora_name = lora_names[0]
|
@@ -1400,12 +1699,26 @@ async def benchmark(
|
|
1400
1699
|
if profile_output.success:
|
1401
1700
|
print("Profiler started")
|
1402
1701
|
|
1403
|
-
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
1404
|
-
|
1405
1702
|
# Run all requests
|
1406
1703
|
benchmark_start_time = time.perf_counter()
|
1407
1704
|
tasks: List[asyncio.Task] = []
|
1408
|
-
|
1705
|
+
pbar_total = len(input_requests)
|
1706
|
+
if (
|
1707
|
+
backend == "sglang" and args.dataset_name == "mooncake"
|
1708
|
+
): # Assuming mooncake is mainly for sglang or similar backends
|
1709
|
+
print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
|
1710
|
+
request_generator = get_mooncake_request_over_time(
|
1711
|
+
input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
|
1712
|
+
)
|
1713
|
+
print(
|
1714
|
+
f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
|
1715
|
+
)
|
1716
|
+
pbar_total *= args.mooncake_num_rounds
|
1717
|
+
else:
|
1718
|
+
request_generator = get_request(input_requests, request_rate)
|
1719
|
+
|
1720
|
+
pbar = None if disable_tqdm else tqdm(total=pbar_total)
|
1721
|
+
async for request in request_generator:
|
1409
1722
|
if lora_names is not None and len(lora_names) != 0:
|
1410
1723
|
idx = random.randint(0, len(lora_names) - 1)
|
1411
1724
|
lora_name = lora_names[idx]
|
@@ -1421,6 +1734,7 @@ async def benchmark(
|
|
1421
1734
|
lora_name=lora_name,
|
1422
1735
|
image_data=request.image_data,
|
1423
1736
|
extra_request_body=extra_request_body,
|
1737
|
+
timestamp=request.timestamp,
|
1424
1738
|
)
|
1425
1739
|
|
1426
1740
|
tasks.append(
|
@@ -1466,7 +1780,11 @@ async def benchmark(
|
|
1466
1780
|
|
1467
1781
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
1468
1782
|
print("{:<40} {:<10}".format("Backend:", backend))
|
1469
|
-
print(
|
1783
|
+
print(
|
1784
|
+
"{:<40} {:<10}".format(
|
1785
|
+
"Traffic request rate:", "trace" if use_trace_timestamps else request_rate
|
1786
|
+
)
|
1787
|
+
)
|
1470
1788
|
print(
|
1471
1789
|
"{:<40} {:<10}".format(
|
1472
1790
|
"Max request concurrency:",
|
@@ -1535,7 +1853,7 @@ async def benchmark(
|
|
1535
1853
|
# Arguments
|
1536
1854
|
"backend": args.backend,
|
1537
1855
|
"dataset_name": args.dataset_name,
|
1538
|
-
"request_rate": request_rate,
|
1856
|
+
"request_rate": "trace" if use_trace_timestamps else request_rate,
|
1539
1857
|
"max_concurrency": max_concurrency,
|
1540
1858
|
"sharegpt_output_len": args.sharegpt_output_len,
|
1541
1859
|
"random_input_len": args.random_input_len,
|
@@ -1579,10 +1897,18 @@ async def benchmark(
|
|
1579
1897
|
output_file_name = args.output_file
|
1580
1898
|
else:
|
1581
1899
|
now = datetime.now().strftime("%m%d")
|
1582
|
-
if args.dataset_name
|
1900
|
+
if args.dataset_name == "random-image":
|
1901
|
+
output_file_name = (
|
1902
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1903
|
+
f"{args.random_output_len}_{args.random_image_num_images}imgs_"
|
1904
|
+
f"{args.random_image_resolution}.jsonl"
|
1905
|
+
)
|
1906
|
+
elif args.dataset_name.startswith("random"):
|
1583
1907
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1584
1908
|
else:
|
1585
|
-
output_file_name =
|
1909
|
+
output_file_name = (
|
1910
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
|
1911
|
+
)
|
1586
1912
|
|
1587
1913
|
result_details = {
|
1588
1914
|
"input_lens": [output.prompt_len for output in outputs],
|
@@ -1637,6 +1963,17 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1637
1963
|
if not hasattr(args, "tokenize_prompt"):
|
1638
1964
|
args.tokenize_prompt = False
|
1639
1965
|
|
1966
|
+
if not hasattr(args, "use_trace_timestamps"):
|
1967
|
+
args.use_trace_timestamps = False
|
1968
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
1969
|
+
args.mooncake_slowdown_factor = 1.0
|
1970
|
+
|
1971
|
+
if not hasattr(args, "mooncake_slowdown_factor"):
|
1972
|
+
args.mooncake_slowdown_factor = 1.0
|
1973
|
+
|
1974
|
+
if not hasattr(args, "mooncake_num_rounds"):
|
1975
|
+
args.mooncake_num_rounds = 1
|
1976
|
+
|
1640
1977
|
print(f"benchmark_args={args}")
|
1641
1978
|
|
1642
1979
|
# Set global environments
|
@@ -1770,6 +2107,9 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1770
2107
|
pd_separated=args.pd_separated,
|
1771
2108
|
flush_cache=args.flush_cache,
|
1772
2109
|
warmup_requests=args.warmup_requests,
|
2110
|
+
use_trace_timestamps=args.use_trace_timestamps,
|
2111
|
+
mooncake_slowdown_factor=args.mooncake_slowdown_factor,
|
2112
|
+
mooncake_num_rounds=args.mooncake_num_rounds,
|
1773
2113
|
)
|
1774
2114
|
)
|
1775
2115
|
|
@@ -1819,7 +2159,15 @@ if __name__ == "__main__":
|
|
1819
2159
|
"--dataset-name",
|
1820
2160
|
type=str,
|
1821
2161
|
default="sharegpt",
|
1822
|
-
choices=[
|
2162
|
+
choices=[
|
2163
|
+
"sharegpt",
|
2164
|
+
"random",
|
2165
|
+
"random-ids",
|
2166
|
+
"generated-shared-prefix",
|
2167
|
+
"mmmu",
|
2168
|
+
"random-image",
|
2169
|
+
"mooncake",
|
2170
|
+
],
|
1823
2171
|
help="Name of the dataset to benchmark on.",
|
1824
2172
|
)
|
1825
2173
|
parser.add_argument(
|
@@ -1872,6 +2220,22 @@ if __name__ == "__main__":
|
|
1872
2220
|
help="Range of sampled ratio of input/output length, "
|
1873
2221
|
"used only for random dataset.",
|
1874
2222
|
)
|
2223
|
+
# random-image dataset args
|
2224
|
+
parser.add_argument(
|
2225
|
+
"--random-image-num-images",
|
2226
|
+
type=int,
|
2227
|
+
default=1,
|
2228
|
+
help="Number of images per request (only available with the random-image dataset)",
|
2229
|
+
)
|
2230
|
+
parser.add_argument(
|
2231
|
+
"--random-image-resolution",
|
2232
|
+
type=str,
|
2233
|
+
default="1080p",
|
2234
|
+
help=(
|
2235
|
+
"Resolution of random images for random-image dataset. "
|
2236
|
+
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2237
|
+
),
|
2238
|
+
)
|
1875
2239
|
parser.add_argument(
|
1876
2240
|
"--request-rate",
|
1877
2241
|
type=float,
|
@@ -1879,6 +2243,11 @@ if __name__ == "__main__":
|
|
1879
2243
|
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
|
1880
2244
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
|
1881
2245
|
)
|
2246
|
+
parser.add_argument(
|
2247
|
+
"--use-trace-timestamps",
|
2248
|
+
action="store_true",
|
2249
|
+
help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
|
2250
|
+
)
|
1882
2251
|
parser.add_argument(
|
1883
2252
|
"--max-concurrency",
|
1884
2253
|
type=int,
|
@@ -2002,5 +2371,33 @@ if __name__ == "__main__":
|
|
2002
2371
|
default=256,
|
2003
2372
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
2004
2373
|
)
|
2374
|
+
mooncake_group = parser.add_argument_group("mooncake dataset arguments")
|
2375
|
+
mooncake_group.add_argument(
|
2376
|
+
"--mooncake-slowdown-factor",
|
2377
|
+
type=float,
|
2378
|
+
default=1.0,
|
2379
|
+
help="Slowdown factor for replaying the mooncake trace. "
|
2380
|
+
"A value of 2.0 means the replay is twice as slow. "
|
2381
|
+
"NOTE: --request-rate is IGNORED in mooncake mode.",
|
2382
|
+
)
|
2383
|
+
mooncake_group.add_argument(
|
2384
|
+
"--mooncake-num-rounds",
|
2385
|
+
type=int,
|
2386
|
+
default=1,
|
2387
|
+
help="Number of conversation rounds for each session in the mooncake dataset. "
|
2388
|
+
"A value > 1 will enable true multi-turn session benchmarking.",
|
2389
|
+
)
|
2390
|
+
mooncake_group.add_argument(
|
2391
|
+
"--mooncake-workload",
|
2392
|
+
type=str,
|
2393
|
+
default="conversation",
|
2394
|
+
choices=[
|
2395
|
+
"mooncake",
|
2396
|
+
"conversation",
|
2397
|
+
"synthetic",
|
2398
|
+
"toolagent",
|
2399
|
+
],
|
2400
|
+
help="Underlying workload for the mooncake dataset.",
|
2401
|
+
)
|
2005
2402
|
args = parser.parse_args()
|
2006
2403
|
run_benchmark(args)
|
sglang/lang/interpreter.py
CHANGED
@@ -740,7 +740,7 @@ class StreamExecutor:
|
|
740
740
|
# Execute the stored lazy generation calls
|
741
741
|
self.backend.role_end_generate(self)
|
742
742
|
|
743
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
743
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
744
744
|
|
745
745
|
reasoning_parser = ReasoningParser(expr.model_type)
|
746
746
|
other = expr.expr
|