sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
+ import base64
16
+ import io
15
17
  import json
16
18
  import os
17
19
  import pickle
@@ -71,8 +73,9 @@ class RequestFuncInput:
71
73
  output_len: int
72
74
  model: str
73
75
  lora_name: str
74
- image_data: str
76
+ image_data: Optional[List[str]]
75
77
  extra_request_body: Dict[str, Any]
78
+ timestamp: Optional[float] = None
76
79
 
77
80
 
78
81
  @dataclass
@@ -289,16 +292,19 @@ async def async_request_openai_chat_completions(
289
292
  ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
290
293
 
291
294
  if request_func_input.image_data:
295
+ # Build multi-image content: a list of image_url entries followed by the text
296
+ content_items = [
297
+ {
298
+ "type": "image_url",
299
+ "image_url": {"url": img_url},
300
+ }
301
+ for img_url in request_func_input.image_data
302
+ ]
303
+ content_items.append({"type": "text", "text": request_func_input.prompt})
292
304
  messages = [
293
305
  {
294
306
  "role": "user",
295
- "content": [
296
- {
297
- "type": "image_url",
298
- "image_url": {"url": request_func_input.image_data},
299
- },
300
- {"type": "text", "text": request_func_input.prompt},
301
- ],
307
+ "content": content_items,
302
308
  },
303
309
  ]
304
310
  else:
@@ -497,7 +503,7 @@ async def async_request_sglang_generate(
497
503
  **request_func_input.extra_request_body,
498
504
  }
499
505
 
500
- # Add image data if available
506
+ # Add image data if available (list of image urls/base64)
501
507
  if request_func_input.image_data:
502
508
  payload["image_data"] = request_func_input.image_data
503
509
 
@@ -648,7 +654,7 @@ def get_dataset(args, tokenizer):
648
654
  prompt_suffix=args.prompt_suffix,
649
655
  apply_chat_template=args.apply_chat_template,
650
656
  )
651
- elif args.dataset_name.startswith("random"):
657
+ elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
652
658
  input_requests = sample_random_requests(
653
659
  input_len=args.random_input_len,
654
660
  output_len=args.random_output_len,
@@ -659,6 +665,18 @@ def get_dataset(args, tokenizer):
659
665
  random_sample=args.dataset_name == "random",
660
666
  return_text=not tokenize_prompt,
661
667
  )
668
+ elif args.dataset_name == "random-image":
669
+ assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
670
+ input_requests = sample_random_image_requests(
671
+ num_requests=args.num_prompts,
672
+ num_images=args.random_image_num_images,
673
+ input_len=args.random_input_len,
674
+ output_len=args.random_output_len,
675
+ range_ratio=args.random_range_ratio,
676
+ tokenizer=tokenizer,
677
+ apply_chat_template=args.apply_chat_template,
678
+ image_resolution=args.random_image_resolution,
679
+ )
662
680
  elif args.dataset_name == "generated-shared-prefix":
663
681
  assert not tokenize_prompt
664
682
  input_requests = sample_generated_shared_prefix_requests(
@@ -679,6 +697,24 @@ def get_dataset(args, tokenizer):
679
697
  apply_chat_template=args.apply_chat_template,
680
698
  random_sample=True,
681
699
  )
700
+ elif args.dataset_name == "mooncake":
701
+ # For mooncake, we don't generate the prompts here.
702
+ # We just load the raw trace data. The async generator will handle the rest.
703
+ if not args.dataset_path:
704
+ local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
705
+ else:
706
+ local_path = args.dataset_path
707
+
708
+ if not os.path.exists(local_path):
709
+ download_and_cache_file(
710
+ MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
711
+ )
712
+
713
+ with open(local_path, "r") as f:
714
+ all_requests_data = [json.loads(line) for line in f if line.strip()]
715
+
716
+ # Limit the number of requests based on --num-prompts
717
+ input_requests = all_requests_data[: args.num_prompts]
682
718
  else:
683
719
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
684
720
  return input_requests
@@ -733,6 +769,12 @@ class BenchmarkMetrics:
733
769
 
734
770
 
735
771
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
772
+ MOONCAKE_DATASET_URL = {
773
+ "mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
774
+ "conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
775
+ "synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
776
+ "toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
777
+ }
736
778
 
737
779
 
738
780
  def download_and_cache_file(url: str, filename: Optional[str] = None):
@@ -790,7 +832,81 @@ class DatasetRow:
790
832
  prompt: str
791
833
  prompt_len: int
792
834
  output_len: int
793
- image_data: Optional[str] = None
835
+ image_data: Optional[List[str]] = None
836
+ timestamp: Optional[float] = None
837
+
838
+
839
+ async def get_mooncake_request_over_time(
840
+ input_requests: List[Dict],
841
+ tokenizer: PreTrainedTokenizerBase,
842
+ slowdown_factor: float,
843
+ num_rounds: int,
844
+ ) -> AsyncGenerator[DatasetRow, None]:
845
+ """
846
+ An async generator that yields requests based on the timestamps in the Mooncake trace file,
847
+ with support for multi-round sessions.
848
+ """
849
+ if not input_requests:
850
+ return
851
+
852
+ input_requests.sort(key=lambda r: r["timestamp"])
853
+
854
+ start_time = time.perf_counter()
855
+ trace_start_time_ms = input_requests[0]["timestamp"]
856
+
857
+ for record in input_requests:
858
+ # Calculate when this entire session should start
859
+ relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
860
+ target_arrival_time_s = relative_arrival_time_s * slowdown_factor
861
+
862
+ current_elapsed_time_s = time.perf_counter() - start_time
863
+ sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
864
+ if sleep_duration_s > 0:
865
+ await asyncio.sleep(sleep_duration_s)
866
+
867
+ # Once the session starts, generate all rounds for it as a burst
868
+ # This simulates a user engaging in a multi-turn conversation
869
+
870
+ # Base user query constructed from hash_ids
871
+ user_query_base = ""
872
+ hash_ids = record.get("hash_ids", [])
873
+ for hash_id in hash_ids:
874
+ user_query_base += f"{hash_id}" + " ".join(
875
+ ["hi"] * 128
876
+ ) # Shorter for multi-round
877
+ user_query_base += "Tell me a story based on this context."
878
+
879
+ output_len_per_round = record.get("output_length", 256)
880
+ chat_history = []
881
+
882
+ for i in range(num_rounds):
883
+ # Add user query for the current round
884
+ chat_history.append(
885
+ {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
886
+ )
887
+
888
+ # Form the full prompt from history
889
+ try:
890
+ full_prompt_text = tokenizer.apply_chat_template(
891
+ chat_history, tokenize=False, add_generation_prompt=True
892
+ )
893
+ except Exception:
894
+ full_prompt_text = "\n".join(
895
+ [f"{msg['role']}: {msg['content']}" for msg in chat_history]
896
+ )
897
+
898
+ prompt_len = len(tokenizer.encode(full_prompt_text))
899
+
900
+ yield DatasetRow(
901
+ prompt=full_prompt_text,
902
+ prompt_len=prompt_len,
903
+ output_len=output_len_per_round,
904
+ )
905
+
906
+ # Add a placeholder assistant response for the next round's context
907
+ # We use a placeholder because we don't know the real response
908
+ placeholder_response = " ".join(["story"] * output_len_per_round)
909
+ chat_history.append({"role": "assistant", "content": placeholder_response})
794
910
 
795
911
 
796
912
  def sample_mmmu_requests(
@@ -879,17 +995,25 @@ def sample_mmmu_requests(
879
995
  prompt = f"Question: {question}\n\nAnswer: "
880
996
  if apply_chat_template:
881
997
  try:
998
+ is_phi4_multimodal = (
999
+ "phi-4-multimodal" in tokenizer.name_or_path.lower()
1000
+ )
1001
+ if is_phi4_multimodal:
1002
+ # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1003
+ content = prompt.replace("image 1", "<|endoftext10|>")
1004
+ else:
1005
+ content = [
1006
+ {
1007
+ "type": "image_url",
1008
+ "image_url": {"url": image_data},
1009
+ },
1010
+ {"type": "text", "text": prompt},
1011
+ ]
882
1012
  prompt = tokenizer.apply_chat_template(
883
1013
  [
884
1014
  {
885
1015
  "role": "user",
886
- "content": [
887
- {
888
- "type": "image_url",
889
- "image_url": {"url": image_data},
890
- },
891
- {"type": "text", "text": prompt},
892
- ],
1016
+ "content": content,
893
1017
  }
894
1018
  ],
895
1019
  add_generation_prompt=True,
@@ -913,7 +1037,7 @@ def sample_mmmu_requests(
913
1037
  prompt=prompt,
914
1038
  prompt_len=prompt_len,
915
1039
  output_len=output_len,
916
- image_data=image_data,
1040
+ image_data=[image_data],
917
1041
  )
918
1042
  )
919
1043
 
@@ -1113,6 +1237,132 @@ def sample_random_requests(
1113
1237
  return input_requests
1114
1238
 
1115
1239
 
1240
+ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1241
+ """Parse image resolution into (width, height).
1242
+
1243
+ Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
1244
+ (e.g., '1080x1920' means height=1080, width=1920).
1245
+ """
1246
+ resolution_to_size = {
1247
+ "4k": (3840, 2160),
1248
+ "1080p": (1920, 1080),
1249
+ "720p": (1280, 720),
1250
+ "360p": (640, 360),
1251
+ }
1252
+ if image_resolution in resolution_to_size:
1253
+ return resolution_to_size[image_resolution]
1254
+
1255
+ res = image_resolution.strip().lower()
1256
+ if "x" in res:
1257
+ parts = res.split("x")
1258
+ if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
1259
+ height = int(parts[0])
1260
+ width = int(parts[1])
1261
+ if height > 0 and width > 0:
1262
+ return (width, height)
1263
+
1264
+ raise ValueError(
1265
+ f"Unsupported random-image resolution: {image_resolution}. "
1266
+ "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1267
+ )
1268
+
1269
+
1270
+ def sample_random_image_requests(
1271
+ num_requests: int,
1272
+ num_images: int,
1273
+ input_len: int,
1274
+ output_len: int,
1275
+ range_ratio: float,
1276
+ tokenizer: PreTrainedTokenizerBase,
1277
+ apply_chat_template: bool = True,
1278
+ image_resolution: str = "1080p",
1279
+ ) -> List[DatasetRow]:
1280
+ """Generate requests with random images.
1281
+
1282
+ - Each request includes ``num_images`` random images.
1283
+ - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1284
+ or custom 'heightxwidth' (e.g., 1080x1920).
1285
+ - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1286
+ only counts text tokens and excludes image data.
1287
+ """
1288
+ try:
1289
+ import pybase64
1290
+ from PIL import Image
1291
+ except ImportError as e:
1292
+ raise ImportError(
1293
+ "Please install Pillow to generate random images: pip install pillow"
1294
+ ) from e
1295
+
1296
+ # Parse resolution (supports presets and 'heightxwidth')
1297
+ width, height = parse_random_image_resolution(image_resolution)
1298
+
1299
+ # Check for potentially problematic combinations and warn user
1300
+ if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1301
+ warnings.warn(
1302
+ f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1303
+ f"may take a long time. Consider reducing resolution or image count.",
1304
+ UserWarning,
1305
+ stacklevel=2,
1306
+ )
1307
+
1308
+ # Sample text lengths
1309
+ input_lens = np.random.randint(
1310
+ max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
1311
+ )
1312
+ output_lens = np.random.randint(
1313
+ int(output_len * range_ratio), output_len + 1, size=num_requests
1314
+ )
1315
+
1316
+ def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1317
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1318
+ img = Image.fromarray(arr, mode="RGB")
1319
+ buf = io.BytesIO()
1320
+ img.save(buf, format="JPEG", quality=85)
1321
+ encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1322
+ return f"data:image/jpeg;base64,{encoded}"
1323
+
1324
+ dataset: List[DatasetRow] = []
1325
+ for i in range(num_requests):
1326
+ # Generate text prompt
1327
+ text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1328
+
1329
+ # Generate image list
1330
+ images = [_gen_random_image_data_uri() for _ in range(num_images)]
1331
+
1332
+ prompt_str = text_prompt
1333
+ if apply_chat_template:
1334
+ try:
1335
+ content_items = [
1336
+ {"type": "image_url", "image_url": {"url": img_url}}
1337
+ for img_url in images
1338
+ ]
1339
+ content_items.append({"type": "text", "text": text_prompt})
1340
+ prompt_str = tokenizer.apply_chat_template(
1341
+ [{"role": "user", "content": content_items}],
1342
+ add_generation_prompt=True,
1343
+ tokenize=False,
1344
+ )
1345
+ except Exception:
1346
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1347
+ prompt_str = f"<image>{text_prompt}"
1348
+
1349
+ prompt_token_ids = tokenizer.encode(prompt_str)
1350
+ prompt_token_len = len(prompt_token_ids)
1351
+
1352
+ dataset.append(
1353
+ DatasetRow(
1354
+ prompt=prompt_str,
1355
+ prompt_len=prompt_token_len,
1356
+ output_len=int(output_lens[i]),
1357
+ image_data=images,
1358
+ )
1359
+ )
1360
+
1361
+ print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1362
+ print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1363
+ return dataset
1364
+
1365
+
1116
1366
  def gen_prompt(tokenizer, token_num):
1117
1367
  """Generate a random prompt of specified token length using tokenizer vocabulary."""
1118
1368
  all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1216,19 +1466,41 @@ def sample_generated_shared_prefix_requests(
1216
1466
  async def get_request(
1217
1467
  input_requests: List[DatasetRow],
1218
1468
  request_rate: float,
1469
+ use_trace_timestamps: bool = False,
1470
+ slowdown_factor: float = 1.0,
1219
1471
  ) -> AsyncGenerator[DatasetRow, None]:
1220
- input_requests = iter(input_requests)
1221
- for request in input_requests:
1222
- yield request
1472
+ if use_trace_timestamps:
1473
+ print(
1474
+ f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
1475
+ )
1476
+ # Sort requests by timestamp for correct replay
1477
+ input_requests.sort(key=lambda r: r.timestamp)
1223
1478
 
1224
- if request_rate == float("inf"):
1225
- # If the request rate is infinity, then we don't need to wait.
1226
- continue
1479
+ start_time = time.perf_counter()
1480
+ trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
1481
+
1482
+ for request in input_requests:
1483
+ trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
1484
+ target_arrival_time = start_time + (trace_time_s * slowdown_factor)
1485
+
1486
+ sleep_duration = target_arrival_time - time.perf_counter()
1487
+ if sleep_duration > 0:
1488
+ await asyncio.sleep(sleep_duration)
1489
+
1490
+ yield request
1491
+ else:
1492
+ input_requests_iter = iter(input_requests)
1493
+ for request in input_requests_iter:
1494
+ yield request
1495
+
1496
+ if request_rate == float("inf"):
1497
+ # If the request rate is infinity, then we don't need to wait.
1498
+ continue
1227
1499
 
1228
- # Sample the request interval from the exponential distribution.
1229
- interval = np.random.exponential(1.0 / request_rate)
1230
- # The next request will be sent after the interval.
1231
- await asyncio.sleep(interval)
1500
+ # Sample the request interval from the exponential distribution.
1501
+ interval = np.random.exponential(1.0 / request_rate)
1502
+ # The next request will be sent after the interval.
1503
+ await asyncio.sleep(interval)
1232
1504
 
1233
1505
 
1234
1506
  def calculate_metrics(
@@ -1254,7 +1526,7 @@ def calculate_metrics(
1254
1526
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1255
1527
  )
1256
1528
  retokenized_output_lens.append(retokenized_output_len)
1257
- total_input += input_requests[i].prompt_len
1529
+ total_input += outputs[i].prompt_len
1258
1530
  if output_len > 1:
1259
1531
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1260
1532
  itls += outputs[i].itl
@@ -1326,6 +1598,9 @@ async def benchmark(
1326
1598
  pd_separated: bool = False,
1327
1599
  flush_cache: bool = False,
1328
1600
  warmup_requests: int = 1,
1601
+ use_trace_timestamps: bool = False,
1602
+ mooncake_slowdown_factor=1.0,
1603
+ mooncake_num_rounds=1,
1329
1604
  ):
1330
1605
  if backend in ASYNC_REQUEST_FUNCS:
1331
1606
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1345,8 +1620,32 @@ async def benchmark(
1345
1620
  # Warmup
1346
1621
  print(f"Starting warmup with {warmup_requests} sequences...")
1347
1622
 
1348
- # Use the first request for all warmup iterations
1349
- test_request = input_requests[0]
1623
+ # Handle the data structure difference for the warmup request
1624
+ if args.dataset_name == "mooncake":
1625
+ # For mooncake, input_requests is a list of dicts.
1626
+ # We need to build a temporary DatasetRow for the warmup phase.
1627
+ warmup_record = input_requests[0]
1628
+
1629
+ # Build prompt from hash_ids, just like in the async generator
1630
+ hash_ids = warmup_record.get("hash_ids", [])
1631
+ prompt_text = ""
1632
+ for hash_id in hash_ids:
1633
+ prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
1634
+ prompt_text += "Can you tell me a detailed story in 1000 words?"
1635
+
1636
+ output_len = warmup_record.get("output_length", 32)
1637
+ prompt_len = len(tokenizer.encode(prompt_text))
1638
+
1639
+ # Create a temporary DatasetRow object for warmup
1640
+ test_request = DatasetRow(
1641
+ prompt=prompt_text,
1642
+ prompt_len=prompt_len,
1643
+ output_len=output_len,
1644
+ image_data=None, # Mooncake doesn't have image data
1645
+ )
1646
+ else:
1647
+ # For all other datasets, input_requests is a list of DatasetRow objects
1648
+ test_request = input_requests[0]
1350
1649
 
1351
1650
  if lora_names is not None and len(lora_names) != 0:
1352
1651
  lora_name = lora_names[0]
@@ -1400,12 +1699,26 @@ async def benchmark(
1400
1699
  if profile_output.success:
1401
1700
  print("Profiler started")
1402
1701
 
1403
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
1404
-
1405
1702
  # Run all requests
1406
1703
  benchmark_start_time = time.perf_counter()
1407
1704
  tasks: List[asyncio.Task] = []
1408
- async for request in get_request(input_requests, request_rate):
1705
+ pbar_total = len(input_requests)
1706
+ if (
1707
+ backend == "sglang" and args.dataset_name == "mooncake"
1708
+ ): # Assuming mooncake is mainly for sglang or similar backends
1709
+ print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
1710
+ request_generator = get_mooncake_request_over_time(
1711
+ input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
1712
+ )
1713
+ print(
1714
+ f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
1715
+ )
1716
+ pbar_total *= args.mooncake_num_rounds
1717
+ else:
1718
+ request_generator = get_request(input_requests, request_rate)
1719
+
1720
+ pbar = None if disable_tqdm else tqdm(total=pbar_total)
1721
+ async for request in request_generator:
1409
1722
  if lora_names is not None and len(lora_names) != 0:
1410
1723
  idx = random.randint(0, len(lora_names) - 1)
1411
1724
  lora_name = lora_names[idx]
@@ -1421,6 +1734,7 @@ async def benchmark(
1421
1734
  lora_name=lora_name,
1422
1735
  image_data=request.image_data,
1423
1736
  extra_request_body=extra_request_body,
1737
+ timestamp=request.timestamp,
1424
1738
  )
1425
1739
 
1426
1740
  tasks.append(
@@ -1466,7 +1780,11 @@ async def benchmark(
1466
1780
 
1467
1781
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
1468
1782
  print("{:<40} {:<10}".format("Backend:", backend))
1469
- print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
1783
+ print(
1784
+ "{:<40} {:<10}".format(
1785
+ "Traffic request rate:", "trace" if use_trace_timestamps else request_rate
1786
+ )
1787
+ )
1470
1788
  print(
1471
1789
  "{:<40} {:<10}".format(
1472
1790
  "Max request concurrency:",
@@ -1535,7 +1853,7 @@ async def benchmark(
1535
1853
  # Arguments
1536
1854
  "backend": args.backend,
1537
1855
  "dataset_name": args.dataset_name,
1538
- "request_rate": request_rate,
1856
+ "request_rate": "trace" if use_trace_timestamps else request_rate,
1539
1857
  "max_concurrency": max_concurrency,
1540
1858
  "sharegpt_output_len": args.sharegpt_output_len,
1541
1859
  "random_input_len": args.random_input_len,
@@ -1579,10 +1897,18 @@ async def benchmark(
1579
1897
  output_file_name = args.output_file
1580
1898
  else:
1581
1899
  now = datetime.now().strftime("%m%d")
1582
- if args.dataset_name.startswith("random"):
1900
+ if args.dataset_name == "random-image":
1901
+ output_file_name = (
1902
+ f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1903
+ f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1904
+ f"{args.random_image_resolution}.jsonl"
1905
+ )
1906
+ elif args.dataset_name.startswith("random"):
1583
1907
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1584
1908
  else:
1585
- output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
1909
+ output_file_name = (
1910
+ f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
1911
+ )
1586
1912
 
1587
1913
  result_details = {
1588
1914
  "input_lens": [output.prompt_len for output in outputs],
@@ -1637,6 +1963,17 @@ def run_benchmark(args_: argparse.Namespace):
1637
1963
  if not hasattr(args, "tokenize_prompt"):
1638
1964
  args.tokenize_prompt = False
1639
1965
 
1966
+ if not hasattr(args, "use_trace_timestamps"):
1967
+ args.use_trace_timestamps = False
1968
+ if not hasattr(args, "mooncake_slowdown_factor"):
1969
+ args.mooncake_slowdown_factor = 1.0
1970
+
1971
+ if not hasattr(args, "mooncake_slowdown_factor"):
1972
+ args.mooncake_slowdown_factor = 1.0
1973
+
1974
+ if not hasattr(args, "mooncake_num_rounds"):
1975
+ args.mooncake_num_rounds = 1
1976
+
1640
1977
  print(f"benchmark_args={args}")
1641
1978
 
1642
1979
  # Set global environments
@@ -1770,6 +2107,9 @@ def run_benchmark(args_: argparse.Namespace):
1770
2107
  pd_separated=args.pd_separated,
1771
2108
  flush_cache=args.flush_cache,
1772
2109
  warmup_requests=args.warmup_requests,
2110
+ use_trace_timestamps=args.use_trace_timestamps,
2111
+ mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2112
+ mooncake_num_rounds=args.mooncake_num_rounds,
1773
2113
  )
1774
2114
  )
1775
2115
 
@@ -1819,7 +2159,15 @@ if __name__ == "__main__":
1819
2159
  "--dataset-name",
1820
2160
  type=str,
1821
2161
  default="sharegpt",
1822
- choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
2162
+ choices=[
2163
+ "sharegpt",
2164
+ "random",
2165
+ "random-ids",
2166
+ "generated-shared-prefix",
2167
+ "mmmu",
2168
+ "random-image",
2169
+ "mooncake",
2170
+ ],
1823
2171
  help="Name of the dataset to benchmark on.",
1824
2172
  )
1825
2173
  parser.add_argument(
@@ -1872,6 +2220,22 @@ if __name__ == "__main__":
1872
2220
  help="Range of sampled ratio of input/output length, "
1873
2221
  "used only for random dataset.",
1874
2222
  )
2223
+ # random-image dataset args
2224
+ parser.add_argument(
2225
+ "--random-image-num-images",
2226
+ type=int,
2227
+ default=1,
2228
+ help="Number of images per request (only available with the random-image dataset)",
2229
+ )
2230
+ parser.add_argument(
2231
+ "--random-image-resolution",
2232
+ type=str,
2233
+ default="1080p",
2234
+ help=(
2235
+ "Resolution of random images for random-image dataset. "
2236
+ "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2237
+ ),
2238
+ )
1875
2239
  parser.add_argument(
1876
2240
  "--request-rate",
1877
2241
  type=float,
@@ -1879,6 +2243,11 @@ if __name__ == "__main__":
1879
2243
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
1880
2244
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
1881
2245
  )
2246
+ parser.add_argument(
2247
+ "--use-trace-timestamps",
2248
+ action="store_true",
2249
+ help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
2250
+ )
1882
2251
  parser.add_argument(
1883
2252
  "--max-concurrency",
1884
2253
  type=int,
@@ -2002,5 +2371,33 @@ if __name__ == "__main__":
2002
2371
  default=256,
2003
2372
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
2004
2373
  )
2374
+ mooncake_group = parser.add_argument_group("mooncake dataset arguments")
2375
+ mooncake_group.add_argument(
2376
+ "--mooncake-slowdown-factor",
2377
+ type=float,
2378
+ default=1.0,
2379
+ help="Slowdown factor for replaying the mooncake trace. "
2380
+ "A value of 2.0 means the replay is twice as slow. "
2381
+ "NOTE: --request-rate is IGNORED in mooncake mode.",
2382
+ )
2383
+ mooncake_group.add_argument(
2384
+ "--mooncake-num-rounds",
2385
+ type=int,
2386
+ default=1,
2387
+ help="Number of conversation rounds for each session in the mooncake dataset. "
2388
+ "A value > 1 will enable true multi-turn session benchmarking.",
2389
+ )
2390
+ mooncake_group.add_argument(
2391
+ "--mooncake-workload",
2392
+ type=str,
2393
+ default="conversation",
2394
+ choices=[
2395
+ "mooncake",
2396
+ "conversation",
2397
+ "synthetic",
2398
+ "toolagent",
2399
+ ],
2400
+ help="Underlying workload for the mooncake dataset.",
2401
+ )
2005
2402
  args = parser.parse_args()
2006
2403
  run_benchmark(args)
@@ -740,7 +740,7 @@ class StreamExecutor:
740
740
  # Execute the stored lazy generation calls
741
741
  self.backend.role_end_generate(self)
742
742
 
743
- from sglang.srt.reasoning_parser import ReasoningParser
743
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
744
744
 
745
745
  reasoning_parser = ReasoningParser(expr.model_type)
746
746
  other = expr.expr
sglang/profiler.py CHANGED
@@ -9,7 +9,6 @@ import argparse
9
9
  import json
10
10
  import os
11
11
  import time
12
- import urllib.parse
13
12
  from argparse import ArgumentParser
14
13
  from pathlib import Path
15
14
  from typing import List, Optional