sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -75,6 +75,7 @@ class RequestFuncInput:
75
75
  lora_name: str
76
76
  image_data: Optional[List[str]]
77
77
  extra_request_body: Dict[str, Any]
78
+ timestamp: Optional[float] = None
78
79
 
79
80
 
80
81
  @dataclass
@@ -104,10 +105,13 @@ def remove_suffix(text: str, suffix: str) -> str:
104
105
 
105
106
 
106
107
  def get_auth_headers() -> Dict[str, str]:
107
- api_key = os.environ.get("OPENAI_API_KEY")
108
- if api_key:
109
- return {"Authorization": f"Bearer {api_key}"}
108
+ openai_api_key = os.environ.get("OPENAI_API_KEY")
109
+ if openai_api_key:
110
+ return {"Authorization": f"Bearer {openai_api_key}"}
110
111
  else:
112
+ api_key = os.environ.get("API_KEY")
113
+ if api_key:
114
+ return {"Authorization": f"{api_key}"}
111
115
  return {}
112
116
 
113
117
 
@@ -204,6 +208,10 @@ async def async_request_openai_completions(
204
208
  "ignore_eos": not args.disable_ignore_eos,
205
209
  **request_func_input.extra_request_body,
206
210
  }
211
+
212
+ if request_func_input.image_data:
213
+ payload.update({"image_data": request_func_input.image_data})
214
+
207
215
  headers = get_auth_headers()
208
216
 
209
217
  output = RequestFuncOutput.init_new(request_func_input)
@@ -627,7 +635,7 @@ def get_tokenizer(
627
635
  if pretrained_model_name_or_path.endswith(
628
636
  ".json"
629
637
  ) or pretrained_model_name_or_path.endswith(".model"):
630
- from sglang.srt.hf_transformers_utils import get_tokenizer
638
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
631
639
 
632
640
  return get_tokenizer(pretrained_model_name_or_path)
633
641
 
@@ -696,6 +704,24 @@ def get_dataset(args, tokenizer):
696
704
  apply_chat_template=args.apply_chat_template,
697
705
  random_sample=True,
698
706
  )
707
+ elif args.dataset_name == "mooncake":
708
+ # For mooncake, we don't generate the prompts here.
709
+ # We just load the raw trace data. The async generator will handle the rest.
710
+ if not args.dataset_path:
711
+ local_path = os.path.join("/tmp", args.mooncake_workload + "_trace.jsonl")
712
+ else:
713
+ local_path = args.dataset_path
714
+
715
+ if not os.path.exists(local_path):
716
+ download_and_cache_file(
717
+ MOONCAKE_DATASET_URL[args.mooncake_workload], local_path
718
+ )
719
+
720
+ with open(local_path, "r") as f:
721
+ all_requests_data = [json.loads(line) for line in f if line.strip()]
722
+
723
+ # Limit the number of requests based on --num-prompts
724
+ input_requests = all_requests_data[: args.num_prompts]
699
725
  else:
700
726
  raise ValueError(f"Unknown dataset: {args.dataset_name}")
701
727
  return input_requests
@@ -750,6 +776,12 @@ class BenchmarkMetrics:
750
776
 
751
777
 
752
778
  SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
779
+ MOONCAKE_DATASET_URL = {
780
+ "mooncake": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/arxiv-trace/mooncake_trace.jsonl",
781
+ "conversation": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/conversation_trace.jsonl",
782
+ "synthetic": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/synthetic_trace.jsonl",
783
+ "toolagent": "https://raw.githubusercontent.com/kvcache-ai/Mooncake/main/FAST25-release/traces/toolagent_trace.jsonl",
784
+ }
753
785
 
754
786
 
755
787
  def download_and_cache_file(url: str, filename: Optional[str] = None):
@@ -808,6 +840,80 @@ class DatasetRow:
808
840
  prompt_len: int
809
841
  output_len: int
810
842
  image_data: Optional[List[str]] = None
843
+ timestamp: Optional[float] = None
844
+
845
+
846
+ async def get_mooncake_request_over_time(
847
+ input_requests: List[Dict],
848
+ tokenizer: PreTrainedTokenizerBase,
849
+ slowdown_factor: float,
850
+ num_rounds: int,
851
+ ) -> AsyncGenerator[DatasetRow, None]:
852
+ """
853
+ An async generator that yields requests based on the timestamps in the Mooncake trace file,
854
+ with support for multi-round sessions.
855
+ """
856
+ if not input_requests:
857
+ return
858
+
859
+ input_requests.sort(key=lambda r: r["timestamp"])
860
+
861
+ start_time = time.perf_counter()
862
+ trace_start_time_ms = input_requests[0]["timestamp"]
863
+
864
+ for record in input_requests:
865
+ # Calculate when this entire session should start
866
+ relative_arrival_time_s = (record["timestamp"] - trace_start_time_ms) / 1000.0
867
+ target_arrival_time_s = relative_arrival_time_s * slowdown_factor
868
+
869
+ current_elapsed_time_s = time.perf_counter() - start_time
870
+ sleep_duration_s = target_arrival_time_s - current_elapsed_time_s
871
+ if sleep_duration_s > 0:
872
+ await asyncio.sleep(sleep_duration_s)
873
+
874
+ # Once the session starts, generate all rounds for it as a burst
875
+ # This simulates a user engaging in a multi-turn conversation
876
+
877
+ # Base user query constructed from hash_ids
878
+ user_query_base = ""
879
+ hash_ids = record.get("hash_ids", [])
880
+ for hash_id in hash_ids:
881
+ user_query_base += f"{hash_id}" + " ".join(
882
+ ["hi"] * 128
883
+ ) # Shorter for multi-round
884
+ user_query_base += "Tell me a story based on this context."
885
+
886
+ output_len_per_round = record.get("output_length", 256)
887
+ chat_history = []
888
+
889
+ for i in range(num_rounds):
890
+ # Add user query for the current round
891
+ chat_history.append(
892
+ {"role": "user", "content": f"Round {i+1}: {user_query_base}"}
893
+ )
894
+
895
+ # Form the full prompt from history
896
+ try:
897
+ full_prompt_text = tokenizer.apply_chat_template(
898
+ chat_history, tokenize=False, add_generation_prompt=True
899
+ )
900
+ except Exception:
901
+ full_prompt_text = "\n".join(
902
+ [f"{msg['role']}: {msg['content']}" for msg in chat_history]
903
+ )
904
+
905
+ prompt_len = len(tokenizer.encode(full_prompt_text))
906
+
907
+ yield DatasetRow(
908
+ prompt=full_prompt_text,
909
+ prompt_len=prompt_len,
910
+ output_len=output_len_per_round,
911
+ )
912
+
913
+ # Add a placeholder assistant response for the next round's context
914
+ # We use a placeholder because we don't know the real response
915
+ placeholder_response = " ".join(["story"] * output_len_per_round)
916
+ chat_history.append({"role": "assistant", "content": placeholder_response})
811
917
 
812
918
 
813
919
  def sample_mmmu_requests(
@@ -896,17 +1002,25 @@ def sample_mmmu_requests(
896
1002
  prompt = f"Question: {question}\n\nAnswer: "
897
1003
  if apply_chat_template:
898
1004
  try:
1005
+ is_phi4_multimodal = (
1006
+ "phi-4-multimodal" in tokenizer.name_or_path.lower()
1007
+ )
1008
+ if is_phi4_multimodal:
1009
+ # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1010
+ content = prompt.replace("image 1", "<|endoftext10|>")
1011
+ else:
1012
+ content = [
1013
+ {
1014
+ "type": "image_url",
1015
+ "image_url": {"url": image_data},
1016
+ },
1017
+ {"type": "text", "text": prompt},
1018
+ ]
899
1019
  prompt = tokenizer.apply_chat_template(
900
1020
  [
901
1021
  {
902
1022
  "role": "user",
903
- "content": [
904
- {
905
- "type": "image_url",
906
- "image_url": {"url": image_data},
907
- },
908
- {"type": "text", "text": prompt},
909
- ],
1023
+ "content": content,
910
1024
  }
911
1025
  ],
912
1026
  add_generation_prompt=True,
@@ -1000,7 +1114,8 @@ def sample_sharegpt_requests(
1000
1114
  add_generation_prompt=True,
1001
1115
  tokenize=False,
1002
1116
  )
1003
- prompt = prompt.replace(tokenizer.bos_token, "")
1117
+ if tokenizer.bos_token:
1118
+ prompt = prompt.replace(tokenizer.bos_token, "")
1004
1119
 
1005
1120
  prompt_token_ids = tokenizer.encode(prompt)
1006
1121
  completion = dataset[i][1]
@@ -1359,19 +1474,41 @@ def sample_generated_shared_prefix_requests(
1359
1474
  async def get_request(
1360
1475
  input_requests: List[DatasetRow],
1361
1476
  request_rate: float,
1477
+ use_trace_timestamps: bool = False,
1478
+ slowdown_factor: float = 1.0,
1362
1479
  ) -> AsyncGenerator[DatasetRow, None]:
1363
- input_requests = iter(input_requests)
1364
- for request in input_requests:
1365
- yield request
1480
+ if use_trace_timestamps:
1481
+ print(
1482
+ f"Using trace timestamps for request generation with slowdown factor {slowdown_factor}."
1483
+ )
1484
+ # Sort requests by timestamp for correct replay
1485
+ input_requests.sort(key=lambda r: r.timestamp)
1366
1486
 
1367
- if request_rate == float("inf"):
1368
- # If the request rate is infinity, then we don't need to wait.
1369
- continue
1487
+ start_time = time.perf_counter()
1488
+ trace_start_time_ms = input_requests[0].timestamp if input_requests else 0
1489
+
1490
+ for request in input_requests:
1491
+ trace_time_s = (request.timestamp - trace_start_time_ms) / 1000.0
1492
+ target_arrival_time = start_time + (trace_time_s * slowdown_factor)
1493
+
1494
+ sleep_duration = target_arrival_time - time.perf_counter()
1495
+ if sleep_duration > 0:
1496
+ await asyncio.sleep(sleep_duration)
1497
+
1498
+ yield request
1499
+ else:
1500
+ input_requests_iter = iter(input_requests)
1501
+ for request in input_requests_iter:
1502
+ yield request
1503
+
1504
+ if request_rate == float("inf"):
1505
+ # If the request rate is infinity, then we don't need to wait.
1506
+ continue
1370
1507
 
1371
- # Sample the request interval from the exponential distribution.
1372
- interval = np.random.exponential(1.0 / request_rate)
1373
- # The next request will be sent after the interval.
1374
- await asyncio.sleep(interval)
1508
+ # Sample the request interval from the exponential distribution.
1509
+ interval = np.random.exponential(1.0 / request_rate)
1510
+ # The next request will be sent after the interval.
1511
+ await asyncio.sleep(interval)
1375
1512
 
1376
1513
 
1377
1514
  def calculate_metrics(
@@ -1397,7 +1534,7 @@ def calculate_metrics(
1397
1534
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1398
1535
  )
1399
1536
  retokenized_output_lens.append(retokenized_output_len)
1400
- total_input += input_requests[i].prompt_len
1537
+ total_input += outputs[i].prompt_len
1401
1538
  if output_len > 1:
1402
1539
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1403
1540
  itls += outputs[i].itl
@@ -1469,6 +1606,9 @@ async def benchmark(
1469
1606
  pd_separated: bool = False,
1470
1607
  flush_cache: bool = False,
1471
1608
  warmup_requests: int = 1,
1609
+ use_trace_timestamps: bool = False,
1610
+ mooncake_slowdown_factor=1.0,
1611
+ mooncake_num_rounds=1,
1472
1612
  ):
1473
1613
  if backend in ASYNC_REQUEST_FUNCS:
1474
1614
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -1488,8 +1628,32 @@ async def benchmark(
1488
1628
  # Warmup
1489
1629
  print(f"Starting warmup with {warmup_requests} sequences...")
1490
1630
 
1491
- # Use the first request for all warmup iterations
1492
- test_request = input_requests[0]
1631
+ # Handle the data structure difference for the warmup request
1632
+ if args.dataset_name == "mooncake":
1633
+ # For mooncake, input_requests is a list of dicts.
1634
+ # We need to build a temporary DatasetRow for the warmup phase.
1635
+ warmup_record = input_requests[0]
1636
+
1637
+ # Build prompt from hash_ids, just like in the async generator
1638
+ hash_ids = warmup_record.get("hash_ids", [])
1639
+ prompt_text = ""
1640
+ for hash_id in hash_ids:
1641
+ prompt_text += f"{hash_id}" + " ".join(["hi"] * 512)
1642
+ prompt_text += "Can you tell me a detailed story in 1000 words?"
1643
+
1644
+ output_len = warmup_record.get("output_length", 32)
1645
+ prompt_len = len(tokenizer.encode(prompt_text))
1646
+
1647
+ # Create a temporary DatasetRow object for warmup
1648
+ test_request = DatasetRow(
1649
+ prompt=prompt_text,
1650
+ prompt_len=prompt_len,
1651
+ output_len=output_len,
1652
+ image_data=None, # Mooncake doesn't have image data
1653
+ )
1654
+ else:
1655
+ # For all other datasets, input_requests is a list of DatasetRow objects
1656
+ test_request = input_requests[0]
1493
1657
 
1494
1658
  if lora_names is not None and len(lora_names) != 0:
1495
1659
  lora_name = lora_names[0]
@@ -1543,12 +1707,26 @@ async def benchmark(
1543
1707
  if profile_output.success:
1544
1708
  print("Profiler started")
1545
1709
 
1546
- pbar = None if disable_tqdm else tqdm(total=len(input_requests))
1547
-
1548
1710
  # Run all requests
1549
1711
  benchmark_start_time = time.perf_counter()
1550
1712
  tasks: List[asyncio.Task] = []
1551
- async for request in get_request(input_requests, request_rate):
1713
+ pbar_total = len(input_requests)
1714
+ if (
1715
+ backend == "sglang" and args.dataset_name == "mooncake"
1716
+ ): # Assuming mooncake is mainly for sglang or similar backends
1717
+ print("Using time-based Mooncake request scheduler, ignoring --request-rate.")
1718
+ request_generator = get_mooncake_request_over_time(
1719
+ input_requests, tokenizer, mooncake_slowdown_factor, mooncake_num_rounds
1720
+ )
1721
+ print(
1722
+ f"Starting Mooncake trace replay. Sessions: {len(input_requests)}, Rounds per session: {mooncake_num_rounds}. Slowdown factor: {mooncake_slowdown_factor}"
1723
+ )
1724
+ pbar_total *= args.mooncake_num_rounds
1725
+ else:
1726
+ request_generator = get_request(input_requests, request_rate)
1727
+
1728
+ pbar = None if disable_tqdm else tqdm(total=pbar_total)
1729
+ async for request in request_generator:
1552
1730
  if lora_names is not None and len(lora_names) != 0:
1553
1731
  idx = random.randint(0, len(lora_names) - 1)
1554
1732
  lora_name = lora_names[idx]
@@ -1564,6 +1742,7 @@ async def benchmark(
1564
1742
  lora_name=lora_name,
1565
1743
  image_data=request.image_data,
1566
1744
  extra_request_body=extra_request_body,
1745
+ timestamp=request.timestamp,
1567
1746
  )
1568
1747
 
1569
1748
  tasks.append(
@@ -1584,7 +1763,9 @@ async def benchmark(
1584
1763
  pbar.close()
1585
1764
 
1586
1765
  if "sglang" in backend:
1587
- server_info = requests.get(base_url + "/get_server_info")
1766
+ server_info = requests.get(
1767
+ base_url + "/get_server_info", headers=get_auth_headers()
1768
+ )
1588
1769
  if server_info.status_code == 200:
1589
1770
  server_info_json = server_info.json()
1590
1771
  if "decode" in server_info_json:
@@ -1609,7 +1790,11 @@ async def benchmark(
1609
1790
 
1610
1791
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
1611
1792
  print("{:<40} {:<10}".format("Backend:", backend))
1612
- print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
1793
+ print(
1794
+ "{:<40} {:<10}".format(
1795
+ "Traffic request rate:", "trace" if use_trace_timestamps else request_rate
1796
+ )
1797
+ )
1613
1798
  print(
1614
1799
  "{:<40} {:<10}".format(
1615
1800
  "Max request concurrency:",
@@ -1678,7 +1863,7 @@ async def benchmark(
1678
1863
  # Arguments
1679
1864
  "backend": args.backend,
1680
1865
  "dataset_name": args.dataset_name,
1681
- "request_rate": request_rate,
1866
+ "request_rate": "trace" if use_trace_timestamps else request_rate,
1682
1867
  "max_concurrency": max_concurrency,
1683
1868
  "sharegpt_output_len": args.sharegpt_output_len,
1684
1869
  "random_input_len": args.random_input_len,
@@ -1731,7 +1916,9 @@ async def benchmark(
1731
1916
  elif args.dataset_name.startswith("random"):
1732
1917
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1733
1918
  else:
1734
- output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
1919
+ output_file_name = (
1920
+ f"{args.backend}_{now}_{args.num_prompts}_{args.dataset_name}.jsonl"
1921
+ )
1735
1922
 
1736
1923
  result_details = {
1737
1924
  "input_lens": [output.prompt_len for output in outputs],
@@ -1786,6 +1973,17 @@ def run_benchmark(args_: argparse.Namespace):
1786
1973
  if not hasattr(args, "tokenize_prompt"):
1787
1974
  args.tokenize_prompt = False
1788
1975
 
1976
+ if not hasattr(args, "use_trace_timestamps"):
1977
+ args.use_trace_timestamps = False
1978
+ if not hasattr(args, "mooncake_slowdown_factor"):
1979
+ args.mooncake_slowdown_factor = 1.0
1980
+
1981
+ if not hasattr(args, "mooncake_slowdown_factor"):
1982
+ args.mooncake_slowdown_factor = 1.0
1983
+
1984
+ if not hasattr(args, "mooncake_num_rounds"):
1985
+ args.mooncake_num_rounds = 1
1986
+
1789
1987
  print(f"benchmark_args={args}")
1790
1988
 
1791
1989
  # Set global environments
@@ -1919,6 +2117,9 @@ def run_benchmark(args_: argparse.Namespace):
1919
2117
  pd_separated=args.pd_separated,
1920
2118
  flush_cache=args.flush_cache,
1921
2119
  warmup_requests=args.warmup_requests,
2120
+ use_trace_timestamps=args.use_trace_timestamps,
2121
+ mooncake_slowdown_factor=args.mooncake_slowdown_factor,
2122
+ mooncake_num_rounds=args.mooncake_num_rounds,
1922
2123
  )
1923
2124
  )
1924
2125
 
@@ -1975,6 +2176,7 @@ if __name__ == "__main__":
1975
2176
  "generated-shared-prefix",
1976
2177
  "mmmu",
1977
2178
  "random-image",
2179
+ "mooncake",
1978
2180
  ],
1979
2181
  help="Name of the dataset to benchmark on.",
1980
2182
  )
@@ -2051,6 +2253,11 @@ if __name__ == "__main__":
2051
2253
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
2052
2254
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
2053
2255
  )
2256
+ parser.add_argument(
2257
+ "--use-trace-timestamps",
2258
+ action="store_true",
2259
+ help="Use timestamps from the trace file for request scheduling. Only valid for 'mooncake' dataset.",
2260
+ )
2054
2261
  parser.add_argument(
2055
2262
  "--max-concurrency",
2056
2263
  type=int,
@@ -2174,5 +2381,33 @@ if __name__ == "__main__":
2174
2381
  default=256,
2175
2382
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
2176
2383
  )
2384
+ mooncake_group = parser.add_argument_group("mooncake dataset arguments")
2385
+ mooncake_group.add_argument(
2386
+ "--mooncake-slowdown-factor",
2387
+ type=float,
2388
+ default=1.0,
2389
+ help="Slowdown factor for replaying the mooncake trace. "
2390
+ "A value of 2.0 means the replay is twice as slow. "
2391
+ "NOTE: --request-rate is IGNORED in mooncake mode.",
2392
+ )
2393
+ mooncake_group.add_argument(
2394
+ "--mooncake-num-rounds",
2395
+ type=int,
2396
+ default=1,
2397
+ help="Number of conversation rounds for each session in the mooncake dataset. "
2398
+ "A value > 1 will enable true multi-turn session benchmarking.",
2399
+ )
2400
+ mooncake_group.add_argument(
2401
+ "--mooncake-workload",
2402
+ type=str,
2403
+ default="conversation",
2404
+ choices=[
2405
+ "mooncake",
2406
+ "conversation",
2407
+ "synthetic",
2408
+ "toolagent",
2409
+ ],
2410
+ help="Underlying workload for the mooncake dataset.",
2411
+ )
2177
2412
  args = parser.parse_args()
2178
2413
  run_benchmark(args)
sglang/global_config.py CHANGED
@@ -37,8 +37,8 @@ class GlobalConfig:
37
37
  )
38
38
  # Runtime constants: others
39
39
  self.retract_decode_steps = 20
40
- self.flashinfer_workspace_size = os.environ.get(
41
- "FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
40
+ self.flashinfer_workspace_size = int(
41
+ os.environ.get("FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024)
42
42
  )
43
43
 
44
44
  # Output tokenization configs
@@ -433,7 +433,7 @@ class Runtime:
433
433
  self.endpoint.cache_prefix(prefix)
434
434
 
435
435
  def get_tokenizer(self):
436
- from sglang.srt.hf_transformers_utils import get_tokenizer
436
+ from sglang.srt.utils.hf_transformers_utils import get_tokenizer
437
437
 
438
438
  return get_tokenizer(
439
439
  self.server_args.tokenizer_path,
sglang/launch_server.py CHANGED
@@ -7,9 +7,23 @@ from sglang.srt.entrypoints.http_server import launch_server
7
7
  from sglang.srt.server_args import prepare_server_args
8
8
  from sglang.srt.utils import kill_process_tree
9
9
 
10
+ MOVE_ENVS_WARN = """
11
+ ########################################################################
12
+ # For contributors and developers: #
13
+ # Please move environment variable definitions to sglang.srt.environ #
14
+ # using the following pattern: #
15
+ # SGLANG_XXX = EnvBool(False) #
16
+ # #
17
+ ########################################################################
18
+ """
19
+
10
20
  if __name__ == "__main__":
11
21
  server_args = prepare_server_args(sys.argv[1:])
12
22
 
23
+ from sglang.srt.server_args import print_deprecated_warning
24
+
25
+ print_deprecated_warning(MOVE_ENVS_WARN)
26
+
13
27
  try:
14
28
  launch_server(server_args)
15
29
  finally:
sglang/profiler.py CHANGED
@@ -15,7 +15,7 @@ from typing import List, Optional
15
15
 
16
16
  import requests
17
17
 
18
- PARENT_FOLDER = "/tmp/sglang-profile"
18
+ PROFILER_DIR = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
19
19
 
20
20
 
21
21
  def _run_profile(
@@ -27,7 +27,7 @@ def _run_profile(
27
27
  profile_by_stage: bool = False,
28
28
  ) -> str:
29
29
  if output_dir is None:
30
- output_dir = PARENT_FOLDER
30
+ output_dir = PROFILER_DIR
31
31
 
32
32
  output_dir = os.path.normpath(output_dir)
33
33
  output_dir = os.path.abspath(output_dir)
@@ -0,0 +1,27 @@
1
+ # Adapted from https://github.com/thinking-machines-lab/batch_invariant_ops/blob/main/batch_invariant_ops/__init__.py
2
+
3
+ from .batch_invariant_ops import (
4
+ AttentionBlockSize,
5
+ disable_batch_invariant_mode,
6
+ enable_batch_invariant_mode,
7
+ get_batch_invariant_attention_block_size,
8
+ is_batch_invariant_mode_enabled,
9
+ log_softmax,
10
+ matmul_persistent,
11
+ mean_dim,
12
+ set_batch_invariant_mode,
13
+ )
14
+
15
+ __version__ = "0.1.0"
16
+
17
+ __all__ = [
18
+ "set_batch_invariant_mode",
19
+ "is_batch_invariant_mode_enabled",
20
+ "disable_batch_invariant_mode",
21
+ "enable_batch_invariant_mode",
22
+ "matmul_persistent",
23
+ "log_softmax",
24
+ "mean_dim",
25
+ "get_batch_invariant_attention_block_size",
26
+ "AttentionBlockSize",
27
+ ]