sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +208 -295
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +9 -6
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  64. sglang/srt/layers/moe/topk.py +13 -4
  65. sglang/srt/layers/quantization/__init__.py +111 -7
  66. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  69. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  71. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  72. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  73. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  75. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  80. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  82. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  83. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  87. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  89. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  90. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  91. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  93. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  94. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  95. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  96. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  97. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  98. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  99. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  100. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  101. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  102. sglang/srt/layers/quantization/fp8.py +69 -28
  103. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  104. sglang/srt/layers/quantization/gptq.py +416 -0
  105. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  106. sglang/srt/layers/quantization/int8_utils.py +73 -0
  107. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  108. sglang/srt/layers/radix_attention.py +1 -0
  109. sglang/srt/layers/rotary_embedding.py +0 -1
  110. sglang/srt/layers/sampler.py +76 -31
  111. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  112. sglang/srt/lora/lora.py +17 -1
  113. sglang/srt/lora/lora_config.py +5 -0
  114. sglang/srt/lora/lora_manager.py +1 -3
  115. sglang/srt/managers/cache_controller.py +193 -62
  116. sglang/srt/managers/configure_logging.py +2 -1
  117. sglang/srt/managers/data_parallel_controller.py +6 -2
  118. sglang/srt/managers/detokenizer_manager.py +124 -102
  119. sglang/srt/managers/image_processor.py +2 -1
  120. sglang/srt/managers/io_struct.py +143 -6
  121. sglang/srt/managers/schedule_batch.py +238 -197
  122. sglang/srt/managers/schedule_policy.py +29 -29
  123. sglang/srt/managers/scheduler.py +681 -259
  124. sglang/srt/managers/session_controller.py +6 -2
  125. sglang/srt/managers/tokenizer_manager.py +224 -68
  126. sglang/srt/managers/tp_worker.py +15 -4
  127. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  128. sglang/srt/mem_cache/chunk_cache.py +18 -11
  129. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  130. sglang/srt/mem_cache/memory_pool.py +44 -18
  131. sglang/srt/mem_cache/radix_cache.py +58 -47
  132. sglang/srt/metrics/collector.py +94 -36
  133. sglang/srt/model_executor/cuda_graph_runner.py +55 -24
  134. sglang/srt/model_executor/forward_batch_info.py +49 -16
  135. sglang/srt/model_executor/model_runner.py +209 -28
  136. sglang/srt/model_loader/loader.py +3 -3
  137. sglang/srt/model_loader/weight_utils.py +36 -14
  138. sglang/srt/models/baichuan.py +31 -6
  139. sglang/srt/models/chatglm.py +39 -7
  140. sglang/srt/models/commandr.py +29 -5
  141. sglang/srt/models/dbrx.py +31 -5
  142. sglang/srt/models/deepseek.py +43 -6
  143. sglang/srt/models/deepseek_nextn.py +32 -19
  144. sglang/srt/models/deepseek_v2.py +265 -29
  145. sglang/srt/models/exaone.py +19 -9
  146. sglang/srt/models/gemma.py +22 -8
  147. sglang/srt/models/gemma2.py +25 -12
  148. sglang/srt/models/gemma2_reward.py +5 -1
  149. sglang/srt/models/gpt2.py +28 -13
  150. sglang/srt/models/gpt_bigcode.py +27 -5
  151. sglang/srt/models/granite.py +21 -9
  152. sglang/srt/models/grok.py +21 -4
  153. sglang/srt/models/internlm2.py +36 -6
  154. sglang/srt/models/internlm2_reward.py +5 -1
  155. sglang/srt/models/llama.py +26 -9
  156. sglang/srt/models/llama_classification.py +5 -1
  157. sglang/srt/models/llama_eagle.py +17 -4
  158. sglang/srt/models/llama_embedding.py +5 -1
  159. sglang/srt/models/llama_reward.py +7 -2
  160. sglang/srt/models/llava.py +19 -3
  161. sglang/srt/models/llavavid.py +10 -1
  162. sglang/srt/models/minicpm.py +26 -2
  163. sglang/srt/models/minicpm3.py +39 -3
  164. sglang/srt/models/minicpmv.py +45 -14
  165. sglang/srt/models/mixtral.py +20 -9
  166. sglang/srt/models/mixtral_quant.py +50 -8
  167. sglang/srt/models/mllama.py +57 -11
  168. sglang/srt/models/olmo.py +34 -6
  169. sglang/srt/models/olmo2.py +34 -13
  170. sglang/srt/models/olmoe.py +26 -4
  171. sglang/srt/models/phi3_small.py +29 -10
  172. sglang/srt/models/qwen.py +26 -3
  173. sglang/srt/models/qwen2.py +26 -4
  174. sglang/srt/models/qwen2_5_vl.py +46 -8
  175. sglang/srt/models/qwen2_eagle.py +17 -5
  176. sglang/srt/models/qwen2_moe.py +44 -6
  177. sglang/srt/models/qwen2_rm.py +78 -0
  178. sglang/srt/models/qwen2_vl.py +39 -8
  179. sglang/srt/models/stablelm.py +32 -5
  180. sglang/srt/models/torch_native_llama.py +5 -2
  181. sglang/srt/models/xverse.py +21 -9
  182. sglang/srt/models/xverse_moe.py +45 -7
  183. sglang/srt/models/yivl.py +2 -1
  184. sglang/srt/openai_api/adapter.py +109 -24
  185. sglang/srt/openai_api/protocol.py +17 -1
  186. sglang/srt/reasoning_parser.py +154 -0
  187. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  188. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  189. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  190. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  191. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  192. sglang/srt/sampling/sampling_batch_info.py +79 -157
  193. sglang/srt/sampling/sampling_params.py +16 -13
  194. sglang/srt/server_args.py +136 -52
  195. sglang/srt/speculative/build_eagle_tree.py +2 -8
  196. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
  197. sglang/srt/speculative/eagle_utils.py +92 -58
  198. sglang/srt/speculative/eagle_worker.py +186 -94
  199. sglang/srt/speculative/spec_info.py +1 -13
  200. sglang/srt/utils.py +43 -17
  201. sglang/srt/warmup.py +47 -0
  202. sglang/test/few_shot_gsm8k.py +4 -1
  203. sglang/test/runners.py +389 -126
  204. sglang/test/send_one.py +88 -0
  205. sglang/test/test_block_fp8_ep.py +361 -0
  206. sglang/test/test_programs.py +1 -1
  207. sglang/test/test_utils.py +138 -84
  208. sglang/utils.py +50 -60
  209. sglang/version.py +1 -1
  210. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
  211. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
  212. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
  213. sglang/bench_latency.py +0 -1
  214. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  215. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  216. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  217. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  218. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
  219. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/api.py CHANGED
@@ -94,7 +94,7 @@ def gen(
94
94
  regex: Optional[str] = None,
95
95
  json_schema: Optional[str] = None,
96
96
  ):
97
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
97
+ """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
98
98
 
99
99
  if choices:
100
100
  return SglSelect(
@@ -56,6 +56,7 @@ class BenchArgs:
56
56
  profile: bool = False
57
57
  skip_warmup: bool = False
58
58
  do_not_exit: bool = False
59
+ prompt_suffix: str = ""
59
60
 
60
61
  @staticmethod
61
62
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -177,6 +178,12 @@ class BenchArgs:
177
178
  action="store_true",
178
179
  help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
179
180
  )
181
+ parser.add_argument(
182
+ "--prompt-suffix",
183
+ type=str,
184
+ default="",
185
+ help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
186
+ )
180
187
 
181
188
  @classmethod
182
189
  def from_cli_args(cls, args: argparse.Namespace):
@@ -216,6 +223,10 @@ def throughput_test_once(
216
223
  ]
217
224
 
218
225
  if profile:
226
+ assert (
227
+ "SGLANG_TORCH_PROFILER_DIR" in os.environ
228
+ ), "Please set SGLANG_TORCH_PROFILER_DIR."
229
+ os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
219
230
  backend.start_profile()
220
231
 
221
232
  st = time.perf_counter()
@@ -229,6 +240,8 @@ def throughput_test_once(
229
240
  if backend_name == "runtime":
230
241
  gen_out = json.loads(gen_out)
231
242
 
243
+ server_info = backend.get_server_info()
244
+
232
245
  measurement_results["total_latency"] = latency
233
246
  measurement_results["total_output_tokens"] = sum(
234
247
  o["meta_info"]["completion_tokens"] for o in gen_out
@@ -246,6 +259,7 @@ def throughput_test_once(
246
259
  measurement_results["total_input_tokens"]
247
260
  + measurement_results["total_output_tokens"]
248
261
  ) / latency
262
+ measurement_results["last_gen_throughput"] = server_info["last_gen_throughput"]
249
263
 
250
264
  return measurement_results
251
265
 
@@ -361,6 +375,11 @@ def throughput_test(
361
375
  print(
362
376
  "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
363
377
  )
378
+ print(
379
+ "{:<40} {:<10.2f}".format(
380
+ "Last generation throughput (tok/s):", result["last_gen_throughput"]
381
+ )
382
+ )
364
383
  print(
365
384
  "{:<40} {:<10.2f}".format(
366
385
  "Request throughput (req/s):", result["request_throughput"]
sglang/bench_one_batch.py CHANGED
@@ -230,7 +230,7 @@ def extend(reqs, model_runner):
230
230
  batch = ScheduleBatch.init_new(
231
231
  reqs=reqs,
232
232
  req_to_token_pool=model_runner.req_to_token_pool,
233
- token_to_kv_pool=model_runner.token_to_kv_pool,
233
+ token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
234
234
  tree_cache=None,
235
235
  model_config=model_runner.model_config,
236
236
  enable_overlap=False,
@@ -326,7 +326,7 @@ def latency_test_run_once(
326
326
 
327
327
  # Clear the pools.
328
328
  model_runner.req_to_token_pool.clear()
329
- model_runner.token_to_kv_pool.clear()
329
+ model_runner.token_to_kv_pool_allocator.clear()
330
330
 
331
331
  measurement_results = {
332
332
  "run_name": run_name,
sglang/bench_serving.py CHANGED
@@ -8,7 +8,6 @@ Usage:
8
8
  python3 -m sglang.bench_serving --backend sglang --num-prompt 10
9
9
 
10
10
  python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
11
- python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
12
11
  """
13
12
 
14
13
  import argparse
@@ -40,6 +39,7 @@ from transformers import (
40
39
  )
41
40
 
42
41
  AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
42
+ ASSISTANT_SUFFIX = "Assistant:"
43
43
 
44
44
  global args
45
45
 
@@ -71,7 +71,19 @@ def remove_prefix(text: str, prefix: str) -> str:
71
71
  return text[len(prefix) :] if text.startswith(prefix) else text
72
72
 
73
73
 
74
- # trt llm not support ignore_eos
74
+ def remove_suffix(text: str, suffix: str) -> str:
75
+ return text[: -len(suffix)] if text.endswith(suffix) else text
76
+
77
+
78
+ def get_auth_headers() -> Dict[str, str]:
79
+ api_key = os.environ.get("OPENAI_API_KEY")
80
+ if api_key:
81
+ return {"Authorization": f"Bearer {api_key}"}
82
+ else:
83
+ return {}
84
+
85
+
86
+ # trt llm does not support ignore_eos
75
87
  # https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
76
88
  async def async_request_trt_llm(
77
89
  request_func_input: RequestFuncInput,
@@ -165,12 +177,13 @@ async def async_request_openai_completions(
165
177
  "ignore_eos": not args.disable_ignore_eos,
166
178
  **request_func_input.extra_request_body,
167
179
  }
168
- headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
180
+ headers = get_auth_headers()
169
181
 
170
182
  output = RequestFuncOutput()
171
183
  output.prompt_len = request_func_input.prompt_len
172
184
 
173
185
  generated_text = ""
186
+ output_len = request_func_input.output_len
174
187
  ttft = 0.0
175
188
  st = time.perf_counter()
176
189
  most_recent_timestamp = st
@@ -207,11 +220,14 @@ async def async_request_openai_completions(
207
220
 
208
221
  most_recent_timestamp = timestamp
209
222
  generated_text += data["choices"][0]["text"]
223
+ output_len = data.get("usage", {}).get(
224
+ "completion_tokens", output_len
225
+ )
210
226
 
211
227
  output.generated_text = generated_text
212
228
  output.success = True
213
229
  output.latency = latency
214
- output.output_len = request_func_input.output_len
230
+ output.output_len = output_len
215
231
  else:
216
232
  output.error = response.reason or ""
217
233
  output.success = False
@@ -244,7 +260,7 @@ async def async_request_truss(
244
260
  "ignore_eos": not args.disable_ignore_eos,
245
261
  **request_func_input.extra_request_body,
246
262
  }
247
- headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
263
+ headers = get_auth_headers()
248
264
 
249
265
  output = RequestFuncOutput()
250
266
  output.prompt_len = request_func_input.prompt_len
@@ -325,15 +341,17 @@ async def async_request_sglang_generate(
325
341
  "logprob_start_len": -1,
326
342
  **request_func_input.extra_request_body,
327
343
  }
328
- headers = {}
344
+ headers = get_auth_headers()
329
345
 
330
346
  output = RequestFuncOutput()
331
347
  output.prompt_len = request_func_input.prompt_len
332
348
 
333
349
  generated_text = ""
350
+ output_len = request_func_input.output_len
334
351
  ttft = 0.0
335
352
  st = time.perf_counter()
336
353
  most_recent_timestamp = st
354
+ last_output_len = 0
337
355
  try:
338
356
  async with session.post(
339
357
  url=api_url, json=payload, headers=headers
@@ -357,6 +375,9 @@ async def async_request_sglang_generate(
357
375
  # want to check a token was generated
358
376
  if data["text"]:
359
377
  timestamp = time.perf_counter()
378
+ generated_text = data["text"]
379
+ output_len = data["meta_info"]["completion_tokens"]
380
+
360
381
  # First token
361
382
  if ttft == 0.0:
362
383
  ttft = time.perf_counter() - st
@@ -364,15 +385,21 @@ async def async_request_sglang_generate(
364
385
 
365
386
  # Decoding phase
366
387
  else:
367
- output.itl.append(timestamp - most_recent_timestamp)
388
+ num_new_tokens = output_len - last_output_len
389
+ if num_new_tokens == 0:
390
+ continue
391
+ adjust_itl = (
392
+ timestamp - most_recent_timestamp
393
+ ) / num_new_tokens
394
+ output.itl.extend([adjust_itl] * num_new_tokens)
368
395
 
369
396
  most_recent_timestamp = timestamp
370
- generated_text = data["text"]
397
+ last_output_len = output_len
371
398
 
372
399
  output.generated_text = generated_text
373
400
  output.success = True
374
401
  output.latency = latency
375
- output.output_len = request_func_input.output_len
402
+ output.output_len = output_len
376
403
  else:
377
404
  output.error = response.reason or ""
378
405
  output.success = False
@@ -380,6 +407,7 @@ async def async_request_sglang_generate(
380
407
  output.success = False
381
408
  exc_info = sys.exc_info()
382
409
  output.error = "".join(traceback.format_exception(*exc_info))
410
+ print(f"{output.error=}")
383
411
 
384
412
  if pbar:
385
413
  pbar.update(1)
@@ -453,6 +481,7 @@ def get_dataset(args, tokenizer):
453
481
  tokenizer=tokenizer,
454
482
  fixed_output_len=args.sharegpt_output_len,
455
483
  context_len=args.sharegpt_context_len,
484
+ prompt_suffix=args.prompt_suffix,
456
485
  apply_chat_template=args.apply_chat_template,
457
486
  )
458
487
  elif args.dataset_name == "random":
@@ -513,7 +542,9 @@ class BenchmarkMetrics:
513
542
  mean_itl_ms: float
514
543
  median_itl_ms: float
515
544
  std_itl_ms: float
545
+ p95_itl_ms: float
516
546
  p99_itl_ms: float
547
+ max_itl_ms: float
517
548
  mean_e2e_latency_ms: float
518
549
  median_e2e_latency_ms: float
519
550
  std_e2e_latency_ms: float
@@ -564,6 +595,7 @@ def sample_sharegpt_requests(
564
595
  tokenizer: PreTrainedTokenizerBase,
565
596
  fixed_output_len: Optional[int] = None,
566
597
  context_len: Optional[int] = None,
598
+ prompt_suffix: Optional[str] = "",
567
599
  apply_chat_template=False,
568
600
  ) -> List[Tuple[str, int, int]]:
569
601
  if fixed_output_len is not None and fixed_output_len < 4:
@@ -576,11 +608,19 @@ def sample_sharegpt_requests(
576
608
  # Load the dataset.
577
609
  with open(dataset_path) as f:
578
610
  dataset = json.load(f)
611
+
579
612
  # Filter out the conversations with less than 2 turns.
580
- dataset = [data for data in dataset if len(data["conversations"]) >= 2]
613
+ dataset = [
614
+ data
615
+ for data in dataset
616
+ if len(data.get("conversations", data.get("conversation", []))) >= 2
617
+ ]
581
618
  # Only keep the first two turns of each conversation.
582
619
  dataset = [
583
- (data["conversations"][0]["value"], data["conversations"][1]["value"])
620
+ (
621
+ data.get("conversations", data.get("conversation", []))[0]["value"],
622
+ data.get("conversations", data.get("conversation", []))[1]["value"],
623
+ )
584
624
  for data in dataset
585
625
  ]
586
626
 
@@ -595,6 +635,12 @@ def sample_sharegpt_requests(
595
635
 
596
636
  # Tokenize the prompts and completions.
597
637
  prompt = dataset[i][0]
638
+ if prompt_suffix:
639
+ prompt = (
640
+ remove_suffix(prompt, ASSISTANT_SUFFIX)
641
+ + prompt_suffix
642
+ + ASSISTANT_SUFFIX
643
+ )
598
644
 
599
645
  if apply_chat_template:
600
646
  prompt = tokenizer.apply_chat_template(
@@ -658,10 +704,17 @@ def sample_random_requests(
658
704
  with open(dataset_path) as f:
659
705
  dataset = json.load(f)
660
706
  # Filter out the conversations with less than 2 turns.
661
- dataset = [data for data in dataset if len(data["conversations"]) >= 2]
707
+ dataset = [
708
+ data
709
+ for data in dataset
710
+ if len(data.get("conversations", data.get("conversation", []))) >= 2
711
+ ]
662
712
  # Only keep the first two turns of each conversation.
663
713
  dataset = [
664
- (data["conversations"][0]["value"], data["conversations"][1]["value"])
714
+ (
715
+ data.get("conversations", data.get("conversation", []))[0]["value"],
716
+ data.get("conversations", data.get("conversation", []))[1]["value"],
717
+ )
665
718
  for data in dataset
666
719
  ]
667
720
  # Shuffle the dataset.
@@ -887,7 +940,9 @@ def calculate_metrics(
887
940
  mean_itl_ms=np.mean(itls or 0) * 1000,
888
941
  median_itl_ms=np.median(itls or 0) * 1000,
889
942
  std_itl_ms=np.std(itls or 0) * 1000,
943
+ p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
890
944
  p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
945
+ max_itl_ms=np.max(itls or 0) * 1000,
891
946
  mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
892
947
  median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
893
948
  std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
@@ -911,6 +966,7 @@ async def benchmark(
911
966
  lora_name: str,
912
967
  extra_request_body: Dict[str, Any],
913
968
  profile: bool,
969
+ pd_seperated: bool = False,
914
970
  ):
915
971
  if backend in ASYNC_REQUEST_FUNCS:
916
972
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -996,6 +1052,17 @@ async def benchmark(
996
1052
  if pbar is not None:
997
1053
  pbar.close()
998
1054
 
1055
+ if "sglang" in backend:
1056
+ server_info = requests.get(base_url + "/get_server_info")
1057
+ if pd_seperated:
1058
+ accept_length = server_info.json()["decode"][0].get(
1059
+ "avg_spec_accept_length", None
1060
+ )
1061
+ else:
1062
+ accept_length = server_info.json().get("avg_spec_accept_length", None)
1063
+ else:
1064
+ accept_length = None
1065
+
999
1066
  # Compute metrics and print results
1000
1067
  benchmark_duration = time.perf_counter() - benchmark_start_time
1001
1068
  metrics, output_lens = calculate_metrics(
@@ -1045,6 +1112,8 @@ async def benchmark(
1045
1112
  )
1046
1113
  )
1047
1114
  print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
1115
+ if accept_length:
1116
+ print("{:<40} {:<10.2f}".format("Accept length:", accept_length))
1048
1117
  print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
1049
1118
  print(
1050
1119
  "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
@@ -1058,16 +1127,12 @@ async def benchmark(
1058
1127
  print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
1059
1128
  print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
1060
1129
  print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
1061
- print(
1062
- "{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
1063
- )
1064
- print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
1065
- print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
1066
- print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
1067
- print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
1130
+ print("{s:{c}^{n}}".format(s="Inter-Token Latency", n=50, c="-"))
1068
1131
  print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
1069
1132
  print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
1133
+ print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
1070
1134
  print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
1135
+ print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
1071
1136
  print("=" * 50)
1072
1137
 
1073
1138
  if (
@@ -1109,8 +1174,10 @@ async def benchmark(
1109
1174
  "mean_itl_ms": metrics.mean_itl_ms,
1110
1175
  "median_itl_ms": metrics.median_itl_ms,
1111
1176
  "std_itl_ms": metrics.std_itl_ms,
1177
+ "p95_itl_ms": metrics.p95_itl_ms,
1112
1178
  "p99_itl_ms": metrics.p99_itl_ms,
1113
1179
  "concurrency": metrics.concurrency,
1180
+ "accept_length": accept_length,
1114
1181
  }
1115
1182
  else:
1116
1183
  print(f"Error running benchmark for request rate: {request_rate}")
@@ -1143,14 +1210,6 @@ async def benchmark(
1143
1210
  return result
1144
1211
 
1145
1212
 
1146
- def parse_request_rate_range(request_rate_range):
1147
- if len(request_rate_range.split(",")) == 3:
1148
- start, stop, step = map(int, request_rate_range.split(","))
1149
- return list(range(start, stop, step))
1150
- else:
1151
- return list(map(int, request_rate_range.split(",")))
1152
-
1153
-
1154
1213
  def check_chat_template(model_path):
1155
1214
  try:
1156
1215
  tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@@ -1160,6 +1219,12 @@ def check_chat_template(model_path):
1160
1219
  return False
1161
1220
 
1162
1221
 
1222
+ def set_global_args(args_: argparse.Namespace):
1223
+ """Set the global args."""
1224
+ global args
1225
+ args = args_
1226
+
1227
+
1163
1228
  def run_benchmark(args_: argparse.Namespace):
1164
1229
  global args
1165
1230
  args = args_
@@ -1168,6 +1233,8 @@ def run_benchmark(args_: argparse.Namespace):
1168
1233
  if not hasattr(args, "max_concurrency"):
1169
1234
  args.max_concurrency = None
1170
1235
 
1236
+ print(f"benchmark_args={args}")
1237
+
1171
1238
  # Set global environments
1172
1239
  set_ulimit()
1173
1240
  random.seed(args.seed)
@@ -1238,7 +1305,7 @@ def run_benchmark(args_: argparse.Namespace):
1238
1305
  )
1239
1306
  sys.exit(1)
1240
1307
  try:
1241
- response = requests.get(model_url)
1308
+ response = requests.get(model_url, headers=get_auth_headers())
1242
1309
  model_list = response.json().get("data", [])
1243
1310
  args.model = model_list[0]["id"] if model_list else None
1244
1311
  except Exception as e:
@@ -1264,49 +1331,26 @@ def run_benchmark(args_: argparse.Namespace):
1264
1331
  backend = args.backend
1265
1332
  model_id = args.model
1266
1333
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
1267
-
1268
1334
  tokenizer = get_tokenizer(tokenizer_id)
1269
-
1270
1335
  input_requests = get_dataset(args, tokenizer)
1271
1336
 
1272
- if not args.multi:
1273
- return asyncio.run(
1274
- benchmark(
1275
- backend=backend,
1276
- api_url=api_url,
1277
- base_url=base_url,
1278
- model_id=model_id,
1279
- tokenizer=tokenizer,
1280
- input_requests=input_requests,
1281
- request_rate=args.request_rate,
1282
- max_concurrency=args.max_concurrency,
1283
- disable_tqdm=args.disable_tqdm,
1284
- lora_name=args.lora_name,
1285
- extra_request_body=extra_request_body,
1286
- profile=args.profile,
1287
- )
1337
+ return asyncio.run(
1338
+ benchmark(
1339
+ backend=backend,
1340
+ api_url=api_url,
1341
+ base_url=base_url,
1342
+ model_id=model_id,
1343
+ tokenizer=tokenizer,
1344
+ input_requests=input_requests,
1345
+ request_rate=args.request_rate,
1346
+ max_concurrency=args.max_concurrency,
1347
+ disable_tqdm=args.disable_tqdm,
1348
+ lora_name=args.lora_name,
1349
+ extra_request_body=extra_request_body,
1350
+ profile=args.profile,
1351
+ pd_seperated=args.pd_seperated,
1288
1352
  )
1289
- else:
1290
- # Benchmark multiple rps. TODO: use a fixed duration to compute num_prompts
1291
- request_rates = parse_request_rate_range(args.request_rate_range)
1292
-
1293
- for rate in request_rates:
1294
- asyncio.run(
1295
- benchmark(
1296
- backend=backend,
1297
- api_url=api_url,
1298
- base_url=base_url,
1299
- model_id=model_id,
1300
- tokenizer=tokenizer,
1301
- input_requests=input_requests,
1302
- request_rate=rate,
1303
- max_concurrency=args.max_concurrency,
1304
- disable_tqdm=args.disable_tqdm,
1305
- lora_name=args.lora_name,
1306
- extra_request_body=extra_request_body,
1307
- profile=args.profile,
1308
- )
1309
- )
1353
+ )
1310
1354
 
1311
1355
 
1312
1356
  def set_ulimit(target_soft_limit=65535):
@@ -1420,17 +1464,6 @@ if __name__ == "__main__":
1420
1464
  "actual request rate may be lower than specified with --request-rate, "
1421
1465
  "if the server is not processing requests fast enough to keep up.",
1422
1466
  )
1423
- parser.add_argument(
1424
- "--multi",
1425
- action="store_true",
1426
- help="Use request rate range rather than single value.",
1427
- )
1428
- parser.add_argument(
1429
- "--request-rate-range",
1430
- type=str,
1431
- default="2,34,2",
1432
- help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
1433
- )
1434
1467
  parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
1435
1468
  parser.add_argument(
1436
1469
  "--disable-tqdm",
@@ -1477,6 +1510,17 @@ if __name__ == "__main__":
1477
1510
  default=None,
1478
1511
  help="The name of LoRA adapter",
1479
1512
  )
1513
+ parser.add_argument(
1514
+ "--prompt-suffix",
1515
+ type=str,
1516
+ default="",
1517
+ help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
1518
+ )
1519
+ parser.add_argument(
1520
+ "--pd-seperated",
1521
+ action="store_true",
1522
+ help="Benchmark PD disaggregation server",
1523
+ )
1480
1524
 
1481
1525
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1482
1526
  group.add_argument(
sglang/global_config.py CHANGED
@@ -4,6 +4,13 @@ import os
4
4
 
5
5
 
6
6
  class GlobalConfig:
7
+ """
8
+ Store some global constants.
9
+
10
+ See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
11
+ many global runtime arguments as well.
12
+ """
13
+
7
14
  def __init__(self):
8
15
  # Verbosity level
9
16
  # 0: do not output anything
@@ -34,11 +41,9 @@ class GlobalConfig:
34
41
  self.skip_special_tokens_in_output = True
35
42
  self.spaces_between_special_tokens_in_out = True
36
43
 
37
- # Interpreter optimization configs
44
+ # Language frontend interpreter optimization configs
38
45
  self.enable_precache_with_tracing = True
39
46
  self.enable_parallel_encoding = True
40
47
 
41
- self.enable_flashinfer_mla = False
42
-
43
48
 
44
49
  global_config = GlobalConfig()
@@ -336,7 +336,7 @@ class Runtime:
336
336
  """
337
337
  A wrapper for the HTTP server.
338
338
  This is used for launching the server in a python program without
339
- using the commond line interface.
339
+ using the command line interface.
340
340
 
341
341
  It is mainly used for the frontend language.
342
342
  You should use the Engine class if you want to do normal offline processing without the frontend language.
sglang/lang/ir.py CHANGED
@@ -457,7 +457,7 @@ class SglGen(SglExpr):
457
457
  regex: Optional[str] = None,
458
458
  json_schema: Optional[str] = None,
459
459
  ):
460
- """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
460
+ """Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
461
461
  super().__init__()
462
462
  self.name = name
463
463
  self.sampling_params = SglSamplingParams(