sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py CHANGED
@@ -24,6 +24,7 @@ import warnings
24
24
  from argparse import ArgumentParser
25
25
  from dataclasses import dataclass, field
26
26
  from datetime import datetime
27
+ from json import JSONDecodeError
27
28
  from pathlib import Path
28
29
  from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
29
30
 
@@ -38,7 +39,6 @@ from transformers import (
38
39
  PreTrainedTokenizerFast,
39
40
  )
40
41
 
41
- AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
42
42
  ASSISTANT_SUFFIX = "Assistant:"
43
43
 
44
44
  global args
@@ -50,6 +50,19 @@ def _get_bool_env_var(name: str, default: str = "false") -> bool:
50
50
  return value.lower() in ("true", "1")
51
51
 
52
52
 
53
+ def _create_bench_client_session():
54
+ # When the pressure is big, the read buffer could be full before aio thread read
55
+ # the content. We increase the read_bufsize from 64K to 10M.
56
+ # Define constants for timeout and buffer size for clarity and maintainability
57
+ BENCH_AIOHTTP_TIMEOUT_SECONDS = 6 * 60 * 60 # 6 hours
58
+ BENCH_AIOHTTP_READ_BUFSIZE_BYTES = 10 * 1024**2 # 10 MB
59
+
60
+ aiohttp_timeout = aiohttp.ClientTimeout(total=BENCH_AIOHTTP_TIMEOUT_SECONDS)
61
+ return aiohttp.ClientSession(
62
+ timeout=aiohttp_timeout, read_bufsize=BENCH_AIOHTTP_READ_BUFSIZE_BYTES
63
+ )
64
+
65
+
53
66
  @dataclass
54
67
  class RequestFuncInput:
55
68
  prompt: str
@@ -73,6 +86,12 @@ class RequestFuncOutput:
73
86
  error: str = ""
74
87
  output_len: int = 0
75
88
 
89
+ @staticmethod
90
+ def init_new(request_func_input: RequestFuncInput):
91
+ output = RequestFuncOutput()
92
+ output.prompt_len = request_func_input.prompt_len
93
+ return output
94
+
76
95
 
77
96
  def remove_prefix(text: str, prefix: str) -> str:
78
97
  return text[len(prefix) :] if text.startswith(prefix) else text
@@ -99,7 +118,7 @@ async def async_request_trt_llm(
99
118
  api_url = request_func_input.api_url
100
119
  assert api_url.endswith("generate_stream")
101
120
 
102
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
121
+ async with _create_bench_client_session() as session:
103
122
  payload = {
104
123
  "accumulate_tokens": True,
105
124
  "text_input": request_func_input.prompt,
@@ -114,8 +133,7 @@ async def async_request_trt_llm(
114
133
  if args.disable_ignore_eos:
115
134
  del payload["min_length"]
116
135
  del payload["end_id"]
117
- output = RequestFuncOutput()
118
- output.prompt_len = request_func_input.prompt_len
136
+ output = RequestFuncOutput.init_new(request_func_input)
119
137
 
120
138
  ttft = 0.0
121
139
  st = time.perf_counter()
@@ -173,7 +191,7 @@ async def async_request_openai_completions(
173
191
 
174
192
  prompt = request_func_input.prompt
175
193
 
176
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
194
+ async with _create_bench_client_session() as session:
177
195
  payload = {
178
196
  "model": request_func_input.model,
179
197
  "prompt": prompt,
@@ -186,8 +204,7 @@ async def async_request_openai_completions(
186
204
  }
187
205
  headers = get_auth_headers()
188
206
 
189
- output = RequestFuncOutput()
190
- output.prompt_len = request_func_input.prompt_len
207
+ output = RequestFuncOutput.init_new(request_func_input)
191
208
 
192
209
  generated_text = ""
193
210
  output_len = request_func_input.output_len
@@ -256,7 +273,7 @@ async def async_request_truss(
256
273
 
257
274
  prompt = request_func_input.prompt
258
275
 
259
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
276
+ async with _create_bench_client_session() as session:
260
277
  payload = {
261
278
  "model": request_func_input.model,
262
279
  "prompt": prompt,
@@ -269,8 +286,7 @@ async def async_request_truss(
269
286
  }
270
287
  headers = get_auth_headers()
271
288
 
272
- output = RequestFuncOutput()
273
- output.prompt_len = request_func_input.prompt_len
289
+ output = RequestFuncOutput.init_new(request_func_input)
274
290
 
275
291
  generated_text = ""
276
292
  ttft = 0.0
@@ -334,9 +350,9 @@ async def async_request_sglang_generate(
334
350
  api_url = request_func_input.api_url
335
351
  prompt = request_func_input.prompt
336
352
 
337
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
353
+ async with _create_bench_client_session() as session:
338
354
  payload = {
339
- "text": prompt,
355
+ ("text" if isinstance(prompt, str) else "input_ids"): prompt,
340
356
  "sampling_params": {
341
357
  "temperature": 0.0,
342
358
  "max_new_tokens": request_func_input.output_len,
@@ -355,8 +371,7 @@ async def async_request_sglang_generate(
355
371
 
356
372
  headers = get_auth_headers()
357
373
 
358
- output = RequestFuncOutput()
359
- output.prompt_len = request_func_input.prompt_len
374
+ output = RequestFuncOutput.init_new(request_func_input)
360
375
 
361
376
  generated_text = ""
362
377
  output_len = request_func_input.output_len
@@ -373,7 +388,6 @@ async def async_request_sglang_generate(
373
388
  chunk_bytes = chunk_bytes.strip()
374
389
  if not chunk_bytes:
375
390
  continue
376
- # print(chunk_bytes)
377
391
 
378
392
  chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
379
393
  latency = time.perf_counter() - st
@@ -434,7 +448,7 @@ async def async_request_gserver(
434
448
 
435
449
 
436
450
  async def async_request_profile(api_url: str) -> RequestFuncOutput:
437
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
451
+ async with _create_bench_client_session() as session:
438
452
  output = RequestFuncOutput()
439
453
  try:
440
454
  async with session.post(url=api_url) as response:
@@ -469,6 +483,10 @@ def get_model(pretrained_model_name_or_path: str) -> str:
469
483
  def get_tokenizer(
470
484
  pretrained_model_name_or_path: str,
471
485
  ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
486
+ assert (
487
+ pretrained_model_name_or_path is not None
488
+ and pretrained_model_name_or_path != ""
489
+ )
472
490
  if pretrained_model_name_or_path.endswith(
473
491
  ".json"
474
492
  ) or pretrained_model_name_or_path.endswith(".model"):
@@ -486,7 +504,9 @@ def get_tokenizer(
486
504
 
487
505
 
488
506
  def get_dataset(args, tokenizer):
507
+ tokenize_prompt = getattr(args, "tokenize_prompt", False)
489
508
  if args.dataset_name == "sharegpt":
509
+ assert not tokenize_prompt
490
510
  input_requests = sample_sharegpt_requests(
491
511
  dataset_path=args.dataset_path,
492
512
  num_requests=args.num_prompts,
@@ -505,8 +525,10 @@ def get_dataset(args, tokenizer):
505
525
  tokenizer=tokenizer,
506
526
  dataset_path=args.dataset_path,
507
527
  random_sample=args.dataset_name == "random",
528
+ return_text=not tokenize_prompt,
508
529
  )
509
530
  elif args.dataset_name == "generated-shared-prefix":
531
+ assert not tokenize_prompt
510
532
  input_requests = sample_generated_shared_prefix_requests(
511
533
  num_groups=args.gsp_num_groups,
512
534
  prompts_per_group=args.gsp_prompts_per_group,
@@ -517,6 +539,7 @@ def get_dataset(args, tokenizer):
517
539
  args=args,
518
540
  )
519
541
  elif args.dataset_name == "mmmu":
542
+ assert not tokenize_prompt
520
543
  input_requests = sample_mmmu_requests(
521
544
  num_requests=args.num_prompts,
522
545
  tokenizer=tokenizer,
@@ -582,7 +605,7 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
582
605
  filename = os.path.join("/tmp", url.split("/")[-1])
583
606
 
584
607
  # Check if the cache file already exists
585
- if os.path.exists(filename):
608
+ if is_file_valid_json(filename):
586
609
  return filename
587
610
 
588
611
  print(f"Downloading from {url} to {filename}")
@@ -610,12 +633,36 @@ def download_and_cache_file(url: str, filename: Optional[str] = None):
610
633
  return filename
611
634
 
612
635
 
636
+ def is_file_valid_json(path):
637
+ if not os.path.isfile(path):
638
+ return False
639
+
640
+ # TODO can fuse into the real file open later
641
+ try:
642
+ with open(path) as f:
643
+ json.load(f)
644
+ return True
645
+ except JSONDecodeError as e:
646
+ print(
647
+ f"{path} exists but json loading fails ({e=}), thus treat as invalid file"
648
+ )
649
+ return False
650
+
651
+
652
+ @dataclass
653
+ class DatasetRow:
654
+ prompt: str
655
+ prompt_len: int
656
+ output_len: int
657
+ image_data: Optional[str] = None
658
+
659
+
613
660
  def sample_mmmu_requests(
614
661
  num_requests: int,
615
662
  tokenizer: PreTrainedTokenizerBase,
616
663
  fixed_output_len: Optional[int] = None,
617
664
  random_sample: bool = True,
618
- ) -> List[Tuple[str, int, int]]:
665
+ ) -> List[DatasetRow]:
619
666
  """
620
667
  Sample requests from the MMMU dataset using HuggingFace datasets.
621
668
 
@@ -683,40 +730,52 @@ def sample_mmmu_requests(
683
730
  buffered = io.BytesIO()
684
731
  image.save(buffered, format="JPEG")
685
732
  img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
686
- image_path = f"data:image/jpeg;base64,{img_str}"
733
+ image_data = f"data:image/jpeg;base64,{img_str}"
687
734
  else:
688
735
  continue
689
736
 
690
737
  # Extract the question
691
738
  question = example.get("question")
692
739
 
693
- # Create the prompt with image, question
740
+ # Construct the prompt
694
741
  prompt = f"Question: {question}\n\nAnswer: "
695
- prompt = tokenizer.apply_chat_template(
696
- [
697
- {
698
- "role": "user",
699
- "content": [
700
- {"type": "image_url", "image_url": {"url": image_path}},
701
- {"type": "text", "text": prompt},
702
- ],
703
- }
704
- ],
705
- add_generation_prompt=True,
706
- tokenize=False,
707
- )
708
- prompt = f"<image>{image_path}</image>{prompt}"
709
742
 
710
- # Calculate token lengths
711
- # Note: This is approximate since we're not rendering the actual image tokens
743
+ try:
744
+ prompt = tokenizer.apply_chat_template(
745
+ [
746
+ {
747
+ "role": "user",
748
+ "content": [
749
+ {
750
+ "type": "image_url",
751
+ "image_url": {"url": image_data},
752
+ },
753
+ {"type": "text", "text": prompt},
754
+ ],
755
+ }
756
+ ],
757
+ add_generation_prompt=True,
758
+ tokenize=False,
759
+ )
760
+ except Exception as e:
761
+ # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
762
+ print(f"Error applying chat template: {e}, fallback to <image> tag")
763
+ prompt = f"<image>{prompt}"
764
+
765
+ # Calculate token lengths for text only (without image data)
712
766
  prompt_token_ids = tokenizer.encode(prompt)
713
- prompt_len = (
714
- len(prompt_token_ids) + 512
715
- ) # Add estimate for image tokens
767
+ prompt_len = len(prompt_token_ids)
716
768
 
717
769
  output_len = fixed_output_len if fixed_output_len is not None else 256
718
770
 
719
- filtered_dataset.append((prompt, prompt_len, output_len))
771
+ filtered_dataset.append(
772
+ DatasetRow(
773
+ prompt=prompt,
774
+ prompt_len=prompt_len,
775
+ output_len=output_len,
776
+ image_data=image_data,
777
+ )
778
+ )
720
779
 
721
780
  except Exception as e:
722
781
  print(f"Error processing example {i}: {e}")
@@ -733,12 +792,12 @@ def sample_sharegpt_requests(
733
792
  context_len: Optional[int] = None,
734
793
  prompt_suffix: Optional[str] = "",
735
794
  apply_chat_template=False,
736
- ) -> List[Tuple[str, int, int]]:
795
+ ) -> List[DatasetRow]:
737
796
  if fixed_output_len is not None and fixed_output_len < 4:
738
797
  raise ValueError("output_len too small")
739
798
 
740
799
  # Download sharegpt if necessary
741
- if not os.path.isfile(dataset_path) and dataset_path == "":
800
+ if not is_file_valid_json(dataset_path) and dataset_path == "":
742
801
  dataset_path = download_and_cache_file(SHAREGPT_URL)
743
802
 
744
803
  # Load the dataset.
@@ -764,7 +823,7 @@ def sample_sharegpt_requests(
764
823
  random.shuffle(dataset)
765
824
 
766
825
  # Filter out sequences that are too long or too short
767
- filtered_dataset: List[Tuple[str, int, int]] = []
826
+ filtered_dataset: List[DatasetRow] = []
768
827
  for i in range(len(dataset)):
769
828
  if len(filtered_dataset) == num_requests:
770
829
  break
@@ -802,10 +861,12 @@ def sample_sharegpt_requests(
802
861
  # Prune too long sequences.
803
862
  continue
804
863
 
805
- filtered_dataset.append((prompt, prompt_len, output_len))
864
+ filtered_dataset.append(
865
+ DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
866
+ )
806
867
 
807
- print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
808
- print(f"#Output tokens: {np.sum([x[2] for x in filtered_dataset])}")
868
+ print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
869
+ print(f"#Output tokens: {np.sum([x.output_len for x in filtered_dataset])}")
809
870
  return filtered_dataset
810
871
 
811
872
 
@@ -817,7 +878,8 @@ def sample_random_requests(
817
878
  tokenizer: PreTrainedTokenizerBase,
818
879
  dataset_path: str,
819
880
  random_sample: bool = True,
820
- ) -> List[Tuple[str, int, int]]:
881
+ return_text: bool = True,
882
+ ) -> List[DatasetRow]:
821
883
  input_lens = np.random.randint(
822
884
  max(int(input_len * range_ratio), 1),
823
885
  input_len + 1,
@@ -833,7 +895,7 @@ def sample_random_requests(
833
895
  # Sample token ids from ShareGPT and repeat/truncate them to satisfy the input_lens
834
896
 
835
897
  # Download sharegpt if necessary
836
- if not os.path.isfile(dataset_path):
898
+ if not is_file_valid_json(dataset_path):
837
899
  dataset_path = download_and_cache_file(SHAREGPT_URL)
838
900
 
839
901
  # Load the dataset.
@@ -857,7 +919,7 @@ def sample_random_requests(
857
919
  random.shuffle(dataset)
858
920
 
859
921
  # Filter out sequences that are too long or too short
860
- input_requests: List[Tuple[str, int, int]] = []
922
+ input_requests: List[DatasetRow] = []
861
923
  for data in dataset:
862
924
  i = len(input_requests)
863
925
  if i == num_prompts:
@@ -877,20 +939,34 @@ def sample_random_requests(
877
939
  else:
878
940
  ratio = (input_lens[i] + prompt_len - 1) // prompt_len
879
941
  input_ids = (prompt_token_ids * ratio)[: input_lens[i]]
880
- prompt = tokenizer.decode(input_ids)
881
- input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
942
+ input_content = input_ids
943
+ if return_text:
944
+ input_content = tokenizer.decode(input_content)
945
+ input_requests.append(
946
+ DatasetRow(
947
+ prompt=input_content,
948
+ prompt_len=int(input_lens[i]),
949
+ output_len=int(output_lens[i]),
950
+ )
951
+ )
882
952
  else:
883
953
  # Sample token ids from random integers. This can cause some NaN issues.
884
954
  offsets = np.random.randint(0, tokenizer.vocab_size, size=num_prompts)
885
955
  input_requests = []
886
956
  for i in range(num_prompts):
887
- prompt = tokenizer.decode(
888
- [
889
- (offsets[i] + i + j) % tokenizer.vocab_size
890
- for j in range(input_lens[i])
891
- ]
957
+ input_content = [
958
+ (offsets[i] + i + j) % tokenizer.vocab_size
959
+ for j in range(input_lens[i])
960
+ ]
961
+ if return_text:
962
+ input_content = tokenizer.decode(input_content)
963
+ input_requests.append(
964
+ DatasetRow(
965
+ prompt=input_content,
966
+ prompt_len=int(input_lens[i]),
967
+ output_len=int(output_lens[i]),
968
+ )
892
969
  )
893
- input_requests.append((prompt, int(input_lens[i]), int(output_lens[i])))
894
970
 
895
971
  print(f"#Input tokens: {np.sum(input_lens)}")
896
972
  print(f"#Output tokens: {np.sum(output_lens)}")
@@ -925,7 +1001,7 @@ def sample_generated_shared_prefix_requests(
925
1001
  output_len: int,
926
1002
  tokenizer: PreTrainedTokenizerBase,
927
1003
  args: argparse.Namespace,
928
- ) -> List[Tuple[str, int, int]]:
1004
+ ) -> List[DatasetRow]:
929
1005
  """Generate benchmark requests with shared system prompts using random tokens and caching."""
930
1006
  cache_path = get_gen_prefix_cache_path(args, tokenizer)
931
1007
 
@@ -963,7 +1039,11 @@ def sample_generated_shared_prefix_requests(
963
1039
  full_prompt = f"{system_prompt}\n\n{question}"
964
1040
  prompt_len = len(tokenizer.encode(full_prompt))
965
1041
 
966
- input_requests.append((full_prompt, prompt_len, output_len))
1042
+ input_requests.append(
1043
+ DatasetRow(
1044
+ prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1045
+ )
1046
+ )
967
1047
  total_input_tokens += prompt_len
968
1048
  total_output_tokens += output_len
969
1049
 
@@ -994,9 +1074,9 @@ def sample_generated_shared_prefix_requests(
994
1074
 
995
1075
 
996
1076
  async def get_request(
997
- input_requests: List[Tuple[str, int, int]],
1077
+ input_requests: List[DatasetRow],
998
1078
  request_rate: float,
999
- ) -> AsyncGenerator[Tuple[str, int, int], None]:
1079
+ ) -> AsyncGenerator[DatasetRow, None]:
1000
1080
  input_requests = iter(input_requests)
1001
1081
  for request in input_requests:
1002
1082
  yield request
@@ -1012,7 +1092,7 @@ async def get_request(
1012
1092
 
1013
1093
 
1014
1094
  def calculate_metrics(
1015
- input_requests: List[Tuple[str, int, int]],
1095
+ input_requests: List[DatasetRow],
1016
1096
  outputs: List[RequestFuncOutput],
1017
1097
  dur_s: float,
1018
1098
  tokenizer: PreTrainedTokenizerBase,
@@ -1034,7 +1114,7 @@ def calculate_metrics(
1034
1114
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1035
1115
  )
1036
1116
  retokenized_output_lens.append(retokenized_output_len)
1037
- total_input += input_requests[i][1]
1117
+ total_input += input_requests[i].prompt_len
1038
1118
  if output_len > 1:
1039
1119
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1040
1120
  itls += outputs[i].itl
@@ -1096,7 +1176,7 @@ async def benchmark(
1096
1176
  base_url: str,
1097
1177
  model_id: str,
1098
1178
  tokenizer: PreTrainedTokenizerBase,
1099
- input_requests: List[Tuple[str, int, int]],
1179
+ input_requests: List[DatasetRow],
1100
1180
  request_rate: float,
1101
1181
  max_concurrency: Optional[int],
1102
1182
  disable_tqdm: bool,
@@ -1126,30 +1206,22 @@ async def benchmark(
1126
1206
  print(f"Starting warmup with {warmup_requests} sequences...")
1127
1207
 
1128
1208
  # Use the first request for all warmup iterations
1129
- test_prompt, test_prompt_len, test_output_len = input_requests[0]
1209
+ test_request = input_requests[0]
1210
+
1130
1211
  if lora_names is not None and len(lora_names) != 0:
1131
1212
  lora_name = lora_names[0]
1132
1213
  else:
1133
1214
  lora_name = None
1134
1215
 
1135
- if "<image>" in test_prompt:
1136
- import re
1137
-
1138
- image_match = re.search(r"<image>(.*?)</image>(.*)", test_prompt)
1139
- image_data = image_match.group(1) if image_match else None
1140
- test_prompt = image_match.group(2) if image_match else test_prompt
1141
- else:
1142
- image_data = None
1143
-
1144
1216
  # Create the test input once
1145
1217
  test_input = RequestFuncInput(
1146
1218
  model=model_id,
1147
- prompt=test_prompt,
1219
+ prompt=test_request.prompt,
1148
1220
  api_url=api_url,
1149
- prompt_len=test_prompt_len,
1150
- output_len=min(test_output_len, 32),
1221
+ prompt_len=test_request.prompt_len,
1222
+ output_len=min(test_request.output_len, 32),
1151
1223
  lora_name=lora_name,
1152
- image_data=image_data,
1224
+ image_data=test_request.image_data,
1153
1225
  extra_request_body=extra_request_body,
1154
1226
  )
1155
1227
 
@@ -1194,32 +1266,23 @@ async def benchmark(
1194
1266
  benchmark_start_time = time.perf_counter()
1195
1267
  tasks: List[asyncio.Task] = []
1196
1268
  async for request in get_request(input_requests, request_rate):
1197
- prompt, prompt_len, output_len = request
1198
1269
  if lora_names is not None and len(lora_names) != 0:
1199
1270
  idx = random.randint(0, len(lora_names) - 1)
1200
1271
  lora_name = lora_names[idx]
1201
1272
  else:
1202
1273
  lora_name = None
1203
1274
 
1204
- if "<image>" in prompt:
1205
- import re
1206
-
1207
- image_match = re.search(r"<image>(.*?)</image>(.*)", prompt)
1208
- image_data = image_match.group(1) if image_match else None
1209
- prompt = image_match.group(2) if image_match else prompt
1210
- else:
1211
- image_data = None
1212
-
1213
1275
  request_func_input = RequestFuncInput(
1214
1276
  model=model_id,
1215
- prompt=prompt,
1277
+ prompt=request.prompt,
1216
1278
  api_url=api_url,
1217
- prompt_len=prompt_len,
1218
- output_len=output_len,
1279
+ prompt_len=request.prompt_len,
1280
+ output_len=request.output_len,
1219
1281
  lora_name=lora_name,
1220
- image_data=image_data,
1282
+ image_data=request.image_data,
1221
1283
  extra_request_body=extra_request_body,
1222
1284
  )
1285
+
1223
1286
  tasks.append(
1224
1287
  asyncio.create_task(
1225
1288
  limited_request_func(request_func_input=request_func_input, pbar=pbar)
@@ -1239,14 +1302,15 @@ async def benchmark(
1239
1302
 
1240
1303
  if "sglang" in backend:
1241
1304
  server_info = requests.get(base_url + "/get_server_info")
1242
- if pd_separated:
1243
- accept_length = server_info.json()["decode"][0]["internal_states"][0].get(
1305
+ if server_info.status_code == 200:
1306
+ server_info_json = server_info.json()
1307
+ if "decode" in server_info_json:
1308
+ server_info_json = server_info_json["decode"][0]
1309
+ accept_length = server_info_json["internal_states"][0].get(
1244
1310
  "avg_spec_accept_length", None
1245
1311
  )
1246
1312
  else:
1247
- accept_length = server_info.json()["internal_states"][0].get(
1248
- "avg_spec_accept_length", None
1249
- )
1313
+ accept_length = None
1250
1314
  else:
1251
1315
  accept_length = None
1252
1316
 
@@ -1380,21 +1444,24 @@ async def benchmark(
1380
1444
  else:
1381
1445
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
1382
1446
 
1447
+ result_details = {
1448
+ "input_lens": [output.prompt_len for output in outputs],
1449
+ "output_lens": output_lens,
1450
+ "ttfts": [output.ttft for output in outputs],
1451
+ "itls": [output.itl for output in outputs],
1452
+ "generated_texts": [output.generated_text for output in outputs],
1453
+ "errors": [output.error for output in outputs],
1454
+ }
1455
+
1383
1456
  # Append results to a JSONL file
1384
1457
  with open(output_file_name, "a") as file:
1385
- file.write(json.dumps(result) + "\n")
1386
-
1387
- result.update(
1388
- {
1389
- "input_lens": [output.prompt_len for output in outputs],
1390
- "output_lens": output_lens,
1391
- "ttfts": [output.ttft for output in outputs],
1392
- "itls": [output.itl for output in outputs],
1393
- "generated_texts": [output.generated_text for output in outputs],
1394
- "errors": [output.error for output in outputs],
1395
- }
1396
- )
1397
- return result
1458
+ if args.output_details:
1459
+ result_for_dump = result | result_details
1460
+ else:
1461
+ result_for_dump = result
1462
+ file.write(json.dumps(result_for_dump) + "\n")
1463
+
1464
+ return result | result_details
1398
1465
 
1399
1466
 
1400
1467
  def check_chat_template(model_path):
@@ -1424,6 +1491,12 @@ def run_benchmark(args_: argparse.Namespace):
1424
1491
  if not hasattr(args, "warmup_requests"):
1425
1492
  args.warmup_requests = 1
1426
1493
 
1494
+ if not hasattr(args, "output_details"):
1495
+ args.output_details = False
1496
+
1497
+ if not hasattr(args, "tokenize_prompt"):
1498
+ args.tokenize_prompt = False
1499
+
1427
1500
  print(f"benchmark_args={args}")
1428
1501
 
1429
1502
  # Set global environments
@@ -1435,6 +1508,11 @@ def run_benchmark(args_: argparse.Namespace):
1435
1508
  if args.extra_request_body:
1436
1509
  extra_request_body = json.loads(args.extra_request_body)
1437
1510
 
1511
+ if args.tokenize_prompt:
1512
+ assert (
1513
+ args.backend == "sglang"
1514
+ ), "`--tokenize-prompt` only compatible with `--backend sglang` currently"
1515
+
1438
1516
  # Set url
1439
1517
  if args.port is None:
1440
1518
  args.port = {
@@ -1545,6 +1623,7 @@ def run_benchmark(args_: argparse.Namespace):
1545
1623
  profile=args.profile,
1546
1624
  pd_separated=args.pd_separated,
1547
1625
  flush_cache=args.flush_cache,
1626
+ warmup_requests=args.warmup_requests,
1548
1627
  )
1549
1628
  )
1550
1629
 
@@ -1668,6 +1747,9 @@ if __name__ == "__main__":
1668
1747
  "if the server is not processing requests fast enough to keep up.",
1669
1748
  )
1670
1749
  parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
1750
+ parser.add_argument(
1751
+ "--output-details", action="store_true", help="Output details of benchmarking."
1752
+ )
1671
1753
  parser.add_argument(
1672
1754
  "--disable-tqdm",
1673
1755
  action="store_true",
@@ -1737,6 +1819,11 @@ if __name__ == "__main__":
1737
1819
  default=1,
1738
1820
  help="Number of warmup requests to run before the benchmark",
1739
1821
  )
1822
+ parser.add_argument(
1823
+ "--tokenize-prompt",
1824
+ action="store_true",
1825
+ help="Use integer ids instead of string for inputs. Useful to control prompt lengths accurately",
1826
+ )
1740
1827
 
1741
1828
  group = parser.add_argument_group("generated-shared-prefix dataset arguments")
1742
1829
  group.add_argument(
@@ -82,8 +82,8 @@ def launch_server_process_and_send_one_request(
82
82
  base_url = f"http://{server_args.host}:{server_args.port}"
83
83
  timeout = compile_args.timeout
84
84
 
85
- start_time = time.time()
86
- while time.time() - start_time < timeout:
85
+ start_time = time.perf_counter()
86
+ while time.perf_counter() - start_time < timeout:
87
87
  try:
88
88
  headers = {
89
89
  "Content-Type": "application/json; charset=utf-8",
@@ -112,9 +112,9 @@ def launch_server_process_and_send_one_request(
112
112
  raise RuntimeError(f"Sync request failed: {error}")
113
113
  # Other nodes should wait for the exit signal from Rank-0 node.
114
114
  else:
115
- start_time_waiting = time.time()
115
+ start_time_waiting = time.perf_counter()
116
116
  while proc.is_alive():
117
- if time.time() - start_time_waiting < timeout:
117
+ if time.perf_counter() - start_time_waiting < timeout:
118
118
  time.sleep(10)
119
119
  else:
120
120
  raise TimeoutError("Waiting for main node timeout!")