sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
24
24
  from concurrent import futures
25
25
  from dataclasses import dataclass
26
26
  from http import HTTPStatus
27
+ from pathlib import Path
27
28
  from types import SimpleNamespace
28
29
  from typing import Dict, List, Optional, Tuple, Union
29
30
 
@@ -35,7 +36,10 @@ from torch.distributed import barrier
35
36
 
36
37
  from sglang.global_config import global_config
37
38
  from sglang.srt.configs.model_config import ModelConfig
38
- from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
39
+ from sglang.srt.constrained.base_grammar_backend import (
40
+ INVALID_GRAMMAR_OBJ,
41
+ create_grammar_backend,
42
+ )
39
43
  from sglang.srt.disaggregation.decode import (
40
44
  DecodePreallocQueue,
41
45
  DecodeTransferQueue,
@@ -62,7 +66,6 @@ from sglang.srt.hf_transformers_utils import (
62
66
  from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
63
67
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
64
68
  from sglang.srt.managers.expert_distribution import (
65
- ExpertDistributionRecorder,
66
69
  get_global_expert_distribution_recorder,
67
70
  )
68
71
  from sglang.srt.managers.io_struct import (
@@ -132,11 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
132
135
  from sglang.srt.server_args import PortArgs, ServerArgs
133
136
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
134
137
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
138
+ from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
135
139
  from sglang.srt.utils import (
140
+ DeepEPMode,
136
141
  DynamicGradMode,
137
142
  broadcast_pyobj,
138
143
  configure_logger,
139
144
  disable_request_logging,
145
+ get_available_gpu_memory,
140
146
  get_bool_env_var,
141
147
  get_zmq_socket,
142
148
  kill_itself_when_parent_died,
@@ -173,6 +179,27 @@ class EmbeddingBatchResult:
173
179
  bid: int
174
180
 
175
181
 
182
+ class IdleSleeper:
183
+ """
184
+ In setups which have long inactivity periods it is desirable to reduce
185
+ system power consumption when sglang does nothing. This would lead not only
186
+ to power savings, but also to more CPU thermal headroom when a request
187
+ eventually comes. This is important in cases when multiple GPUs are connected
188
+ as each GPU would otherwise pin one thread at 100% CPU usage.
189
+
190
+ The simplest solution is to use zmq.Poller on all sockets that may receive
191
+ data that needs handling immediately.
192
+ """
193
+
194
+ def __init__(self, sockets):
195
+ self.poller = zmq.Poller()
196
+ for s in sockets:
197
+ self.poller.register(s, zmq.POLLIN)
198
+
199
+ def maybe_sleep(self):
200
+ self.poller.poll(1000)
201
+
202
+
176
203
  class Scheduler(
177
204
  SchedulerOutputProcessorMixin,
178
205
  SchedulerDisaggregationDecodeMixin,
@@ -210,7 +237,6 @@ class Scheduler(
210
237
  self.gpu_id = gpu_id
211
238
  self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
212
239
  self.page_size = server_args.page_size
213
- # Distributed rank info
214
240
  self.dp_size = server_args.dp_size
215
241
  self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
216
242
  compute_dp_attention_world_info(
@@ -223,6 +249,8 @@ class Scheduler(
223
249
 
224
250
  # Init inter-process communication
225
251
  context = zmq.Context(2)
252
+ self.idle_sleeper = None
253
+
226
254
  if self.pp_rank == 0 and self.attn_tp_rank == 0:
227
255
  self.recv_from_tokenizer = get_zmq_socket(
228
256
  context, zmq.PULL, port_args.scheduler_input_ipc_name, False
@@ -245,6 +273,13 @@ class Scheduler(
245
273
  self.recv_from_rpc = get_zmq_socket(
246
274
  context, zmq.DEALER, port_args.rpc_ipc_name, False
247
275
  )
276
+ if self.server_args.sleep_on_idle:
277
+ self.idle_sleeper = IdleSleeper(
278
+ [
279
+ self.recv_from_tokenizer,
280
+ self.recv_from_rpc,
281
+ ]
282
+ )
248
283
  else:
249
284
  self.recv_from_tokenizer = None
250
285
  self.recv_from_rpc = None
@@ -330,12 +365,16 @@ class Scheduler(
330
365
 
331
366
  # Print debug info
332
367
  if tp_rank == 0:
368
+ avail_mem = get_available_gpu_memory(
369
+ self.device, self.gpu_id, empty_cache=False
370
+ )
333
371
  logger.info(
334
372
  f"max_total_num_tokens={self.max_total_num_tokens}, "
335
373
  f"chunked_prefill_size={server_args.chunked_prefill_size}, "
336
374
  f"max_prefill_tokens={self.max_prefill_tokens}, "
337
375
  f"max_running_requests={self.max_running_requests}, "
338
- f"context_len={self.model_config.context_len}"
376
+ f"context_len={self.model_config.context_len}, "
377
+ f"available_gpu_mem={avail_mem:.2f} GB"
339
378
  )
340
379
 
341
380
  # Init memory pool and cache
@@ -352,13 +391,14 @@ class Scheduler(
352
391
  self.forward_ct = 0
353
392
  self.forward_ct_decode = 0
354
393
  self.num_generated_tokens = 0
355
- self.num_prefill_tokens = 0
394
+ self.last_prefill_tokens = 0
356
395
  self.last_decode_stats_tic = time.perf_counter()
357
396
  self.last_prefill_stats_tic = time.perf_counter()
358
397
  self.return_health_check_ct = 0
359
398
  self.current_stream = torch.get_device_module(self.device).current_stream()
360
399
  if self.device == "cpu":
361
400
  self.current_stream.synchronize = lambda: None # No-op for CPU
401
+ self.forward_sleep_time = None
362
402
 
363
403
  # Init session info
364
404
  self.sessions: Dict[str, Session] = {}
@@ -420,10 +460,16 @@ class Scheduler(
420
460
  self.torch_profiler = None
421
461
  self.torch_profiler_output_dir: Optional[str] = None
422
462
  self.profiler_activities: Optional[List[str]] = None
423
- self.profiler_id: Optional[str] = None
463
+ self.profile_id: Optional[str] = None
424
464
  self.profiler_target_forward_ct: Optional[int] = None
425
-
426
- self.forward_sleep_time = None
465
+ self.profiler_target_prefill_ct: Optional[int] = None
466
+ self.profiler_target_decode_ct: Optional[int] = None
467
+ self.profiler_prefill_ct: Optional[int] = None
468
+ self.profiler_decode_ct: Optional[int] = None
469
+ self.profile_by_stage: bool = False
470
+ self.profile_steps: Optional[int] = None
471
+ self.profile_in_progress: bool = False
472
+ self.rpd_profiler = None
427
473
 
428
474
  # Init metrics stats
429
475
  self.init_metrics()
@@ -462,6 +508,10 @@ class Scheduler(
462
508
  )
463
509
  self.init_disaggregation()
464
510
 
511
+ def maybe_sleep_on_idle(self):
512
+ if self.idle_sleeper is not None:
513
+ self.idle_sleeper.maybe_sleep()
514
+
465
515
  def init_tokenizer(self):
466
516
  server_args = self.server_args
467
517
 
@@ -556,7 +606,9 @@ class Scheduler(
556
606
 
557
607
  def init_kv_events(self, kv_events_config: Optional[str]):
558
608
  if self.enable_kv_cache_events:
559
- self.kv_event_publisher = EventPublisherFactory.create(kv_events_config)
609
+ self.kv_event_publisher = EventPublisherFactory.create(
610
+ kv_events_config, self.attn_dp_rank
611
+ )
560
612
 
561
613
  def init_disaggregation(self):
562
614
  self.transfer_backend = TransferBackend(
@@ -567,7 +619,7 @@ class Scheduler(
567
619
  self.disaggregation_mode == DisaggregationMode.DECODE
568
620
  ): # *2 for the headroom.
569
621
  buffer_size = (self.req_to_token_pool.size) * 2
570
- req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
622
+ self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
571
623
  buffer_size
572
624
  )
573
625
  self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
@@ -575,7 +627,8 @@ class Scheduler(
575
627
  # The decode requests polling kv cache
576
628
  self.disagg_decode_transfer_queue = DecodeTransferQueue(
577
629
  gloo_group=self.attn_tp_cpu_group,
578
- req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
630
+ req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
631
+ tp_rank=self.tp_rank,
579
632
  metadata_buffers=self.disagg_metadata_buffers,
580
633
  scheduler=self,
581
634
  tree_cache=self.tree_cache,
@@ -590,7 +643,7 @@ class Scheduler(
590
643
  if self.draft_worker is None
591
644
  else self.draft_worker.model_runner.token_to_kv_pool
592
645
  ),
593
- req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
646
+ req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
594
647
  metadata_buffers=self.disagg_metadata_buffers,
595
648
  scheduler=self,
596
649
  transfer_queue=self.disagg_decode_transfer_queue,
@@ -598,7 +651,12 @@ class Scheduler(
598
651
  gloo_group=self.attn_tp_cpu_group,
599
652
  tp_rank=self.tp_rank,
600
653
  tp_size=self.tp_size,
654
+ dp_size=self.server_args.dp_size,
655
+ gpu_id=self.gpu_id,
601
656
  bootstrap_port=self.server_args.disaggregation_bootstrap_port,
657
+ max_total_num_tokens=self.max_total_num_tokens,
658
+ prefill_pp_size=self.server_args.disaggregation_prefill_pp,
659
+ num_reserved_decode_tokens=self.server_args.num_reserved_decode_tokens,
602
660
  transfer_backend=self.transfer_backend,
603
661
  )
604
662
 
@@ -608,7 +666,7 @@ class Scheduler(
608
666
  elif self.disaggregation_mode == DisaggregationMode.PREFILL:
609
667
  # *2 for the headroom.
610
668
  buffer_size = self.max_running_requests * 2
611
- req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
669
+ self.req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
612
670
  buffer_size
613
671
  )
614
672
  self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
@@ -620,14 +678,20 @@ class Scheduler(
620
678
  if self.draft_worker is None
621
679
  else self.draft_worker.model_runner.token_to_kv_pool
622
680
  ),
623
- req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
681
+ req_to_metadata_buffer_idx_allocator=self.req_to_metadata_buffer_idx_allocator,
624
682
  metadata_buffers=self.disagg_metadata_buffers,
625
683
  tp_rank=self.tp_rank,
626
684
  tp_size=self.tp_size,
685
+ gpu_id=self.gpu_id,
627
686
  bootstrap_port=self.server_args.disaggregation_bootstrap_port,
628
687
  gloo_group=self.attn_tp_cpu_group,
629
- transfer_backend=self.transfer_backend,
688
+ max_total_num_tokens=self.max_total_num_tokens,
689
+ decode_tp_size=self.server_args.disaggregation_decode_tp,
690
+ decode_dp_size=self.server_args.disaggregation_decode_dp,
630
691
  scheduler=self,
692
+ pp_rank=self.pp_rank,
693
+ pp_size=self.pp_size,
694
+ transfer_backend=self.transfer_backend,
631
695
  )
632
696
  # The prefill requests that are in the middle of kv sending
633
697
  self.disagg_prefill_inflight_queue: List[Req] = []
@@ -649,6 +713,7 @@ class Scheduler(
649
713
  # When the server is idle, do self-check and re-init some states
650
714
  self.check_memory()
651
715
  self.new_token_ratio = self.init_new_token_ratio
716
+ self.maybe_sleep_on_idle()
652
717
 
653
718
  self.last_batch = batch
654
719
 
@@ -693,6 +758,7 @@ class Scheduler(
693
758
  # When the server is idle, do self-check and re-init some states
694
759
  self.check_memory()
695
760
  self.new_token_ratio = self.init_new_token_ratio
761
+ self.maybe_sleep_on_idle()
696
762
 
697
763
  self.last_batch = batch
698
764
 
@@ -798,6 +864,7 @@ class Scheduler(
798
864
  if server_is_idle:
799
865
  self.check_memory()
800
866
  self.new_token_ratio = self.init_new_token_ratio
867
+ self.maybe_sleep_on_idle()
801
868
 
802
869
  def recv_requests(self) -> List[Req]:
803
870
  """Receive results at tp_rank = 0 and broadcast it to all other TP ranks."""
@@ -931,18 +998,19 @@ class Scheduler(
931
998
  bootstrap_host=recv_req.bootstrap_host,
932
999
  bootstrap_port=recv_req.bootstrap_port,
933
1000
  bootstrap_room=recv_req.bootstrap_room,
1001
+ data_parallel_rank=recv_req.data_parallel_rank,
934
1002
  )
935
1003
  req.tokenizer = self.tokenizer
936
1004
 
937
1005
  if self.disaggregation_mode != DisaggregationMode.NULL:
938
1006
  # Invalid request for disaggregated mode
939
1007
  if recv_req.bootstrap_room is None:
940
- error_message = (
1008
+ error_msg = (
941
1009
  f"Invalid request: Disaggregated request received without "
942
1010
  f"boostrap room id. {req.rid=}"
943
1011
  )
944
- logger.error(error_message)
945
- prepare_abort(req, error_message)
1012
+ logger.error(error_msg)
1013
+ prepare_abort(req, error_msg)
946
1014
  self.stream_output([req], req.return_logprob)
947
1015
  return
948
1016
 
@@ -973,29 +1041,23 @@ class Scheduler(
973
1041
  req.extend_image_inputs(image_inputs)
974
1042
 
975
1043
  if len(req.origin_input_ids) >= self.max_req_input_len:
976
- error_msg = (
977
- "Multimodal prompt is too long after expanding multimodal tokens. "
978
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
979
- )
980
- logger.error(error_msg)
981
- req.origin_input_ids = [0]
982
- req.multimodal_inputs = None
983
- req.sampling_params.max_new_tokens = 0
984
- req.finished_reason = FINISH_ABORT(
985
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1044
+ req.set_finish_with_abort(
1045
+ error_msg=(
1046
+ "Multimodal prompt is too long after expanding multimodal tokens. "
1047
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1048
+ )
986
1049
  )
987
1050
  self._add_request_to_queue(req)
988
1051
  return
989
1052
 
990
- # Validate prompts length
1053
+ # Validate prompt length
991
1054
  error_msg = validate_input_length(
992
1055
  req,
993
1056
  self.max_req_input_len,
994
1057
  self.server_args.allow_auto_truncate,
995
1058
  )
996
1059
  if error_msg:
997
- req.origin_input_ids = [0]
998
- req.sampling_params.max_new_tokens = 0
1060
+ req.set_finish_with_abort(error_msg)
999
1061
  self._add_request_to_queue(req)
1000
1062
  return
1001
1063
 
@@ -1007,12 +1069,9 @@ class Scheduler(
1007
1069
  req.logprob_start_len = recv_req.logprob_start_len
1008
1070
 
1009
1071
  if req.logprob_start_len >= len(req.origin_input_ids):
1010
- req.finished_reason = FINISH_ABORT(
1011
- f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
1012
- HTTPStatus.BAD_REQUEST,
1013
- "BadRequestError",
1014
- )
1072
+ error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
1015
1073
  req.logprob_start_len = len(req.origin_input_ids) - 1
1074
+ req.set_finish_with_abort(error_msg)
1016
1075
  self._add_request_to_queue(req)
1017
1076
  return
1018
1077
 
@@ -1049,6 +1108,10 @@ class Scheduler(
1049
1108
  if not cache_hit:
1050
1109
  req.grammar_key = key
1051
1110
  add_to_grammar_queue = True
1111
+ else:
1112
+ if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
1113
+ error_msg = f"Invalid grammar request with cache hit: {key=}"
1114
+ req.set_finish_with_abort(error_msg)
1052
1115
 
1053
1116
  if add_to_grammar_queue:
1054
1117
  req.queue_time_start = time.perf_counter()
@@ -1059,18 +1122,22 @@ class Scheduler(
1059
1122
  def _add_request_to_queue(self, req: Req):
1060
1123
  req.queue_time_start = time.perf_counter()
1061
1124
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1062
- self.disagg_prefill_bootstrap_queue.add(req)
1125
+ self.disagg_prefill_bootstrap_queue.add(
1126
+ req, self.model_config.num_key_value_heads
1127
+ )
1063
1128
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
1064
1129
  self.disagg_decode_prealloc_queue.add(req)
1065
1130
  else:
1066
1131
  self.waiting_queue.append(req)
1067
1132
 
1068
- def _extend_requests_to_queue(self, reqs: List[Req]):
1133
+ def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
1069
1134
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1070
- self.disagg_prefill_bootstrap_queue.extend(reqs)
1135
+ self.disagg_prefill_bootstrap_queue.extend(
1136
+ reqs, self.model_config.num_key_value_heads
1137
+ )
1071
1138
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
1072
1139
  # If this is a decode server, we put the request to the decode pending prealloc queue
1073
- self.disagg_decode_prealloc_queue.extend(reqs)
1140
+ self.disagg_decode_prealloc_queue.extend(reqs, is_retracted)
1074
1141
  else:
1075
1142
  self.waiting_queue.extend(reqs)
1076
1143
 
@@ -1083,6 +1150,7 @@ class Scheduler(
1083
1150
  recv_req.input_text,
1084
1151
  recv_req.input_ids,
1085
1152
  recv_req.sampling_params,
1153
+ token_type_ids=recv_req.token_type_ids,
1086
1154
  )
1087
1155
  req.tokenizer = self.tokenizer
1088
1156
 
@@ -1096,19 +1164,13 @@ class Scheduler(
1096
1164
  req.extend_image_inputs(image_inputs)
1097
1165
 
1098
1166
  if len(req.origin_input_ids) >= self.max_req_input_len:
1099
- error_msg = (
1100
- "Multimodal prompt is too long after expanding multimodal tokens. "
1101
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1102
- )
1103
- logger.error(error_msg)
1104
- req.origin_input_ids = [0]
1105
- req.multimodal_inputs = None
1106
- req.sampling_params.max_new_tokens = 0
1107
- req.finished_reason = FINISH_ABORT(
1108
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1167
+ req.set_finish_with_abort(
1168
+ error_msg=(
1169
+ "Multimodal prompt is too long after expanding multimodal tokens. "
1170
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1171
+ )
1109
1172
  )
1110
- req.queue_time_start = time.perf_counter()
1111
- self.waiting_queue.append(req)
1173
+ self._add_request_to_queue(req)
1112
1174
  return
1113
1175
 
1114
1176
  # Validate prompts length
@@ -1133,8 +1195,8 @@ class Scheduler(
1133
1195
  ):
1134
1196
  gap_latency = time.perf_counter() - self.last_prefill_stats_tic
1135
1197
  self.last_prefill_stats_tic = time.perf_counter()
1136
- self.last_input_throughput = self.num_prefill_tokens / gap_latency
1137
- self.num_prefill_tokens = 0
1198
+ self.last_input_throughput = self.last_prefill_tokens / gap_latency
1199
+ self.last_prefill_tokens = adder.log_input_tokens
1138
1200
 
1139
1201
  num_used = self.max_total_num_tokens - (
1140
1202
  self.token_to_kv_pool_allocator.available_size()
@@ -1148,14 +1210,15 @@ class Scheduler(
1148
1210
  f"#new-token: {adder.log_input_tokens}, "
1149
1211
  f"#cached-token: {adder.log_hit_tokens}, "
1150
1212
  f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
1151
- f"#running-req: {running_bs}, "
1152
1213
  )
1153
1214
 
1154
1215
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1155
1216
  f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
1156
1217
  f += f"#queue-req: {len(self.waiting_queue)}, "
1157
- f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
1218
+ f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
1219
+ f += f"input throughput (token/s): {self.last_input_throughput:.2f} "
1158
1220
  else:
1221
+ f += f"#running-req: {running_bs}, "
1159
1222
  f += f"#queue-req: {len(self.waiting_queue)}"
1160
1223
 
1161
1224
  logger.info(f)
@@ -1218,6 +1281,7 @@ class Scheduler(
1218
1281
 
1219
1282
  if self.disaggregation_mode == DisaggregationMode.DECODE:
1220
1283
  msg += f"pre-allocated usage: {self.num_tokens_pre_allocated / self.max_total_num_tokens:.2f}, "
1284
+ msg += f"#retracted-req: {len(self.disagg_decode_prealloc_queue.retracted_queue)}, "
1221
1285
 
1222
1286
  msg += (
1223
1287
  f"cuda graph: {can_run_cuda_graph}, "
@@ -1515,11 +1579,11 @@ class Scheduler(
1515
1579
  self.new_token_ratio = new_token_ratio
1516
1580
 
1517
1581
  logger.info(
1518
- "Decode out of memory happened. "
1582
+ "KV cache pool is full. Retract requests. "
1519
1583
  f"#retracted_reqs: {len(retracted_reqs)}, "
1520
1584
  f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
1521
1585
  )
1522
- self._extend_requests_to_queue(retracted_reqs)
1586
+ self._extend_requests_to_queue(retracted_reqs, is_retracted=True)
1523
1587
  else:
1524
1588
  self.new_token_ratio = max(
1525
1589
  self.new_token_ratio - self.new_token_ratio_decay,
@@ -1539,13 +1603,8 @@ class Scheduler(
1539
1603
  """Run a batch."""
1540
1604
  self.forward_ct += 1
1541
1605
 
1542
- # Check profiler
1543
- if (
1544
- self.profiler_target_forward_ct
1545
- and self.profiler_target_forward_ct <= self.forward_ct
1546
- ):
1547
- self.send_to_tokenizer.send_pyobj(self.stop_profile())
1548
-
1606
+ # Whether to run the profiler
1607
+ self._profile_batch_predicate(batch)
1549
1608
  if self.forward_sleep_time is not None:
1550
1609
  logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
1551
1610
  time.sleep(self.forward_sleep_time)
@@ -1571,10 +1630,9 @@ class Scheduler(
1571
1630
  num_accepted_tokens,
1572
1631
  can_run_cuda_graph,
1573
1632
  ) = self.draft_worker.forward_batch_speculative_generation(batch)
1574
- self.spec_num_total_accepted_tokens += (
1575
- num_accepted_tokens + batch.batch_size()
1576
- )
1577
- self.spec_num_total_forward_ct += batch.batch_size()
1633
+ bs = batch.batch_size()
1634
+ self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
1635
+ self.spec_num_total_forward_ct += bs
1578
1636
  self.num_generated_tokens += num_accepted_tokens
1579
1637
 
1580
1638
  if self.pp_group.is_last_rank:
@@ -1648,6 +1706,9 @@ class Scheduler(
1648
1706
  disable_cuda_graph=self.server_args.disable_cuda_graph,
1649
1707
  spec_algorithm=self.spec_algorithm,
1650
1708
  speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
1709
+ enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
1710
+ enable_deepep_moe=self.server_args.enable_deepep_moe,
1711
+ deepep_mode=DeepEPMode[self.server_args.deepep_mode],
1651
1712
  )
1652
1713
 
1653
1714
  @staticmethod
@@ -1661,6 +1722,9 @@ class Scheduler(
1661
1722
  disable_cuda_graph: bool,
1662
1723
  spec_algorithm,
1663
1724
  speculative_num_draft_tokens,
1725
+ enable_two_batch_overlap: bool,
1726
+ enable_deepep_moe: bool,
1727
+ deepep_mode: DeepEPMode,
1664
1728
  ):
1665
1729
  # Check if other DP workers have running batches
1666
1730
  if local_batch is None:
@@ -1696,17 +1760,26 @@ class Scheduler(
1696
1760
  is_extend_in_batch = (
1697
1761
  local_batch.forward_mode.is_extend() if local_batch else False
1698
1762
  )
1763
+
1764
+ tbo_preparer = TboDPAttentionPreparer()
1765
+
1699
1766
  local_info = torch.tensor(
1700
1767
  [
1701
1768
  num_tokens,
1702
1769
  can_cuda_graph,
1703
1770
  num_tokens_for_logprob,
1704
1771
  is_extend_in_batch,
1772
+ *tbo_preparer.prepare_all_gather(
1773
+ local_batch,
1774
+ deepep_mode,
1775
+ enable_deepep_moe,
1776
+ enable_two_batch_overlap,
1777
+ ),
1705
1778
  ],
1706
1779
  dtype=torch.int64,
1707
1780
  )
1708
1781
  global_info = torch.empty(
1709
- (dp_size, attn_tp_size, 4),
1782
+ (dp_size, attn_tp_size, 6),
1710
1783
  dtype=torch.int64,
1711
1784
  )
1712
1785
  torch.distributed.all_gather_into_tensor(
@@ -1719,6 +1792,10 @@ class Scheduler(
1719
1792
  global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
1720
1793
  is_extend_in_batch = global_info[:, 0, 3].tolist()
1721
1794
 
1795
+ tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
1796
+ global_info[:, :, 4:6]
1797
+ )
1798
+
1722
1799
  if local_batch is None and max(global_num_tokens) > 0:
1723
1800
  local_batch = get_idle_batch()
1724
1801
 
@@ -1732,6 +1809,8 @@ class Scheduler(
1732
1809
  local_batch.global_num_tokens_for_logprob = (
1733
1810
  global_num_tokens_for_logprob
1734
1811
  )
1812
+ local_batch.tbo_split_seq_index = tbo_split_seq_index
1813
+ local_batch.global_forward_mode = global_forward_mode
1735
1814
 
1736
1815
  # Check forward mode for cuda graph
1737
1816
  if not disable_cuda_graph:
@@ -1757,17 +1836,25 @@ class Scheduler(
1757
1836
  """Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
1758
1837
 
1759
1838
  num_ready_reqs = 0
1760
- num_abort_reqs = 0
1839
+ num_timeout_reqs = 0
1761
1840
  for req in self.grammar_queue:
1762
1841
  try:
1842
+ if req.finished(): # It is aborted by AbortReq
1843
+ num_ready_reqs += 1
1844
+ continue
1763
1845
  req.grammar = req.grammar.result(timeout=0.03)
1764
- if req.grammar:
1765
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1846
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1847
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1848
+ req.set_finish_with_abort(
1849
+ f"Invalid grammar request: {req.grammar_key=}"
1850
+ )
1766
1851
  num_ready_reqs += 1
1767
1852
  except futures._base.TimeoutError:
1768
1853
  req.grammar_wait_ct += 1
1854
+ # NOTE(lianmin): this timeout is the waiting time of the above line. It is
1855
+ # not the waiting time from it enters the grammar queue.
1769
1856
  if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
1770
- num_abort_reqs = 1
1857
+ num_timeout_reqs = 1
1771
1858
  break
1772
1859
 
1773
1860
  if self.server_args.enable_dp_attention:
@@ -1779,28 +1866,33 @@ class Scheduler(
1779
1866
 
1780
1867
  if tp_size > 1:
1781
1868
  # Sync across TP ranks to make sure they have the same number of ready requests
1782
- tensor = torch.tensor([num_ready_reqs, num_abort_reqs], dtype=torch.int32)
1869
+ tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
1783
1870
  torch.distributed.all_reduce(
1784
1871
  tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
1785
1872
  )
1786
- num_ready_reqs_max, num_abort_reqs_max = tensor.tolist()
1873
+ num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
1787
1874
 
1788
1875
  for i in range(num_ready_reqs, num_ready_reqs_max):
1789
1876
  req = self.grammar_queue[i]
1877
+ if req.finished(): # It is aborted by AbortReq
1878
+ continue
1790
1879
  req.grammar = req.grammar.result()
1791
- if req.grammar:
1792
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1880
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1881
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1882
+ req.set_finish_with_abort(
1883
+ f"Invalid grammar request: {req.grammar_key=}"
1884
+ )
1885
+ else:
1886
+ num_ready_reqs_max = num_ready_reqs
1887
+ num_timeout_reqs_max = num_timeout_reqs
1793
1888
 
1794
- for i in range(num_ready_reqs, num_ready_reqs + num_abort_reqs_max):
1795
- req = self.grammar_queue[i]
1796
- req.grammar.cancel()
1797
- req.grammar = None
1798
- error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1799
- logger.error(error_msg)
1800
- req.finished_reason = FINISH_ABORT(
1801
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1802
- )
1803
- num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
1889
+ for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
1890
+ req = self.grammar_queue[i]
1891
+ req.grammar.cancel()
1892
+ error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1893
+ req.set_finish_with_abort(error_msg)
1894
+ self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
1895
+ num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
1804
1896
 
1805
1897
  self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
1806
1898
  self.grammar_queue = self.grammar_queue[num_ready_reqs:]
@@ -1887,6 +1979,27 @@ class Scheduler(
1887
1979
  if_success = False
1888
1980
  return if_success
1889
1981
 
1982
+ def get_load(self):
1983
+ # TODO(lsyin): use dynamically maintained num_waiting_tokens
1984
+ load = (
1985
+ self.max_total_num_tokens
1986
+ - self.token_to_kv_pool_allocator.available_size()
1987
+ - self.tree_cache.evictable_size()
1988
+ )
1989
+ load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
1990
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
1991
+ load += sum(
1992
+ len(req.origin_input_ids)
1993
+ for req in self.disagg_prefill_bootstrap_queue.queue
1994
+ )
1995
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
1996
+ load += sum(
1997
+ len(req.req.origin_input_ids)
1998
+ for req in self.disagg_decode_prealloc_queue.queue
1999
+ )
2000
+
2001
+ return load
2002
+
1890
2003
  def get_internal_state(self, recv_req: GetInternalStateReq):
1891
2004
  ret = dict(global_server_args_dict)
1892
2005
  ret["last_gen_throughput"] = self.last_gen_throughput
@@ -1896,9 +2009,10 @@ class Scheduler(
1896
2009
  )
1897
2010
  if RECORD_STEP_TIME:
1898
2011
  ret["step_time_dict"] = self.step_time_dict
1899
- return GetInternalStateReqOutput(
1900
- internal_state=ret,
1901
- )
2012
+
2013
+ ret["load"] = self.get_load()
2014
+
2015
+ return GetInternalStateReqOutput(internal_state=ret)
1902
2016
 
1903
2017
  def set_internal_state(self, recv_req: SetInternalStateReq):
1904
2018
  server_args_dict = recv_req.server_args
@@ -1932,7 +2046,7 @@ class Scheduler(
1932
2046
  self.cum_spec_accept_length = self.cum_spec_accept_count = 0
1933
2047
  for k, v in server_args_dict.items():
1934
2048
  global_server_args_dict[k] = v
1935
- logger.info(f"Global server args updated! " f"{global_server_args_dict=}")
2049
+ logger.info(f"Global server args updated! {global_server_args_dict=}")
1936
2050
  return SetInternalStateReqOutput(
1937
2051
  updated=True,
1938
2052
  server_args=global_server_args_dict,
@@ -1974,8 +2088,6 @@ class Scheduler(
1974
2088
  )
1975
2089
 
1976
2090
  def abort_request(self, recv_req: AbortReq):
1977
- # TODO(lmzheng): abort the requests in the grammar queue.
1978
-
1979
2091
  # Delete requests in the waiting queue
1980
2092
  to_del = []
1981
2093
  for i, req in enumerate(self.waiting_queue):
@@ -1984,10 +2096,24 @@ class Scheduler(
1984
2096
 
1985
2097
  # Sort in reverse order to avoid index issues when deleting
1986
2098
  for i in reversed(to_del):
2099
+ # Abort method 1: directly pop from the queue
2100
+ # This only works for requests that have not started anything.
2101
+ # We still need to send something back to TokenizerManager to clean up the state.
1987
2102
  req = self.waiting_queue.pop(i)
1988
2103
  self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
1989
2104
  logger.debug(f"Abort queued request. {req.rid=}")
1990
2105
 
2106
+ # Delete the requests in the grammar queue
2107
+ for req in self.grammar_queue:
2108
+ # Abort method 2: call `set_finish_with_abort`
2109
+ # The request will still run one prefill forward pass.
2110
+ # In this case, we change the input_ids to be only one token to make this prefill cheap.
2111
+ if req.rid.startswith(recv_req.rid):
2112
+ logger.debug(f"Abort grammar queue request. {req.rid=}")
2113
+ if req.grammar:
2114
+ req.grammar.cancel()
2115
+ req.set_finish_with_abort("Aborted by AbortReq.")
2116
+
1991
2117
  # Delete requests in the running batch
1992
2118
  if self.cur_batch is self.running_batch or self.cur_batch is None:
1993
2119
  reqs = self.running_batch.reqs
@@ -1996,6 +2122,9 @@ class Scheduler(
1996
2122
 
1997
2123
  for req in reqs:
1998
2124
  if req.rid.startswith(recv_req.rid) and not req.finished():
2125
+ # Abort method 3: set `to_abort=True`
2126
+ # The request will still run one decode forward pass.
2127
+ # Then we reuse all existing code to clean up the KV cache allocation.
1999
2128
  logger.debug(f"Abort running request. {req.rid=}")
2000
2129
  req.to_abort = True
2001
2130
 
@@ -2075,46 +2204,86 @@ class Scheduler(
2075
2204
 
2076
2205
  def profile(self, recv_req: ProfileReq):
2077
2206
  if recv_req.type == ProfileReqType.START_PROFILE:
2078
- return self.start_profile(
2079
- recv_req.output_dir,
2080
- recv_req.num_steps,
2081
- recv_req.activities,
2082
- recv_req.with_stack,
2083
- recv_req.record_shapes,
2084
- recv_req.profile_id,
2085
- )
2207
+ if recv_req.profile_by_stage:
2208
+ return self.init_profile(
2209
+ recv_req.output_dir,
2210
+ recv_req.num_steps,
2211
+ recv_req.activities,
2212
+ recv_req.with_stack,
2213
+ recv_req.record_shapes,
2214
+ recv_req.profile_by_stage,
2215
+ recv_req.profile_id,
2216
+ )
2217
+ else:
2218
+ self.init_profile(
2219
+ recv_req.output_dir,
2220
+ recv_req.num_steps,
2221
+ recv_req.activities,
2222
+ recv_req.with_stack,
2223
+ recv_req.record_shapes,
2224
+ recv_req.profile_by_stage,
2225
+ recv_req.profile_id,
2226
+ )
2227
+ return self.start_profile(True)
2086
2228
  else:
2087
2229
  return self.stop_profile()
2088
2230
 
2089
- def start_profile(
2231
+ def init_profile(
2090
2232
  self,
2091
2233
  output_dir: Optional[str],
2092
2234
  num_steps: Optional[int],
2093
2235
  activities: Optional[List[str]],
2094
2236
  with_stack: Optional[bool],
2095
2237
  record_shapes: Optional[bool],
2096
- profile_id: Optional[str],
2097
- ) -> None:
2098
- if self.profiler_activities:
2238
+ profile_by_stage: bool,
2239
+ profile_id: str,
2240
+ ) -> ProfileReqOutput:
2241
+ if self.profile_in_progress:
2099
2242
  return ProfileReqOutput(
2100
2243
  success=False,
2101
2244
  message="Profiling is already in progress. Call /stop_profile first.",
2102
2245
  )
2103
2246
 
2247
+ self.profile_by_stage = profile_by_stage
2248
+
2104
2249
  if output_dir is None:
2105
2250
  output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
2106
2251
  if activities is None:
2107
2252
  activities = ["CPU", "GPU"]
2108
2253
 
2109
2254
  self.torch_profiler_output_dir = output_dir
2255
+ self.torch_profiler_with_stack = with_stack
2256
+ self.torch_profiler_record_shapes = record_shapes
2110
2257
  self.profiler_activities = activities
2111
- self.profiler_id = profile_id
2258
+ self.profile_id = profile_id
2259
+
2260
+ if num_steps:
2261
+ self.profile_steps = num_steps
2262
+ if self.profile_by_stage:
2263
+ self.profiler_target_prefill_ct = num_steps
2264
+ self.profiler_target_decode_ct = num_steps
2265
+ self.profiler_prefill_ct = 0
2266
+ self.profiler_decode_ct = 0
2267
+ else:
2268
+ self.profiler_target_forward_ct = self.forward_ct + num_steps
2269
+ # The caller will be notified when reaching profiler_target_forward_ct
2270
+ else:
2271
+ self.profiler_target_forward_ct = None
2272
+
2273
+ return ProfileReqOutput(success=True, message="Succeeded")
2274
+
2275
+ def start_profile(
2276
+ self, stage: Optional[ForwardMode] = None
2277
+ ) -> ProfileReqOutput | None:
2278
+ stage_str = f" for {stage.__str__()}" if stage else ""
2112
2279
  logger.info(
2113
- "Profiling starts. Traces will be saved to: %s (with id %s)",
2114
- self.torch_profiler_output_dir,
2115
- self.profiler_id,
2280
+ f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
2116
2281
  )
2117
2282
 
2283
+ activities = self.profiler_activities
2284
+ with_stack = self.torch_profiler_with_stack
2285
+ record_shapes = self.torch_profiler_record_shapes
2286
+
2118
2287
  activity_map = {
2119
2288
  "CPU": torch.profiler.ProfilerActivity.CPU,
2120
2289
  "GPU": torch.profiler.ProfilerActivity.CUDA,
@@ -2123,48 +2292,100 @@ class Scheduler(
2123
2292
  activity_map[a] for a in activities if a in activity_map
2124
2293
  ]
2125
2294
 
2126
- if torchprof_activities:
2295
+ if "RPD" in activities:
2296
+ from rpdTracerControl import rpdTracerControl
2297
+
2298
+ rpdTracerControl.skipCreate()
2299
+
2300
+ self.rpd_profile_path = os.path.join(
2301
+ self.torch_profiler_output_dir,
2302
+ "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2303
+ )
2304
+
2305
+ if self.tp_rank == 0:
2306
+ import sqlite3
2307
+
2308
+ from rocpd.schema import RocpdSchema
2309
+
2310
+ if os.path.exists("trace.rpd"):
2311
+ os.unlink("trace.rpd")
2312
+ schema = RocpdSchema()
2313
+ connection = sqlite3.connect("trace.rpd")
2314
+ schema.writeSchema(connection)
2315
+ connection.commit()
2316
+ del connection
2317
+ torch.distributed.barrier(self.tp_cpu_group)
2318
+
2319
+ self.rpd_profiler = rpdTracerControl()
2320
+ self.rpd_profiler.setPythonTrace(True)
2321
+ self.rpd_profiler.start()
2322
+ self.rpd_profiler.rangePush("", "rpd profile range", "")
2323
+ self.profile_in_progress = True
2324
+ elif torchprof_activities:
2127
2325
  self.torch_profiler = torch.profiler.profile(
2128
2326
  activities=torchprof_activities,
2129
2327
  with_stack=with_stack if with_stack is not None else True,
2130
2328
  record_shapes=record_shapes if record_shapes is not None else False,
2131
2329
  )
2132
2330
  self.torch_profiler.start()
2331
+ self.profile_in_progress = True
2133
2332
 
2134
2333
  if "MEM" in activities:
2135
2334
  torch.cuda.memory._record_memory_history(max_entries=100000)
2335
+ self.profile_in_progress = True
2136
2336
 
2137
2337
  if "CUDA_PROFILER" in activities:
2138
2338
  torch.cuda.cudart().cudaProfilerStart()
2139
2339
 
2140
- if num_steps:
2141
- self.profiler_target_forward_ct = self.forward_ct + num_steps
2142
- # The caller will be notified when reaching profiler_target_forward_ct
2143
- else:
2144
- self.profiler_target_forward_ct = None
2145
- return ProfileReqOutput(success=True, message="Succeeded")
2340
+ return ProfileReqOutput(success=True, message="Succeeded")
2146
2341
 
2147
- def stop_profile(self) -> None:
2148
- if self.profiler_activities is None:
2342
+ def stop_profile(
2343
+ self, stage: Optional[ForwardMode] = None
2344
+ ) -> ProfileReqOutput | None:
2345
+ if not self.profile_in_progress:
2149
2346
  return ProfileReqOutput(
2150
2347
  success=False,
2151
2348
  message="Profiling is not in progress. Call /start_profile first.",
2152
2349
  )
2153
2350
 
2154
- logger.info("Stop profiling...")
2351
+ if not Path(self.torch_profiler_output_dir).exists():
2352
+ Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
2353
+
2354
+ stage_suffix = f"-{stage.__str__()}" if stage else ""
2355
+ logger.info("Stop profiling" + stage_suffix + "...")
2155
2356
  if self.torch_profiler is not None:
2156
2357
  self.torch_profiler.stop()
2157
2358
  self.torch_profiler.export_chrome_trace(
2158
2359
  os.path.join(
2159
2360
  self.torch_profiler_output_dir,
2160
- self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2361
+ self.profile_id
2362
+ + f"-TP-{self.tp_rank}"
2363
+ + stage_suffix
2364
+ + ".trace.json.gz",
2161
2365
  )
2162
2366
  )
2367
+ torch.distributed.barrier(self.tp_cpu_group)
2368
+
2369
+ if self.rpd_profiler is not None:
2370
+ self.rpd_profiler.rangePop()
2371
+ self.rpd_profiler.stop()
2372
+ self.rpd_profiler.flush()
2373
+
2374
+ torch.distributed.barrier(self.tp_cpu_group)
2375
+ if self.tp_rank == 0:
2376
+ from sglang.srt.utils import rpd_to_chrome_trace
2377
+
2378
+ rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
2379
+ self.rpd_profiler = None
2380
+ self.rpd_profiler_path = None
2163
2381
 
2164
- if "MEM" in self.profiler_activities:
2382
+ if self.profiler_activities is not None and "MEM" in self.profiler_activities:
2165
2383
  memory_profile_path = os.path.join(
2166
2384
  self.torch_profiler_output_dir,
2167
- self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
2385
+ str(time.time())
2386
+ + f"-TP-{self.tp_rank}-memory"
2387
+ + stage_suffix
2388
+ + ".pickle",
2168
2389
  )
2169
2390
  torch.cuda.memory._dump_snapshot(memory_profile_path)
2170
2391
  torch.cuda.memory._record_memory_history(enabled=None)
@@ -2177,10 +2398,38 @@ class Scheduler(
2177
2398
  self.torch_profiler_output_dir,
2178
2399
  )
2179
2400
  self.torch_profiler = None
2180
- self.torch_profiler_output_dir = None
2181
- self.profiler_activities = None
2182
-
2183
- return ProfileReqOutput(success=True, message="Succeeded")
2401
+ self.profile_in_progress = False
2402
+
2403
+ return ProfileReqOutput(success=True, message="Succeeded.")
2404
+
2405
+ def _profile_batch_predicate(self, batch):
2406
+ if self.profile_by_stage:
2407
+ if batch.forward_mode.is_prefill():
2408
+ if self.profiler_prefill_ct == 0:
2409
+ self.start_profile(batch.forward_mode)
2410
+ self.profiler_prefill_ct += 1
2411
+ if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
2412
+ if self.profile_in_progress:
2413
+ self.stop_profile(stage=ForwardMode.EXTEND)
2414
+ elif batch.forward_mode.is_decode():
2415
+ if self.profiler_decode_ct == 0:
2416
+ if self.profile_in_progress:
2417
+ # force trace flush
2418
+ self.stop_profile(ForwardMode.EXTEND)
2419
+ self.start_profile(batch.forward_mode)
2420
+ self.profiler_decode_ct += 1
2421
+ if self.profiler_decode_ct > self.profiler_target_decode_ct:
2422
+ if self.profile_in_progress:
2423
+ self.stop_profile(stage=ForwardMode.DECODE)
2424
+ else:
2425
+ raise RuntimeError("unsupported profile stage")
2426
+ else:
2427
+ # Check profiler
2428
+ if (
2429
+ self.profiler_target_forward_ct
2430
+ and self.profiler_target_forward_ct <= self.forward_ct
2431
+ ):
2432
+ self.stop_profile()
2184
2433
 
2185
2434
  def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
2186
2435
  if recv_req == ExpertDistributionReq.START_RECORD: