sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
24
24
  from concurrent import futures
25
25
  from dataclasses import dataclass
26
26
  from http import HTTPStatus
27
+ from pathlib import Path
27
28
  from types import SimpleNamespace
28
29
  from typing import Dict, List, Optional, Tuple, Union
29
30
 
@@ -35,20 +36,26 @@ from torch.distributed import barrier
35
36
 
36
37
  from sglang.global_config import global_config
37
38
  from sglang.srt.configs.model_config import ModelConfig
38
- from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
39
+ from sglang.srt.constrained.base_grammar_backend import (
40
+ INVALID_GRAMMAR_OBJ,
41
+ create_grammar_backend,
42
+ )
39
43
  from sglang.srt.disaggregation.decode import (
40
44
  DecodePreallocQueue,
41
45
  DecodeTransferQueue,
42
46
  SchedulerDisaggregationDecodeMixin,
43
47
  )
48
+ from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
44
49
  from sglang.srt.disaggregation.prefill import (
45
50
  PrefillBootstrapQueue,
46
51
  SchedulerDisaggregationPrefillMixin,
47
52
  )
48
53
  from sglang.srt.disaggregation.utils import (
49
54
  DisaggregationMode,
55
+ MetadataBuffers,
50
56
  ReqToMetadataIdxAllocator,
51
57
  TransferBackend,
58
+ prepare_abort,
52
59
  )
53
60
  from sglang.srt.distributed import get_pp_group, get_world_group
54
61
  from sglang.srt.hf_transformers_utils import (
@@ -58,7 +65,9 @@ from sglang.srt.hf_transformers_utils import (
58
65
  )
59
66
  from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
60
67
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
61
- from sglang.srt.managers.expert_distribution import ExpertDistributionRecorder
68
+ from sglang.srt.managers.expert_distribution import (
69
+ get_global_expert_distribution_recorder,
70
+ )
62
71
  from sglang.srt.managers.io_struct import (
63
72
  AbortReq,
64
73
  CloseSessionReqInput,
@@ -97,6 +106,7 @@ from sglang.srt.managers.io_struct import (
97
106
  UpdateWeightsFromTensorReqInput,
98
107
  UpdateWeightsFromTensorReqOutput,
99
108
  )
109
+ from sglang.srt.managers.mm_utils import init_embedding_cache
100
110
  from sglang.srt.managers.schedule_batch import (
101
111
  FINISH_ABORT,
102
112
  MultimodalInputs,
@@ -125,12 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
125
135
  from sglang.srt.server_args import PortArgs, ServerArgs
126
136
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
127
137
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
138
+ from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
128
139
  from sglang.srt.utils import (
140
+ DeepEPMode,
129
141
  DynamicGradMode,
130
142
  broadcast_pyobj,
131
143
  configure_logger,
132
- crash_on_warnings,
133
144
  disable_request_logging,
145
+ get_available_gpu_memory,
134
146
  get_bool_env_var,
135
147
  get_zmq_socket,
136
148
  kill_itself_when_parent_died,
@@ -142,8 +154,6 @@ from sglang.srt.utils import (
142
154
  )
143
155
  from sglang.utils import TypeBasedDispatcher, get_exception_traceback
144
156
 
145
- expert_distribution_recorder = ExpertDistributionRecorder()
146
-
147
157
  logger = logging.getLogger(__name__)
148
158
 
149
159
  # Test retract decode for debugging purposes
@@ -198,6 +208,7 @@ class Scheduler(
198
208
  self.enable_overlap = not server_args.disable_overlap_schedule
199
209
  self.skip_tokenizer_init = server_args.skip_tokenizer_init
200
210
  self.enable_metrics = server_args.enable_metrics
211
+ self.enable_kv_cache_events = server_args.kv_events_config is not None
201
212
  self.stream_interval = server_args.stream_interval
202
213
  self.spec_algorithm = SpeculativeAlgorithm.from_string(
203
214
  server_args.speculative_algorithm
@@ -205,8 +216,6 @@ class Scheduler(
205
216
  self.gpu_id = gpu_id
206
217
  self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
207
218
  self.page_size = server_args.page_size
208
-
209
- # Distributed rank info
210
219
  self.dp_size = server_args.dp_size
211
220
  self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
212
221
  compute_dp_attention_world_info(
@@ -326,12 +335,16 @@ class Scheduler(
326
335
 
327
336
  # Print debug info
328
337
  if tp_rank == 0:
338
+ avail_mem = get_available_gpu_memory(
339
+ self.device, self.gpu_id, empty_cache=False
340
+ )
329
341
  logger.info(
330
342
  f"max_total_num_tokens={self.max_total_num_tokens}, "
331
343
  f"chunked_prefill_size={server_args.chunked_prefill_size}, "
332
344
  f"max_prefill_tokens={self.max_prefill_tokens}, "
333
345
  f"max_running_requests={self.max_running_requests}, "
334
- f"context_len={self.model_config.context_len}"
346
+ f"context_len={self.model_config.context_len}, "
347
+ f"available_gpu_mem={avail_mem:.2f} GB"
335
348
  )
336
349
 
337
350
  # Init memory pool and cache
@@ -349,12 +362,13 @@ class Scheduler(
349
362
  self.forward_ct_decode = 0
350
363
  self.num_generated_tokens = 0
351
364
  self.num_prefill_tokens = 0
352
- self.last_decode_stats_tic = time.time()
353
- self.last_prefill_stats_tic = time.time()
365
+ self.last_decode_stats_tic = time.perf_counter()
366
+ self.last_prefill_stats_tic = time.perf_counter()
354
367
  self.return_health_check_ct = 0
355
368
  self.current_stream = torch.get_device_module(self.device).current_stream()
356
369
  if self.device == "cpu":
357
370
  self.current_stream.synchronize = lambda: None # No-op for CPU
371
+ self.forward_sleep_time = None
358
372
 
359
373
  # Init session info
360
374
  self.sessions: Dict[str, Session] = {}
@@ -416,13 +430,20 @@ class Scheduler(
416
430
  self.torch_profiler = None
417
431
  self.torch_profiler_output_dir: Optional[str] = None
418
432
  self.profiler_activities: Optional[List[str]] = None
419
- self.profiler_id: Optional[str] = None
433
+ self.profile_id: Optional[str] = None
420
434
  self.profiler_target_forward_ct: Optional[int] = None
421
-
422
- self.forward_sleep_time = None
435
+ self.profiler_target_prefill_ct: Optional[int] = None
436
+ self.profiler_target_decode_ct: Optional[int] = None
437
+ self.profiler_prefill_ct: Optional[int] = None
438
+ self.profiler_decode_ct: Optional[int] = None
439
+ self.profile_by_stage: bool = False
440
+ self.profile_steps: Optional[int] = None
441
+ self.profile_in_progress: bool = False
442
+ self.rpd_profiler = None
423
443
 
424
444
  # Init metrics stats
425
445
  self.init_metrics()
446
+ self.init_kv_events(server_args.kv_events_config)
426
447
 
427
448
  # Init request dispatcher
428
449
  self._request_dispatcher = TypeBasedDispatcher(
@@ -516,6 +537,7 @@ class Scheduler(
516
537
  token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
517
538
  page_size=self.page_size,
518
539
  disable=server_args.disable_radix_cache,
540
+ enable_kv_cache_events=self.enable_kv_cache_events,
519
541
  )
520
542
 
521
543
  self.decode_mem_cache_buf_multiplier = (
@@ -548,6 +570,12 @@ class Scheduler(
548
570
  },
549
571
  )
550
572
 
573
+ def init_kv_events(self, kv_events_config: Optional[str]):
574
+ if self.enable_kv_cache_events:
575
+ self.kv_event_publisher = EventPublisherFactory.create(
576
+ kv_events_config, self.attn_dp_rank
577
+ )
578
+
551
579
  def init_disaggregation(self):
552
580
  self.transfer_backend = TransferBackend(
553
581
  self.server_args.disaggregation_transfer_backend
@@ -560,29 +588,28 @@ class Scheduler(
560
588
  req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
561
589
  buffer_size
562
590
  )
563
- aux_dtype = torch.int32
564
- # A list of metadata buffers. The shape is (b, metadata_size) where
565
- # b corresponds to a max running requests. The last shape * dtype.itemsize
566
- # should be larger than 64 bytes to work with RDMA, so we pad it.
567
- output_id_buffer = torch.zeros(
568
- (buffer_size, 16), dtype=aux_dtype, device="cpu"
569
- )
570
- metadata_buffers = [output_id_buffer]
591
+ self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
571
592
 
572
593
  # The decode requests polling kv cache
573
594
  self.disagg_decode_transfer_queue = DecodeTransferQueue(
574
595
  gloo_group=self.attn_tp_cpu_group,
575
596
  req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
576
- metadata_buffers=metadata_buffers,
597
+ metadata_buffers=self.disagg_metadata_buffers,
598
+ scheduler=self,
599
+ tree_cache=self.tree_cache,
577
600
  )
578
601
 
579
602
  # The decode requests pending for pre-allocation
580
603
  self.disagg_decode_prealloc_queue = DecodePreallocQueue(
581
604
  req_to_token_pool=self.req_to_token_pool,
582
605
  token_to_kv_pool_allocator=self.token_to_kv_pool_allocator,
606
+ draft_token_to_kv_pool=(
607
+ None
608
+ if self.draft_worker is None
609
+ else self.draft_worker.model_runner.token_to_kv_pool
610
+ ),
583
611
  req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
584
- metadata_buffers=metadata_buffers,
585
- aux_dtype=aux_dtype,
612
+ metadata_buffers=self.disagg_metadata_buffers,
586
613
  scheduler=self,
587
614
  transfer_queue=self.disagg_decode_transfer_queue,
588
615
  tree_cache=self.tree_cache,
@@ -602,20 +629,17 @@ class Scheduler(
602
629
  req_to_metadata_buffer_idx_allocator = ReqToMetadataIdxAllocator(
603
630
  buffer_size
604
631
  )
605
- aux_dtype = torch.int32
606
- # A list of metadata buffers. The shape is (b, metadata_size) where
607
- # b corresponds to a max running requests. The last shape * dtype.itemsize
608
- # should be larger than 64 bytes to work with RDMA, so we pad it.
609
- output_id_buffer = torch.zeros(
610
- (buffer_size, 16), dtype=aux_dtype, device="cpu"
611
- )
612
- metadata_buffers = [output_id_buffer]
632
+ self.disagg_metadata_buffers = MetadataBuffers(buffer_size)
613
633
 
614
634
  self.disagg_prefill_bootstrap_queue = PrefillBootstrapQueue(
615
635
  token_to_kv_pool=self.token_to_kv_pool_allocator.get_kvcache(),
636
+ draft_token_to_kv_pool=(
637
+ None
638
+ if self.draft_worker is None
639
+ else self.draft_worker.model_runner.token_to_kv_pool
640
+ ),
616
641
  req_to_metadata_buffer_idx_allocator=req_to_metadata_buffer_idx_allocator,
617
- metadata_buffers=metadata_buffers,
618
- aux_dtype=aux_dtype,
642
+ metadata_buffers=self.disagg_metadata_buffers,
619
643
  tp_rank=self.tp_rank,
620
644
  tp_size=self.tp_size,
621
645
  bootstrap_port=self.server_args.disaggregation_bootstrap_port,
@@ -925,9 +949,22 @@ class Scheduler(
925
949
  bootstrap_host=recv_req.bootstrap_host,
926
950
  bootstrap_port=recv_req.bootstrap_port,
927
951
  bootstrap_room=recv_req.bootstrap_room,
952
+ data_parallel_rank=recv_req.data_parallel_rank,
928
953
  )
929
954
  req.tokenizer = self.tokenizer
930
955
 
956
+ if self.disaggregation_mode != DisaggregationMode.NULL:
957
+ # Invalid request for disaggregated mode
958
+ if recv_req.bootstrap_room is None:
959
+ error_msg = (
960
+ f"Invalid request: Disaggregated request received without "
961
+ f"boostrap room id. {req.rid=}"
962
+ )
963
+ logger.error(error_msg)
964
+ prepare_abort(req, error_msg)
965
+ self.stream_output([req], req.return_logprob)
966
+ return
967
+
931
968
  if (
932
969
  recv_req.session_params is not None
933
970
  and recv_req.session_params.id is not None
@@ -955,29 +992,23 @@ class Scheduler(
955
992
  req.extend_image_inputs(image_inputs)
956
993
 
957
994
  if len(req.origin_input_ids) >= self.max_req_input_len:
958
- error_msg = (
959
- "Multimodal prompt is too long after expanding multimodal tokens. "
960
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
961
- )
962
- logger.error(error_msg)
963
- req.origin_input_ids = [0]
964
- req.multimodal_inputs = None
965
- req.sampling_params.max_new_tokens = 0
966
- req.finished_reason = FINISH_ABORT(
967
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
995
+ req.set_finish_with_abort(
996
+ error_msg=(
997
+ "Multimodal prompt is too long after expanding multimodal tokens. "
998
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
999
+ )
968
1000
  )
969
1001
  self._add_request_to_queue(req)
970
1002
  return
971
1003
 
972
- # Validate prompts length
1004
+ # Validate prompt length
973
1005
  error_msg = validate_input_length(
974
1006
  req,
975
1007
  self.max_req_input_len,
976
1008
  self.server_args.allow_auto_truncate,
977
1009
  )
978
1010
  if error_msg:
979
- req.origin_input_ids = [0]
980
- req.sampling_params.max_new_tokens = 0
1011
+ req.set_finish_with_abort(error_msg)
981
1012
  self._add_request_to_queue(req)
982
1013
  return
983
1014
 
@@ -989,12 +1020,9 @@ class Scheduler(
989
1020
  req.logprob_start_len = recv_req.logprob_start_len
990
1021
 
991
1022
  if req.logprob_start_len >= len(req.origin_input_ids):
992
- req.finished_reason = FINISH_ABORT(
993
- f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
994
- HTTPStatus.BAD_REQUEST,
995
- "BadRequestError",
996
- )
1023
+ error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
997
1024
  req.logprob_start_len = len(req.origin_input_ids) - 1
1025
+ req.set_finish_with_abort(error_msg)
998
1026
  self._add_request_to_queue(req)
999
1027
  return
1000
1028
 
@@ -1031,15 +1059,19 @@ class Scheduler(
1031
1059
  if not cache_hit:
1032
1060
  req.grammar_key = key
1033
1061
  add_to_grammar_queue = True
1062
+ else:
1063
+ if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
1064
+ error_msg = f"Invalid grammar request with cache hit: {key=}"
1065
+ req.set_finish_with_abort(error_msg)
1034
1066
 
1035
1067
  if add_to_grammar_queue:
1036
- req.queue_time_start = time.time()
1068
+ req.queue_time_start = time.perf_counter()
1037
1069
  self.grammar_queue.append(req)
1038
1070
  else:
1039
1071
  self._add_request_to_queue(req)
1040
1072
 
1041
1073
  def _add_request_to_queue(self, req: Req):
1042
- req.queue_time_start = time.time()
1074
+ req.queue_time_start = time.perf_counter()
1043
1075
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1044
1076
  self.disagg_prefill_bootstrap_queue.add(req)
1045
1077
  elif self.disaggregation_mode == DisaggregationMode.DECODE:
@@ -1047,8 +1079,11 @@ class Scheduler(
1047
1079
  else:
1048
1080
  self.waiting_queue.append(req)
1049
1081
 
1050
- def _extend_requests_to_queue(self, reqs: List[Req], is_retracted: bool = False):
1051
- if self.disaggregation_mode == DisaggregationMode.DECODE:
1082
+ def _extend_requests_to_queue(self, reqs: List[Req]):
1083
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
1084
+ self.disagg_prefill_bootstrap_queue.extend(reqs)
1085
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
1086
+ # If this is a decode server, we put the request to the decode pending prealloc queue
1052
1087
  self.disagg_decode_prealloc_queue.extend(reqs)
1053
1088
  else:
1054
1089
  self.waiting_queue.extend(reqs)
@@ -1075,19 +1110,13 @@ class Scheduler(
1075
1110
  req.extend_image_inputs(image_inputs)
1076
1111
 
1077
1112
  if len(req.origin_input_ids) >= self.max_req_input_len:
1078
- error_msg = (
1079
- "Multimodal prompt is too long after expanding multimodal tokens. "
1080
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1081
- )
1082
- logger.error(error_msg)
1083
- req.origin_input_ids = [0]
1084
- req.multimodal_inputs = None
1085
- req.sampling_params.max_new_tokens = 0
1086
- req.finished_reason = FINISH_ABORT(
1087
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1113
+ req.set_finish_with_abort(
1114
+ error_msg=(
1115
+ "Multimodal prompt is too long after expanding multimodal tokens. "
1116
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1117
+ )
1088
1118
  )
1089
- req.queue_time_start = time.time()
1090
- self.waiting_queue.append(req)
1119
+ self._add_request_to_queue(req)
1091
1120
  return
1092
1121
 
1093
1122
  # Validate prompts length
@@ -1110,8 +1139,8 @@ class Scheduler(
1110
1139
  can_run_list: List[Req],
1111
1140
  running_bs: int,
1112
1141
  ):
1113
- gap_latency = time.time() - self.last_prefill_stats_tic
1114
- self.last_prefill_stats_tic = time.time()
1142
+ gap_latency = time.perf_counter() - self.last_prefill_stats_tic
1143
+ self.last_prefill_stats_tic = time.perf_counter()
1115
1144
  self.last_input_throughput = self.num_prefill_tokens / gap_latency
1116
1145
  self.num_prefill_tokens = 0
1117
1146
 
@@ -1133,7 +1162,8 @@ class Scheduler(
1133
1162
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1134
1163
  f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
1135
1164
  f += f"#queue-req: {len(self.waiting_queue)}, "
1136
- f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
1165
+ f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
1166
+ f += f"time: {gap_latency:.2f} "
1137
1167
  else:
1138
1168
  f += f"#queue-req: {len(self.waiting_queue)}"
1139
1169
 
@@ -1155,14 +1185,15 @@ class Scheduler(
1155
1185
  self.stats.avg_request_queue_latency = total_queue_latency / num_new_seq
1156
1186
 
1157
1187
  self.metrics_collector.log_stats(self.stats)
1188
+ self._publish_kv_events()
1158
1189
 
1159
1190
  def log_decode_stats(
1160
1191
  self, can_run_cuda_graph: bool, running_batch: ScheduleBatch = None
1161
1192
  ):
1162
1193
  batch = running_batch or self.running_batch
1163
1194
 
1164
- gap_latency = time.time() - self.last_decode_stats_tic
1165
- self.last_decode_stats_tic = time.time()
1195
+ gap_latency = time.perf_counter() - self.last_decode_stats_tic
1196
+ self.last_decode_stats_tic = time.perf_counter()
1166
1197
  self.last_gen_throughput = self.num_generated_tokens / gap_latency
1167
1198
  self.num_generated_tokens = 0
1168
1199
  num_running_reqs = len(batch.reqs)
@@ -1214,6 +1245,7 @@ class Scheduler(
1214
1245
  self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
1215
1246
  self.stats.spec_accept_length = spec_accept_length
1216
1247
  self.metrics_collector.log_stats(self.stats)
1248
+ self._publish_kv_events()
1217
1249
 
1218
1250
  def check_memory(self):
1219
1251
  available_size = (
@@ -1246,7 +1278,7 @@ class Scheduler(
1246
1278
  if (
1247
1279
  self.enable_metrics
1248
1280
  and self.attn_tp_rank == 0
1249
- and time.time() > self.metrics_collector.last_log_time + 30
1281
+ and time.perf_counter() > self.metrics_collector.last_log_time + 30
1250
1282
  ):
1251
1283
  # During idle time, also collect metrics every 30 seconds.
1252
1284
  num_used = self.max_total_num_tokens - (
@@ -1261,6 +1293,7 @@ class Scheduler(
1261
1293
  self.stats.num_queue_reqs = len(self.waiting_queue)
1262
1294
  self.stats.num_grammar_queue_reqs = len(self.grammar_queue)
1263
1295
  self.metrics_collector.log_stats(self.stats)
1296
+ self._publish_kv_events()
1264
1297
 
1265
1298
  def get_next_batch_to_run(self) -> Optional[ScheduleBatch]:
1266
1299
  # Merge the prefill batch into the running batch
@@ -1383,6 +1416,13 @@ class Scheduler(
1383
1416
  self.running_batch.batch_is_full = True
1384
1417
  break
1385
1418
 
1419
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
1420
+ # In prefill mode, prealloc queue and transfer queue can also take memory,
1421
+ # so we need to check if the available size for the actual available size.
1422
+ if len(adder.can_run_list) >= self.req_to_token_pool.available_size():
1423
+ self.running_batch.batch_is_full = True
1424
+ break
1425
+
1386
1426
  req.init_next_round_input(
1387
1427
  None if prefix_computed else self.tree_cache,
1388
1428
  self.enable_hierarchical_cache,
@@ -1411,7 +1451,7 @@ class Scheduler(
1411
1451
  if self.enable_metrics:
1412
1452
  # only record queue time when enable_metrics is True to avoid overhead
1413
1453
  for req in can_run_list:
1414
- req.queue_time_end = time.time()
1454
+ req.queue_time_end = time.perf_counter()
1415
1455
 
1416
1456
  self.waiting_queue = [
1417
1457
  x for x in self.waiting_queue if x not in set(can_run_list)
@@ -1484,7 +1524,7 @@ class Scheduler(
1484
1524
  self.new_token_ratio = new_token_ratio
1485
1525
 
1486
1526
  logger.info(
1487
- "Decode out of memory happened. "
1527
+ "KV cache pool is full. Retract requests. "
1488
1528
  f"#retracted_reqs: {len(retracted_reqs)}, "
1489
1529
  f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
1490
1530
  )
@@ -1508,13 +1548,8 @@ class Scheduler(
1508
1548
  """Run a batch."""
1509
1549
  self.forward_ct += 1
1510
1550
 
1511
- # Check profiler
1512
- if (
1513
- self.profiler_target_forward_ct
1514
- and self.profiler_target_forward_ct <= self.forward_ct
1515
- ):
1516
- self.stop_profile()
1517
-
1551
+ # Whether to run the profiler
1552
+ self._profile_batch_predicate(batch)
1518
1553
  if self.forward_sleep_time is not None:
1519
1554
  logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
1520
1555
  time.sleep(self.forward_sleep_time)
@@ -1540,10 +1575,9 @@ class Scheduler(
1540
1575
  num_accepted_tokens,
1541
1576
  can_run_cuda_graph,
1542
1577
  ) = self.draft_worker.forward_batch_speculative_generation(batch)
1543
- self.spec_num_total_accepted_tokens += (
1544
- num_accepted_tokens + batch.batch_size()
1545
- )
1546
- self.spec_num_total_forward_ct += batch.batch_size()
1578
+ bs = batch.batch_size()
1579
+ self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
1580
+ self.spec_num_total_forward_ct += bs
1547
1581
  self.num_generated_tokens += num_accepted_tokens
1548
1582
 
1549
1583
  if self.pp_group.is_last_rank:
@@ -1617,6 +1651,9 @@ class Scheduler(
1617
1651
  disable_cuda_graph=self.server_args.disable_cuda_graph,
1618
1652
  spec_algorithm=self.spec_algorithm,
1619
1653
  speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
1654
+ enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
1655
+ enable_deepep_moe=self.server_args.enable_deepep_moe,
1656
+ deepep_mode=DeepEPMode[self.server_args.deepep_mode],
1620
1657
  )
1621
1658
 
1622
1659
  @staticmethod
@@ -1630,6 +1667,9 @@ class Scheduler(
1630
1667
  disable_cuda_graph: bool,
1631
1668
  spec_algorithm,
1632
1669
  speculative_num_draft_tokens,
1670
+ enable_two_batch_overlap: bool,
1671
+ enable_deepep_moe: bool,
1672
+ deepep_mode: DeepEPMode,
1633
1673
  ):
1634
1674
  # Check if other DP workers have running batches
1635
1675
  if local_batch is None:
@@ -1665,17 +1705,26 @@ class Scheduler(
1665
1705
  is_extend_in_batch = (
1666
1706
  local_batch.forward_mode.is_extend() if local_batch else False
1667
1707
  )
1708
+
1709
+ tbo_preparer = TboDPAttentionPreparer()
1710
+
1668
1711
  local_info = torch.tensor(
1669
1712
  [
1670
1713
  num_tokens,
1671
1714
  can_cuda_graph,
1672
1715
  num_tokens_for_logprob,
1673
1716
  is_extend_in_batch,
1717
+ *tbo_preparer.prepare_all_gather(
1718
+ local_batch,
1719
+ deepep_mode,
1720
+ enable_deepep_moe,
1721
+ enable_two_batch_overlap,
1722
+ ),
1674
1723
  ],
1675
1724
  dtype=torch.int64,
1676
1725
  )
1677
1726
  global_info = torch.empty(
1678
- (dp_size, attn_tp_size, 4),
1727
+ (dp_size, attn_tp_size, 6),
1679
1728
  dtype=torch.int64,
1680
1729
  )
1681
1730
  torch.distributed.all_gather_into_tensor(
@@ -1688,6 +1737,10 @@ class Scheduler(
1688
1737
  global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
1689
1738
  is_extend_in_batch = global_info[:, 0, 3].tolist()
1690
1739
 
1740
+ tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
1741
+ global_info[:, :, 4:6]
1742
+ )
1743
+
1691
1744
  if local_batch is None and max(global_num_tokens) > 0:
1692
1745
  local_batch = get_idle_batch()
1693
1746
 
@@ -1701,6 +1754,8 @@ class Scheduler(
1701
1754
  local_batch.global_num_tokens_for_logprob = (
1702
1755
  global_num_tokens_for_logprob
1703
1756
  )
1757
+ local_batch.tbo_split_seq_index = tbo_split_seq_index
1758
+ local_batch.global_forward_mode = global_forward_mode
1704
1759
 
1705
1760
  # Check forward mode for cuda graph
1706
1761
  if not disable_cuda_graph:
@@ -1726,17 +1781,25 @@ class Scheduler(
1726
1781
  """Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
1727
1782
 
1728
1783
  num_ready_reqs = 0
1729
- num_abort_reqs = 0
1784
+ num_timeout_reqs = 0
1730
1785
  for req in self.grammar_queue:
1731
1786
  try:
1787
+ if req.finished(): # It is aborted by AbortReq
1788
+ num_ready_reqs += 1
1789
+ continue
1732
1790
  req.grammar = req.grammar.result(timeout=0.03)
1733
- if req.grammar:
1734
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1791
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1792
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1793
+ req.set_finish_with_abort(
1794
+ f"Invalid grammar request: {req.grammar_key=}"
1795
+ )
1735
1796
  num_ready_reqs += 1
1736
1797
  except futures._base.TimeoutError:
1737
1798
  req.grammar_wait_ct += 1
1799
+ # NOTE(lianmin): this timeout is the waiting time of the above line. It is
1800
+ # not the waiting time from it enters the grammar queue.
1738
1801
  if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
1739
- num_abort_reqs = 1
1802
+ num_timeout_reqs = 1
1740
1803
  break
1741
1804
 
1742
1805
  if self.server_args.enable_dp_attention:
@@ -1748,28 +1811,33 @@ class Scheduler(
1748
1811
 
1749
1812
  if tp_size > 1:
1750
1813
  # Sync across TP ranks to make sure they have the same number of ready requests
1751
- tensor = torch.tensor([num_ready_reqs, num_abort_reqs], dtype=torch.int32)
1814
+ tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
1752
1815
  torch.distributed.all_reduce(
1753
1816
  tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
1754
1817
  )
1755
- num_ready_reqs_max, num_abort_reqs_max = tensor.tolist()
1818
+ num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
1756
1819
 
1757
1820
  for i in range(num_ready_reqs, num_ready_reqs_max):
1758
1821
  req = self.grammar_queue[i]
1822
+ if req.finished(): # It is aborted by AbortReq
1823
+ continue
1759
1824
  req.grammar = req.grammar.result()
1760
- if req.grammar:
1761
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1825
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1826
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1827
+ req.set_finish_with_abort(
1828
+ f"Invalid grammar request: {req.grammar_key=}"
1829
+ )
1830
+ else:
1831
+ num_ready_reqs_max = num_ready_reqs
1832
+ num_timeout_reqs_max = num_timeout_reqs
1762
1833
 
1763
- for i in range(num_ready_reqs, num_ready_reqs + num_abort_reqs_max):
1764
- req = self.grammar_queue[i]
1765
- req.grammar.cancel()
1766
- req.grammar = None
1767
- error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1768
- logger.error(error_msg)
1769
- req.finished_reason = FINISH_ABORT(
1770
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1771
- )
1772
- num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
1834
+ for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
1835
+ req = self.grammar_queue[i]
1836
+ req.grammar.cancel()
1837
+ error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1838
+ req.set_finish_with_abort(error_msg)
1839
+ self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
1840
+ num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
1773
1841
 
1774
1842
  self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
1775
1843
  self.grammar_queue = self.grammar_queue[num_ready_reqs:]
@@ -1784,10 +1852,10 @@ class Scheduler(
1784
1852
  def watchdog_thread(self):
1785
1853
  """A watch dog thread that will try to kill the server itself if one forward batch takes too long."""
1786
1854
  self.watchdog_last_forward_ct = 0
1787
- self.watchdog_last_time = time.time()
1855
+ self.watchdog_last_time = time.perf_counter()
1788
1856
 
1789
1857
  while True:
1790
- current = time.time()
1858
+ current = time.perf_counter()
1791
1859
  if self.cur_batch is not None:
1792
1860
  if self.watchdog_last_forward_ct == self.forward_ct:
1793
1861
  if current > self.watchdog_last_time + self.watchdog_timeout:
@@ -1856,6 +1924,27 @@ class Scheduler(
1856
1924
  if_success = False
1857
1925
  return if_success
1858
1926
 
1927
+ def get_load(self):
1928
+ # TODO(lsyin): use dynamically maintained num_waiting_tokens
1929
+ load = (
1930
+ self.max_total_num_tokens
1931
+ - self.token_to_kv_pool_allocator.available_size()
1932
+ - self.tree_cache.evictable_size()
1933
+ )
1934
+ load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
1935
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
1936
+ load += sum(
1937
+ len(req.origin_input_ids)
1938
+ for req in self.disagg_prefill_bootstrap_queue.queue
1939
+ )
1940
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
1941
+ load += sum(
1942
+ len(req.req.origin_input_ids)
1943
+ for req in self.disagg_decode_prealloc_queue.queue
1944
+ )
1945
+
1946
+ return load
1947
+
1859
1948
  def get_internal_state(self, recv_req: GetInternalStateReq):
1860
1949
  ret = dict(global_server_args_dict)
1861
1950
  ret["last_gen_throughput"] = self.last_gen_throughput
@@ -1865,9 +1954,10 @@ class Scheduler(
1865
1954
  )
1866
1955
  if RECORD_STEP_TIME:
1867
1956
  ret["step_time_dict"] = self.step_time_dict
1868
- return GetInternalStateReqOutput(
1869
- internal_state=ret,
1870
- )
1957
+
1958
+ ret["load"] = self.get_load()
1959
+
1960
+ return GetInternalStateReqOutput(internal_state=ret)
1871
1961
 
1872
1962
  def set_internal_state(self, recv_req: SetInternalStateReq):
1873
1963
  server_args_dict = recv_req.server_args
@@ -1901,7 +1991,7 @@ class Scheduler(
1901
1991
  self.cum_spec_accept_length = self.cum_spec_accept_count = 0
1902
1992
  for k, v in server_args_dict.items():
1903
1993
  global_server_args_dict[k] = v
1904
- logger.info(f"Global server args updated! " f"{global_server_args_dict=}")
1994
+ logger.info(f"Global server args updated! {global_server_args_dict=}")
1905
1995
  return SetInternalStateReqOutput(
1906
1996
  updated=True,
1907
1997
  server_args=global_server_args_dict,
@@ -1943,8 +2033,6 @@ class Scheduler(
1943
2033
  )
1944
2034
 
1945
2035
  def abort_request(self, recv_req: AbortReq):
1946
- # TODO(lmzheng): abort the requests in the grammar queue.
1947
-
1948
2036
  # Delete requests in the waiting queue
1949
2037
  to_del = []
1950
2038
  for i, req in enumerate(self.waiting_queue):
@@ -1953,10 +2041,23 @@ class Scheduler(
1953
2041
 
1954
2042
  # Sort in reverse order to avoid index issues when deleting
1955
2043
  for i in reversed(to_del):
2044
+ # Abort method 1: directly pop from the queue
2045
+ # This only works for requests that have not started anything.
2046
+ # We still need to send something back to TokenizerManager to clean up the state.
1956
2047
  req = self.waiting_queue.pop(i)
1957
2048
  self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
1958
2049
  logger.debug(f"Abort queued request. {req.rid=}")
1959
2050
 
2051
+ # Delete the requests in the grammar queue
2052
+ for req in self.grammar_queue:
2053
+ # Abort method 2: call `set_finish_with_abort`
2054
+ # The request will still run one prefill forward pass.
2055
+ # In this case, we change the input_ids to be only one token to make this prefill cheap.
2056
+ if req.rid.startswith(recv_req.rid):
2057
+ logger.debug(f"Abort grammar queue request. {req.rid=}")
2058
+ req.grammar.cancel()
2059
+ req.set_finish_with_abort("Aborted by AbortReq.")
2060
+
1960
2061
  # Delete requests in the running batch
1961
2062
  if self.cur_batch is self.running_batch or self.cur_batch is None:
1962
2063
  reqs = self.running_batch.reqs
@@ -1965,6 +2066,9 @@ class Scheduler(
1965
2066
 
1966
2067
  for req in reqs:
1967
2068
  if req.rid.startswith(recv_req.rid) and not req.finished():
2069
+ # Abort method 3: set `to_abort=True`
2070
+ # The request will still run one decode forward pass.
2071
+ # Then we reuse all existing code to clean up the KV cache allocation.
1968
2072
  logger.debug(f"Abort running request. {req.rid=}")
1969
2073
  req.to_abort = True
1970
2074
 
@@ -2044,46 +2148,86 @@ class Scheduler(
2044
2148
 
2045
2149
  def profile(self, recv_req: ProfileReq):
2046
2150
  if recv_req.type == ProfileReqType.START_PROFILE:
2047
- return self.start_profile(
2048
- recv_req.output_dir,
2049
- recv_req.num_steps,
2050
- recv_req.activities,
2051
- recv_req.with_stack,
2052
- recv_req.record_shapes,
2053
- recv_req.profile_id,
2054
- )
2151
+ if recv_req.profile_by_stage:
2152
+ return self.init_profile(
2153
+ recv_req.output_dir,
2154
+ recv_req.num_steps,
2155
+ recv_req.activities,
2156
+ recv_req.with_stack,
2157
+ recv_req.record_shapes,
2158
+ recv_req.profile_by_stage,
2159
+ recv_req.profile_id,
2160
+ )
2161
+ else:
2162
+ self.init_profile(
2163
+ recv_req.output_dir,
2164
+ recv_req.num_steps,
2165
+ recv_req.activities,
2166
+ recv_req.with_stack,
2167
+ recv_req.record_shapes,
2168
+ recv_req.profile_by_stage,
2169
+ recv_req.profile_id,
2170
+ )
2171
+ return self.start_profile(True)
2055
2172
  else:
2056
2173
  return self.stop_profile()
2057
2174
 
2058
- def start_profile(
2175
+ def init_profile(
2059
2176
  self,
2060
2177
  output_dir: Optional[str],
2061
2178
  num_steps: Optional[int],
2062
2179
  activities: Optional[List[str]],
2063
2180
  with_stack: Optional[bool],
2064
2181
  record_shapes: Optional[bool],
2065
- profile_id: Optional[str],
2066
- ) -> None:
2067
- if self.profiler_activities:
2182
+ profile_by_stage: bool,
2183
+ profile_id: str,
2184
+ ) -> ProfileReqOutput:
2185
+ if self.profile_in_progress:
2068
2186
  return ProfileReqOutput(
2069
2187
  success=False,
2070
2188
  message="Profiling is already in progress. Call /stop_profile first.",
2071
2189
  )
2072
2190
 
2191
+ self.profile_by_stage = profile_by_stage
2192
+
2073
2193
  if output_dir is None:
2074
2194
  output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
2075
2195
  if activities is None:
2076
2196
  activities = ["CPU", "GPU"]
2077
2197
 
2078
2198
  self.torch_profiler_output_dir = output_dir
2199
+ self.torch_profiler_with_stack = with_stack
2200
+ self.torch_profiler_record_shapes = record_shapes
2079
2201
  self.profiler_activities = activities
2080
- self.profiler_id = profile_id
2202
+ self.profile_id = profile_id
2203
+
2204
+ if num_steps:
2205
+ self.profile_steps = num_steps
2206
+ if self.profile_by_stage:
2207
+ self.profiler_target_prefill_ct = num_steps
2208
+ self.profiler_target_decode_ct = num_steps
2209
+ self.profiler_prefill_ct = 0
2210
+ self.profiler_decode_ct = 0
2211
+ else:
2212
+ self.profiler_target_forward_ct = self.forward_ct + num_steps
2213
+ # The caller will be notified when reaching profiler_target_forward_ct
2214
+ else:
2215
+ self.profiler_target_forward_ct = None
2216
+
2217
+ return ProfileReqOutput(success=True, message="Succeeded")
2218
+
2219
+ def start_profile(
2220
+ self, stage: Optional[ForwardMode] = None
2221
+ ) -> ProfileReqOutput | None:
2222
+ stage_str = f" for {stage.__str__()}" if stage else ""
2081
2223
  logger.info(
2082
- "Profiling starts. Traces will be saved to: %s (with id %s)",
2083
- self.torch_profiler_output_dir,
2084
- self.profiler_id,
2224
+ f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
2085
2225
  )
2086
2226
 
2227
+ activities = self.profiler_activities
2228
+ with_stack = self.torch_profiler_with_stack
2229
+ record_shapes = self.torch_profiler_record_shapes
2230
+
2087
2231
  activity_map = {
2088
2232
  "CPU": torch.profiler.ProfilerActivity.CPU,
2089
2233
  "GPU": torch.profiler.ProfilerActivity.CUDA,
@@ -2092,45 +2236,100 @@ class Scheduler(
2092
2236
  activity_map[a] for a in activities if a in activity_map
2093
2237
  ]
2094
2238
 
2095
- if torchprof_activities:
2239
+ if "RPD" in activities:
2240
+ from rpdTracerControl import rpdTracerControl
2241
+
2242
+ rpdTracerControl.skipCreate()
2243
+
2244
+ self.rpd_profile_path = os.path.join(
2245
+ self.torch_profiler_output_dir,
2246
+ "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2247
+ )
2248
+
2249
+ if self.tp_rank == 0:
2250
+ import sqlite3
2251
+
2252
+ from rocpd.schema import RocpdSchema
2253
+
2254
+ if os.path.exists("trace.rpd"):
2255
+ os.unlink("trace.rpd")
2256
+ schema = RocpdSchema()
2257
+ connection = sqlite3.connect("trace.rpd")
2258
+ schema.writeSchema(connection)
2259
+ connection.commit()
2260
+ del connection
2261
+ torch.distributed.barrier(self.tp_cpu_group)
2262
+
2263
+ self.rpd_profiler = rpdTracerControl()
2264
+ self.rpd_profiler.setPythonTrace(True)
2265
+ self.rpd_profiler.start()
2266
+ self.rpd_profiler.rangePush("", "rpd profile range", "")
2267
+ self.profile_in_progress = True
2268
+ elif torchprof_activities:
2096
2269
  self.torch_profiler = torch.profiler.profile(
2097
2270
  activities=torchprof_activities,
2098
2271
  with_stack=with_stack if with_stack is not None else True,
2099
2272
  record_shapes=record_shapes if record_shapes is not None else False,
2100
2273
  )
2101
2274
  self.torch_profiler.start()
2275
+ self.profile_in_progress = True
2102
2276
 
2103
2277
  if "MEM" in activities:
2104
2278
  torch.cuda.memory._record_memory_history(max_entries=100000)
2279
+ self.profile_in_progress = True
2105
2280
 
2106
2281
  if "CUDA_PROFILER" in activities:
2107
2282
  torch.cuda.cudart().cudaProfilerStart()
2108
2283
 
2109
- if num_steps:
2110
- self.profiler_target_forward_ct = self.forward_ct + num_steps
2111
- # The caller will be notified when reaching profiler_target_forward_ct
2112
- else:
2113
- self.profiler_target_forward_ct = None
2114
- return ProfileReqOutput(success=True, message="Succeeded")
2284
+ return ProfileReqOutput(success=True, message="Succeeded")
2115
2285
 
2116
- def stop_profile(self) -> None:
2117
- if self.profiler_activities is None:
2118
- return
2286
+ def stop_profile(
2287
+ self, stage: Optional[ForwardMode] = None
2288
+ ) -> ProfileReqOutput | None:
2289
+ if not self.profile_in_progress:
2290
+ return ProfileReqOutput(
2291
+ success=False,
2292
+ message="Profiling is not in progress. Call /start_profile first.",
2293
+ )
2119
2294
 
2120
- logger.info("Stop profiling...")
2295
+ if not Path(self.torch_profiler_output_dir).exists():
2296
+ Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
2297
+
2298
+ stage_suffix = f"-{stage.__str__()}" if stage else ""
2299
+ logger.info("Stop profiling" + stage_suffix + "...")
2121
2300
  if self.torch_profiler is not None:
2122
2301
  self.torch_profiler.stop()
2123
2302
  self.torch_profiler.export_chrome_trace(
2124
2303
  os.path.join(
2125
2304
  self.torch_profiler_output_dir,
2126
- self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2305
+ self.profile_id
2306
+ + f"-TP-{self.tp_rank}"
2307
+ + stage_suffix
2308
+ + ".trace.json.gz",
2127
2309
  )
2128
2310
  )
2311
+ torch.distributed.barrier(self.tp_cpu_group)
2312
+
2313
+ if self.rpd_profiler is not None:
2314
+ self.rpd_profiler.rangePop()
2315
+ self.rpd_profiler.stop()
2316
+ self.rpd_profiler.flush()
2129
2317
 
2130
- if "MEM" in self.profiler_activities:
2318
+ torch.distributed.barrier(self.tp_cpu_group)
2319
+ if self.tp_rank == 0:
2320
+ from sglang.srt.utils import rpd_to_chrome_trace
2321
+
2322
+ rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
2323
+ self.rpd_profiler = None
2324
+ self.rpd_profiler_path = None
2325
+
2326
+ if self.profiler_activities is not None and "MEM" in self.profiler_activities:
2131
2327
  memory_profile_path = os.path.join(
2132
2328
  self.torch_profiler_output_dir,
2133
- self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
2329
+ str(time.time())
2330
+ + f"-TP-{self.tp_rank}-memory"
2331
+ + stage_suffix
2332
+ + ".pickle",
2134
2333
  )
2135
2334
  torch.cuda.memory._dump_snapshot(memory_profile_path)
2136
2335
  torch.cuda.memory._record_memory_history(enabled=None)
@@ -2143,21 +2342,46 @@ class Scheduler(
2143
2342
  self.torch_profiler_output_dir,
2144
2343
  )
2145
2344
  self.torch_profiler = None
2146
- self.torch_profiler_output_dir = None
2147
- self.profiler_activities = None
2148
-
2149
- if self.profiler_target_forward_ct:
2150
- self.send_to_tokenizer.send_pyobj(
2151
- ProfileReqOutput(success=True, message="Succeeded.")
2152
- )
2345
+ self.profile_in_progress = False
2346
+
2347
+ return ProfileReqOutput(success=True, message="Succeeded.")
2348
+
2349
+ def _profile_batch_predicate(self, batch):
2350
+ if self.profile_by_stage:
2351
+ if batch.forward_mode.is_prefill():
2352
+ if self.profiler_prefill_ct == 0:
2353
+ self.start_profile(batch.forward_mode)
2354
+ self.profiler_prefill_ct += 1
2355
+ if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
2356
+ if self.profile_in_progress:
2357
+ self.stop_profile(stage=ForwardMode.EXTEND)
2358
+ elif batch.forward_mode.is_decode():
2359
+ if self.profiler_decode_ct == 0:
2360
+ if self.profile_in_progress:
2361
+ # force trace flush
2362
+ self.stop_profile(ForwardMode.EXTEND)
2363
+ self.start_profile(batch.forward_mode)
2364
+ self.profiler_decode_ct += 1
2365
+ if self.profiler_decode_ct > self.profiler_target_decode_ct:
2366
+ if self.profile_in_progress:
2367
+ self.stop_profile(stage=ForwardMode.DECODE)
2368
+ else:
2369
+ raise RuntimeError("unsupported profile stage")
2370
+ else:
2371
+ # Check profiler
2372
+ if (
2373
+ self.profiler_target_forward_ct
2374
+ and self.profiler_target_forward_ct <= self.forward_ct
2375
+ ):
2376
+ self.stop_profile()
2153
2377
 
2154
2378
  def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
2155
2379
  if recv_req == ExpertDistributionReq.START_RECORD:
2156
- expert_distribution_recorder.start_record()
2380
+ get_global_expert_distribution_recorder().start_record()
2157
2381
  elif recv_req == ExpertDistributionReq.STOP_RECORD:
2158
- expert_distribution_recorder.stop_record()
2382
+ get_global_expert_distribution_recorder().stop_record()
2159
2383
  elif recv_req == ExpertDistributionReq.DUMP_RECORD:
2160
- expert_distribution_recorder.dump_record()
2384
+ get_global_expert_distribution_recorder().dump_record()
2161
2385
  else:
2162
2386
  raise ValueError("Unrecognized ExpertDistributionReq value")
2163
2387
  return ExpertDistributionReqOutput()
@@ -2195,6 +2419,13 @@ class Scheduler(
2195
2419
  prefix += f" PP{self.pp_rank}"
2196
2420
  return prefix
2197
2421
 
2422
+ def _publish_kv_events(self):
2423
+ if self.enable_kv_cache_events:
2424
+ events = self.tree_cache.take_events()
2425
+ if events:
2426
+ batch = KVEventBatch(ts=time.time(), events=events)
2427
+ self.kv_event_publisher.publish(batch)
2428
+
2198
2429
 
2199
2430
  def is_health_check_generate_req(recv_req):
2200
2431
  return getattr(recv_req, "rid", "").startswith("HEALTH_CHECK")
@@ -2250,6 +2481,10 @@ def run_scheduler_process(
2250
2481
  if get_bool_env_var("SGLANG_SET_CPU_AFFINITY"):
2251
2482
  set_gpu_proc_affinity(server_args.tp_size, server_args.nnodes, gpu_id)
2252
2483
 
2484
+ embedding_cache_size = 100
2485
+ if "SGLANG_VLM_CACHE_SIZE_MB" in os.environ:
2486
+ embedding_cache_size = int(os.environ["SGLANG_VLM_CACHE_SIZE_MB"])
2487
+ init_embedding_cache(embedding_cache_size * 1024 * 1024)
2253
2488
  # Create a scheduler and run the event loop
2254
2489
  try:
2255
2490
  scheduler = Scheduler(server_args, port_args, gpu_id, tp_rank, pp_rank, dp_rank)