sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -24,6 +24,7 @@ from collections import defaultdict, deque
24
24
  from concurrent import futures
25
25
  from dataclasses import dataclass
26
26
  from http import HTTPStatus
27
+ from pathlib import Path
27
28
  from types import SimpleNamespace
28
29
  from typing import Dict, List, Optional, Tuple, Union
29
30
 
@@ -35,7 +36,10 @@ from torch.distributed import barrier
35
36
 
36
37
  from sglang.global_config import global_config
37
38
  from sglang.srt.configs.model_config import ModelConfig
38
- from sglang.srt.constrained.base_grammar_backend import create_grammar_backend
39
+ from sglang.srt.constrained.base_grammar_backend import (
40
+ INVALID_GRAMMAR_OBJ,
41
+ create_grammar_backend,
42
+ )
39
43
  from sglang.srt.disaggregation.decode import (
40
44
  DecodePreallocQueue,
41
45
  DecodeTransferQueue,
@@ -62,7 +66,6 @@ from sglang.srt.hf_transformers_utils import (
62
66
  from sglang.srt.layers.dp_attention import compute_dp_attention_world_info
63
67
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
64
68
  from sglang.srt.managers.expert_distribution import (
65
- ExpertDistributionRecorder,
66
69
  get_global_expert_distribution_recorder,
67
70
  )
68
71
  from sglang.srt.managers.io_struct import (
@@ -132,11 +135,14 @@ from sglang.srt.reasoning_parser import ReasoningParser
132
135
  from sglang.srt.server_args import PortArgs, ServerArgs
133
136
  from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
134
137
  from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
138
+ from sglang.srt.two_batch_overlap import TboDPAttentionPreparer
135
139
  from sglang.srt.utils import (
140
+ DeepEPMode,
136
141
  DynamicGradMode,
137
142
  broadcast_pyobj,
138
143
  configure_logger,
139
144
  disable_request_logging,
145
+ get_available_gpu_memory,
140
146
  get_bool_env_var,
141
147
  get_zmq_socket,
142
148
  kill_itself_when_parent_died,
@@ -210,7 +216,6 @@ class Scheduler(
210
216
  self.gpu_id = gpu_id
211
217
  self.enable_hierarchical_cache = server_args.enable_hierarchical_cache
212
218
  self.page_size = server_args.page_size
213
- # Distributed rank info
214
219
  self.dp_size = server_args.dp_size
215
220
  self.attn_tp_rank, self.attn_tp_size, self.attn_dp_rank = (
216
221
  compute_dp_attention_world_info(
@@ -330,12 +335,16 @@ class Scheduler(
330
335
 
331
336
  # Print debug info
332
337
  if tp_rank == 0:
338
+ avail_mem = get_available_gpu_memory(
339
+ self.device, self.gpu_id, empty_cache=False
340
+ )
333
341
  logger.info(
334
342
  f"max_total_num_tokens={self.max_total_num_tokens}, "
335
343
  f"chunked_prefill_size={server_args.chunked_prefill_size}, "
336
344
  f"max_prefill_tokens={self.max_prefill_tokens}, "
337
345
  f"max_running_requests={self.max_running_requests}, "
338
- f"context_len={self.model_config.context_len}"
346
+ f"context_len={self.model_config.context_len}, "
347
+ f"available_gpu_mem={avail_mem:.2f} GB"
339
348
  )
340
349
 
341
350
  # Init memory pool and cache
@@ -359,6 +368,7 @@ class Scheduler(
359
368
  self.current_stream = torch.get_device_module(self.device).current_stream()
360
369
  if self.device == "cpu":
361
370
  self.current_stream.synchronize = lambda: None # No-op for CPU
371
+ self.forward_sleep_time = None
362
372
 
363
373
  # Init session info
364
374
  self.sessions: Dict[str, Session] = {}
@@ -420,10 +430,16 @@ class Scheduler(
420
430
  self.torch_profiler = None
421
431
  self.torch_profiler_output_dir: Optional[str] = None
422
432
  self.profiler_activities: Optional[List[str]] = None
423
- self.profiler_id: Optional[str] = None
433
+ self.profile_id: Optional[str] = None
424
434
  self.profiler_target_forward_ct: Optional[int] = None
425
-
426
- self.forward_sleep_time = None
435
+ self.profiler_target_prefill_ct: Optional[int] = None
436
+ self.profiler_target_decode_ct: Optional[int] = None
437
+ self.profiler_prefill_ct: Optional[int] = None
438
+ self.profiler_decode_ct: Optional[int] = None
439
+ self.profile_by_stage: bool = False
440
+ self.profile_steps: Optional[int] = None
441
+ self.profile_in_progress: bool = False
442
+ self.rpd_profiler = None
427
443
 
428
444
  # Init metrics stats
429
445
  self.init_metrics()
@@ -556,7 +572,9 @@ class Scheduler(
556
572
 
557
573
  def init_kv_events(self, kv_events_config: Optional[str]):
558
574
  if self.enable_kv_cache_events:
559
- self.kv_event_publisher = EventPublisherFactory.create(kv_events_config)
575
+ self.kv_event_publisher = EventPublisherFactory.create(
576
+ kv_events_config, self.attn_dp_rank
577
+ )
560
578
 
561
579
  def init_disaggregation(self):
562
580
  self.transfer_backend = TransferBackend(
@@ -931,18 +949,19 @@ class Scheduler(
931
949
  bootstrap_host=recv_req.bootstrap_host,
932
950
  bootstrap_port=recv_req.bootstrap_port,
933
951
  bootstrap_room=recv_req.bootstrap_room,
952
+ data_parallel_rank=recv_req.data_parallel_rank,
934
953
  )
935
954
  req.tokenizer = self.tokenizer
936
955
 
937
956
  if self.disaggregation_mode != DisaggregationMode.NULL:
938
957
  # Invalid request for disaggregated mode
939
958
  if recv_req.bootstrap_room is None:
940
- error_message = (
959
+ error_msg = (
941
960
  f"Invalid request: Disaggregated request received without "
942
961
  f"boostrap room id. {req.rid=}"
943
962
  )
944
- logger.error(error_message)
945
- prepare_abort(req, error_message)
963
+ logger.error(error_msg)
964
+ prepare_abort(req, error_msg)
946
965
  self.stream_output([req], req.return_logprob)
947
966
  return
948
967
 
@@ -973,29 +992,23 @@ class Scheduler(
973
992
  req.extend_image_inputs(image_inputs)
974
993
 
975
994
  if len(req.origin_input_ids) >= self.max_req_input_len:
976
- error_msg = (
977
- "Multimodal prompt is too long after expanding multimodal tokens. "
978
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
979
- )
980
- logger.error(error_msg)
981
- req.origin_input_ids = [0]
982
- req.multimodal_inputs = None
983
- req.sampling_params.max_new_tokens = 0
984
- req.finished_reason = FINISH_ABORT(
985
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
995
+ req.set_finish_with_abort(
996
+ error_msg=(
997
+ "Multimodal prompt is too long after expanding multimodal tokens. "
998
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
999
+ )
986
1000
  )
987
1001
  self._add_request_to_queue(req)
988
1002
  return
989
1003
 
990
- # Validate prompts length
1004
+ # Validate prompt length
991
1005
  error_msg = validate_input_length(
992
1006
  req,
993
1007
  self.max_req_input_len,
994
1008
  self.server_args.allow_auto_truncate,
995
1009
  )
996
1010
  if error_msg:
997
- req.origin_input_ids = [0]
998
- req.sampling_params.max_new_tokens = 0
1011
+ req.set_finish_with_abort(error_msg)
999
1012
  self._add_request_to_queue(req)
1000
1013
  return
1001
1014
 
@@ -1007,12 +1020,9 @@ class Scheduler(
1007
1020
  req.logprob_start_len = recv_req.logprob_start_len
1008
1021
 
1009
1022
  if req.logprob_start_len >= len(req.origin_input_ids):
1010
- req.finished_reason = FINISH_ABORT(
1011
- f"logprob_start_len, ({req.logprob_start_len}) is higher than the number of input tokens ({len(req.origin_input_ids)}). Request with a lower logprob_start_len.",
1012
- HTTPStatus.BAD_REQUEST,
1013
- "BadRequestError",
1014
- )
1023
+ error_msg = f"{req.logprob_start_len=} is higher than the number of input tokens {len(req.origin_input_ids)=}. Please use a smaller logprob_start_len."
1015
1024
  req.logprob_start_len = len(req.origin_input_ids) - 1
1025
+ req.set_finish_with_abort(error_msg)
1016
1026
  self._add_request_to_queue(req)
1017
1027
  return
1018
1028
 
@@ -1049,6 +1059,10 @@ class Scheduler(
1049
1059
  if not cache_hit:
1050
1060
  req.grammar_key = key
1051
1061
  add_to_grammar_queue = True
1062
+ else:
1063
+ if value is INVALID_GRAMMAR_OBJ: # We hit a cached invalid grammar.
1064
+ error_msg = f"Invalid grammar request with cache hit: {key=}"
1065
+ req.set_finish_with_abort(error_msg)
1052
1066
 
1053
1067
  if add_to_grammar_queue:
1054
1068
  req.queue_time_start = time.perf_counter()
@@ -1096,19 +1110,13 @@ class Scheduler(
1096
1110
  req.extend_image_inputs(image_inputs)
1097
1111
 
1098
1112
  if len(req.origin_input_ids) >= self.max_req_input_len:
1099
- error_msg = (
1100
- "Multimodal prompt is too long after expanding multimodal tokens. "
1101
- f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1102
- )
1103
- logger.error(error_msg)
1104
- req.origin_input_ids = [0]
1105
- req.multimodal_inputs = None
1106
- req.sampling_params.max_new_tokens = 0
1107
- req.finished_reason = FINISH_ABORT(
1108
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1113
+ req.set_finish_with_abort(
1114
+ error_msg=(
1115
+ "Multimodal prompt is too long after expanding multimodal tokens. "
1116
+ f"After expanding {len(req.origin_input_ids_unpadded)=} => {len(req.origin_input_ids)} >= {self.max_req_input_len}."
1117
+ )
1109
1118
  )
1110
- req.queue_time_start = time.perf_counter()
1111
- self.waiting_queue.append(req)
1119
+ self._add_request_to_queue(req)
1112
1120
  return
1113
1121
 
1114
1122
  # Validate prompts length
@@ -1154,7 +1162,8 @@ class Scheduler(
1154
1162
  if self.disaggregation_mode == DisaggregationMode.PREFILL:
1155
1163
  f += f"#unbootstrapped-req: {len(self.disagg_prefill_bootstrap_queue.queue)}, "
1156
1164
  f += f"#queue-req: {len(self.waiting_queue)}, "
1157
- f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)} "
1165
+ f += f"#transferring-req: {len(self.disagg_prefill_inflight_queue)}, "
1166
+ f += f"time: {gap_latency:.2f} "
1158
1167
  else:
1159
1168
  f += f"#queue-req: {len(self.waiting_queue)}"
1160
1169
 
@@ -1515,7 +1524,7 @@ class Scheduler(
1515
1524
  self.new_token_ratio = new_token_ratio
1516
1525
 
1517
1526
  logger.info(
1518
- "Decode out of memory happened. "
1527
+ "KV cache pool is full. Retract requests. "
1519
1528
  f"#retracted_reqs: {len(retracted_reqs)}, "
1520
1529
  f"#new_token_ratio: {old_ratio:.4f} -> {self.new_token_ratio:.4f}"
1521
1530
  )
@@ -1539,13 +1548,8 @@ class Scheduler(
1539
1548
  """Run a batch."""
1540
1549
  self.forward_ct += 1
1541
1550
 
1542
- # Check profiler
1543
- if (
1544
- self.profiler_target_forward_ct
1545
- and self.profiler_target_forward_ct <= self.forward_ct
1546
- ):
1547
- self.send_to_tokenizer.send_pyobj(self.stop_profile())
1548
-
1551
+ # Whether to run the profiler
1552
+ self._profile_batch_predicate(batch)
1549
1553
  if self.forward_sleep_time is not None:
1550
1554
  logger.info(f"Scheduler.run_batch sleep {self.forward_sleep_time}s")
1551
1555
  time.sleep(self.forward_sleep_time)
@@ -1571,10 +1575,9 @@ class Scheduler(
1571
1575
  num_accepted_tokens,
1572
1576
  can_run_cuda_graph,
1573
1577
  ) = self.draft_worker.forward_batch_speculative_generation(batch)
1574
- self.spec_num_total_accepted_tokens += (
1575
- num_accepted_tokens + batch.batch_size()
1576
- )
1577
- self.spec_num_total_forward_ct += batch.batch_size()
1578
+ bs = batch.batch_size()
1579
+ self.spec_num_total_accepted_tokens += num_accepted_tokens + bs
1580
+ self.spec_num_total_forward_ct += bs
1578
1581
  self.num_generated_tokens += num_accepted_tokens
1579
1582
 
1580
1583
  if self.pp_group.is_last_rank:
@@ -1648,6 +1651,9 @@ class Scheduler(
1648
1651
  disable_cuda_graph=self.server_args.disable_cuda_graph,
1649
1652
  spec_algorithm=self.spec_algorithm,
1650
1653
  speculative_num_draft_tokens=self.server_args.speculative_num_draft_tokens,
1654
+ enable_two_batch_overlap=self.server_args.enable_two_batch_overlap,
1655
+ enable_deepep_moe=self.server_args.enable_deepep_moe,
1656
+ deepep_mode=DeepEPMode[self.server_args.deepep_mode],
1651
1657
  )
1652
1658
 
1653
1659
  @staticmethod
@@ -1661,6 +1667,9 @@ class Scheduler(
1661
1667
  disable_cuda_graph: bool,
1662
1668
  spec_algorithm,
1663
1669
  speculative_num_draft_tokens,
1670
+ enable_two_batch_overlap: bool,
1671
+ enable_deepep_moe: bool,
1672
+ deepep_mode: DeepEPMode,
1664
1673
  ):
1665
1674
  # Check if other DP workers have running batches
1666
1675
  if local_batch is None:
@@ -1696,17 +1705,26 @@ class Scheduler(
1696
1705
  is_extend_in_batch = (
1697
1706
  local_batch.forward_mode.is_extend() if local_batch else False
1698
1707
  )
1708
+
1709
+ tbo_preparer = TboDPAttentionPreparer()
1710
+
1699
1711
  local_info = torch.tensor(
1700
1712
  [
1701
1713
  num_tokens,
1702
1714
  can_cuda_graph,
1703
1715
  num_tokens_for_logprob,
1704
1716
  is_extend_in_batch,
1717
+ *tbo_preparer.prepare_all_gather(
1718
+ local_batch,
1719
+ deepep_mode,
1720
+ enable_deepep_moe,
1721
+ enable_two_batch_overlap,
1722
+ ),
1705
1723
  ],
1706
1724
  dtype=torch.int64,
1707
1725
  )
1708
1726
  global_info = torch.empty(
1709
- (dp_size, attn_tp_size, 4),
1727
+ (dp_size, attn_tp_size, 6),
1710
1728
  dtype=torch.int64,
1711
1729
  )
1712
1730
  torch.distributed.all_gather_into_tensor(
@@ -1719,6 +1737,10 @@ class Scheduler(
1719
1737
  global_num_tokens_for_logprob = global_info[:, 0, 2].tolist()
1720
1738
  is_extend_in_batch = global_info[:, 0, 3].tolist()
1721
1739
 
1740
+ tbo_split_seq_index, global_forward_mode = tbo_preparer.compute_output(
1741
+ global_info[:, :, 4:6]
1742
+ )
1743
+
1722
1744
  if local_batch is None and max(global_num_tokens) > 0:
1723
1745
  local_batch = get_idle_batch()
1724
1746
 
@@ -1732,6 +1754,8 @@ class Scheduler(
1732
1754
  local_batch.global_num_tokens_for_logprob = (
1733
1755
  global_num_tokens_for_logprob
1734
1756
  )
1757
+ local_batch.tbo_split_seq_index = tbo_split_seq_index
1758
+ local_batch.global_forward_mode = global_forward_mode
1735
1759
 
1736
1760
  # Check forward mode for cuda graph
1737
1761
  if not disable_cuda_graph:
@@ -1757,17 +1781,25 @@ class Scheduler(
1757
1781
  """Move requests whose grammar objects are ready from grammar_queue to waiting_queue."""
1758
1782
 
1759
1783
  num_ready_reqs = 0
1760
- num_abort_reqs = 0
1784
+ num_timeout_reqs = 0
1761
1785
  for req in self.grammar_queue:
1762
1786
  try:
1787
+ if req.finished(): # It is aborted by AbortReq
1788
+ num_ready_reqs += 1
1789
+ continue
1763
1790
  req.grammar = req.grammar.result(timeout=0.03)
1764
- if req.grammar:
1765
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1791
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1792
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1793
+ req.set_finish_with_abort(
1794
+ f"Invalid grammar request: {req.grammar_key=}"
1795
+ )
1766
1796
  num_ready_reqs += 1
1767
1797
  except futures._base.TimeoutError:
1768
1798
  req.grammar_wait_ct += 1
1799
+ # NOTE(lianmin): this timeout is the waiting time of the above line. It is
1800
+ # not the waiting time from it enters the grammar queue.
1769
1801
  if req.grammar_wait_ct > GRAMMAR_TIMEOUT / 0.03:
1770
- num_abort_reqs = 1
1802
+ num_timeout_reqs = 1
1771
1803
  break
1772
1804
 
1773
1805
  if self.server_args.enable_dp_attention:
@@ -1779,28 +1811,33 @@ class Scheduler(
1779
1811
 
1780
1812
  if tp_size > 1:
1781
1813
  # Sync across TP ranks to make sure they have the same number of ready requests
1782
- tensor = torch.tensor([num_ready_reqs, num_abort_reqs], dtype=torch.int32)
1814
+ tensor = torch.tensor([num_ready_reqs, num_timeout_reqs], dtype=torch.int32)
1783
1815
  torch.distributed.all_reduce(
1784
1816
  tensor, op=torch.distributed.ReduceOp.MAX, group=tp_group
1785
1817
  )
1786
- num_ready_reqs_max, num_abort_reqs_max = tensor.tolist()
1818
+ num_ready_reqs_max, num_timeout_reqs_max = tensor.tolist()
1787
1819
 
1788
1820
  for i in range(num_ready_reqs, num_ready_reqs_max):
1789
1821
  req = self.grammar_queue[i]
1822
+ if req.finished(): # It is aborted by AbortReq
1823
+ continue
1790
1824
  req.grammar = req.grammar.result()
1791
- if req.grammar:
1792
- self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1825
+ self.grammar_backend.set_cache(req.grammar_key, req.grammar.copy())
1826
+ if req.grammar is INVALID_GRAMMAR_OBJ:
1827
+ req.set_finish_with_abort(
1828
+ f"Invalid grammar request: {req.grammar_key=}"
1829
+ )
1830
+ else:
1831
+ num_ready_reqs_max = num_ready_reqs
1832
+ num_timeout_reqs_max = num_timeout_reqs
1793
1833
 
1794
- for i in range(num_ready_reqs, num_ready_reqs + num_abort_reqs_max):
1795
- req = self.grammar_queue[i]
1796
- req.grammar.cancel()
1797
- req.grammar = None
1798
- error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1799
- logger.error(error_msg)
1800
- req.finished_reason = FINISH_ABORT(
1801
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
1802
- )
1803
- num_ready_reqs = num_ready_reqs_max + num_abort_reqs_max
1834
+ for i in range(num_ready_reqs, num_ready_reqs + num_timeout_reqs_max):
1835
+ req = self.grammar_queue[i]
1836
+ req.grammar.cancel()
1837
+ error_msg = f"Grammar preprocessing timed out for {req.grammar_key=}"
1838
+ req.set_finish_with_abort(error_msg)
1839
+ self.grammar_backend.set_cache(req.grammar_key, INVALID_GRAMMAR_OBJ)
1840
+ num_ready_reqs = num_ready_reqs_max + num_timeout_reqs_max
1804
1841
 
1805
1842
  self._extend_requests_to_queue(self.grammar_queue[:num_ready_reqs])
1806
1843
  self.grammar_queue = self.grammar_queue[num_ready_reqs:]
@@ -1887,6 +1924,27 @@ class Scheduler(
1887
1924
  if_success = False
1888
1925
  return if_success
1889
1926
 
1927
+ def get_load(self):
1928
+ # TODO(lsyin): use dynamically maintained num_waiting_tokens
1929
+ load = (
1930
+ self.max_total_num_tokens
1931
+ - self.token_to_kv_pool_allocator.available_size()
1932
+ - self.tree_cache.evictable_size()
1933
+ )
1934
+ load += sum(len(req.origin_input_ids) for req in self.waiting_queue)
1935
+ if self.disaggregation_mode == DisaggregationMode.PREFILL:
1936
+ load += sum(
1937
+ len(req.origin_input_ids)
1938
+ for req in self.disagg_prefill_bootstrap_queue.queue
1939
+ )
1940
+ elif self.disaggregation_mode == DisaggregationMode.DECODE:
1941
+ load += sum(
1942
+ len(req.req.origin_input_ids)
1943
+ for req in self.disagg_decode_prealloc_queue.queue
1944
+ )
1945
+
1946
+ return load
1947
+
1890
1948
  def get_internal_state(self, recv_req: GetInternalStateReq):
1891
1949
  ret = dict(global_server_args_dict)
1892
1950
  ret["last_gen_throughput"] = self.last_gen_throughput
@@ -1896,9 +1954,10 @@ class Scheduler(
1896
1954
  )
1897
1955
  if RECORD_STEP_TIME:
1898
1956
  ret["step_time_dict"] = self.step_time_dict
1899
- return GetInternalStateReqOutput(
1900
- internal_state=ret,
1901
- )
1957
+
1958
+ ret["load"] = self.get_load()
1959
+
1960
+ return GetInternalStateReqOutput(internal_state=ret)
1902
1961
 
1903
1962
  def set_internal_state(self, recv_req: SetInternalStateReq):
1904
1963
  server_args_dict = recv_req.server_args
@@ -1932,7 +1991,7 @@ class Scheduler(
1932
1991
  self.cum_spec_accept_length = self.cum_spec_accept_count = 0
1933
1992
  for k, v in server_args_dict.items():
1934
1993
  global_server_args_dict[k] = v
1935
- logger.info(f"Global server args updated! " f"{global_server_args_dict=}")
1994
+ logger.info(f"Global server args updated! {global_server_args_dict=}")
1936
1995
  return SetInternalStateReqOutput(
1937
1996
  updated=True,
1938
1997
  server_args=global_server_args_dict,
@@ -1974,8 +2033,6 @@ class Scheduler(
1974
2033
  )
1975
2034
 
1976
2035
  def abort_request(self, recv_req: AbortReq):
1977
- # TODO(lmzheng): abort the requests in the grammar queue.
1978
-
1979
2036
  # Delete requests in the waiting queue
1980
2037
  to_del = []
1981
2038
  for i, req in enumerate(self.waiting_queue):
@@ -1984,10 +2041,23 @@ class Scheduler(
1984
2041
 
1985
2042
  # Sort in reverse order to avoid index issues when deleting
1986
2043
  for i in reversed(to_del):
2044
+ # Abort method 1: directly pop from the queue
2045
+ # This only works for requests that have not started anything.
2046
+ # We still need to send something back to TokenizerManager to clean up the state.
1987
2047
  req = self.waiting_queue.pop(i)
1988
2048
  self.send_to_tokenizer.send_pyobj(AbortReq(req.rid))
1989
2049
  logger.debug(f"Abort queued request. {req.rid=}")
1990
2050
 
2051
+ # Delete the requests in the grammar queue
2052
+ for req in self.grammar_queue:
2053
+ # Abort method 2: call `set_finish_with_abort`
2054
+ # The request will still run one prefill forward pass.
2055
+ # In this case, we change the input_ids to be only one token to make this prefill cheap.
2056
+ if req.rid.startswith(recv_req.rid):
2057
+ logger.debug(f"Abort grammar queue request. {req.rid=}")
2058
+ req.grammar.cancel()
2059
+ req.set_finish_with_abort("Aborted by AbortReq.")
2060
+
1991
2061
  # Delete requests in the running batch
1992
2062
  if self.cur_batch is self.running_batch or self.cur_batch is None:
1993
2063
  reqs = self.running_batch.reqs
@@ -1996,6 +2066,9 @@ class Scheduler(
1996
2066
 
1997
2067
  for req in reqs:
1998
2068
  if req.rid.startswith(recv_req.rid) and not req.finished():
2069
+ # Abort method 3: set `to_abort=True`
2070
+ # The request will still run one decode forward pass.
2071
+ # Then we reuse all existing code to clean up the KV cache allocation.
1999
2072
  logger.debug(f"Abort running request. {req.rid=}")
2000
2073
  req.to_abort = True
2001
2074
 
@@ -2075,46 +2148,86 @@ class Scheduler(
2075
2148
 
2076
2149
  def profile(self, recv_req: ProfileReq):
2077
2150
  if recv_req.type == ProfileReqType.START_PROFILE:
2078
- return self.start_profile(
2079
- recv_req.output_dir,
2080
- recv_req.num_steps,
2081
- recv_req.activities,
2082
- recv_req.with_stack,
2083
- recv_req.record_shapes,
2084
- recv_req.profile_id,
2085
- )
2151
+ if recv_req.profile_by_stage:
2152
+ return self.init_profile(
2153
+ recv_req.output_dir,
2154
+ recv_req.num_steps,
2155
+ recv_req.activities,
2156
+ recv_req.with_stack,
2157
+ recv_req.record_shapes,
2158
+ recv_req.profile_by_stage,
2159
+ recv_req.profile_id,
2160
+ )
2161
+ else:
2162
+ self.init_profile(
2163
+ recv_req.output_dir,
2164
+ recv_req.num_steps,
2165
+ recv_req.activities,
2166
+ recv_req.with_stack,
2167
+ recv_req.record_shapes,
2168
+ recv_req.profile_by_stage,
2169
+ recv_req.profile_id,
2170
+ )
2171
+ return self.start_profile(True)
2086
2172
  else:
2087
2173
  return self.stop_profile()
2088
2174
 
2089
- def start_profile(
2175
+ def init_profile(
2090
2176
  self,
2091
2177
  output_dir: Optional[str],
2092
2178
  num_steps: Optional[int],
2093
2179
  activities: Optional[List[str]],
2094
2180
  with_stack: Optional[bool],
2095
2181
  record_shapes: Optional[bool],
2096
- profile_id: Optional[str],
2097
- ) -> None:
2098
- if self.profiler_activities:
2182
+ profile_by_stage: bool,
2183
+ profile_id: str,
2184
+ ) -> ProfileReqOutput:
2185
+ if self.profile_in_progress:
2099
2186
  return ProfileReqOutput(
2100
2187
  success=False,
2101
2188
  message="Profiling is already in progress. Call /stop_profile first.",
2102
2189
  )
2103
2190
 
2191
+ self.profile_by_stage = profile_by_stage
2192
+
2104
2193
  if output_dir is None:
2105
2194
  output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
2106
2195
  if activities is None:
2107
2196
  activities = ["CPU", "GPU"]
2108
2197
 
2109
2198
  self.torch_profiler_output_dir = output_dir
2199
+ self.torch_profiler_with_stack = with_stack
2200
+ self.torch_profiler_record_shapes = record_shapes
2110
2201
  self.profiler_activities = activities
2111
- self.profiler_id = profile_id
2202
+ self.profile_id = profile_id
2203
+
2204
+ if num_steps:
2205
+ self.profile_steps = num_steps
2206
+ if self.profile_by_stage:
2207
+ self.profiler_target_prefill_ct = num_steps
2208
+ self.profiler_target_decode_ct = num_steps
2209
+ self.profiler_prefill_ct = 0
2210
+ self.profiler_decode_ct = 0
2211
+ else:
2212
+ self.profiler_target_forward_ct = self.forward_ct + num_steps
2213
+ # The caller will be notified when reaching profiler_target_forward_ct
2214
+ else:
2215
+ self.profiler_target_forward_ct = None
2216
+
2217
+ return ProfileReqOutput(success=True, message="Succeeded")
2218
+
2219
+ def start_profile(
2220
+ self, stage: Optional[ForwardMode] = None
2221
+ ) -> ProfileReqOutput | None:
2222
+ stage_str = f" for {stage.__str__()}" if stage else ""
2112
2223
  logger.info(
2113
- "Profiling starts. Traces will be saved to: %s (with id %s)",
2114
- self.torch_profiler_output_dir,
2115
- self.profiler_id,
2224
+ f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
2116
2225
  )
2117
2226
 
2227
+ activities = self.profiler_activities
2228
+ with_stack = self.torch_profiler_with_stack
2229
+ record_shapes = self.torch_profiler_record_shapes
2230
+
2118
2231
  activity_map = {
2119
2232
  "CPU": torch.profiler.ProfilerActivity.CPU,
2120
2233
  "GPU": torch.profiler.ProfilerActivity.CUDA,
@@ -2123,48 +2236,100 @@ class Scheduler(
2123
2236
  activity_map[a] for a in activities if a in activity_map
2124
2237
  ]
2125
2238
 
2126
- if torchprof_activities:
2239
+ if "RPD" in activities:
2240
+ from rpdTracerControl import rpdTracerControl
2241
+
2242
+ rpdTracerControl.skipCreate()
2243
+
2244
+ self.rpd_profile_path = os.path.join(
2245
+ self.torch_profiler_output_dir,
2246
+ "rpd-" + str(time.time()) + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2247
+ )
2248
+
2249
+ if self.tp_rank == 0:
2250
+ import sqlite3
2251
+
2252
+ from rocpd.schema import RocpdSchema
2253
+
2254
+ if os.path.exists("trace.rpd"):
2255
+ os.unlink("trace.rpd")
2256
+ schema = RocpdSchema()
2257
+ connection = sqlite3.connect("trace.rpd")
2258
+ schema.writeSchema(connection)
2259
+ connection.commit()
2260
+ del connection
2261
+ torch.distributed.barrier(self.tp_cpu_group)
2262
+
2263
+ self.rpd_profiler = rpdTracerControl()
2264
+ self.rpd_profiler.setPythonTrace(True)
2265
+ self.rpd_profiler.start()
2266
+ self.rpd_profiler.rangePush("", "rpd profile range", "")
2267
+ self.profile_in_progress = True
2268
+ elif torchprof_activities:
2127
2269
  self.torch_profiler = torch.profiler.profile(
2128
2270
  activities=torchprof_activities,
2129
2271
  with_stack=with_stack if with_stack is not None else True,
2130
2272
  record_shapes=record_shapes if record_shapes is not None else False,
2131
2273
  )
2132
2274
  self.torch_profiler.start()
2275
+ self.profile_in_progress = True
2133
2276
 
2134
2277
  if "MEM" in activities:
2135
2278
  torch.cuda.memory._record_memory_history(max_entries=100000)
2279
+ self.profile_in_progress = True
2136
2280
 
2137
2281
  if "CUDA_PROFILER" in activities:
2138
2282
  torch.cuda.cudart().cudaProfilerStart()
2139
2283
 
2140
- if num_steps:
2141
- self.profiler_target_forward_ct = self.forward_ct + num_steps
2142
- # The caller will be notified when reaching profiler_target_forward_ct
2143
- else:
2144
- self.profiler_target_forward_ct = None
2145
- return ProfileReqOutput(success=True, message="Succeeded")
2284
+ return ProfileReqOutput(success=True, message="Succeeded")
2146
2285
 
2147
- def stop_profile(self) -> None:
2148
- if self.profiler_activities is None:
2286
+ def stop_profile(
2287
+ self, stage: Optional[ForwardMode] = None
2288
+ ) -> ProfileReqOutput | None:
2289
+ if not self.profile_in_progress:
2149
2290
  return ProfileReqOutput(
2150
2291
  success=False,
2151
2292
  message="Profiling is not in progress. Call /start_profile first.",
2152
2293
  )
2153
2294
 
2154
- logger.info("Stop profiling...")
2295
+ if not Path(self.torch_profiler_output_dir).exists():
2296
+ Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
2297
+
2298
+ stage_suffix = f"-{stage.__str__()}" if stage else ""
2299
+ logger.info("Stop profiling" + stage_suffix + "...")
2155
2300
  if self.torch_profiler is not None:
2156
2301
  self.torch_profiler.stop()
2157
2302
  self.torch_profiler.export_chrome_trace(
2158
2303
  os.path.join(
2159
2304
  self.torch_profiler_output_dir,
2160
- self.profiler_id + f"-TP-{self.tp_rank}" + ".trace.json.gz",
2305
+ self.profile_id
2306
+ + f"-TP-{self.tp_rank}"
2307
+ + stage_suffix
2308
+ + ".trace.json.gz",
2161
2309
  )
2162
2310
  )
2311
+ torch.distributed.barrier(self.tp_cpu_group)
2312
+
2313
+ if self.rpd_profiler is not None:
2314
+ self.rpd_profiler.rangePop()
2315
+ self.rpd_profiler.stop()
2316
+ self.rpd_profiler.flush()
2317
+
2318
+ torch.distributed.barrier(self.tp_cpu_group)
2319
+ if self.tp_rank == 0:
2320
+ from sglang.srt.utils import rpd_to_chrome_trace
2163
2321
 
2164
- if "MEM" in self.profiler_activities:
2322
+ rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
2323
+ self.rpd_profiler = None
2324
+ self.rpd_profiler_path = None
2325
+
2326
+ if self.profiler_activities is not None and "MEM" in self.profiler_activities:
2165
2327
  memory_profile_path = os.path.join(
2166
2328
  self.torch_profiler_output_dir,
2167
- self.profiler_id + f"-TP-{self.tp_rank}-memory" + ".pickle",
2329
+ str(time.time())
2330
+ + f"-TP-{self.tp_rank}-memory"
2331
+ + stage_suffix
2332
+ + ".pickle",
2168
2333
  )
2169
2334
  torch.cuda.memory._dump_snapshot(memory_profile_path)
2170
2335
  torch.cuda.memory._record_memory_history(enabled=None)
@@ -2177,10 +2342,38 @@ class Scheduler(
2177
2342
  self.torch_profiler_output_dir,
2178
2343
  )
2179
2344
  self.torch_profiler = None
2180
- self.torch_profiler_output_dir = None
2181
- self.profiler_activities = None
2182
-
2183
- return ProfileReqOutput(success=True, message="Succeeded")
2345
+ self.profile_in_progress = False
2346
+
2347
+ return ProfileReqOutput(success=True, message="Succeeded.")
2348
+
2349
+ def _profile_batch_predicate(self, batch):
2350
+ if self.profile_by_stage:
2351
+ if batch.forward_mode.is_prefill():
2352
+ if self.profiler_prefill_ct == 0:
2353
+ self.start_profile(batch.forward_mode)
2354
+ self.profiler_prefill_ct += 1
2355
+ if self.profiler_prefill_ct > self.profiler_target_prefill_ct:
2356
+ if self.profile_in_progress:
2357
+ self.stop_profile(stage=ForwardMode.EXTEND)
2358
+ elif batch.forward_mode.is_decode():
2359
+ if self.profiler_decode_ct == 0:
2360
+ if self.profile_in_progress:
2361
+ # force trace flush
2362
+ self.stop_profile(ForwardMode.EXTEND)
2363
+ self.start_profile(batch.forward_mode)
2364
+ self.profiler_decode_ct += 1
2365
+ if self.profiler_decode_ct > self.profiler_target_decode_ct:
2366
+ if self.profile_in_progress:
2367
+ self.stop_profile(stage=ForwardMode.DECODE)
2368
+ else:
2369
+ raise RuntimeError("unsupported profile stage")
2370
+ else:
2371
+ # Check profiler
2372
+ if (
2373
+ self.profiler_target_forward_ct
2374
+ and self.profiler_target_forward_ct <= self.forward_ct
2375
+ ):
2376
+ self.stop_profile()
2184
2377
 
2185
2378
  def expert_distribution_handle(self, recv_req: ExpertDistributionReq):
2186
2379
  if recv_req == ExpertDistributionReq.START_RECORD: