sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -25,12 +25,13 @@ from collections import deque
25
25
  from http import HTTPStatus
26
26
  from typing import TYPE_CHECKING, List, Optional
27
27
 
28
+ import numpy as np
28
29
  import torch
29
30
 
30
- from sglang.srt.disaggregation.base import BaseKVManager, KVArgs, KVPoll
31
+ from sglang.srt.disaggregation.base import BaseKVManager, KVPoll
31
32
  from sglang.srt.disaggregation.utils import (
33
+ FAKE_BOOTSTRAP_HOST,
32
34
  DisaggregationMode,
33
- FakeBootstrapHost,
34
35
  KVClassType,
35
36
  MetadataBuffers,
36
37
  ReqToMetadataIdxAllocator,
@@ -51,7 +52,6 @@ if TYPE_CHECKING:
51
52
  from sglang.srt.managers.scheduler import GenerationBatchResult, Scheduler
52
53
  from sglang.srt.mem_cache.memory_pool import KVCache
53
54
 
54
-
55
55
  logger = logging.getLogger(__name__)
56
56
 
57
57
 
@@ -68,35 +68,45 @@ class PrefillBootstrapQueue:
68
68
  metadata_buffers: MetadataBuffers,
69
69
  tp_rank: int,
70
70
  tp_size: int,
71
+ gpu_id: int,
71
72
  bootstrap_port: int,
72
73
  gloo_group: ProcessGroup,
73
- transfer_backend: TransferBackend,
74
+ max_total_num_tokens: int,
75
+ decode_tp_size: int,
76
+ decode_dp_size: int,
74
77
  scheduler: Scheduler,
78
+ pp_rank: int,
79
+ pp_size: int,
80
+ transfer_backend: TransferBackend,
75
81
  ):
76
82
  self.token_to_kv_pool = token_to_kv_pool
77
83
  self.draft_token_to_kv_pool = draft_token_to_kv_pool
78
-
79
84
  self.is_mla_backend = is_mla_backend(token_to_kv_pool)
80
-
81
85
  self.metadata_buffers = metadata_buffers
82
86
  self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
83
87
  self.tp_rank = tp_rank
84
88
  self.tp_size = tp_size
85
- self.transfer_backend = transfer_backend
86
- self.scheduler = scheduler
87
- self.kv_manager = self._init_kv_manager()
89
+ self.decode_tp_size = decode_tp_size
90
+ self.decode_dp_size = decode_dp_size
91
+ self.pp_rank = pp_rank
92
+ self.pp_size = pp_size
93
+ self.gpu_id = gpu_id
94
+ self.bootstrap_port = bootstrap_port
88
95
  self.queue: List[Req] = []
96
+ self.pp_rank = pp_rank
97
+ self.pp_size = pp_size
89
98
  self.gloo_group = gloo_group
90
- self.bootstrap_port = bootstrap_port
91
-
92
- def store_prefill_results(self, idx: int, token_id: int):
93
- assert token_id >= 0, f"token_id: {token_id} is negative"
94
- output_id_buffer = self.metadata_buffers[0]
95
- output_id_buffer[idx] = token_id
99
+ self.max_total_num_tokens = max_total_num_tokens
100
+ self.scheduler = scheduler
101
+ self.transfer_backend = transfer_backend
102
+ self.kv_manager = self._init_kv_manager()
96
103
 
97
104
  def _init_kv_manager(self) -> BaseKVManager:
98
- kv_args = KVArgs()
105
+ kv_args_class = get_kv_class(self.transfer_backend, KVClassType.KVARGS)
106
+ kv_args = kv_args_class()
99
107
  kv_args.engine_rank = self.tp_rank
108
+ kv_args.decode_tp_size = self.decode_tp_size // self.decode_dp_size
109
+ kv_args.prefill_pp_size = self.pp_size
100
110
  kv_data_ptrs, kv_data_lens, kv_item_lens = (
101
111
  self.token_to_kv_pool.get_contiguous_buf_infos()
102
112
  )
@@ -115,12 +125,12 @@ class PrefillBootstrapQueue:
115
125
  kv_args.kv_data_lens = kv_data_lens
116
126
  kv_args.kv_item_lens = kv_item_lens
117
127
 
118
- # Define req -> input ids buffer
119
128
  kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
120
129
  self.metadata_buffers.get_buf_infos()
121
130
  )
122
131
  kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
123
132
  kv_args.gpu_id = self.scheduler.gpu_id
133
+
124
134
  kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
125
135
  kv_manager = kv_manager_class(
126
136
  kv_args,
@@ -130,23 +140,39 @@ class PrefillBootstrapQueue:
130
140
  )
131
141
  return kv_manager
132
142
 
133
- def add(self, req: Req) -> None:
134
- if req.bootstrap_host == FakeBootstrapHost:
135
- # Fake transfer for warmup reqs
143
+ def add(self, req: Req, num_kv_heads: int) -> None:
144
+ if self._check_if_req_exceed_kv_capacity(req):
145
+ return
146
+
147
+ if req.bootstrap_host == FAKE_BOOTSTRAP_HOST:
136
148
  kv_sender_class = get_kv_class(TransferBackend.FAKE, KVClassType.SENDER)
137
149
  else:
138
150
  kv_sender_class = get_kv_class(self.transfer_backend, KVClassType.SENDER)
151
+
152
+ dest_tp_ranks = [self.tp_rank]
153
+
139
154
  req.disagg_kv_sender = kv_sender_class(
140
155
  mgr=self.kv_manager,
141
156
  bootstrap_addr=f"{req.bootstrap_host}:{self.bootstrap_port}",
142
157
  bootstrap_room=req.bootstrap_room,
158
+ dest_tp_ranks=dest_tp_ranks,
159
+ pp_rank=self.pp_rank,
143
160
  )
144
161
  self._process_req(req)
145
162
  self.queue.append(req)
146
163
 
147
- def extend(self, reqs: List[Req]) -> None:
164
+ def extend(self, reqs: List[Req], num_kv_heads: int) -> None:
148
165
  for req in reqs:
149
- self.add(req)
166
+ self.add(req, num_kv_heads)
167
+
168
+ def _check_if_req_exceed_kv_capacity(self, req: Req) -> bool:
169
+ if len(req.origin_input_ids) > self.max_total_num_tokens:
170
+ message = f"Request {req.rid} exceeds the maximum number of tokens: {len(req.origin_input_ids)} > {self.max_total_num_tokens}"
171
+ logger.error(message)
172
+ prepare_abort(req, message)
173
+ self.scheduler.stream_output([req], req.return_logprob)
174
+ return True
175
+ return False
150
176
 
151
177
  def _process_req(self, req: Req) -> None:
152
178
  """
@@ -154,19 +180,40 @@ class PrefillBootstrapQueue:
154
180
  """
155
181
  req.sampling_params.max_new_tokens = 1
156
182
 
157
- def pop_bootstrapped(self) -> List[Req]:
158
- """pop the reqs which has finished bootstrapping"""
183
+ def pop_bootstrapped(
184
+ self,
185
+ return_failed_reqs: bool = False,
186
+ rids_to_check: Optional[List[str]] = None,
187
+ ) -> List[Req]:
188
+ """
189
+ pop the reqs which has finished bootstrapping
190
+
191
+ return_failed_reqs: For PP, on rank 0, also return the failed reqs to notify the next rank
192
+ rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
193
+ """
194
+
159
195
  bootstrapped_reqs = []
196
+ failed_reqs = []
160
197
  indices_to_remove = set()
161
198
 
162
199
  if len(self.queue) == 0:
163
- return []
200
+ if return_failed_reqs is False:
201
+ return []
202
+ else:
203
+ return [], []
164
204
 
165
205
  polls = poll_and_all_reduce(
166
206
  [req.disagg_kv_sender for req in self.queue], self.gloo_group
167
207
  )
168
-
169
208
  for i, (req, poll) in enumerate(zip(self.queue, polls)):
209
+
210
+ if rids_to_check is not None:
211
+ # if req not in reqs_info_to_check, skip
212
+ if req.rid not in rids_to_check:
213
+ continue
214
+ # Either waiting for input or failed
215
+ assert poll == KVPoll.WaitingForInput or poll == KVPoll.Failed
216
+
170
217
  if poll == KVPoll.Bootstrapping:
171
218
  continue
172
219
  elif poll == KVPoll.Failed:
@@ -181,9 +228,10 @@ class PrefillBootstrapQueue:
181
228
  )
182
229
  self.scheduler.stream_output([req], req.return_logprob)
183
230
  indices_to_remove.add(i)
231
+ failed_reqs.append(req)
184
232
  continue
185
233
 
186
- # KV.WaitingForInput
234
+ # KV.WaitingForInput - init here
187
235
  num_kv_indices = len(req.origin_input_ids)
188
236
  if self.req_to_metadata_buffer_idx_allocator.available_size() == 0:
189
237
  break
@@ -192,9 +240,9 @@ class PrefillBootstrapQueue:
192
240
  self.req_to_metadata_buffer_idx_allocator.alloc()
193
241
  )
194
242
  assert req.metadata_buffer_index is not None
243
+
195
244
  num_pages = kv_to_page_num(num_kv_indices, self.token_to_kv_pool.page_size)
196
245
  req.disagg_kv_sender.init(num_pages, req.metadata_buffer_index)
197
-
198
246
  bootstrapped_reqs.append(req)
199
247
  indices_to_remove.add(i)
200
248
 
@@ -202,7 +250,10 @@ class PrefillBootstrapQueue:
202
250
  entry for i, entry in enumerate(self.queue) if i not in indices_to_remove
203
251
  ]
204
252
 
205
- return bootstrapped_reqs
253
+ if return_failed_reqs is False:
254
+ return bootstrapped_reqs
255
+ else:
256
+ return bootstrapped_reqs, failed_reqs
206
257
 
207
258
 
208
259
  class SchedulerDisaggregationPrefillMixin:
@@ -211,7 +262,7 @@ class SchedulerDisaggregationPrefillMixin:
211
262
  """
212
263
 
213
264
  @torch.no_grad()
214
- def event_loop_normal_disagg_prefill(self: Scheduler):
265
+ def event_loop_normal_disagg_prefill(self: Scheduler) -> None:
215
266
  """A normal scheduler loop for prefill worker in disaggregation mode."""
216
267
 
217
268
  while True:
@@ -229,7 +280,6 @@ class SchedulerDisaggregationPrefillMixin:
229
280
  or self.server_args.enable_sp_layernorm
230
281
  ):
231
282
  batch, _ = self.prepare_dp_attn_batch(batch)
232
-
233
283
  self.cur_batch = batch
234
284
 
235
285
  if batch:
@@ -242,6 +292,7 @@ class SchedulerDisaggregationPrefillMixin:
242
292
  if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
243
293
  self.check_memory()
244
294
  self.new_token_ratio = self.init_new_token_ratio
295
+ self.maybe_sleep_on_idle()
245
296
 
246
297
  self.last_batch = batch
247
298
  # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -249,7 +300,7 @@ class SchedulerDisaggregationPrefillMixin:
249
300
  self.running_batch.batch_is_full = False
250
301
 
251
302
  @torch.no_grad()
252
- def event_loop_overlap_disagg_prefill(self: Scheduler):
303
+ def event_loop_overlap_disagg_prefill(self: Scheduler) -> None:
253
304
  self.result_queue = deque()
254
305
 
255
306
  while True:
@@ -267,9 +318,7 @@ class SchedulerDisaggregationPrefillMixin:
267
318
  or self.server_args.enable_sp_layernorm
268
319
  ):
269
320
  batch, _ = self.prepare_dp_attn_batch(batch)
270
-
271
321
  self.cur_batch = batch
272
-
273
322
  if batch:
274
323
  result = self.run_batch(batch)
275
324
  self.result_queue.append((batch.copy(), result))
@@ -286,6 +335,9 @@ class SchedulerDisaggregationPrefillMixin:
286
335
 
287
336
  if self.last_batch:
288
337
  tmp_batch, tmp_result = self.result_queue.popleft()
338
+ tmp_batch.next_batch_sampling_info = (
339
+ self.tp_worker.cur_sampling_info if batch else None
340
+ )
289
341
  self.process_batch_result_disagg_prefill(tmp_batch, tmp_result)
290
342
 
291
343
  if len(self.disagg_prefill_inflight_queue) > 0:
@@ -294,6 +346,7 @@ class SchedulerDisaggregationPrefillMixin:
294
346
  if batch is None and len(self.disagg_prefill_inflight_queue) == 0:
295
347
  self.check_memory()
296
348
  self.new_token_ratio = self.init_new_token_ratio
349
+ self.maybe_sleep_on_idle()
297
350
 
298
351
  self.last_batch = batch
299
352
  # HACK (byronhsu): reset the batch_is_full flag because we never enter update_running_batch which resets it
@@ -307,7 +360,7 @@ class SchedulerDisaggregationPrefillMixin:
307
360
  launch_done: Optional[threading.Event] = None,
308
361
  ) -> None:
309
362
  """
310
- Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
363
+ Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
311
364
  Adapted from process_batch_result_prefill
312
365
  """
313
366
  (
@@ -323,7 +376,7 @@ class SchedulerDisaggregationPrefillMixin:
323
376
  )
324
377
 
325
378
  logprob_pt = 0
326
- # Transfer kv for prefill completed requests and add it into disagg_prefill_infight_queue
379
+ # Transfer kv for prefill completed requests and add it into disagg_prefill_inflight_queue
327
380
  if self.enable_overlap:
328
381
  # wait
329
382
  logits_output, next_token_ids, _ = self.tp_worker.resolve_last_batch_result(
@@ -395,11 +448,15 @@ class SchedulerDisaggregationPrefillMixin:
395
448
  # We need to remove the sync in the following function for overlap schedule.
396
449
  self.set_next_batch_sampling_info_done(batch)
397
450
 
398
- def process_disagg_prefill_inflight_queue(self: Scheduler) -> None:
451
+ def process_disagg_prefill_inflight_queue(
452
+ self: Scheduler, rids_to_check: Optional[List[str]] = None
453
+ ) -> List[Req]:
399
454
  """
400
455
  Poll the requests in the middle of transfer. If done, return the request.
456
+ rids_to_check: For PP, on rank > 0, check the rids from the previous rank has consensus with the current rank.
401
457
  """
402
- assert len(self.disagg_prefill_inflight_queue) > 0
458
+ if len(self.disagg_prefill_inflight_queue) == 0:
459
+ return []
403
460
 
404
461
  done_reqs = []
405
462
 
@@ -411,12 +468,22 @@ class SchedulerDisaggregationPrefillMixin:
411
468
  undone_reqs: List[Req] = []
412
469
  # Check .poll() for the reqs in disagg_prefill_inflight_queue. If Success, respond to the client and remove it from the queue
413
470
  for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
471
+
472
+ if rids_to_check is not None:
473
+ if req.rid not in rids_to_check:
474
+ undone_reqs.append(req)
475
+ continue
476
+
477
+ assert poll == KVPoll.Success or poll == KVPoll.Failed
478
+
414
479
  if poll in [KVPoll.WaitingForInput, KVPoll.Transferring]:
415
480
  undone_reqs.append(req)
416
481
  elif poll == KVPoll.Success: # transfer done
417
482
  self.tree_cache.cache_finished_req(req) # unlock the tree
418
483
  req.finished_reason = FINISH_LENGTH(length=0)
419
484
  # FIXME: clean up req's data in transfer engine
485
+ if hasattr(req.disagg_kv_sender, "clear"):
486
+ req.disagg_kv_sender.clear()
420
487
  done_reqs.append(req)
421
488
  elif poll == KVPoll.Failed:
422
489
  error_message = f"Prefill transfer failed for request rank={self.tp_rank} {req.rid=} {req.bootstrap_room=}"
@@ -430,11 +497,8 @@ class SchedulerDisaggregationPrefillMixin:
430
497
  req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
431
498
  )
432
499
  done_reqs.append(req)
433
-
434
- for req in done_reqs:
435
- self.disagg_prefill_bootstrap_queue.req_to_metadata_buffer_idx_allocator.free(
436
- req.metadata_buffer_index
437
- )
500
+ else:
501
+ assert False, f"Unexpected polling state {poll=}"
438
502
 
439
503
  # Stream requests which have finished transfer
440
504
  self.stream_output(
@@ -442,9 +506,32 @@ class SchedulerDisaggregationPrefillMixin:
442
506
  any(req.return_logprob for req in done_reqs),
443
507
  None,
444
508
  )
509
+ for req in done_reqs:
510
+ req: Req
511
+ self.req_to_metadata_buffer_idx_allocator.free(req.metadata_buffer_index)
512
+ req.metadata_buffer_index = -1
445
513
 
446
514
  self.disagg_prefill_inflight_queue = undone_reqs
447
515
 
516
+ return done_reqs
517
+
518
+ def get_transferred_rids(self: Scheduler) -> List[str]:
519
+ """
520
+ Used by PP, get the transferred rids but **do not pop**
521
+ """
522
+ polls = poll_and_all_reduce(
523
+ [req.disagg_kv_sender for req in self.disagg_prefill_inflight_queue],
524
+ self.tp_worker.get_tp_group().cpu_group,
525
+ )
526
+
527
+ transferred_rids: List[str] = []
528
+
529
+ for req, poll in zip(self.disagg_prefill_inflight_queue, polls):
530
+ if poll == KVPoll.Success or poll == KVPoll.Failed:
531
+ transferred_rids.append(req.rid)
532
+
533
+ return transferred_rids
534
+
448
535
  def process_prefill_chunk(self: Scheduler) -> None:
449
536
  if self.last_batch and self.last_batch.forward_mode.is_extend():
450
537
  if self.chunked_req:
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import dataclasses
4
4
  import os
5
5
  import random
6
+ import threading
6
7
  import warnings
7
8
  from collections import deque
8
9
  from enum import Enum
@@ -18,10 +19,10 @@ from sglang.srt.utils import get_ip
18
19
  if TYPE_CHECKING:
19
20
  from sglang.srt.managers.schedule_batch import Req
20
21
 
21
- FakeBootstrapHost = "2.2.2.2"
22
-
23
- # env var for testing failure, convert to float explicitly
24
- FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
22
+ #########################
23
+ # Constants & Enums
24
+ #########################
25
+ FAKE_BOOTSTRAP_HOST = "2.2.2.2"
25
26
 
26
27
 
27
28
  class DisaggregationMode(Enum):
@@ -30,6 +31,14 @@ class DisaggregationMode(Enum):
30
31
  DECODE = "decode"
31
32
 
32
33
 
34
+ #########################
35
+ # Synchronization
36
+ #########################
37
+
38
+ # env var for testing failure, convert to float explicitly
39
+ FAILURE_PROB = float(os.getenv("DISAGGREGATION_TEST_FAILURE_PROB", 0))
40
+
41
+
33
42
  def poll_and_all_reduce(pollers, gloo_group):
34
43
  # at a certain prob, the poll is failed to simulate failure
35
44
  if FAILURE_PROB > 0:
@@ -46,6 +55,11 @@ def poll_and_all_reduce(pollers, gloo_group):
46
55
  return tensor_to_reduce.tolist()
47
56
 
48
57
 
58
+ #########################
59
+ # Metadata Buffers
60
+ #########################
61
+
62
+
49
63
  class ReqToMetadataIdxAllocator:
50
64
  """A memory pool that maps a request to its first output token location."""
51
65
 
@@ -69,6 +83,91 @@ class ReqToMetadataIdxAllocator:
69
83
  self.free_slots.append(free_index)
70
84
 
71
85
 
86
+ class MetadataBuffers:
87
+ def __init__(self, size: int, max_top_logprobs_num: int = 128):
88
+ # TODO: abort top_logprobs_num > 128 in PD
89
+
90
+ # We transfer the metadata of first output token to decode
91
+ # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
92
+ self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device="cpu")
93
+ self.output_token_logprobs_val = torch.zeros(
94
+ (size, 16), dtype=torch.float32, device="cpu"
95
+ )
96
+ self.output_token_logprobs_idx = torch.zeros(
97
+ (size, 16), dtype=torch.int32, device="cpu"
98
+ )
99
+ self.output_top_logprobs_val = torch.zeros(
100
+ (size, max_top_logprobs_num), dtype=torch.float32, device="cpu"
101
+ )
102
+ self.output_top_logprobs_idx = torch.zeros(
103
+ (size, max_top_logprobs_num), dtype=torch.int32, device="cpu"
104
+ )
105
+
106
+ def get_buf_infos(self):
107
+ ptrs = [
108
+ self.output_ids.data_ptr(),
109
+ self.output_token_logprobs_val.data_ptr(),
110
+ self.output_token_logprobs_idx.data_ptr(),
111
+ self.output_top_logprobs_val.data_ptr(),
112
+ self.output_top_logprobs_idx.data_ptr(),
113
+ ]
114
+ data_lens = [
115
+ self.output_ids.nbytes,
116
+ self.output_token_logprobs_val.nbytes,
117
+ self.output_token_logprobs_idx.nbytes,
118
+ self.output_top_logprobs_val.nbytes,
119
+ self.output_top_logprobs_idx.nbytes,
120
+ ]
121
+ item_lens = [
122
+ self.output_ids[0].nbytes,
123
+ self.output_token_logprobs_val[0].nbytes,
124
+ self.output_token_logprobs_idx[0].nbytes,
125
+ self.output_top_logprobs_val[0].nbytes,
126
+ self.output_top_logprobs_idx[0].nbytes,
127
+ ]
128
+ return ptrs, data_lens, item_lens
129
+
130
+ def get_buf(self, idx: int):
131
+ return (
132
+ self.output_ids[idx],
133
+ self.output_token_logprobs_val[idx],
134
+ self.output_token_logprobs_idx[idx],
135
+ self.output_top_logprobs_val[idx],
136
+ self.output_top_logprobs_idx[idx],
137
+ )
138
+
139
+ def set_buf(self, req: Req):
140
+
141
+ self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
142
+ if req.return_logprob:
143
+ if req.output_token_logprobs_val: # not none or empty list
144
+ self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
145
+ req.output_token_logprobs_val[0]
146
+ )
147
+ if req.output_token_logprobs_idx: # not none or empty list
148
+ self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
149
+ req.output_token_logprobs_idx[0]
150
+ )
151
+
152
+ if req.output_top_logprobs_val: # not none or empty list
153
+ self.output_top_logprobs_val[req.metadata_buffer_index][
154
+ : len(req.output_top_logprobs_val[0])
155
+ ] = torch.tensor(
156
+ req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
157
+ )
158
+ if req.output_top_logprobs_idx: # not none or empty list
159
+ self.output_top_logprobs_idx[req.metadata_buffer_index][
160
+ : len(req.output_top_logprobs_idx[0])
161
+ ] = torch.tensor(
162
+ req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
163
+ )
164
+
165
+
166
+ #########################
167
+ # Transfer Backend
168
+ #########################
169
+
170
+
72
171
  class TransferBackend(Enum):
73
172
  MOONCAKE = "mooncake"
74
173
  NIXL = "nixl"
@@ -76,6 +175,7 @@ class TransferBackend(Enum):
76
175
 
77
176
 
78
177
  class KVClassType(Enum):
178
+ KVARGS = "kvargs"
79
179
  MANAGER = "manager"
80
180
  SENDER = "sender"
81
181
  RECEIVER = "receiver"
@@ -86,6 +186,7 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
86
186
  from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
87
187
 
88
188
  if transfer_backend == TransferBackend.MOONCAKE:
189
+ from sglang.srt.disaggregation.base import KVArgs
89
190
  from sglang.srt.disaggregation.mooncake import (
90
191
  MooncakeKVBootstrapServer,
91
192
  MooncakeKVManager,
@@ -94,13 +195,15 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
94
195
  )
95
196
 
96
197
  class_mapping = {
198
+ KVClassType.KVARGS: KVArgs,
97
199
  KVClassType.MANAGER: MooncakeKVManager,
98
200
  KVClassType.SENDER: MooncakeKVSender,
99
201
  KVClassType.RECEIVER: (MooncakeKVReceiver),
100
202
  KVClassType.BOOTSTRAP_SERVER: MooncakeKVBootstrapServer,
101
203
  }
102
204
  return class_mapping.get(class_type)
103
- if transfer_backend == TransferBackend.NIXL:
205
+ elif transfer_backend == TransferBackend.NIXL:
206
+ from sglang.srt.disaggregation.base import KVArgs
104
207
  from sglang.srt.disaggregation.nixl import (
105
208
  NixlKVBootstrapServer,
106
209
  NixlKVManager,
@@ -109,16 +212,19 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
109
212
  )
110
213
 
111
214
  class_mapping = {
215
+ KVClassType.KVARGS: KVArgs,
112
216
  KVClassType.MANAGER: NixlKVManager,
113
217
  KVClassType.SENDER: NixlKVSender,
114
218
  KVClassType.RECEIVER: (NixlKVReceiver),
115
219
  KVClassType.BOOTSTRAP_SERVER: NixlKVBootstrapServer,
116
220
  }
117
221
  return class_mapping.get(class_type)
118
- if transfer_backend == TransferBackend.FAKE:
222
+ elif transfer_backend == TransferBackend.FAKE:
223
+ from sglang.srt.disaggregation.base import KVArgs
119
224
  from sglang.srt.disaggregation.fake import FakeKVReceiver, FakeKVSender
120
225
 
121
226
  class_mapping = {
227
+ KVClassType.KVARGS: KVArgs,
122
228
  KVClassType.SENDER: FakeKVSender,
123
229
  KVClassType.RECEIVER: (FakeKVReceiver),
124
230
  }
@@ -127,6 +233,11 @@ def get_kv_class(transfer_backend: TransferBackend, class_type: KVClassType):
127
233
  raise ValueError(f"Unsupported transfer backend: {transfer_backend}")
128
234
 
129
235
 
236
+ #########################
237
+ # KV Pages
238
+ #########################
239
+
240
+
130
241
  def kv_to_page_indices(kv_indices: np.ndarray, page_size: int):
131
242
  # 1. The page is guaranteed to be full except the last page.
132
243
  # 2. page index = kv_index // page_size
@@ -142,6 +253,11 @@ def kv_to_page_num(num_kv_indices: int, page_size: int):
142
253
  return (num_kv_indices + page_size - 1) // page_size
143
254
 
144
255
 
256
+ #########################
257
+ # PDLB Registry
258
+ #########################
259
+
260
+
145
261
  @dataclasses.dataclass
146
262
  class PDRegistryRequest:
147
263
  """A request to register a machine itself to the LB."""
@@ -180,6 +296,11 @@ def register_disaggregation_server(
180
296
  )
181
297
 
182
298
 
299
+ #########################
300
+ # Misc
301
+ #########################
302
+
303
+
183
304
  def is_mla_backend(target_kv_pool) -> bool:
184
305
  from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
185
306
 
@@ -199,83 +320,3 @@ def prepare_abort(req: Req, error_message: str, status_code=None):
199
320
  req.input_top_logprobs_idx = []
200
321
  req.input_token_ids_logprobs_val = []
201
322
  req.input_token_ids_logprobs_idx = []
202
-
203
-
204
- class MetadataBuffers:
205
- def __init__(self, size: int, max_top_logprobs_num: int = 128):
206
- # TODO: abort top_logprobs_num > 128 in PD
207
-
208
- # We transfer the metadata of first output token to decode
209
- # The minimal size for RDMA is 64Bytes, so we pad it to > 64Bytes
210
- self.output_ids = torch.zeros((size, 16), dtype=torch.int32, device="cpu")
211
- self.output_token_logprobs_val = torch.zeros(
212
- (size, 16), dtype=torch.float32, device="cpu"
213
- )
214
- self.output_token_logprobs_idx = torch.zeros(
215
- (size, 16), dtype=torch.int32, device="cpu"
216
- )
217
- self.output_top_logprobs_val = torch.zeros(
218
- (size, max_top_logprobs_num), dtype=torch.float32, device="cpu"
219
- )
220
- self.output_top_logprobs_idx = torch.zeros(
221
- (size, max_top_logprobs_num), dtype=torch.int32, device="cpu"
222
- )
223
-
224
- def get_buf_infos(self):
225
- ptrs = [
226
- self.output_ids.data_ptr(),
227
- self.output_token_logprobs_val.data_ptr(),
228
- self.output_token_logprobs_idx.data_ptr(),
229
- self.output_top_logprobs_val.data_ptr(),
230
- self.output_top_logprobs_idx.data_ptr(),
231
- ]
232
- data_lens = [
233
- self.output_ids.nbytes,
234
- self.output_token_logprobs_val.nbytes,
235
- self.output_token_logprobs_idx.nbytes,
236
- self.output_top_logprobs_val.nbytes,
237
- self.output_top_logprobs_idx.nbytes,
238
- ]
239
- item_lens = [
240
- self.output_ids[0].nbytes,
241
- self.output_token_logprobs_val[0].nbytes,
242
- self.output_token_logprobs_idx[0].nbytes,
243
- self.output_top_logprobs_val[0].nbytes,
244
- self.output_top_logprobs_idx[0].nbytes,
245
- ]
246
- return ptrs, data_lens, item_lens
247
-
248
- def get_buf(self, idx: int):
249
- return (
250
- self.output_ids[idx],
251
- self.output_token_logprobs_val[idx],
252
- self.output_token_logprobs_idx[idx],
253
- self.output_top_logprobs_val[idx],
254
- self.output_top_logprobs_idx[idx],
255
- )
256
-
257
- def set_buf(self, req: Req):
258
-
259
- self.output_ids[req.metadata_buffer_index][0] = req.output_ids[0]
260
- if req.return_logprob:
261
- if req.output_token_logprobs_val: # not none or empty list
262
- self.output_token_logprobs_val[req.metadata_buffer_index][0] = (
263
- req.output_token_logprobs_val[0]
264
- )
265
- if req.output_token_logprobs_idx: # not none or empty list
266
- self.output_token_logprobs_idx[req.metadata_buffer_index][0] = (
267
- req.output_token_logprobs_idx[0]
268
- )
269
-
270
- if req.output_top_logprobs_val: # not none or empty list
271
- self.output_top_logprobs_val[req.metadata_buffer_index][
272
- : len(req.output_top_logprobs_val[0])
273
- ] = torch.tensor(
274
- req.output_top_logprobs_val[0], dtype=torch.float32, device="cpu"
275
- )
276
- if req.output_top_logprobs_idx: # not none or empty list
277
- self.output_top_logprobs_idx[req.metadata_buffer_index][
278
- : len(req.output_top_logprobs_idx[0])
279
- ] = torch.tensor(
280
- req.output_top_logprobs_idx[0], dtype=torch.int32, device="cpu"
281
- )