sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,8 @@ import logging
24
24
  import os
25
25
  from collections import deque
26
26
  from dataclasses import dataclass
27
- from typing import TYPE_CHECKING, List, Optional, Tuple
27
+ from http import HTTPStatus
28
+ from typing import TYPE_CHECKING, List, Optional, Tuple, Union
28
29
 
29
30
  import numpy as np
30
31
  import torch
@@ -35,25 +36,87 @@ from sglang.srt.disaggregation.utils import (
35
36
  DisaggregationMode,
36
37
  FakeBootstrapHost,
37
38
  KVClassType,
39
+ MetadataBuffers,
38
40
  ReqToMetadataIdxAllocator,
39
41
  TransferBackend,
40
42
  get_kv_class,
41
43
  is_mla_backend,
42
44
  kv_to_page_indices,
43
45
  poll_and_all_reduce,
46
+ prepare_abort,
44
47
  )
48
+ from sglang.srt.managers.schedule_batch import FINISH_ABORT, ScheduleBatch
45
49
  from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
46
50
  from sglang.srt.mem_cache.memory_pool import ReqToTokenPool, TokenToKVPoolAllocator
47
51
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
48
- from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
52
+ from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
49
53
 
50
54
  logger = logging.getLogger(__name__)
51
55
 
52
56
  if TYPE_CHECKING:
53
- from sglang.srt.configs.model_config import ModelConfig
54
- from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
57
+ from sglang.srt.managers.schedule_batch import Req
55
58
  from sglang.srt.managers.scheduler import Scheduler
56
- from sglang.srt.server_args import ServerArgs
59
+
60
+
61
+ class DecodeReqToTokenPool:
62
+ """
63
+ The difference of DecodeReqToTokenPool and ReqToTokenPool is that
64
+ DecodeReqToTokenPool subscribes memory for pre-allocated requests.
65
+
66
+ In ReqToTokenPool, if `--max-running-requests` is 8,
67
+ #pre-allocated + #transfer + #running <= 8, but there are in fact more memory can carry pre-allocated requests.
68
+
69
+ In DecodeReqToTokenPool, if `--max-running-requests` is 8,
70
+ #running <= 8, #pre-allocated + #transfer <= pre_alloc_size, so we can use the free memory to pre-allocate requests to unblock prefill.
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ size: int,
76
+ max_context_len: int,
77
+ device: str,
78
+ enable_memory_saver: bool,
79
+ pre_alloc_size: int,
80
+ ):
81
+ memory_saver_adapter = TorchMemorySaverAdapter.create(
82
+ enable=enable_memory_saver
83
+ )
84
+
85
+ self.size = size
86
+ self.max_context_len = max_context_len
87
+ self.device = device
88
+ self.pre_alloc_size = pre_alloc_size
89
+ with memory_saver_adapter.region():
90
+ self.req_to_token = torch.zeros(
91
+ (size + pre_alloc_size, max_context_len),
92
+ dtype=torch.int32,
93
+ device=device,
94
+ )
95
+
96
+ self.free_slots = list(range(size + pre_alloc_size))
97
+
98
+ def write(self, indices, values):
99
+ self.req_to_token[indices] = values
100
+
101
+ def available_size(self):
102
+ return len(self.free_slots)
103
+
104
+ def alloc(self, need_size: int) -> List[int]:
105
+ if need_size > len(self.free_slots):
106
+ return None
107
+
108
+ select_index = self.free_slots[:need_size]
109
+ self.free_slots = self.free_slots[need_size:]
110
+ return select_index
111
+
112
+ def free(self, free_index: Union[int, List[int]]):
113
+ if isinstance(free_index, (int,)):
114
+ self.free_slots.append(free_index)
115
+ else:
116
+ self.free_slots.extend(free_index)
117
+
118
+ def clear(self):
119
+ self.free_slots = list(range(self.size + self.pre_alloc_size))
57
120
 
58
121
 
59
122
  @dataclass
@@ -73,9 +136,9 @@ class DecodePreallocQueue:
73
136
  self,
74
137
  req_to_token_pool: ReqToTokenPool,
75
138
  token_to_kv_pool_allocator: TokenToKVPoolAllocator,
139
+ draft_token_to_kv_pool: Optional[KVCache],
76
140
  req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
77
- metadata_buffers: List[torch.Tensor],
78
- aux_dtype: torch.dtype,
141
+ metadata_buffers: MetadataBuffers,
79
142
  scheduler: Scheduler,
80
143
  transfer_queue: DecodeTransferQueue,
81
144
  tree_cache: BasePrefixCache,
@@ -88,8 +151,8 @@ class DecodePreallocQueue:
88
151
  self.req_to_token_pool = req_to_token_pool
89
152
  self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
90
153
  self.token_to_kv_pool = token_to_kv_pool_allocator.get_kvcache()
154
+ self.draft_token_to_kv_pool = draft_token_to_kv_pool
91
155
  self.is_mla_backend = is_mla_backend(self.token_to_kv_pool)
92
- self.aux_dtype = aux_dtype
93
156
  self.metadata_buffers = metadata_buffers
94
157
  self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
95
158
  self.scheduler = scheduler
@@ -116,19 +179,21 @@ class DecodePreallocQueue:
116
179
  self.token_to_kv_pool.get_contiguous_buf_infos()
117
180
  )
118
181
 
182
+ if self.draft_token_to_kv_pool is not None:
183
+ draft_kv_data_ptrs, draft_kv_data_lens, draft_kv_item_lens = (
184
+ self.draft_token_to_kv_pool.get_contiguous_buf_infos()
185
+ )
186
+ kv_data_ptrs += draft_kv_data_ptrs
187
+ kv_data_lens += draft_kv_data_lens
188
+ kv_item_lens += draft_kv_item_lens
189
+
119
190
  kv_args.kv_data_ptrs = kv_data_ptrs
120
191
  kv_args.kv_data_lens = kv_data_lens
121
192
  kv_args.kv_item_lens = kv_item_lens
122
193
 
123
- kv_args.aux_data_ptrs = [
124
- output_id_tensor.data_ptr() for output_id_tensor in self.metadata_buffers
125
- ]
126
- kv_args.aux_data_lens = [
127
- metadata_buffer.nbytes for metadata_buffer in self.metadata_buffers
128
- ]
129
- kv_args.aux_item_lens = [
130
- metadata_buffer[0].nbytes for metadata_buffer in self.metadata_buffers
131
- ]
194
+ kv_args.aux_data_ptrs, kv_args.aux_data_lens, kv_args.aux_item_lens = (
195
+ self.metadata_buffers.get_buf_infos()
196
+ )
132
197
  kv_args.ib_device = self.scheduler.server_args.disaggregation_ib_device
133
198
  kv_args.gpu_id = self.scheduler.gpu_id
134
199
  kv_manager_class = get_kv_class(self.transfer_backend, KVClassType.MANAGER)
@@ -153,6 +218,7 @@ class DecodePreallocQueue:
153
218
  mgr=self.kv_manager,
154
219
  bootstrap_addr=f"{req.bootstrap_host}:{req.bootstrap_port}",
155
220
  bootstrap_room=req.bootstrap_room,
221
+ data_parallel_rank=req.data_parallel_rank,
156
222
  )
157
223
  self.queue.append(DecodeRequest(req=req, kv_receiver=kv_receiver))
158
224
 
@@ -178,7 +244,17 @@ class DecodePreallocQueue:
178
244
  elif poll == KVPoll.WaitingForInput:
179
245
  decode_req.waiting_for_input = True
180
246
  elif poll == KVPoll.Failed:
181
- raise Exception("Handshake failed")
247
+ error_message = f"Decode handshake failed for request rank={self.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
248
+ try:
249
+ decode_req.kv_receiver.failure_exception()
250
+ except Exception as e:
251
+ error_message += f" with exception {e}"
252
+ logger.error(error_message)
253
+ prepare_abort(
254
+ decode_req.req,
255
+ error_message,
256
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
257
+ )
182
258
 
183
259
  def pop_preallocated(self) -> List[DecodeRequest]:
184
260
  """Pop the preallocated requests from the pending queue (FIFO)."""
@@ -188,7 +264,18 @@ class DecodePreallocQueue:
188
264
  indices_to_remove = set()
189
265
  allocatable_tokens = self._allocatable_tokens()
190
266
 
267
+ # First, remove all failed requests from the queue
191
268
  for i, decode_req in enumerate(self.queue):
269
+ if isinstance(decode_req.req.finished_reason, FINISH_ABORT):
270
+ self.scheduler.stream_output(
271
+ [decode_req.req], decode_req.req.return_logprob
272
+ )
273
+ indices_to_remove.add(i)
274
+
275
+ for i, decode_req in enumerate(self.queue):
276
+ if i in indices_to_remove:
277
+ continue
278
+
192
279
  if not decode_req.waiting_for_input:
193
280
  continue
194
281
 
@@ -308,18 +395,22 @@ class DecodeTransferQueue:
308
395
  self,
309
396
  gloo_group: ProcessGroup,
310
397
  req_to_metadata_buffer_idx_allocator: ReqToMetadataIdxAllocator,
311
- metadata_buffers: torch.Tensor,
398
+ metadata_buffers: MetadataBuffers,
399
+ scheduler: Scheduler,
400
+ tree_cache: BasePrefixCache,
312
401
  ):
313
402
  self.queue: List[DecodeRequest] = []
314
403
  self.gloo_group = gloo_group
315
404
  self.req_to_metadata_buffer_idx_allocator = req_to_metadata_buffer_idx_allocator
316
405
  self.metadata_buffers = metadata_buffers
406
+ self.scheduler = scheduler
407
+ self.tree_cache = tree_cache
317
408
 
318
- def add(self, req_conn: DecodeRequest) -> None:
319
- self.queue.append(req_conn)
409
+ def add(self, decode_req: DecodeRequest) -> None:
410
+ self.queue.append(decode_req)
320
411
 
321
- def extend(self, req_conns) -> None:
322
- self.queue.extend(req_conns)
412
+ def extend(self, decode_reqs: List[DecodeRequest]) -> None:
413
+ self.queue.extend(decode_reqs)
323
414
 
324
415
  def pop_transferred(self) -> List[DecodeRequest]:
325
416
  if not self.queue:
@@ -333,18 +424,57 @@ class DecodeTransferQueue:
333
424
  indices_to_remove = set()
334
425
  for i, (decode_req, poll) in enumerate(zip(self.queue, polls)):
335
426
  if poll == KVPoll.Failed:
336
- raise Exception("Transfer failed")
427
+ error_message = f"Decode transfer failed for request rank={self.scheduler.tp_rank} {decode_req.req.rid=} {decode_req.req.bootstrap_room=}"
428
+ try:
429
+ decode_req.kv_receiver.failure_exception()
430
+ except Exception as e:
431
+ error_message += f" with exception {e}"
432
+ logger.error(error_message)
433
+ prepare_abort(
434
+ decode_req.req,
435
+ error_message,
436
+ status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
437
+ )
438
+ self.scheduler.stream_output(
439
+ [decode_req.req], decode_req.req.return_logprob
440
+ )
441
+ # unlock the kv cache or it will have memory leak
442
+ self.tree_cache.cache_finished_req(decode_req.req)
443
+ indices_to_remove.add(i)
444
+ continue
337
445
  elif poll == KVPoll.Success:
338
- # pop and push it to waiting queue
446
+
339
447
  idx = decode_req.metadata_buffer_index
340
- assert len(decode_req.req.output_ids) == 0
341
- output_id_buffer = self.metadata_buffers[0]
342
- # the last dimension is padded by the same values.
343
- output_id = output_id_buffer[idx][0].item()
344
- assert len(decode_req.req.output_ids) == 0
345
- assert decode_req.req.transferred_output_id is None
346
- decode_req.req.transferred_output_id = output_id
347
- transferred_reqs.append(decode_req)
448
+ (
449
+ output_id,
450
+ output_token_logprobs_val,
451
+ output_token_logprobs_idx,
452
+ output_top_logprobs_val,
453
+ output_top_logprobs_idx,
454
+ ) = self.metadata_buffers.get_buf(idx)
455
+
456
+ decode_req.req.output_ids.append(output_id[0].item())
457
+
458
+ if decode_req.req.return_logprob:
459
+ decode_req.req.output_token_logprobs_val.append(
460
+ output_token_logprobs_val[0].item()
461
+ )
462
+ decode_req.req.output_token_logprobs_idx.append(
463
+ output_token_logprobs_idx[0].item()
464
+ )
465
+ decode_req.req.output_top_logprobs_val.append(
466
+ output_top_logprobs_val[
467
+ : decode_req.req.top_logprobs_num
468
+ ].tolist()
469
+ )
470
+ decode_req.req.output_top_logprobs_idx.append(
471
+ output_top_logprobs_idx[
472
+ : decode_req.req.top_logprobs_num
473
+ ].tolist()
474
+ )
475
+ if hasattr(decode_req.kv_receiver, "clear"):
476
+ decode_req.kv_receiver.clear()
477
+ transferred_reqs.append(decode_req.req)
348
478
  indices_to_remove.add(i)
349
479
  elif poll in [
350
480
  KVPoll.Bootstrapping,
@@ -367,95 +497,6 @@ class DecodeTransferQueue:
367
497
  return transferred_reqs
368
498
 
369
499
 
370
- class ScheduleBatchDisaggregationDecodeMixin:
371
-
372
- def prepare_for_prebuilt_extend(self: ScheduleBatch):
373
- """
374
- Prepare a prebuilt extend by populate metadata
375
- Adapted from .prepare_for_extend().
376
- """
377
-
378
- self.forward_mode = ForwardMode.EXTEND
379
- reqs = self.reqs
380
- input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
381
- extend_num_tokens = sum(len(ids) for ids in input_ids)
382
- seq_lens = []
383
- pre_lens = []
384
- req_pool_indices = []
385
-
386
- # Pre-calculate total size
387
- total_size = sum(req.extend_input_len for req in reqs)
388
- out_cache_loc = torch.empty(total_size, dtype=torch.int64, device=self.device)
389
-
390
- # Fill the tensor in one pass
391
- offset = 0
392
- for i, req in enumerate(reqs):
393
- req_pool_indices.append(req.req_pool_idx)
394
-
395
- chunk = self.req_to_token_pool.req_to_token[req.req_pool_idx][
396
- : req.extend_input_len
397
- ]
398
- assert (
399
- offset + req.extend_input_len <= total_size
400
- ), f"Exceeds total size: offset={offset}, req.extend_input_len={req.extend_input_len}, total_size={total_size}"
401
- out_cache_loc[offset : offset + req.extend_input_len] = chunk
402
- offset += req.extend_input_len
403
-
404
- pre_len = len(req.prefix_indices)
405
- seq_len = len(req.origin_input_ids) + max(0, len(req.output_ids) - 1)
406
- seq_lens.append(seq_len)
407
- if len(req.output_ids) == 0:
408
- assert (
409
- seq_len - pre_len == req.extend_input_len
410
- ), f"seq_len={seq_len}, pre_len={pre_len}, req.extend_input_len={req.extend_input_len}"
411
-
412
- req.cached_tokens += pre_len - req.already_computed
413
- req.already_computed = seq_len
414
- req.is_retracted = False
415
- pre_lens.append(pre_len)
416
- req.extend_logprob_start_len = 0
417
-
418
- extend_input_logprob_token_ids = None
419
-
420
- # Set fields
421
- self.input_ids = torch.tensor(
422
- sum(input_ids, []), dtype=torch.int32, device=self.device
423
- )
424
- self.req_pool_indices = torch.tensor(
425
- req_pool_indices, dtype=torch.int64, device=self.device
426
- )
427
- self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, device=self.device)
428
- self.out_cache_loc = out_cache_loc
429
- self.seq_lens_sum = sum(seq_lens)
430
- self.extend_num_tokens = extend_num_tokens
431
- self.prefix_lens = [len(r.prefix_indices) for r in reqs]
432
- self.extend_lens = [r.extend_input_len for r in reqs]
433
- self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
434
- self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
435
-
436
- # Build sampling info
437
- self.sampling_info = SamplingBatchInfo.from_schedule_batch(
438
- self,
439
- self.model_config.vocab_size,
440
- )
441
-
442
- def process_prebuilt_extend(
443
- self: ScheduleBatch, server_args: ServerArgs, model_config: ModelConfig
444
- ):
445
- """Assign the buffered last input id to schedule batch"""
446
- self.output_ids = []
447
- for req in self.reqs:
448
- if req.output_ids and len(req.output_ids) > 0:
449
- # resumed retracted req
450
- self.output_ids.append(req.output_ids[-1])
451
- else:
452
- assert req.transferred_output_id is not None
453
- req.output_ids.append(req.transferred_output_id)
454
- self.output_ids.append(req.transferred_output_id)
455
- self.tree_cache.cache_unfinished_req(req)
456
- self.output_ids = torch.tensor(self.output_ids, device=self.device)
457
-
458
-
459
500
  class SchedulerDisaggregationDecodeMixin:
460
501
 
461
502
  def _prepare_idle_batch_and_run(self, batch, delay_process=False):
@@ -488,7 +529,9 @@ class SchedulerDisaggregationDecodeMixin:
488
529
  # Generate fake extend output.
489
530
  if batch.forward_mode.is_extend():
490
531
  # Note: Logprobs should be handled on the prefill engine.
491
- self.stream_output(batch.reqs, False)
532
+ self.stream_output(
533
+ batch.reqs, any(req.return_logprob for req in batch.reqs)
534
+ )
492
535
  if prepare_dp_attn_flag:
493
536
  self._prepare_idle_batch_and_run(None)
494
537
  else:
@@ -534,7 +577,9 @@ class SchedulerDisaggregationDecodeMixin:
534
577
  # Generate fake extend output.
535
578
  if batch.forward_mode.is_extend():
536
579
  # Note: Logprobs should be handled on the prefill engine.
537
- self.stream_output(batch.reqs, False)
580
+ self.stream_output(
581
+ batch.reqs, any(req.return_logprob for req in batch.reqs)
582
+ )
538
583
  if prepare_dp_attn_flag:
539
584
  batch_, result = self._prepare_idle_batch_and_run(
540
585
  None, delay_process=True
@@ -547,7 +592,18 @@ class SchedulerDisaggregationDecodeMixin:
547
592
  self.prepare_dp_attn_batch(batch)
548
593
  result = self.run_batch(batch)
549
594
  result_queue.append((batch.copy(), result))
595
+
596
+ if (self.last_batch is None) or (not self.last_batch_in_queue):
597
+ # Create a dummy first batch to start the pipeline for overlap schedule.
598
+ # It is now used for triggering the sampling_info_done event.
599
+ tmp_batch = ScheduleBatch(
600
+ reqs=None,
601
+ forward_mode=ForwardMode.DUMMY_FIRST,
602
+ next_batch_sampling_info=self.tp_worker.cur_sampling_info,
603
+ )
604
+ self.set_next_batch_sampling_info_done(tmp_batch)
550
605
  last_batch_in_queue = True
606
+
551
607
  elif prepare_dp_attn_flag:
552
608
  batch, result = self._prepare_idle_batch_and_run(
553
609
  None, delay_process=True
@@ -559,6 +615,9 @@ class SchedulerDisaggregationDecodeMixin:
559
615
  # Process the results of the previous batch but skip if the last batch is extend
560
616
  if self.last_batch and self.last_batch_in_queue:
561
617
  tmp_batch, tmp_result = result_queue.popleft()
618
+ tmp_batch.next_batch_sampling_info = (
619
+ self.tp_worker.cur_sampling_info if batch else None
620
+ )
562
621
  self.process_batch_result(tmp_batch, tmp_result)
563
622
 
564
623
  if batch is None and (
@@ -607,6 +666,9 @@ class SchedulerDisaggregationDecodeMixin:
607
666
 
608
667
  def get_new_prebuilt_batch(self: Scheduler) -> Optional[ScheduleBatch]:
609
668
  """Create a schedulebatch for fake completed prefill"""
669
+ if self.grammar_queue:
670
+ self.move_ready_grammar_requests()
671
+
610
672
  if len(self.waiting_queue) == 0:
611
673
  return None
612
674
 
@@ -632,8 +694,6 @@ class SchedulerDisaggregationDecodeMixin:
632
694
  self.waiting_queue = waiting_queue
633
695
  if len(can_run_list) == 0:
634
696
  return None
635
- # local import to avoid circular import
636
- from sglang.srt.managers.schedule_batch import ScheduleBatch
637
697
 
638
698
  # construct a schedule batch with those requests and mark as decode
639
699
  new_batch = ScheduleBatch.init_new(
@@ -655,15 +715,8 @@ class SchedulerDisaggregationDecodeMixin:
655
715
 
656
716
  def process_decode_queue(self: Scheduler):
657
717
  req_conns = self.disagg_decode_prealloc_queue.pop_preallocated()
658
-
659
- def _num_pre_alloc(req):
660
- return len(req.req.origin_input_ids) + max(len(req.req.output_ids) - 1, 0)
661
-
662
- self.num_tokens_pre_allocated += sum(_num_pre_alloc(req) for req in req_conns)
663
718
  self.disagg_decode_transfer_queue.extend(req_conns)
664
719
  alloc_reqs = (
665
720
  self.disagg_decode_transfer_queue.pop_transferred()
666
721
  ) # the requests which kv has arrived
667
- self.num_tokens_pre_allocated -= sum(_num_pre_alloc(req) for req in alloc_reqs)
668
-
669
- self.waiting_queue.extend([req.req for req in alloc_reqs])
722
+ self.waiting_queue.extend(alloc_reqs)
@@ -0,0 +1,142 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import TYPE_CHECKING
5
+
6
+ import torch
7
+
8
+ from sglang.srt.model_executor.forward_batch_info import CaptureHiddenMode, ForwardMode
9
+ from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ if TYPE_CHECKING:
14
+ from sglang.srt.configs.model_config import ModelConfig
15
+ from sglang.srt.managers.schedule_batch import ScheduleBatch
16
+ from sglang.srt.server_args import ServerArgs
17
+
18
+
19
+ class ScheduleBatchDisaggregationDecodeMixin:
20
+
21
+ def prepare_for_prebuilt_extend(self: ScheduleBatch):
22
+ """
23
+ Prepare a prebuilt extend by populate metadata
24
+ Adapted from .prepare_for_extend().
25
+ """
26
+
27
+ self.forward_mode = ForwardMode.EXTEND
28
+ reqs = self.reqs
29
+ input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
30
+ extend_num_tokens = sum(len(ids) for ids in input_ids)
31
+ seq_lens = []
32
+ pre_lens = []
33
+ req_pool_indices = []
34
+
35
+ # Pre-calculate total size
36
+ total_size = sum(req.extend_input_len for req in reqs)
37
+ out_cache_loc = torch.empty(total_size, dtype=torch.int64, device=self.device)
38
+
39
+ # Fill the tensor in one pass
40
+ offset = 0
41
+ for i, req in enumerate(reqs):
42
+ req_pool_indices.append(req.req_pool_idx)
43
+
44
+ chunk = self.req_to_token_pool.req_to_token[req.req_pool_idx][
45
+ : req.extend_input_len
46
+ ]
47
+ assert (
48
+ offset + req.extend_input_len <= total_size
49
+ ), f"Exceeds total size: offset={offset}, req.extend_input_len={req.extend_input_len}, total_size={total_size}"
50
+ out_cache_loc[offset : offset + req.extend_input_len] = chunk
51
+ offset += req.extend_input_len
52
+
53
+ pre_len = len(req.prefix_indices)
54
+ seq_len = len(req.origin_input_ids) + max(0, len(req.output_ids) - 1)
55
+ seq_lens.append(seq_len)
56
+ if len(req.output_ids) == 0:
57
+ assert (
58
+ seq_len - pre_len == req.extend_input_len
59
+ ), f"seq_len={seq_len}, pre_len={pre_len}, req.extend_input_len={req.extend_input_len}"
60
+
61
+ req.cached_tokens += pre_len - req.already_computed
62
+ req.already_computed = seq_len
63
+ req.is_retracted = False
64
+ pre_lens.append(pre_len)
65
+ req.extend_logprob_start_len = 0
66
+
67
+ extend_input_logprob_token_ids = None
68
+
69
+ # Set fields
70
+ self.input_ids = torch.tensor(
71
+ sum(input_ids, []), dtype=torch.int32, device=self.device
72
+ )
73
+ self.req_pool_indices = torch.tensor(
74
+ req_pool_indices, dtype=torch.int64, device=self.device
75
+ )
76
+ self.seq_lens = torch.tensor(seq_lens, dtype=torch.int64, device=self.device)
77
+ self.out_cache_loc = out_cache_loc
78
+ self.seq_lens_sum = sum(seq_lens)
79
+
80
+ if self.return_logprob:
81
+ self.top_logprobs_nums = [r.top_logprobs_num for r in reqs]
82
+ self.token_ids_logprobs = [r.token_ids_logprob for r in reqs]
83
+
84
+ self.extend_num_tokens = extend_num_tokens
85
+ self.prefix_lens = [len(r.prefix_indices) for r in reqs]
86
+ self.extend_lens = [r.extend_input_len for r in reqs]
87
+ self.extend_logprob_start_lens = [r.extend_logprob_start_len for r in reqs]
88
+ self.extend_input_logprob_token_ids = extend_input_logprob_token_ids
89
+
90
+ # Build sampling info
91
+ self.sampling_info = SamplingBatchInfo.from_schedule_batch(
92
+ self,
93
+ self.model_config.vocab_size,
94
+ )
95
+
96
+ def process_prebuilt_extend(
97
+ self: ScheduleBatch, server_args: ServerArgs, model_config: ModelConfig
98
+ ):
99
+ """Assign the buffered last input id to schedule batch"""
100
+ self.output_ids = []
101
+ for req in self.reqs:
102
+ self.output_ids.append(req.output_ids[-1])
103
+ self.tree_cache.cache_unfinished_req(req)
104
+ if req.grammar is not None:
105
+ req.grammar.accept_token(req.output_ids[-1])
106
+ req.grammar.finished = req.finished()
107
+ self.output_ids = torch.tensor(self.output_ids, device=self.device)
108
+
109
+ # Simulate the eagle run. We add mock data to hidden states for the
110
+ # ease of implementation now meaning the first token will have acc rate
111
+ # of 0.
112
+ if not self.spec_algorithm.is_none():
113
+
114
+ b = len(self.reqs)
115
+ topk_p = torch.arange(
116
+ b * server_args.speculative_eagle_topk,
117
+ 0,
118
+ -1,
119
+ device=self.device,
120
+ dtype=torch.float32,
121
+ )
122
+ topk_p = topk_p.reshape(b, server_args.speculative_eagle_topk)
123
+ topk_p /= b * server_args.speculative_eagle_topk
124
+ topk_index = torch.arange(
125
+ b * server_args.speculative_eagle_topk, device=self.device
126
+ )
127
+ topk_index = topk_index.reshape(b, server_args.speculative_eagle_topk)
128
+
129
+ # local import to avoid circular import
130
+ from sglang.srt.speculative.eagle_utils import EagleDraftInput
131
+
132
+ spec_info = EagleDraftInput(
133
+ topk_p=topk_p,
134
+ topk_index=topk_index,
135
+ hidden_states=torch.ones(
136
+ (b, model_config.hidden_size), device=self.device
137
+ ),
138
+ verified_id=self.output_ids,
139
+ )
140
+ spec_info.prepare_for_extend(self)
141
+ spec_info.capture_hidden_mode = CaptureHiddenMode.LAST
142
+ self.spec_info = spec_info
@@ -33,28 +33,18 @@ class FakeKVSender(BaseKVSender):
33
33
  self,
34
34
  kv_indices: list[int],
35
35
  aux_index: Optional[int] = None,
36
- dest_ranks: Optional[list[int]] = None,
37
36
  ):
38
37
  logger.info(
39
- f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}, dest_ranks: {dest_ranks}"
38
+ f"FakeKVSender init with kv_indices: {kv_indices}, aux_index: {aux_index}"
40
39
  )
41
40
  pass
42
41
 
43
42
  def send(
44
43
  self,
45
44
  kv_indices: npt.NDArray[np.int64],
46
- index_slice: slice,
47
- is_last: bool,
48
45
  ):
49
- logger.info(
50
- f"FakeKVSender send with kv_indices: {kv_indices}, index_slice: {index_slice}, is_last: {is_last}"
51
- )
52
- if is_last:
53
- self.has_sent = True
54
- logger.info(f"FakeKVSender send success")
55
- else:
56
- self.has_sent = False
57
- logger.info(f"FakeKVSender send fake transferring")
46
+ self.has_sent = True
47
+ logger.info(f"FakeKVSender send with kv_indices: {kv_indices}")
58
48
 
59
49
  def failure_exception(self):
60
50
  raise Exception("Fake KVSender Exception")
@@ -66,6 +56,7 @@ class FakeKVReceiver(BaseKVReceiver):
66
56
  mgr: BaseKVManager,
67
57
  bootstrap_addr: str,
68
58
  bootstrap_room: Optional[int] = None,
59
+ data_parallel_rank: Optional[int] = None,
69
60
  ):
70
61
  self.has_init = False
71
62