sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,315 @@
1
+ import bisect
2
+ import logging
3
+ import math
4
+ import os
5
+ from contextlib import contextmanager
6
+ from enum import IntEnum
7
+ from typing import Any, Callable, List, Optional, TypeVar, Union
8
+
9
+ import torch
10
+ import torch.distributed as dist
11
+ from torch.distributed import ProcessGroup, ReduceOp
12
+
13
+ from sglang.srt import _custom_ops as ops
14
+ from sglang.srt.utils import is_cuda, is_hip
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ _is_cuda = is_cuda()
19
+ _is_hip = is_hip()
20
+
21
+ mscclpp_is_available = False
22
+ if _is_hip:
23
+ # TODO(zyksir): mscclpp is untested on AMD and therefore disabled.
24
+ mscclpp_is_available = False
25
+ if _is_cuda:
26
+ try:
27
+ import sgl_kernel
28
+
29
+ mscclpp_is_available = True
30
+ except:
31
+ mscclpp_is_available = False
32
+
33
+
34
+ class MscclContextSelection(IntEnum):
35
+ MSCCL1SHOT1NODELL = 1
36
+ MSCCL1SHOT2NODELL = 2
37
+
38
+
39
+ def mscclpp_is_weak_contiguous(inp: torch.Tensor):
40
+ return inp.is_contiguous() or (
41
+ inp.storage().nbytes() - inp.storage_offset() * inp.element_size()
42
+ == inp.numel() * inp.element_size()
43
+ )
44
+
45
+
46
+ def mscclpp_convert_to_bytes(size_str):
47
+ """
48
+ Converts a human-readable size string (e.g., "1MB", "2.5kb", "3 GB")
49
+ into the equivalent number of bytes using binary units.
50
+
51
+ Args:
52
+ size_str (str): A string representing size with unit (KB, MB, GB).
53
+
54
+ Returns:
55
+ int: Number of bytes.
56
+ """
57
+ size_str = size_str.strip().lower()
58
+
59
+ if not size_str:
60
+ raise ValueError("Empty input string")
61
+
62
+ # Extract numeric part and unit
63
+ for i in range(len(size_str)):
64
+ if not size_str[i].isdigit() and size_str[i] != ".":
65
+ break
66
+ num_str = size_str[:i]
67
+ unit = size_str[i:].strip()
68
+
69
+ try:
70
+ num = float(num_str)
71
+ except ValueError:
72
+ raise ValueError(f"Invalid numeric value in '{size_str}'")
73
+
74
+ # Conversion factors
75
+ if unit == "b":
76
+ return int(num)
77
+ elif unit == "kb":
78
+ return int(num * 1024)
79
+ elif unit == "mb":
80
+ return int(num * 1024 * 1024)
81
+ elif unit == "gb":
82
+ return int(num * 1024 * 1024 * 1024)
83
+ else:
84
+ raise ValueError(f"Unsupported unit: {unit}, support B, KB, MB, GB only")
85
+
86
+
87
+ def mscclpp_bench_time(func, test_niter: int = 10, warmup_niter: int = 2):
88
+ # warmup
89
+ for _ in range(warmup_niter):
90
+ func()
91
+ start_event = torch.cuda.Event(enable_timing=True)
92
+ end_event = torch.cuda.Event(enable_timing=True)
93
+ torch.cuda.synchronize()
94
+ dist.barrier()
95
+ start_event.record()
96
+ for _ in range(test_niter):
97
+ func()
98
+ end_event.record()
99
+ end_event.synchronize()
100
+ func_cost_us = start_event.elapsed_time(end_event) / test_niter * 1000
101
+ return func_cost_us
102
+
103
+
104
+ class PyMscclppCommunicator:
105
+ _SUPPORTED_WORLD_SIZES = [8, 16]
106
+ _MAX_BYTES = mscclpp_convert_to_bytes(os.getenv("SGLANG_MSCCLPP_MAX_BYTES", "1MB"))
107
+ _SUPPORTED_DTYPE = [torch.float, torch.float16, torch.bfloat16]
108
+
109
+ # max_bytes: max supported mscclpp allreduce size
110
+ # in A100 mscclpp is faster than nccl only under condition of msg size smaller than1MB
111
+ def __init__(
112
+ self,
113
+ group: ProcessGroup,
114
+ device: Union[int, str, torch.device],
115
+ max_bytes=_MAX_BYTES,
116
+ ) -> None:
117
+ """
118
+ Args:
119
+ group: the process group to work on. If None, it will use the
120
+ default process group.
121
+ device: the device to bind the CustomAllreduce to. If None,
122
+ it will be bind to f"cuda:{local_rank}".
123
+ It is the caller's responsibility to make sure each communicator
124
+ is bind to a unique device, and all communicators in this group
125
+ are in the same node.
126
+ """
127
+ self._IS_CAPTURING = False
128
+ self.disabled = True
129
+
130
+ if not mscclpp_is_available:
131
+ # disable because of missing mscclpp library
132
+ # e.g. in a non-cuda environment
133
+ return
134
+
135
+ self.group = group
136
+
137
+ assert (
138
+ dist.get_backend(group) != dist.Backend.NCCL
139
+ ), "CustomAllreduce should be attached to a non-NCCL group."
140
+
141
+ rank = dist.get_rank(group=self.group)
142
+ world_size = dist.get_world_size(group=self.group)
143
+ if world_size == 1:
144
+ # No need to initialize mscclpp for single GPU case.
145
+ return
146
+
147
+ if world_size not in PyMscclppCommunicator._SUPPORTED_WORLD_SIZES:
148
+ logger.warning(
149
+ "PyMscclpp is disabled due to an unsupported world"
150
+ " size: %d. Supported world sizes: %s. To silence this "
151
+ "warning, specify disable_mscclpp=True explicitly.",
152
+ world_size,
153
+ str(PyMscclppCommunicator._SUPPORTED_WORLD_SIZES),
154
+ )
155
+ return
156
+
157
+ self.ranks = torch.distributed.get_process_group_ranks(group)
158
+ self.nranks_per_node = torch.cuda.device_count()
159
+ # for now mscclpp with stride in the communicator is not tested
160
+ if not (abs(self.ranks[-1] - self.ranks[0]) == world_size - 1):
161
+ logger.warning(
162
+ "PyMscclpp is disabled due to an unsupported group %s."
163
+ "Please ensure all ranks in the group are consecutive."
164
+ "To silence this warning, specify disable_mscclpp=True explicitly.",
165
+ str(self.ranks),
166
+ )
167
+ return
168
+
169
+ if isinstance(device, int):
170
+ device = torch.device(f"cuda:{device}")
171
+ elif isinstance(device, str):
172
+ device = torch.device(device)
173
+ # now `device` is a `torch.device` object
174
+ assert isinstance(device, torch.device)
175
+ self.device = device
176
+
177
+ self.max_bytes = max_bytes
178
+ self.rank = rank
179
+ self.world_size = world_size
180
+
181
+ if dist.get_rank(group) == 0:
182
+ unique_id = [ops.mscclpp_generate_unique_id()]
183
+ else:
184
+ unique_id = [None]
185
+ dist.broadcast_object_list(unique_id, src=self.ranks[0], group=self.group)
186
+ self.unique_id = unique_id[0]
187
+ self.rank_to_node, self.rank_to_ib = list(range(world_size)), list(
188
+ range(world_size)
189
+ )
190
+ for r in range(world_size):
191
+ self.rank_to_node[r] = r // 8
192
+ self.rank_to_ib[r] = self.rank % 8
193
+
194
+ self._context = None
195
+ self.context_selection = None
196
+ self.msg_size_for_finetune = [
197
+ 2**i for i in range(10, math.floor(math.log2(self.max_bytes)) + 1)
198
+ ]
199
+ self.msg_size2best_config = {}
200
+ if world_size == 8:
201
+ self.context_selection = MscclContextSelection.MSCCL1SHOT1NODELL
202
+ elif world_size == 16:
203
+ self.context_selection = MscclContextSelection.MSCCL1SHOT2NODELL
204
+ if not _is_hip:
205
+ self.scratch = torch.empty(
206
+ self.max_bytes * 8,
207
+ dtype=torch.uint8,
208
+ device=self.device,
209
+ )
210
+ self.put_buffer = torch.empty(
211
+ self.max_bytes * 8 // self.nranks_per_node,
212
+ dtype=torch.uint8,
213
+ device=self.device,
214
+ )
215
+ self._context = ops.mscclpp_init_context(
216
+ self.unique_id,
217
+ self.rank,
218
+ self.world_size,
219
+ self.scratch,
220
+ self.put_buffer,
221
+ self.nranks_per_node,
222
+ self.rank_to_node,
223
+ self.rank_to_ib,
224
+ int(self.context_selection),
225
+ )
226
+ else:
227
+ raise NotImplementedError("HIP Mscclpp is not supported yet.")
228
+
229
+ self.msg_size2best_config = {}
230
+ self.pre_tune_config()
231
+ if dist.get_rank(group) == 0:
232
+ msg_size2best_config = [self.msg_size2best_config]
233
+ else:
234
+ msg_size2best_config = [None]
235
+ dist.broadcast_object_list(
236
+ msg_size2best_config, src=self.ranks[0], group=self.group
237
+ )
238
+ self.msg_size2best_config = msg_size2best_config[0]
239
+
240
+ # PyMscclpp is enabled only in cuda graph
241
+ self.disabled = True
242
+
243
+ def pre_tune_config(self, dtype=torch.bfloat16) -> bool:
244
+ logger.debug(f"start to pre-tune configs for rank {self.rank}")
245
+ nthreads_to_try = [256, 512, 1024]
246
+ nblocks_to_try = [21, 42, 84]
247
+ inp_randn = torch.ones(
248
+ self.msg_size_for_finetune[-1] // dtype.itemsize, dtype=dtype, device="cuda"
249
+ )
250
+ oup_randn = torch.empty_like(inp_randn)
251
+ for msg_size in self.msg_size_for_finetune:
252
+ mock_inp, mock_outp = (
253
+ inp_randn[: msg_size // dtype.itemsize],
254
+ oup_randn[: msg_size // dtype.itemsize],
255
+ )
256
+ best_config, best_time = None, None
257
+ for nthreads in nthreads_to_try:
258
+ for nblocks in nblocks_to_try:
259
+ cur_cost = mscclpp_bench_time(
260
+ lambda: ops.mscclpp_allreduce(
261
+ self._context, mock_inp, mock_outp, nthreads, nblocks
262
+ )
263
+ )
264
+ if best_time is None or cur_cost < best_time:
265
+ best_config = (nthreads, nblocks)
266
+ best_time = cur_cost
267
+ self.msg_size2best_config[msg_size] = best_config
268
+ if self.rank == 0:
269
+ logger.debug(
270
+ f"for msg_size {msg_size}, best_config: {best_config}, best_time: {best_time}us"
271
+ )
272
+
273
+ def should_mscclpp_allreduce(
274
+ self, inp: torch.Tensor, op: ReduceOp = ReduceOp.SUM
275
+ ) -> bool:
276
+ if self.disabled or self._context is None:
277
+ return False
278
+ if inp.dtype not in PyMscclppCommunicator._SUPPORTED_DTYPE:
279
+ return False
280
+ if not mscclpp_is_weak_contiguous(inp):
281
+ return False
282
+ # only support sum op
283
+ if op != ReduceOp.SUM:
284
+ return False
285
+ if inp.numel() * inp.element_size() > self.max_bytes:
286
+ return False
287
+ return True
288
+
289
+ def all_reduce(self, tensor: torch.Tensor, op: ReduceOp = ReduceOp.SUM):
290
+ if self._IS_CAPTURING:
291
+ if torch.cuda.is_current_stream_capturing():
292
+ self.graph_input_set.add((tensor.dtype, tensor.numel()))
293
+ msg_size = tensor.numel() * tensor.itemsize
294
+ index = bisect.bisect_left(self.msg_size_for_finetune, msg_size)
295
+ msg_size_finetune = self.msg_size_for_finetune[index]
296
+ nthreads, nblocks = self.msg_size2best_config[msg_size_finetune]
297
+ result = torch.empty_like(tensor)
298
+ ops.mscclpp_allreduce(self._context, tensor, result, nthreads, nblocks)
299
+ return result
300
+
301
+ @contextmanager
302
+ def change_state(
303
+ self,
304
+ enable: Optional[bool] = None,
305
+ ):
306
+ if enable is None:
307
+ # guess a default value when not specified
308
+ enable = self.available
309
+
310
+ old_disable = self.disabled
311
+ self.disabled = not enable
312
+
313
+ yield
314
+
315
+ self.disabled = old_disable
@@ -41,6 +41,7 @@ from torch.distributed import Backend, ProcessGroup
41
41
 
42
42
  from sglang.srt.utils import (
43
43
  direct_register_custom_op,
44
+ get_bool_env_var,
44
45
  is_cuda_alike,
45
46
  is_npu,
46
47
  supports_custom_op,
@@ -189,6 +190,7 @@ class GroupCoordinator:
189
190
  cpu_group: ProcessGroup # group for CPU communication
190
191
  device_group: ProcessGroup # group for device communication
191
192
  use_pynccl: bool # a hint of whether to use PyNccl
193
+ use_pymscclpp: bool # a hint of whether to use PyMsccl
192
194
  use_custom_allreduce: bool # a hint of whether to use CustomAllreduce
193
195
  use_message_queue_broadcaster: (
194
196
  bool # a hint of whether to use message queue broadcaster
@@ -204,6 +206,7 @@ class GroupCoordinator:
204
206
  local_rank: int,
205
207
  torch_distributed_backend: Union[str, Backend],
206
208
  use_pynccl: bool,
209
+ use_pymscclpp: bool,
207
210
  use_custom_allreduce: bool,
208
211
  use_hpu_communicator: bool,
209
212
  use_xpu_communicator: bool,
@@ -243,6 +246,7 @@ class GroupCoordinator:
243
246
  self.device = torch.device("cpu")
244
247
 
245
248
  self.use_pynccl = use_pynccl
249
+ self.use_pymscclpp = use_pymscclpp
246
250
  self.use_custom_allreduce = use_custom_allreduce
247
251
  self.use_hpu_communicator = use_hpu_communicator
248
252
  self.use_xpu_communicator = use_xpu_communicator
@@ -264,6 +268,17 @@ class GroupCoordinator:
264
268
  device=self.device,
265
269
  )
266
270
 
271
+ from sglang.srt.distributed.device_communicators.pymscclpp import (
272
+ PyMscclppCommunicator,
273
+ )
274
+
275
+ self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
276
+ if use_pymscclpp and self.world_size > 1:
277
+ self.pymscclpp_comm = PyMscclppCommunicator(
278
+ group=self.cpu_group,
279
+ device=self.device,
280
+ )
281
+
267
282
  self.ca_comm: Optional[CustomAllreduce] = None
268
283
  if use_custom_allreduce and self.world_size > 1:
269
284
  # Initialize a custom fast all-reduce implementation.
@@ -372,11 +387,15 @@ class GroupCoordinator:
372
387
  # --------------------------------------------
373
388
  # custom allreduce | enabled | enabled |
374
389
  # PyNccl | disabled| enabled |
390
+ # PyMscclpp | disabled| enabled |
375
391
  # torch.distributed | enabled | disabled|
376
392
  #
377
393
  # Note that custom allreduce will have a runtime check, if the
378
394
  # tensor size is too large, it will fallback to the next
379
395
  # available option.
396
+ # Note that the PyMsccl needs to register the tensor in ahead,
397
+ # which will introduce large overhead in the eager case,
398
+ # therefore it is only supported in the graph case.
380
399
  # In summary: When using CUDA graph, we use
381
400
  # either custom all-reduce kernel or pynccl. When not using
382
401
  # CUDA graph, we use either custom all-reduce kernel or
@@ -391,7 +410,14 @@ class GroupCoordinator:
391
410
  maybe_pynccl_context = pynccl_comm.change_state(
392
411
  enable=True, stream=torch.cuda.current_stream()
393
412
  )
394
- with maybe_pynccl_context:
413
+
414
+ pymscclpp_comm = self.pymscclpp_comm
415
+ maybe_pymscclpp_context: Any
416
+ if not pymscclpp_comm:
417
+ maybe_pymscclpp_context = nullcontext()
418
+ else:
419
+ maybe_pymscclpp_context = pymscclpp_comm.change_state(enable=True)
420
+ with maybe_pynccl_context, maybe_pymscclpp_context:
395
421
  yield graph_capture_context
396
422
 
397
423
  def all_reduce(self, input_: torch.Tensor) -> torch.Tensor:
@@ -436,6 +462,10 @@ class GroupCoordinator:
436
462
  self.ca_comm is not None
437
463
  and not self.ca_comm.disabled
438
464
  and self.ca_comm.should_custom_ar(input_)
465
+ ) or (
466
+ self.pymscclpp_comm is not None
467
+ and not self.pymscclpp_comm.disabled
468
+ and self.pymscclpp_comm.should_mscclpp_allreduce(input_)
439
469
  ):
440
470
  return torch.ops.sglang.outplace_all_reduce(
441
471
  input_, group_name=self.unique_name
@@ -446,9 +476,13 @@ class GroupCoordinator:
446
476
 
447
477
  def _all_reduce_out_place(self, input_: torch.Tensor) -> torch.Tensor:
448
478
  ca_comm = self.ca_comm
449
- assert ca_comm is not None
450
- assert not ca_comm.disabled
451
- out = ca_comm.custom_all_reduce(input_)
479
+ pymscclpp_comm = self.pymscclpp_comm
480
+ assert ca_comm is not None or pymscclpp_comm is not None
481
+ if ca_comm is not None and not ca_comm.disabled:
482
+ out = ca_comm.custom_all_reduce(input_)
483
+ else:
484
+ assert not pymscclpp_comm.disabled
485
+ out = pymscclpp_comm.all_reduce(input_)
452
486
  assert out is not None
453
487
  return out
454
488
 
@@ -957,6 +991,7 @@ def init_world_group(
957
991
  local_rank=local_rank,
958
992
  torch_distributed_backend=backend,
959
993
  use_pynccl=False,
994
+ use_pymscclpp=False,
960
995
  use_custom_allreduce=False,
961
996
  use_hpu_communicator=False,
962
997
  use_xpu_communicator=False,
@@ -972,14 +1007,18 @@ def init_model_parallel_group(
972
1007
  use_custom_allreduce: Optional[bool] = None,
973
1008
  use_message_queue_broadcaster: bool = False,
974
1009
  group_name: Optional[str] = None,
1010
+ use_mscclpp_allreduce: Optional[bool] = None,
975
1011
  ) -> GroupCoordinator:
976
1012
  if use_custom_allreduce is None:
977
1013
  use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
1014
+ if use_mscclpp_allreduce is None:
1015
+ use_mscclpp_allreduce = _ENABLE_MSCCLPP_ALL_REDUCE
978
1016
  return GroupCoordinator(
979
1017
  group_ranks=group_ranks,
980
1018
  local_rank=local_rank,
981
1019
  torch_distributed_backend=backend,
982
1020
  use_pynccl=not is_npu(),
1021
+ use_pymscclpp=use_mscclpp_allreduce,
983
1022
  use_custom_allreduce=use_custom_allreduce,
984
1023
  use_hpu_communicator=True,
985
1024
  use_xpu_communicator=True,
@@ -1036,6 +1075,7 @@ def graph_capture():
1036
1075
  logger = logging.getLogger(__name__)
1037
1076
 
1038
1077
  _ENABLE_CUSTOM_ALL_REDUCE = True
1078
+ _ENABLE_MSCCLPP_ALL_REDUCE = False
1039
1079
 
1040
1080
 
1041
1081
  def set_custom_all_reduce(enable: bool):
@@ -1043,6 +1083,11 @@ def set_custom_all_reduce(enable: bool):
1043
1083
  _ENABLE_CUSTOM_ALL_REDUCE = enable
1044
1084
 
1045
1085
 
1086
+ def set_mscclpp_all_reduce(enable: bool):
1087
+ global _ENABLE_MSCCLPP_ALL_REDUCE
1088
+ _ENABLE_MSCCLPP_ALL_REDUCE = enable
1089
+
1090
+
1046
1091
  def init_distributed_environment(
1047
1092
  world_size: int = -1,
1048
1093
  rank: int = -1,
@@ -1153,7 +1198,9 @@ def initialize_model_parallel(
1153
1198
  group_ranks,
1154
1199
  get_world_group().local_rank,
1155
1200
  backend,
1156
- use_message_queue_broadcaster=True,
1201
+ use_message_queue_broadcaster=get_bool_env_var(
1202
+ "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
1203
+ ),
1157
1204
  group_name="tp",
1158
1205
  )
1159
1206
 
@@ -23,6 +23,12 @@ class EngineBase(ABC):
23
23
  token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
24
24
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
25
25
  custom_logit_processor: Optional[Union[List[str], str]] = None,
26
+ return_hidden_states: Optional[bool] = None,
27
+ stream: Optional[bool] = None,
28
+ bootstrap_host: Optional[Union[List[str], str]] = None,
29
+ bootstrap_port: Optional[Union[List[int], int]] = None,
30
+ bootstrap_room: Optional[Union[List[int], int]] = None,
31
+ data_parallel_rank: Optional[int] = None,
26
32
  ) -> Union[Dict, Iterator[Dict]]:
27
33
  """Generate outputs based on given inputs."""
28
34
  pass
@@ -167,11 +167,22 @@ class Engine(EngineBase):
167
167
  bootstrap_host: Optional[Union[List[str], str]] = None,
168
168
  bootstrap_port: Optional[Union[List[int], int]] = None,
169
169
  bootstrap_room: Optional[Union[List[int], int]] = None,
170
+ data_parallel_rank: Optional[int] = None,
170
171
  ) -> Union[Dict, Iterator[Dict]]:
171
172
  """
172
173
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
173
174
  Please refer to `GenerateReqInput` for the documentation.
174
175
  """
176
+ if self.server_args.enable_dp_attention:
177
+ if data_parallel_rank is None:
178
+ logger.info("data_parallel_rank not provided, using default dispatch")
179
+ elif data_parallel_rank < 0:
180
+ raise ValueError("data_parallel_rank must be non-negative")
181
+ elif data_parallel_rank >= self.server_args.dp_size:
182
+ raise ValueError(
183
+ f"data_parallel_rank must be less than dp_size: {self.server_args.dp_size}"
184
+ )
185
+
175
186
  obj = GenerateReqInput(
176
187
  text=prompt,
177
188
  input_ids=input_ids,
@@ -188,6 +199,7 @@ class Engine(EngineBase):
188
199
  bootstrap_host=bootstrap_host,
189
200
  bootstrap_port=bootstrap_port,
190
201
  bootstrap_room=bootstrap_room,
202
+ data_parallel_rank=data_parallel_rank,
191
203
  )
192
204
  loop = asyncio.get_event_loop()
193
205
  generator = self.tokenizer_manager.generate_request(obj, None)
@@ -237,11 +249,24 @@ class Engine(EngineBase):
237
249
  bootstrap_host: Optional[Union[List[str], str]] = None,
238
250
  bootstrap_port: Optional[Union[List[int], int]] = None,
239
251
  bootstrap_room: Optional[Union[List[int], int]] = None,
252
+ data_parallel_rank: Optional[int] = None,
240
253
  ) -> Union[Dict, AsyncIterator[Dict]]:
241
254
  """
242
255
  The arguments of this function is the same as `sglang/srt/managers/io_struct.py::GenerateReqInput`.
243
256
  Please refer to `GenerateReqInput` for the documentation.
244
257
  """
258
+
259
+ if self.server_args.enable_dp_attention:
260
+ if data_parallel_rank is None:
261
+ logger.info("data_parallel_rank not provided, using default dispatch")
262
+ elif data_parallel_rank < 0:
263
+ raise ValueError("data_parallel_rank must be non-negative")
264
+ elif data_parallel_rank >= self.server_args.dp_size:
265
+ raise ValueError(
266
+ f"data_parallel_rank must be in range [0, {self.server_args.dp_size-1}]"
267
+ )
268
+
269
+ logger.info(f"data_parallel_rank: {data_parallel_rank}")
245
270
  obj = GenerateReqInput(
246
271
  text=prompt,
247
272
  input_ids=input_ids,
@@ -257,6 +282,7 @@ class Engine(EngineBase):
257
282
  bootstrap_host=bootstrap_host,
258
283
  bootstrap_port=bootstrap_port,
259
284
  bootstrap_room=bootstrap_room,
285
+ data_parallel_rank=data_parallel_rank,
260
286
  )
261
287
  generator = self.tokenizer_manager.generate_request(obj, None)
262
288
 
@@ -301,6 +327,20 @@ class Engine(EngineBase):
301
327
  generator = self.tokenizer_manager.generate_request(obj, None)
302
328
  return await generator.__anext__()
303
329
 
330
+ def rerank(
331
+ self,
332
+ prompt: Union[List[List[str]]],
333
+ ) -> Dict:
334
+ """
335
+ The arguments of this function is the same as `sglang/srt/managers/io_struct.py::EmbeddingReqInput`.
336
+ Please refer to `EmbeddingReqInput` for the documentation.
337
+ """
338
+ obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
339
+ loop = asyncio.get_event_loop()
340
+ generator = self.tokenizer_manager.generate_request(obj, None)
341
+ ret = loop.run_until_complete(generator.__anext__())
342
+ return ret
343
+
304
344
  def shutdown(self):
305
345
  """Shutdown the engine"""
306
346
  kill_process_tree(os.getpid(), include_parent=False)
@@ -472,6 +512,79 @@ class Engine(EngineBase):
472
512
  def save_sharded_model(self, **kwargs):
473
513
  self.collective_rpc("save_sharded_model", **kwargs)
474
514
 
515
+ def score(
516
+ self,
517
+ query: Optional[Union[str, List[int]]] = None,
518
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
519
+ label_token_ids: Optional[List[int]] = None,
520
+ apply_softmax: bool = False,
521
+ item_first: bool = False,
522
+ ) -> List[List[float]]:
523
+ """
524
+ Score the probability of specified token IDs appearing after the given (query + item) pair. For example:
525
+ query = "<|user|>Is the following city the capital of France? "
526
+ items = ["Paris <|assistant|>", "London <|assistant|>", "Berlin <|assistant|>"]
527
+ label_token_ids = [2332, 1223] # Token IDs for "Yes" and "No"
528
+ item_first = False
529
+
530
+ This would pass the following prompts to the model:
531
+ "<|user|>Is the following city the capital of France? Paris <|assistant|>"
532
+ "<|user|>Is the following city the capital of France? London <|assistant|>"
533
+ "<|user|>Is the following city the capital of France? Berlin <|assistant|>"
534
+ The api would then return the probabilities of the model producing "Yes" and "No" as the next token.
535
+ The output would look like:
536
+ [[0.9, 0.1], [0.2, 0.8], [0.1, 0.9]]
537
+
538
+
539
+ Args:
540
+ query: The query text or pre-tokenized query token IDs. Must be provided.
541
+ items: The item text(s) or pre-tokenized item token IDs. Must be provided.
542
+ label_token_ids: List of token IDs to compute probabilities for. If None, no token probabilities will be computed.
543
+ apply_softmax: Whether to normalize probabilities using softmax.
544
+ item_first: If True, prepend items to query. Otherwise append items to query.
545
+
546
+ Returns:
547
+ List of dictionaries mapping token IDs to their probabilities for each item.
548
+ Each dictionary in the list corresponds to one item input.
549
+
550
+ Raises:
551
+ ValueError: If query is not provided, or if items is not provided,
552
+ or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
553
+ """
554
+ loop = asyncio.get_event_loop()
555
+ return loop.run_until_complete(
556
+ self.tokenizer_manager.score_request(
557
+ query=query,
558
+ items=items,
559
+ label_token_ids=label_token_ids,
560
+ apply_softmax=apply_softmax,
561
+ item_first=item_first,
562
+ request=None,
563
+ )
564
+ )
565
+
566
+ async def async_score(
567
+ self,
568
+ query: Optional[Union[str, List[int]]] = None,
569
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
570
+ label_token_ids: Optional[List[int]] = None,
571
+ apply_softmax: bool = False,
572
+ item_first: bool = False,
573
+ ) -> List[List[float]]:
574
+ """
575
+ Asynchronous version of score method.
576
+
577
+ See score() for detailed documentation.
578
+ """
579
+ return await self.tokenizer_manager.score_request(
580
+ query=query,
581
+ items=items,
582
+ label_token_ids=label_token_ids,
583
+ apply_softmax=apply_softmax,
584
+ item_first=item_first,
585
+ request=None,
586
+ )
587
+
475
588
 
476
589
  def _set_envs_and_config(server_args: ServerArgs):
477
590
  # Set global environments
@@ -498,7 +611,7 @@ def _set_envs_and_config(server_args: ServerArgs):
498
611
  if server_args.attention_backend == "flashinfer":
499
612
  assert_pkg_version(
500
613
  "flashinfer_python",
501
- "0.2.5",
614
+ "0.2.6.post1",
502
615
  "Please uninstall the old version and "
503
616
  "reinstall the latest version by following the instructions "
504
617
  "at https://docs.flashinfer.ai/installation.html.",
@@ -506,7 +619,7 @@ def _set_envs_and_config(server_args: ServerArgs):
506
619
  if _is_cuda:
507
620
  assert_pkg_version(
508
621
  "sgl-kernel",
509
- "0.1.4",
622
+ "0.1.9",
510
623
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
511
624
  )
512
625
 
@@ -514,9 +627,7 @@ def _set_envs_and_config(server_args: ServerArgs):
514
627
  pid, exitcode = os.waitpid(0, os.WNOHANG)
515
628
  if exitcode != 0:
516
629
  logger.warning(
517
- "Child process unexpectedly failed with an exit code %d. pid=%d",
518
- exitcode,
519
- pid,
630
+ f"Child process unexpectedly failed with {exitcode=}. {pid=}"
520
631
  )
521
632
 
522
633
  signal.signal(signal.SIGCHLD, sigchld_handler)