sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
sglang/srt/operations.py CHANGED
@@ -12,7 +12,7 @@ if _ENABLE_PROFILE:
12
12
 
13
13
 
14
14
  def execute_operations(inputs, operations):
15
- stages = _convert_operations_to_stages(decorate_operations(operations))
15
+ stages = _convert_operations_to_stages(operations)
16
16
  executor = _StageExecutor("primary", stages, inputs=inputs)
17
17
  for _ in range(executor.num_stages):
18
18
  executor.next()
@@ -20,6 +20,37 @@ def execute_operations(inputs, operations):
20
20
  return executor.output
21
21
 
22
22
 
23
+ def execute_overlapped_operations(
24
+ inputs_arr: Sequence,
25
+ operations_arr: Sequence,
26
+ delta_stages: Sequence[int],
27
+ ) -> Sequence:
28
+ # Make it explicit for clarity; if we need multi-batch overlap, this can be generalized
29
+ inputs_a, inputs_b = inputs_arr
30
+ operations_a, operations_b = operations_arr
31
+ delta_stage_a, delta_stage_b = delta_stages
32
+ assert delta_stage_a == 0
33
+ delta_stage = delta_stage_b
34
+
35
+ stages_a = _convert_operations_to_stages(operations_a)
36
+ stages_b = _convert_operations_to_stages(operations_b)
37
+ executor_a = _StageExecutor("a", stages_a, inputs=inputs_a)
38
+ executor_b = _StageExecutor("b", stages_b, inputs=inputs_b)
39
+
40
+ for _ in range(delta_stage):
41
+ executor_a.next()
42
+
43
+ for _ in range(executor_a.num_stages - delta_stage):
44
+ executor_a.next()
45
+ executor_b.next()
46
+
47
+ for _ in range(delta_stage):
48
+ executor_b.next()
49
+
50
+ assert executor_a.done and executor_b.done
51
+ return [executor_a.output, executor_b.output]
52
+
53
+
23
54
  class YieldOperation:
24
55
  pass
25
56
 
@@ -109,6 +140,9 @@ class _StateDict:
109
140
  for k, v in values.items():
110
141
  setattr(self, k, v)
111
142
 
143
+ def get(self, item):
144
+ return self._data.get(item)
145
+
112
146
  def clear(self, expect_keys: Sequence[str]):
113
147
  if set(self._data.keys()) != set(expect_keys):
114
148
  raise Exception(
@@ -119,6 +153,7 @@ class _StateDict:
119
153
 
120
154
 
121
155
  def _convert_operations_to_stages(operations: List[Operation]) -> List[Stage]:
156
+ operations = _decorate_operations(operations)
122
157
  operation_chunks = list(
123
158
  _chunk_by_separator(operations, lambda op: isinstance(op, YieldOperation))
124
159
  )
@@ -140,7 +175,7 @@ def _chunk_by_separator(
140
175
  yield pending_items
141
176
 
142
177
 
143
- def decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
178
+ def _decorate_operations(operations: List[Operation], debug_name_prefix: str = ""):
144
179
  return [_decorate_operation(op, debug_name_prefix) for op in operations]
145
180
 
146
181
 
@@ -1,31 +1,207 @@
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
1
4
  import torch
2
5
 
6
+ from sglang.srt import operations
7
+ from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPConfig
8
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode
9
+ from sglang.srt.operations import Operation
10
+
11
+
12
+ @dataclass
13
+ class OperationsStrategy:
14
+ operations: List[Operation]
15
+ deep_gemm_num_sms: Optional[int] = None
16
+ tbo_delta_stages: Optional[int] = None
17
+
18
+ @classmethod
19
+ def concat(cls, items: List["OperationsStrategy"]) -> "OperationsStrategy":
20
+ return OperationsStrategy(
21
+ operations=[x for item in items for x in item.operations],
22
+ deep_gemm_num_sms=_assert_all_same(
23
+ [item.deep_gemm_num_sms for item in items]
24
+ ),
25
+ tbo_delta_stages=_assert_all_same(
26
+ [item.tbo_delta_stages for item in items]
27
+ ),
28
+ )
29
+
30
+ @staticmethod
31
+ def init_new_tbo(
32
+ layers: torch.nn.ModuleList,
33
+ forward_mode: ForwardMode,
34
+ ) -> "OperationsStrategy":
35
+ layer_name = layers[0].__class__.__name__
36
+ if layer_name == "DeepseekV2DecoderLayer":
37
+ return OperationsStrategy.concat(
38
+ [
39
+ _compute_moe_deepseek_layer_operations_strategy_tbo(
40
+ layer, forward_mode
41
+ )
42
+ for layer in layers
43
+ ]
44
+ )
45
+ elif layer_name == "Qwen3MoeDecoderLayer":
46
+ return OperationsStrategy.concat(
47
+ [
48
+ _compute_moe_qwen3_layer_operations_strategy_tbo(
49
+ layer, forward_mode
50
+ )
51
+ for layer in layers
52
+ ]
53
+ )
54
+ else:
55
+ raise NotImplementedError
56
+
57
+
58
+ def _assert_all_same(items: List):
59
+ assert all(item == items[0] for item in items)
60
+ return items[0]
61
+
62
+
63
+ # -------------------------------- Strategy for DeepSeek ---------------------------------------
64
+
65
+
66
+ # TODO can refactor to make it more fancy if we have more complex strategies
67
+ def _compute_moe_deepseek_layer_operations_strategy_tbo(
68
+ layer: torch.nn.Module,
69
+ forward_mode: ForwardMode,
70
+ ) -> OperationsStrategy:
71
+ assert layer.is_layer_sparse, "dense layer TBO not yet implemented"
72
+ if forward_mode == ForwardMode.EXTEND:
73
+ return _compute_moe_deepseek_blog_prefill(layer)
74
+ elif forward_mode == ForwardMode.DECODE:
75
+ return _compute_moe_deepseek_blog_decode(layer)
76
+ else:
77
+ raise NotImplementedError(f"Unsupported {forward_mode=}")
78
+
79
+
80
+ def _compute_moe_deepseek_blog_prefill(layer):
81
+ device_properties = torch.cuda.get_device_properties(device="cuda")
82
+ total_num_sms = device_properties.multi_processor_count
83
+ deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
84
+
85
+ return OperationsStrategy(
86
+ deep_gemm_num_sms=deep_gemm_num_sms,
87
+ tbo_delta_stages=0,
88
+ operations=[
89
+ layer.op_comm_prepare_attn,
90
+ layer.self_attn.op_prepare,
91
+ layer.self_attn.op_core,
92
+ layer.op_comm_prepare_mlp,
93
+ layer.mlp.op_gate,
94
+ layer.mlp.op_select_experts,
95
+ layer.mlp.op_dispatch_a,
96
+ operations.YieldOperation(),
97
+ layer.mlp.op_dispatch_b,
98
+ layer.mlp.op_experts,
99
+ layer.mlp.op_combine_a,
100
+ operations.YieldOperation(),
101
+ layer.mlp.op_shared_experts,
102
+ layer.mlp.op_combine_b,
103
+ layer.mlp.op_output,
104
+ layer.op_comm_postprocess_layer,
105
+ ],
106
+ )
107
+
108
+
109
+ def _compute_moe_deepseek_blog_decode(layer):
110
+ return OperationsStrategy(
111
+ deep_gemm_num_sms=None,
112
+ tbo_delta_stages=2,
113
+ operations=[
114
+ layer.op_comm_prepare_attn,
115
+ layer.self_attn.op_prepare,
116
+ operations.YieldOperation(),
117
+ layer.self_attn.op_core,
118
+ layer.op_comm_prepare_mlp,
119
+ layer.mlp.op_gate,
120
+ layer.mlp.op_select_experts,
121
+ operations.YieldOperation(),
122
+ layer.mlp.op_dispatch_a,
123
+ layer.mlp.op_shared_experts,
124
+ operations.YieldOperation(),
125
+ layer.mlp.op_dispatch_b,
126
+ layer.mlp.op_experts,
127
+ layer.mlp.op_combine_a,
128
+ operations.YieldOperation(),
129
+ layer.mlp.op_combine_b,
130
+ operations.YieldOperation(),
131
+ layer.mlp.op_output,
132
+ layer.op_comm_postprocess_layer,
133
+ ],
134
+ )
135
+
136
+
137
+ # -------------------------------- Strategy for Qwen3 ---------------------------------------
138
+
3
139
 
4
- def compute_layer_operations(
140
+ # TODO: unstable, current strategy is almost the same as DeepSeek, keep redundant code here for
141
+ # convenience to adjust strategy
142
+ def _compute_moe_qwen3_layer_operations_strategy_tbo(
5
143
  layer: torch.nn.Module,
6
- ):
7
- if not layer.is_layer_sparse:
8
- return [
144
+ forward_mode: ForwardMode,
145
+ ) -> OperationsStrategy:
146
+ assert layer.is_layer_sparse, "qwen3 moe only support sparse layers"
147
+ if forward_mode == ForwardMode.EXTEND:
148
+ return _compute_moe_qwen3_prefill(layer)
149
+ elif forward_mode == ForwardMode.DECODE:
150
+ return _compute_moe_qwen3_decode(layer)
151
+ else:
152
+ raise NotImplementedError(f"Unsupported {forward_mode=}")
153
+
154
+
155
+ def _compute_moe_qwen3_prefill(layer):
156
+ device_properties = torch.cuda.get_device_properties(device="cuda")
157
+ total_num_sms = device_properties.multi_processor_count
158
+ deep_gemm_num_sms = total_num_sms - DeepEPConfig.get_instance().num_sms
159
+
160
+ return OperationsStrategy(
161
+ deep_gemm_num_sms=deep_gemm_num_sms,
162
+ tbo_delta_stages=0,
163
+ operations=[
164
+ layer.op_comm_prepare_attn,
165
+ layer.self_attn.op_prepare,
166
+ layer.self_attn.op_core,
167
+ layer.op_comm_prepare_mlp,
168
+ layer.mlp.op_gate,
169
+ layer.mlp.op_select_experts,
170
+ layer.mlp.op_dispatch_a,
171
+ operations.YieldOperation(),
172
+ layer.mlp.op_dispatch_b,
173
+ layer.mlp.op_experts,
174
+ layer.mlp.op_combine_a,
175
+ operations.YieldOperation(),
176
+ layer.mlp.op_combine_b,
177
+ layer.mlp.op_output,
178
+ layer.op_comm_postprocess_layer,
179
+ ],
180
+ )
181
+
182
+
183
+ def _compute_moe_qwen3_decode(layer):
184
+ return OperationsStrategy(
185
+ deep_gemm_num_sms=None,
186
+ tbo_delta_stages=2,
187
+ operations=[
9
188
  layer.op_comm_prepare_attn,
10
- layer.op_attn,
189
+ layer.self_attn.op_prepare,
190
+ operations.YieldOperation(),
191
+ layer.self_attn.op_core,
11
192
  layer.op_comm_prepare_mlp,
12
- layer.op_mlp,
193
+ layer.mlp.op_gate,
194
+ layer.mlp.op_select_experts,
195
+ operations.YieldOperation(),
196
+ layer.mlp.op_dispatch_a,
197
+ operations.YieldOperation(),
198
+ layer.mlp.op_dispatch_b,
199
+ layer.mlp.op_experts,
200
+ layer.mlp.op_combine_a,
201
+ operations.YieldOperation(),
202
+ layer.mlp.op_combine_b,
203
+ layer.mlp.op_output,
13
204
  layer.op_comm_postprocess_layer,
14
- ]
15
-
16
- # Will add TBO operation orders here
17
- return [
18
- layer.op_comm_prepare_attn,
19
- layer.op_attn,
20
- layer.op_comm_prepare_mlp,
21
- layer.mlp.op_gate,
22
- layer.mlp.op_shared_experts,
23
- layer.mlp.op_select_experts,
24
- layer.mlp.op_dispatch_a,
25
- layer.mlp.op_dispatch_b,
26
- layer.mlp.op_experts,
27
- layer.mlp.op_combine_a,
28
- layer.mlp.op_combine_b,
29
- layer.mlp.op_output,
30
- layer.op_comm_postprocess_layer,
31
- ]
205
+ operations.YieldOperation(),
206
+ ],
207
+ )
@@ -9,10 +9,12 @@ import torch
9
9
 
10
10
  import sglang.srt.sampling.penaltylib as penaltylib
11
11
  from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
12
+ from sglang.srt.sampling.sampling_params import TOP_K_ALL
12
13
 
13
14
  if TYPE_CHECKING:
14
15
  from sglang.srt.managers.schedule_batch import ScheduleBatch
15
16
 
17
+
16
18
  logger = logging.getLogger(__name__)
17
19
 
18
20
 
@@ -27,6 +29,12 @@ class SamplingBatchInfo:
27
29
  # Whether all requests use greedy sampling
28
30
  is_all_greedy: bool
29
31
 
32
+ # Whether any requests use top_p sampling
33
+ need_top_p_sampling: bool
34
+
35
+ # Whether any requests use top_k sampling
36
+ need_top_k_sampling: bool
37
+
30
38
  # Whether any request needs min_p sampling
31
39
  need_min_p_sampling: bool
32
40
 
@@ -133,6 +141,8 @@ class SamplingBatchInfo:
133
141
  top_ks=top_ks,
134
142
  min_ps=min_ps,
135
143
  is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
144
+ need_top_p_sampling=any(r.sampling_params.top_p != 1.0 for r in reqs),
145
+ need_top_k_sampling=any(r.sampling_params.top_k != TOP_K_ALL for r in reqs),
136
146
  need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
137
147
  vocab_size=vocab_size,
138
148
  penalizer_orchestrator=penalizer_orchestrator,
@@ -167,7 +177,7 @@ class SamplingBatchInfo:
167
177
 
168
178
  # Apply the mask
169
179
  for i, grammar in enumerate(self.grammars):
170
- if grammar and not grammar.finished:
180
+ if grammar and not grammar.finished and not grammar.is_terminated():
171
181
  grammar.fill_vocab_mask(self.vocab_mask, i)
172
182
 
173
183
  # Move the mask to the device if needed
@@ -308,4 +318,6 @@ class SamplingBatchInfo:
308
318
  setattr(self, item, torch.cat([self_val, other_val]))
309
319
 
310
320
  self.is_all_greedy &= other.is_all_greedy
321
+ self.need_top_p_sampling |= other.need_top_p_sampling
322
+ self.need_top_k_sampling |= other.need_top_k_sampling
311
323
  self.need_min_p_sampling |= other.need_min_p_sampling
@@ -16,6 +16,7 @@
16
16
  from typing import Any, Dict, List, Optional, Union
17
17
 
18
18
  _SAMPLING_EPS = 1e-6
19
+ TOP_K_ALL = 1 << 30
19
20
 
20
21
 
21
22
  class SamplingParams:
@@ -84,7 +85,7 @@ class SamplingParams:
84
85
  self.temperature = 1.0
85
86
  self.top_k = 1
86
87
  if self.top_k == -1:
87
- self.top_k = 1 << 30 # whole vocabulary
88
+ self.top_k = TOP_K_ALL # whole vocabulary
88
89
 
89
90
  def verify(self):
90
91
  if self.temperature < 0.0:
sglang/srt/server_args.py CHANGED
@@ -28,6 +28,7 @@ from sglang.srt.utils import (
28
28
  configure_ipv6,
29
29
  get_device,
30
30
  get_device_memory_capacity,
31
+ is_cuda,
31
32
  is_flashinfer_available,
32
33
  is_hip,
33
34
  is_port_available,
@@ -60,6 +61,7 @@ class ServerArgs:
60
61
  is_embedding: bool = False
61
62
  enable_multimodal: Optional[bool] = None
62
63
  revision: Optional[str] = None
64
+ impl: str = "auto"
63
65
 
64
66
  # Port for the HTTP server
65
67
  host: str = "127.0.0.1"
@@ -163,20 +165,24 @@ class ServerArgs:
163
165
  enable_tokenizer_batch_encode: bool = False
164
166
  disable_outlines_disk_cache: bool = False
165
167
  disable_custom_all_reduce: bool = False
168
+ enable_mscclpp: bool = False
166
169
  disable_overlap_schedule: bool = False
167
170
  enable_mixed_chunk: bool = False
168
171
  enable_dp_attention: bool = False
169
172
  enable_dp_lm_head: bool = False
173
+ enable_two_batch_overlap: bool = False
170
174
  enable_ep_moe: bool = False
171
175
  enable_deepep_moe: bool = False
172
176
  deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
173
177
  ep_num_redundant_experts: int = 0
174
- ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
178
+ ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
175
179
  init_expert_location: str = "trivial"
176
180
  enable_eplb: bool = False
181
+ eplb_algorithm: str = "auto"
177
182
  eplb_rebalance_num_iterations: int = 1000
183
+ eplb_rebalance_layers_per_chunk: Optional[int] = None
178
184
  expert_distribution_recorder_mode: Optional[
179
- Literal["stat", "per_pass", "per_token"]
185
+ Literal["stat", "stat_approx", "per_pass", "per_token"]
180
186
  ] = None
181
187
  expert_distribution_recorder_buffer_size: Optional[int] = None
182
188
  enable_expert_distribution_metrics: bool = False
@@ -203,7 +209,7 @@ class ServerArgs:
203
209
  flashinfer_mla_disable_ragged: bool = False
204
210
  warmups: Optional[str] = None
205
211
  moe_dense_tp_size: Optional[int] = None
206
- n_share_experts_fusion: int = 0
212
+ disable_shared_experts_fusion: bool = False
207
213
  disable_chunked_prefix_cache: bool = False
208
214
  disable_fast_image_processor: bool = False
209
215
  mm_attention_backend: Optional[str] = None
@@ -259,17 +265,28 @@ class ServerArgs:
259
265
  self.mem_fraction_static = 0.88
260
266
  else:
261
267
  self.mem_fraction_static = 0.88
262
- if gpu_mem is not None and gpu_mem > 96 * 1024:
268
+ if gpu_mem is not None and gpu_mem > 180 * 1000 and is_cuda():
269
+ self.mem_fraction_static = 0.79
270
+ elif gpu_mem is not None and gpu_mem > 96 * 1024:
263
271
  mem_fraction = self.mem_fraction_static
272
+ # 15 GB + additional 3GB for cuda graph
273
+ reserve_mem = 1024 * 18
274
+ # need reserve more memory for spec cuda graph
275
+ if self.speculative_algorithm is not None:
276
+ reserve_mem = 1024 * 20
264
277
  self.mem_fraction_static = min(
265
278
  mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
266
- (gpu_mem - 1024 * 18)
267
- / gpu_mem, # 15 GB + additional 3GB for cuda graph
279
+ (gpu_mem - reserve_mem) / gpu_mem,
268
280
  )
281
+ else:
282
+ if self.speculative_algorithm is not None:
283
+ self.mem_fraction_static *= 0.95
269
284
 
270
285
  # Set chunked prefill size, which depends on the gpu memory capacity
271
286
  if self.chunked_prefill_size is None:
272
- if gpu_mem is not None and gpu_mem < 25_000:
287
+ if gpu_mem is not None and gpu_mem > 180_000:
288
+ self.chunked_prefill_size = 16384
289
+ elif gpu_mem is not None and gpu_mem < 25_000:
273
290
  self.chunked_prefill_size = 2048
274
291
  elif self.disaggregation_mode != "null":
275
292
  self.chunked_prefill_size = 16384
@@ -309,6 +326,11 @@ class ServerArgs:
309
326
  self.sampling_backend = "pytorch"
310
327
 
311
328
  # Set kernel backends
329
+ if self.device == "cpu":
330
+ if self.attention_backend is None:
331
+ self.attention_backend = "intel_amx"
332
+ self.sampling_backend = "pytorch"
333
+
312
334
  if self.sampling_backend is None:
313
335
  self.sampling_backend = (
314
336
  "flashinfer" if is_flashinfer_available() else "pytorch"
@@ -365,12 +387,28 @@ class ServerArgs:
365
387
  "Pipeline parallelism is incompatible with overlap schedule."
366
388
  )
367
389
 
390
+ if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
391
+ self.expert_distribution_recorder_mode = "stat"
392
+ logger.info(
393
+ f"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
394
+ )
395
+
396
+ if (self.enable_eplb or (self.init_expert_location is not None)) and (
397
+ self.ep_dispatch_algorithm is None
398
+ ):
399
+ self.ep_dispatch_algorithm = "static"
400
+ logger.info(
401
+ f"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
402
+ )
403
+
404
+ if self.enable_expert_distribution_metrics and (
405
+ self.expert_distribution_recorder_mode is None
406
+ ):
407
+ self.expert_distribution_recorder_mode = "stat"
408
+
368
409
  if self.expert_distribution_recorder_buffer_size is None:
369
- # TODO pr-chain: enable this later
370
- # if (x := self.eplb_rebalance_num_iterations) is not None:
371
- # self.expert_distribution_recorder_buffer_size = x
372
- if False:
373
- pass
410
+ if (x := self.eplb_rebalance_num_iterations) is not None:
411
+ self.expert_distribution_recorder_buffer_size = x
374
412
  elif self.expert_distribution_recorder_mode is not None:
375
413
  self.expert_distribution_recorder_buffer_size = 1000
376
414
 
@@ -387,6 +425,12 @@ class ServerArgs:
387
425
  "Overlap scheduler is disabled because of using "
388
426
  "eagle speculative decoding."
389
427
  )
428
+ if self.enable_mixed_chunk:
429
+ self.enable_mixed_chunk = False
430
+ logger.warning(
431
+ "Mixed chunked prefill is disabled because of using "
432
+ "eagle speculative decoding."
433
+ )
390
434
 
391
435
  model_arch = get_model_arch(self)
392
436
 
@@ -409,7 +453,7 @@ class ServerArgs:
409
453
  self.speculative_num_steps,
410
454
  self.speculative_eagle_topk,
411
455
  self.speculative_num_draft_tokens,
412
- ) = auto_choose_speculative_params(model_arch)
456
+ ) = auto_choose_speculative_params(self)
413
457
 
414
458
  if self.page_size > 1 and self.speculative_eagle_topk > 1:
415
459
  self.speculative_eagle_topk = 1
@@ -691,6 +735,18 @@ class ServerArgs:
691
735
  default=ServerArgs.page_size,
692
736
  help="The number of tokens in a page.",
693
737
  )
738
+ parser.add_argument(
739
+ "--impl",
740
+ type=str,
741
+ default=ServerArgs.impl,
742
+ help="Which implementation of the model to use.\n\n"
743
+ '* "auto" will try to use the SGLang implementation if it exists '
744
+ "and fall back to the Transformers implementation if no SGLang "
745
+ "implementation is available.\n"
746
+ '* "sglang" will use the SGLang model implementation.\n'
747
+ '* "transformers" will use the Transformers model '
748
+ "implementation.\n",
749
+ )
694
750
 
695
751
  # Other runtime options
696
752
  parser.add_argument(
@@ -957,12 +1013,13 @@ class ServerArgs:
957
1013
  type=str,
958
1014
  choices=[
959
1015
  "aiter",
960
- "flashinfer",
961
- "triton",
962
- "torch_native",
1016
+ "cutlass_mla",
963
1017
  "fa3",
1018
+ "flashinfer",
964
1019
  "flashmla",
965
- "cutlass_mla",
1020
+ "intel_amx",
1021
+ "torch_native",
1022
+ "triton",
966
1023
  ],
967
1024
  default=ServerArgs.attention_backend,
968
1025
  help="Choose the kernels for attention layers.",
@@ -1119,6 +1176,11 @@ class ServerArgs:
1119
1176
  action="store_true",
1120
1177
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
1121
1178
  )
1179
+ parser.add_argument(
1180
+ "--enable-mscclpp",
1181
+ action="store_true",
1182
+ help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
1183
+ )
1122
1184
  parser.add_argument(
1123
1185
  "--disable-overlap-schedule",
1124
1186
  action="store_true",
@@ -1144,6 +1206,11 @@ class ServerArgs:
1144
1206
  action="store_true",
1145
1207
  help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1146
1208
  )
1209
+ parser.add_argument(
1210
+ "--enable-two-batch-overlap",
1211
+ action="store_true",
1212
+ help="Enabling two micro batches to overlap.",
1213
+ )
1147
1214
  parser.add_argument(
1148
1215
  "--enable-torch-compile",
1149
1216
  action="store_true",
@@ -1295,12 +1362,24 @@ class ServerArgs:
1295
1362
  action="store_true",
1296
1363
  help="Enable EPLB algorithm",
1297
1364
  )
1365
+ parser.add_argument(
1366
+ "--eplb-algorithm",
1367
+ type=str,
1368
+ default=ServerArgs.eplb_algorithm,
1369
+ help="Chosen EPLB algorithm",
1370
+ )
1298
1371
  parser.add_argument(
1299
1372
  "--eplb-rebalance-num-iterations",
1300
1373
  type=int,
1301
1374
  default=ServerArgs.eplb_rebalance_num_iterations,
1302
1375
  help="Number of iterations to automatically trigger a EPLB re-balance.",
1303
1376
  )
1377
+ parser.add_argument(
1378
+ "--eplb-rebalance-layers-per-chunk",
1379
+ type=int,
1380
+ default=ServerArgs.eplb_rebalance_layers_per_chunk,
1381
+ help="Number of layers to rebalance per forward pass.",
1382
+ )
1304
1383
  parser.add_argument(
1305
1384
  "--expert-distribution-recorder-mode",
1306
1385
  type=str,
@@ -1322,15 +1401,12 @@ class ServerArgs:
1322
1401
  "--deepep-config",
1323
1402
  type=str,
1324
1403
  default=ServerArgs.deepep_config,
1325
- help="Tuned DeepEP config suitable for your own cluster.",
1404
+ help="Tuned DeepEP config suitable for your own cluster. It can be either a string with JSON content or a file path.",
1326
1405
  )
1327
-
1328
1406
  parser.add_argument(
1329
- "--n-share-experts-fusion",
1330
- type=int,
1331
- default=0,
1332
- help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
1333
- "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
1407
+ "--disable-shared-experts-fusion",
1408
+ action="store_true",
1409
+ help="Disable shared experts fusion optimization for deepseek v3/r1.",
1334
1410
  )
1335
1411
  parser.add_argument(
1336
1412
  "--disable-chunked-prefix-cache",
@@ -1451,7 +1527,7 @@ class ServerArgs:
1451
1527
  self.max_loras_per_batch > 0
1452
1528
  # FIXME
1453
1529
  and (self.lora_paths is None or self.disable_radix_cache)
1454
- ), "compatibility of lora and cuda graph and radix attention is in progress"
1530
+ ), "compatibility of lora and radix attention is in progress"
1455
1531
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1456
1532
  assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
1457
1533
 
@@ -1585,18 +1661,29 @@ def get_model_arch(args: ServerArgs):
1585
1661
  return hf_config.architectures[0]
1586
1662
 
1587
1663
 
1588
- def auto_choose_speculative_params(arch: str):
1664
+ def auto_choose_speculative_params(self: ServerArgs):
1589
1665
  """
1590
1666
  Automatically choose the parameters for speculative decoding.
1591
1667
 
1592
1668
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1593
1669
  """
1670
+ kwargs = {}
1671
+
1672
+ hf_config = get_config(
1673
+ self.model_path,
1674
+ trust_remote_code=self.trust_remote_code,
1675
+ revision=self.revision,
1676
+ model_override_args=json.loads(self.json_model_override_args),
1677
+ **kwargs,
1678
+ )
1679
+ arch = hf_config.architectures[0]
1680
+
1594
1681
  if arch in ["LlamaForCausalLM"]:
1595
1682
  # The default value for llama
1596
1683
  return (5, 4, 8)
1597
1684
  elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
1598
1685
  # The default value for deepseek
1599
- return (5, 4, 8)
1686
+ return (3, 1, 4)
1600
1687
  elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
1601
1688
  return (5, 4, 8)
1602
1689
  else: