sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -18,6 +18,7 @@ import copy
18
18
  import dataclasses
19
19
  import json
20
20
  import logging
21
+ import math
21
22
  import os
22
23
  import pickle
23
24
  import signal
@@ -42,6 +43,7 @@ from typing import (
42
43
  )
43
44
 
44
45
  import fastapi
46
+ import torch
45
47
  import uvloop
46
48
  import zmq
47
49
  import zmq.asyncio
@@ -114,6 +116,7 @@ from sglang.srt.sampling.sampling_params import SamplingParams
114
116
  from sglang.srt.server_args import PortArgs, ServerArgs
115
117
  from sglang.srt.utils import (
116
118
  dataclass_to_string_truncated,
119
+ get_bool_env_var,
117
120
  get_zmq_socket,
118
121
  kill_process_tree,
119
122
  )
@@ -221,7 +224,7 @@ class TokenizerManager:
221
224
  self.tokenizer = get_tokenizer_from_processor(self.processor)
222
225
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
223
226
  else:
224
- self.mm_processor = get_dummy_processor()
227
+ self.mm_processor = None
225
228
 
226
229
  if server_args.skip_tokenizer_init:
227
230
  self.tokenizer = self.processor = None
@@ -395,6 +398,9 @@ class TokenizerManager:
395
398
  self.server_args.disaggregation_bootstrap_port
396
399
  )
397
400
 
401
+ self.current_load = 0
402
+ self.current_load_lock = asyncio.Lock()
403
+
398
404
  async def generate_request(
399
405
  self,
400
406
  obj: Union[GenerateReqInput, EmbeddingReqInput],
@@ -422,8 +428,8 @@ class TokenizerManager:
422
428
  is_single = obj.is_single
423
429
  if is_single:
424
430
  tokenized_obj = await self._tokenize_one_request(obj)
425
- self._send_one_request(obj, tokenized_obj, created_time)
426
- async for response in self._wait_one_response(obj, request):
431
+ state = self._send_one_request(obj, tokenized_obj, created_time)
432
+ async for response in self._wait_one_response(obj, state, request):
427
433
  yield response
428
434
  else:
429
435
  async for response in self._handle_batch_request(
@@ -459,8 +465,7 @@ class TokenizerManager:
459
465
  )
460
466
  input_ids = self.tokenizer.encode(input_text)
461
467
 
462
- image_inputs: Optional[Dict] = None
463
- if obj.contains_mm_input():
468
+ if self.mm_processor and obj.contains_mm_input():
464
469
  image_inputs = await self.mm_processor.process_mm_data_async(
465
470
  image_data=obj.image_data,
466
471
  input_text=input_text or input_ids,
@@ -469,6 +474,8 @@ class TokenizerManager:
469
474
  )
470
475
  if image_inputs and "input_ids" in image_inputs:
471
476
  input_ids = image_inputs["input_ids"]
477
+ else:
478
+ image_inputs: Optional[Dict] = None
472
479
 
473
480
  self._validate_token_len(obj, input_ids)
474
481
  return self._create_tokenized_object(
@@ -563,6 +570,7 @@ class TokenizerManager:
563
570
  session_params=session_params,
564
571
  custom_logit_processor=obj.custom_logit_processor,
565
572
  return_hidden_states=obj.return_hidden_states,
573
+ data_parallel_rank=obj.data_parallel_rank,
566
574
  )
567
575
  elif isinstance(obj, EmbeddingReqInput):
568
576
  tokenized_obj = TokenizedEmbeddingReqInput(
@@ -628,15 +636,15 @@ class TokenizerManager:
628
636
  self.send_to_scheduler.send_pyobj(tokenized_obj)
629
637
  state = ReqState([], False, asyncio.Event(), obj, created_time=created_time)
630
638
  self.rid_to_state[obj.rid] = state
639
+ return state
631
640
 
632
641
  async def _wait_one_response(
633
642
  self,
634
643
  obj: Union[GenerateReqInput, EmbeddingReqInput],
644
+ state: ReqState,
635
645
  request: Optional[fastapi.Request] = None,
636
646
  ):
637
647
  """Wait for the response of one request."""
638
- state = self.rid_to_state[obj.rid]
639
-
640
648
  while True:
641
649
  try:
642
650
  await asyncio.wait_for(state.event.wait(), timeout=4)
@@ -706,16 +714,16 @@ class TokenizerManager:
706
714
 
707
715
  for i, tokenized_obj in enumerate(tokenized_objs):
708
716
  tmp_obj = obj[i]
709
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
710
- generators.append(self._wait_one_response(tmp_obj, request))
717
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
718
+ generators.append(self._wait_one_response(tmp_obj, state, request))
711
719
  rids.append(tmp_obj.rid)
712
720
  else:
713
721
  # Sequential tokenization and processing
714
722
  for i in range(batch_size):
715
723
  tmp_obj = obj[i]
716
724
  tokenized_obj = await self._tokenize_one_request(tmp_obj)
717
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
718
- generators.append(self._wait_one_response(tmp_obj, request))
725
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
726
+ generators.append(self._wait_one_response(tmp_obj, state, request))
719
727
  rids.append(tmp_obj.rid)
720
728
  else:
721
729
  # FIXME: When using batch and parallel_sample_num together, the perf is not optimal.
@@ -740,8 +748,8 @@ class TokenizerManager:
740
748
  tokenized_obj.sampling_params = copy.copy(tokenized_obj.sampling_params)
741
749
  tokenized_obj.sampling_params.max_new_tokens = 0
742
750
  tokenized_obj.stream = False
743
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
744
- await self._wait_one_response(tmp_obj, request).__anext__()
751
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
752
+ await self._wait_one_response(tmp_obj, state, request).__anext__()
745
753
 
746
754
  # Expand requests, assign new rids for them, and send them
747
755
  for i in range(batch_size):
@@ -749,8 +757,8 @@ class TokenizerManager:
749
757
  tmp_obj = copy.copy(objs[i])
750
758
  tokenized_obj = copy.copy(tokenized_objs[i])
751
759
  tokenized_obj.rid = tmp_obj.regenerate_rid()
752
- self._send_one_request(tmp_obj, tokenized_obj, created_time)
753
- generators.append(self._wait_one_response(tmp_obj, request))
760
+ state = self._send_one_request(tmp_obj, tokenized_obj, created_time)
761
+ generators.append(self._wait_one_response(tmp_obj, state, request))
754
762
  rids.append(tmp_obj.rid)
755
763
 
756
764
  # Wait for all requests
@@ -786,6 +794,9 @@ class TokenizerManager:
786
794
  req = AbortReq(rid)
787
795
  self.send_to_scheduler.send_pyobj(req)
788
796
 
797
+ if self.enable_metrics:
798
+ self.metrics_collector.observe_one_aborted_request()
799
+
789
800
  async def start_profile(
790
801
  self,
791
802
  output_dir: Optional[str] = None,
@@ -793,8 +804,11 @@ class TokenizerManager:
793
804
  activities: Optional[List[str]] = None,
794
805
  with_stack: Optional[bool] = None,
795
806
  record_shapes: Optional[bool] = None,
807
+ profile_by_stage: bool = False,
796
808
  ):
797
809
  self.auto_create_handle_loop()
810
+ env_with_stack: bool = get_bool_env_var("SGLANG_PROFILE_WITH_STACK", "true")
811
+ with_stack = False if with_stack is False or env_with_stack is False else True
798
812
  req = ProfileReq(
799
813
  type=ProfileReqType.START_PROFILE,
800
814
  output_dir=output_dir,
@@ -802,6 +816,7 @@ class TokenizerManager:
802
816
  activities=activities,
803
817
  with_stack=with_stack,
804
818
  record_shapes=record_shapes,
819
+ profile_by_stage=profile_by_stage,
805
820
  profile_id=str(time.time()),
806
821
  )
807
822
  return await self._execute_profile(req)
@@ -841,7 +856,7 @@ class TokenizerManager:
841
856
  obj.load_format = self.server_args.load_format
842
857
  logger.info("Start update_weights. Load format=%s", obj.load_format)
843
858
 
844
- if True:
859
+ if True: # Keep this redundant check to simplify some internal code sync
845
860
  # Hold the lock if it is not async. This means that weight sync
846
861
  # cannot run while requests are in progress.
847
862
  async with self.model_update_lock.writer_lock:
@@ -983,6 +998,14 @@ class TokenizerManager:
983
998
  # Many DP ranks
984
999
  return [res.internal_state for res in responses]
985
1000
 
1001
+ async def get_load(self) -> dict:
1002
+ # TODO(lsyin): fake load report server
1003
+ if not self.current_load_lock.locked():
1004
+ async with self.current_load_lock:
1005
+ internal_state = await self.get_internal_state()
1006
+ self.current_load = internal_state[0]["load"]
1007
+ return {"load": self.current_load}
1008
+
986
1009
  async def set_internal_state(
987
1010
  self, obj: SetInternalStateReq
988
1011
  ) -> SetInternalStateReqOutput:
@@ -1400,7 +1423,7 @@ class TokenizerManager:
1400
1423
  asyncio.create_task(asyncio.to_thread(background_task))
1401
1424
 
1402
1425
  def _handle_abort_req(self, recv_obj):
1403
- self.rid_to_state.pop(recv_obj.rid)
1426
+ self.rid_to_state.pop(recv_obj.rid, None)
1404
1427
 
1405
1428
  def _handle_open_session_req_output(self, recv_obj):
1406
1429
  self.session_futures[recv_obj.session_id].set_result(
@@ -1416,6 +1439,100 @@ class TokenizerManager:
1416
1439
  if len(self.model_update_tmp) == self.server_args.dp_size:
1417
1440
  self.model_update_result.set_result(self.model_update_tmp)
1418
1441
 
1442
+ async def score_request(
1443
+ self,
1444
+ query: Optional[Union[str, List[int]]] = None,
1445
+ items: Optional[Union[str, List[str], List[List[int]]]] = None,
1446
+ label_token_ids: Optional[List[int]] = None,
1447
+ apply_softmax: bool = False,
1448
+ item_first: bool = False,
1449
+ request: Optional[Any] = None,
1450
+ ) -> List[List[float]]:
1451
+ """
1452
+ See Engine.score() for more details.
1453
+ """
1454
+ if label_token_ids is None:
1455
+ raise ValueError("label_token_ids must be provided")
1456
+
1457
+ if self.tokenizer is not None:
1458
+ vocab_size = self.tokenizer.vocab_size
1459
+ for token_id in label_token_ids:
1460
+ if token_id >= vocab_size:
1461
+ raise ValueError(
1462
+ f"Token ID {token_id} is out of vocabulary (vocab size: {vocab_size})"
1463
+ )
1464
+
1465
+ # Handle string or tokenized query/items
1466
+ if isinstance(query, str) and (
1467
+ isinstance(items, str)
1468
+ or (isinstance(items, list) and (not items or isinstance(items[0], str)))
1469
+ ):
1470
+ # Both query and items are text
1471
+ items_list = [items] if isinstance(items, str) else items
1472
+ if item_first:
1473
+ prompts = [f"{item}{query}" for item in items_list]
1474
+ else:
1475
+ prompts = [f"{query}{item}" for item in items_list]
1476
+ batch_request = GenerateReqInput(
1477
+ text=prompts,
1478
+ return_logprob=True,
1479
+ token_ids_logprob=label_token_ids,
1480
+ stream=False,
1481
+ sampling_params={"max_new_tokens": 1},
1482
+ )
1483
+ elif (
1484
+ isinstance(query, list)
1485
+ and isinstance(items, list)
1486
+ and items
1487
+ and isinstance(items[0], list)
1488
+ ):
1489
+ # Both query and items are token IDs
1490
+ if item_first:
1491
+ input_ids_list = [item + query for item in items]
1492
+ else:
1493
+ input_ids_list = [query + item for item in items]
1494
+ batch_request = GenerateReqInput(
1495
+ input_ids=input_ids_list,
1496
+ return_logprob=True,
1497
+ token_ids_logprob=label_token_ids,
1498
+ stream=False,
1499
+ sampling_params={"max_new_tokens": 1},
1500
+ )
1501
+ else:
1502
+ raise ValueError(
1503
+ "Invalid combination of query/items types for score_request."
1504
+ )
1505
+
1506
+ results = await self.generate_request(batch_request, request).__anext__()
1507
+ scores = []
1508
+
1509
+ for result in results:
1510
+ # Get logprobs for each token
1511
+ logprobs = {}
1512
+ for logprob, token_id, _ in result["meta_info"].get(
1513
+ "output_token_ids_logprobs", []
1514
+ )[0]:
1515
+ if token_id in label_token_ids:
1516
+ logprobs[token_id] = logprob
1517
+
1518
+ # Get scores in order of label_token_ids
1519
+ score_list = [
1520
+ logprobs.get(token_id, float("-inf")) for token_id in label_token_ids
1521
+ ]
1522
+
1523
+ # Apply softmax to logprobs if needed
1524
+ if apply_softmax:
1525
+ score_list = torch.softmax(torch.tensor(score_list), dim=0).tolist()
1526
+ else:
1527
+ # Convert logprobs to probabilities if not using softmax
1528
+ score_list = [
1529
+ math.exp(x) if x != float("-inf") else 0.0 for x in score_list
1530
+ ]
1531
+
1532
+ scores.append(score_list)
1533
+
1534
+ return scores
1535
+
1419
1536
 
1420
1537
  async def print_exception_wrapper(func):
1421
1538
  """
@@ -35,10 +35,6 @@ def validate_input_length(
35
35
  f"the maximum allowed length ({max_req_input_len} tokens). "
36
36
  f"Use a shorter input or enable --allow-auto-truncate."
37
37
  )
38
- logger.error(error_msg)
39
- req.finished_reason = FINISH_ABORT(
40
- error_msg, HTTPStatus.BAD_REQUEST, "BadRequestError"
41
- )
42
38
  return error_msg
43
39
 
44
40
  return None
@@ -402,6 +402,12 @@ class TokenizerMetricsCollector:
402
402
  labelnames=labels.keys(),
403
403
  )
404
404
 
405
+ self.num_aborted_requests_total = Counter(
406
+ name="sglang:num_aborted_requests",
407
+ documentation="Number of requests aborted.",
408
+ labelnames=labels.keys(),
409
+ )
410
+
405
411
  if bucket_time_to_first_token is None:
406
412
  bucket_time_to_first_token = [
407
413
  0.1,
@@ -533,3 +539,6 @@ class TokenizerMetricsCollector:
533
539
  if adjusted_interval <= bound:
534
540
  his._buckets[i].inc(num_new_tokens)
535
541
  break
542
+
543
+ def observe_one_aborted_request(self):
544
+ self.num_aborted_requests_total.labels(**self.labels).inc(1)
@@ -28,7 +28,6 @@ from sglang.srt.custom_op import CustomOp
28
28
  from sglang.srt.distributed import get_tensor_model_parallel_rank
29
29
  from sglang.srt.distributed.parallel_state import GroupCoordinator, graph_capture
30
30
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
31
- from sglang.srt.layers.moe.fused_moe_native import fused_moe_forward_native
32
31
  from sglang.srt.layers.torchao_utils import save_gemlite_cache
33
32
  from sglang.srt.managers.schedule_batch import global_server_args_dict
34
33
  from sglang.srt.model_executor.forward_batch_info import (
@@ -36,8 +35,10 @@ from sglang.srt.model_executor.forward_batch_info import (
36
35
  ForwardBatch,
37
36
  ForwardMode,
38
37
  PPProxyTensors,
38
+ enable_num_token_non_padded,
39
39
  )
40
40
  from sglang.srt.patch_torch import monkey_patch_torch_compile
41
+ from sglang.srt.two_batch_overlap import TboCudaGraphRunnerPlugin
41
42
  from sglang.srt.utils import (
42
43
  get_available_gpu_memory,
43
44
  get_device_memory_capacity,
@@ -55,22 +56,23 @@ def get_is_capture_mode():
55
56
  return is_capture_mode
56
57
 
57
58
 
59
+ @contextmanager
60
+ def model_capture_mode():
61
+ global is_capture_mode
62
+ is_capture_mode = True
63
+
64
+ yield
65
+
66
+ is_capture_mode = False
67
+
68
+
58
69
  def _to_torch(model: torch.nn.Module, reverse: bool, num_tokens: int):
59
70
  for sub in model._modules.values():
60
71
  if isinstance(sub, CustomOp):
61
72
  if reverse:
62
- sub._forward_method = sub.forward_cuda
63
- setattr(sub, "is_torch_compile", False)
73
+ sub.leave_torch_compile()
64
74
  else:
65
- # NOTE: Temporarily workaround MoE
66
- if "FusedMoE" in sub.__class__.__name__:
67
- if num_tokens == 1:
68
- # The performance of torch.compile on this layer is not always good when bs > 1,
69
- # so we decide to only use torch.compile when bs =1
70
- sub._forward_method = fused_moe_forward_native
71
- else:
72
- sub._forward_method = sub.forward_native
73
- setattr(sub, "is_torch_compile", True)
75
+ sub.enter_torch_compile(num_tokens=num_tokens)
74
76
  if isinstance(sub, torch.nn.Module):
75
77
  _to_torch(sub, reverse, num_tokens)
76
78
 
@@ -131,26 +133,32 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
131
133
  if capture_bs is None:
132
134
  if server_args.speculative_algorithm is None:
133
135
  if server_args.disable_cuda_graph_padding:
134
- capture_bs = list(range(1, 33)) + list(range(40, 161, 16))
136
+ capture_bs = list(range(1, 33)) + list(range(48, 161, 16))
135
137
  else:
136
138
  capture_bs = [1, 2, 4, 8] + list(range(16, 161, 8))
137
139
  else:
138
140
  # Since speculative decoding requires more cuda graph memory, we
139
141
  # capture less.
140
142
  capture_bs = (
141
- list(range(1, 9)) + list(range(10, 33, 2)) + list(range(40, 161, 16))
143
+ list(range(1, 9))
144
+ + list(range(10, 33, 2))
145
+ + list(range(40, 64, 8))
146
+ + list(range(80, 161, 16))
142
147
  )
143
148
 
144
149
  gpu_mem = get_device_memory_capacity()
145
150
  if gpu_mem is not None and gpu_mem > 96 * 1024:
146
151
  capture_bs += list(range(160, 257, 8))
152
+ if gpu_mem is not None and gpu_mem > 180 * 1000:
153
+ capture_bs += list(range(256, 513, 16))
147
154
 
148
155
  if max(capture_bs) > model_runner.req_to_token_pool.size:
149
- # In some case (e.g., with a small GPU or --max-running-requests), the #max-running-requests
156
+ # In some cases (e.g., with a small GPU or --max-running-requests), the #max-running-requests
150
157
  # is very small. We add more values here to make sure we capture the maximum bs.
151
- capture_bs += [model_runner.req_to_token_pool.size - 1] + [
152
- model_runner.req_to_token_pool.size
153
- ]
158
+ capture_bs += [model_runner.req_to_token_pool.size]
159
+
160
+ if server_args.enable_two_batch_overlap:
161
+ capture_bs = [bs for bs in capture_bs if bs >= 2]
154
162
 
155
163
  if server_args.cuda_graph_max_bs:
156
164
  capture_bs = [bs for bs in capture_bs if bs <= server_args.cuda_graph_max_bs]
@@ -160,7 +168,7 @@ def get_batch_sizes_to_capture(model_runner: ModelRunner):
160
168
  )
161
169
  capture_bs = [bs for bs in capture_bs if bs <= model_runner.req_to_token_pool.size]
162
170
  capture_bs = list(sorted(set(capture_bs)))
163
- assert len(capture_bs) > 0 and capture_bs[0] > 0
171
+ assert len(capture_bs) > 0 and capture_bs[0] > 0, f"{capture_bs=}"
164
172
  compile_bs = (
165
173
  [bs for bs in capture_bs if bs <= server_args.torch_compile_max_bs]
166
174
  if server_args.enable_torch_compile
@@ -195,6 +203,9 @@ class CudaGraphRunner:
195
203
  self.is_encoder_decoder = model_runner.model_config.is_encoder_decoder
196
204
  self.enable_dp_attention = model_runner.server_args.enable_dp_attention
197
205
  self.enable_sp_layernorm = model_runner.server_args.enable_sp_layernorm
206
+ self.enable_two_batch_overlap = (
207
+ model_runner.server_args.enable_two_batch_overlap
208
+ )
198
209
  self.speculative_algorithm = model_runner.server_args.speculative_algorithm
199
210
  self.tp_size = model_runner.server_args.tp_size
200
211
  self.dp_size = model_runner.server_args.dp_size
@@ -248,6 +259,7 @@ class CudaGraphRunner:
248
259
  self.positions = torch.zeros((self.max_num_token,), dtype=torch.int64)
249
260
  self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int64)
250
261
  self.num_token_non_padded = torch.zeros((1,), dtype=torch.int32)
262
+ self.tbo_plugin = TboCudaGraphRunnerPlugin()
251
263
 
252
264
  # pipeline parallelism
253
265
  if self.pp_size > 1:
@@ -263,23 +275,8 @@ class CudaGraphRunner:
263
275
  }
264
276
 
265
277
  # Speculative_inference
266
- if (
267
- model_runner.spec_algorithm.is_eagle3()
268
- and not model_runner.is_draft_worker
269
- ):
270
- self.hidden_states = torch.zeros(
271
- (
272
- self.max_num_token,
273
- 3 * self.model_runner.model_config.hidden_size,
274
- ),
275
- dtype=self.model_runner.dtype,
276
- )
278
+ if model_runner.spec_algorithm.is_eagle3():
277
279
  self.model_runner.model.set_eagle3_layers_to_capture()
278
- elif model_runner.spec_algorithm.is_eagle():
279
- self.hidden_states = torch.zeros(
280
- (self.max_num_token, self.model_runner.model_config.hidden_size),
281
- dtype=self.model_runner.dtype,
282
- )
283
280
 
284
281
  if self.is_encoder_decoder:
285
282
  # NOTE: encoder_lens can influence the full_text_row_masked_out_mask tensor when doing mixed batch
@@ -288,6 +285,7 @@ class CudaGraphRunner:
288
285
  )
289
286
  else:
290
287
  self.encoder_lens = None
288
+
291
289
  if self.enable_dp_attention or self.enable_sp_layernorm:
292
290
  # TODO(ch-wan): SP layernorm should use a different logic to manage gathered_buffer
293
291
  self.gathered_buffer = torch.zeros(
@@ -303,28 +301,13 @@ class CudaGraphRunner:
303
301
 
304
302
  # Capture
305
303
  try:
306
- with self.model_capture_mode():
304
+ with model_capture_mode():
307
305
  self.capture()
308
306
  except RuntimeError as e:
309
307
  raise Exception(
310
- f"Capture CUDA graph failed: {e}\n"
311
- "Possible solutions:\n"
312
- "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
313
- "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
314
- "3. disable torch compile by not using --enable-torch-compile\n"
315
- "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
316
- "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
308
+ f"Capture cuda graph failed: {e}\n{CUDA_GRAPH_CAPTURE_FAILED_MSG}"
317
309
  )
318
310
 
319
- @contextmanager
320
- def model_capture_mode(self):
321
- global is_capture_mode
322
- is_capture_mode = True
323
-
324
- yield
325
-
326
- is_capture_mode = False
327
-
328
311
  def can_run(self, forward_batch: ForwardBatch):
329
312
  if self.enable_dp_attention or self.enable_sp_layernorm:
330
313
  total_global_tokens = sum(forward_batch.global_num_tokens_cpu)
@@ -349,7 +332,12 @@ class CudaGraphRunner:
349
332
  if self.is_encoder_decoder
350
333
  else True
351
334
  )
352
- return is_bs_supported and is_encoder_lens_supported
335
+
336
+ is_tbo_supported = (
337
+ forward_batch.can_run_tbo if self.enable_two_batch_overlap else True
338
+ )
339
+
340
+ return is_bs_supported and is_encoder_lens_supported and is_tbo_supported
353
341
 
354
342
  def capture(self):
355
343
  with graph_capture() as graph_capture_context:
@@ -436,6 +424,7 @@ class CudaGraphRunner:
436
424
  self.capture_hidden_mode = (
437
425
  spec_info.capture_hidden_mode if spec_info else CaptureHiddenMode.NULL
438
426
  )
427
+
439
428
  if self.model_runner.server_args.lora_paths is not None:
440
429
  # Currently, if the lora_path in `lora_paths` is None, the lora backend will use a
441
430
  # different logic to handle lora, so we need to set `lora_paths` to a list of non-None
@@ -464,9 +453,11 @@ class CudaGraphRunner:
464
453
  spec_algorithm=self.model_runner.spec_algorithm,
465
454
  spec_info=spec_info,
466
455
  capture_hidden_mode=self.capture_hidden_mode,
467
- lora_paths=lora_paths,
468
456
  num_token_non_padded=self.num_token_non_padded,
457
+ global_forward_mode=self.capture_forward_mode,
458
+ lora_paths=lora_paths,
469
459
  )
460
+ self.tbo_plugin.capture_one_batch_size(forward_batch, num_tokens=num_tokens)
470
461
 
471
462
  if lora_paths is not None:
472
463
  self.model_runner.lora_manager.prepare_lora_batch(forward_batch)
@@ -492,7 +483,9 @@ class CudaGraphRunner:
492
483
  self.pp_size > 1
493
484
  and "pp_proxy_tensors" in inspect.signature(forward).parameters
494
485
  ):
495
- kwargs["pp_proxy_tensors"] = pp_proxy_tensors
486
+ kwargs["pp_proxy_tensors"] = PPProxyTensors(
487
+ {k: v.clone() for k, v in pp_proxy_tensors.tensors.items()}
488
+ )
496
489
 
497
490
  logits_output_or_pp_proxy_tensors = forward(
498
491
  input_ids,
@@ -561,7 +554,7 @@ class CudaGraphRunner:
561
554
  self.seq_lens[:raw_bs].copy_(forward_batch.seq_lens)
562
555
  self.out_cache_loc[:raw_num_token].copy_(forward_batch.out_cache_loc)
563
556
  self.positions[:raw_num_token].copy_(forward_batch.positions)
564
- self.num_token_non_padded[...] = len(forward_batch.input_ids)
557
+
565
558
  if forward_batch.seq_lens_cpu is not None:
566
559
  if bs != raw_bs:
567
560
  self.seq_lens_cpu.fill_(1)
@@ -578,9 +571,14 @@ class CudaGraphRunner:
578
571
  self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
579
572
  if self.enable_dp_attention or self.enable_sp_layernorm:
580
573
  self.global_num_tokens_gpu.copy_(forward_batch.global_num_tokens_gpu)
581
-
582
- if hasattr(forward_batch.spec_info, "hidden_states"):
583
- self.hidden_states[:raw_num_token] = forward_batch.spec_info.hidden_states
574
+ if enable_num_token_non_padded(self.model_runner.server_args):
575
+ self.num_token_non_padded.copy_(forward_batch.num_token_non_padded)
576
+ if self.enable_two_batch_overlap:
577
+ self.tbo_plugin.replay_prepare(
578
+ forward_mode=forward_batch.forward_mode,
579
+ bs=bs,
580
+ num_token_non_padded=len(forward_batch.input_ids),
581
+ )
584
582
 
585
583
  # Attention backend
586
584
  self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
@@ -639,7 +637,7 @@ class CudaGraphRunner:
639
637
  else:
640
638
  spec_info = EagleVerifyInput(
641
639
  draft_token=None,
642
- custom_mask=torch.zeros(
640
+ custom_mask=torch.ones(
643
641
  (num_tokens * self.model_runner.model_config.context_len),
644
642
  dtype=torch.bool,
645
643
  device="cuda",
@@ -649,9 +647,22 @@ class CudaGraphRunner:
649
647
  retrive_next_token=None,
650
648
  retrive_next_sibling=None,
651
649
  retrive_cum_len=None,
652
- draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
653
650
  spec_steps=self.model_runner.server_args.speculative_num_steps,
651
+ topk=self.model_runner.server_args.speculative_eagle_topk,
652
+ draft_token_num=self.model_runner.server_args.speculative_num_draft_tokens,
654
653
  capture_hidden_mode=CaptureHiddenMode.FULL,
654
+ seq_lens_sum=None,
655
+ seq_lens_cpu=None,
655
656
  )
656
657
 
657
658
  return spec_info
659
+
660
+
661
+ CUDA_GRAPH_CAPTURE_FAILED_MSG = (
662
+ "Possible solutions:\n"
663
+ "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
664
+ "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
665
+ "3. disable torch compile by not using --enable-torch-compile\n"
666
+ "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
667
+ "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
668
+ )