sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -17,6 +17,7 @@ import base64
17
17
  import builtins
18
18
  import ctypes
19
19
  import dataclasses
20
+ import functools
20
21
  import importlib
21
22
  import io
22
23
  import ipaddress
@@ -25,6 +26,7 @@ import json
25
26
  import logging
26
27
  import os
27
28
  import pickle
29
+ import platform
28
30
  import random
29
31
  import re
30
32
  import resource
@@ -44,6 +46,7 @@ from functools import lru_cache
44
46
  from importlib.metadata import PackageNotFoundError, version
45
47
  from importlib.util import find_spec
46
48
  from io import BytesIO
49
+ from json import JSONDecodeError
47
50
  from multiprocessing.reduction import ForkingPickler
48
51
  from pathlib import Path
49
52
  from typing import (
@@ -157,6 +160,15 @@ def is_npu() -> bool:
157
160
  return hasattr(torch, "npu") and torch.npu.is_available()
158
161
 
159
162
 
163
+ def is_cpu() -> bool:
164
+ machine = platform.machine().lower()
165
+ return (
166
+ machine in ("x86_64", "amd64", "i386", "i686")
167
+ and hasattr(torch, "cpu")
168
+ and torch.cpu.is_available()
169
+ )
170
+
171
+
160
172
  def is_flashinfer_available():
161
173
  """
162
174
  Check whether flashinfer is available.
@@ -826,6 +838,7 @@ class CustomCacheManager(FileCacheManager):
826
838
 
827
839
 
828
840
  def set_ulimit(target_soft_limit=65535):
841
+ # number of open files
829
842
  resource_type = resource.RLIMIT_NOFILE
830
843
  current_soft, current_hard = resource.getrlimit(resource_type)
831
844
 
@@ -835,6 +848,18 @@ def set_ulimit(target_soft_limit=65535):
835
848
  except ValueError as e:
836
849
  logger.warning(f"Fail to set RLIMIT_NOFILE: {e}")
837
850
 
851
+ # stack size
852
+ resource_type = resource.RLIMIT_STACK
853
+ current_soft, current_hard = resource.getrlimit(resource_type)
854
+ target_soft_limit_stack_size = 1024 * target_soft_limit
855
+ if current_soft < target_soft_limit_stack_size:
856
+ try:
857
+ resource.setrlimit(
858
+ resource_type, (target_soft_limit_stack_size, current_hard)
859
+ )
860
+ except ValueError as e:
861
+ logger.warning(f"Fail to set RLIMIT_STACK: {e}")
862
+
838
863
 
839
864
  def add_api_key_middleware(app, api_key: str):
840
865
  @app.middleware("http")
@@ -1362,6 +1387,11 @@ def print_warning_once(msg: str) -> None:
1362
1387
  logger.warning(msg, stacklevel=2)
1363
1388
 
1364
1389
 
1390
+ @functools.lru_cache(None)
1391
+ def print_info_once(msg: str) -> None:
1392
+ logger.info(msg)
1393
+
1394
+
1365
1395
  def get_device_name(device_id: int = 0) -> str:
1366
1396
  if hasattr(torch, "cuda") and torch.cuda.is_available():
1367
1397
  return torch.cuda.get_device_name(device_id)
@@ -1917,16 +1947,18 @@ def next_power_of_2(n: int):
1917
1947
  setattr(triton, "next_power_of_2", next_power_of_2)
1918
1948
 
1919
1949
 
1920
- @contextmanager
1921
- def empty_context(*args, **kwargs):
1922
- try:
1923
- # Setup code goes here
1924
- yield
1925
- finally:
1926
- # Cleanup code goes here
1950
+ class EmptyContextManager:
1951
+ def __enter__(self):
1952
+ return self
1953
+
1954
+ def __exit__(self, exc_type, exc_value, traceback):
1927
1955
  pass
1928
1956
 
1929
1957
 
1958
+ def empty_context(*args, **kwargs):
1959
+ return EmptyContextManager()
1960
+
1961
+
1930
1962
  def add_prefix(name: str, prefix: str) -> str:
1931
1963
  """Add a weight path prefix to a module name.
1932
1964
 
@@ -2025,6 +2057,14 @@ class DeepEPMode(Enum):
2025
2057
  return DeepEPMode.normal
2026
2058
 
2027
2059
 
2060
+ def is_non_idle_and_non_empty(forward_mode, hidden_states):
2061
+ return (
2062
+ (forward_mode is not None)
2063
+ and not forward_mode.is_idle()
2064
+ and hidden_states.shape[0] > 0
2065
+ )
2066
+
2067
+
2028
2068
  def fast_topk(values, topk, dim):
2029
2069
  if topk == 1:
2030
2070
  # Use max along the specified dimension to get both value and index
@@ -2046,6 +2086,12 @@ is_ampere_with_cuda_12_3 = lambda: _check(8)
2046
2086
  is_hopper_with_cuda_12_3 = lambda: _check(9)
2047
2087
 
2048
2088
 
2089
+ def is_blackwell():
2090
+ if not is_cuda():
2091
+ return False
2092
+ return torch.cuda.get_device_capability()[0] == 10
2093
+
2094
+
2049
2095
  def get_free_port():
2050
2096
  # try ipv4
2051
2097
  try:
@@ -2068,6 +2114,14 @@ def get_local_ip_by_remote() -> str:
2068
2114
  except Exception:
2069
2115
  pass
2070
2116
 
2117
+ try:
2118
+ hostname = socket.gethostname()
2119
+ ip = socket.gethostbyname(hostname)
2120
+ if ip and ip != "127.0.0.1" and ip != "0.0.0.0":
2121
+ return ip
2122
+ except Exception:
2123
+ pass
2124
+
2071
2125
  # try ipv6
2072
2126
  try:
2073
2127
  s = socket.socket(socket.AF_INET6, socket.SOCK_DGRAM)
@@ -2160,3 +2214,129 @@ class Withable(Generic[T]):
2160
2214
  finally:
2161
2215
  assert self._value is new_value
2162
2216
  self._value = None
2217
+
2218
+
2219
+ def merge_bias_tensor(
2220
+ lhs: Optional[torch.Tensor],
2221
+ rhs: Optional[torch.Tensor],
2222
+ bs1: int,
2223
+ bs2: int,
2224
+ device: str,
2225
+ default: float,
2226
+ ):
2227
+ """Merge two bias tensors for batch merging.
2228
+
2229
+ Args:
2230
+ lhs: Left-hand side tensor
2231
+ rhs: Right-hand side tensor
2232
+ bs1: Batch size of left-hand side tensor
2233
+ bs2: Batch size of right-hand side tensor
2234
+ device: Device to place the merged tensor on
2235
+ default: Default value for missing tensor elements
2236
+
2237
+ Returns:
2238
+ Merged tensor or None if both inputs are None
2239
+ """
2240
+ if lhs is None and rhs is None:
2241
+ return None
2242
+
2243
+ if lhs is not None and rhs is not None:
2244
+ return torch.cat([lhs, rhs])
2245
+ else:
2246
+ if lhs is not None:
2247
+ shape, dtype = lhs.shape[1:], lhs.dtype
2248
+ else:
2249
+ shape, dtype = rhs.shape[1:], rhs.dtype
2250
+
2251
+ if lhs is None:
2252
+ lhs = torch.empty((bs1, *shape), device=device, dtype=dtype).fill_(default)
2253
+ if rhs is None:
2254
+ rhs = torch.empty((bs2, *shape), device=device, dtype=dtype).fill_(default)
2255
+ return torch.cat([lhs, rhs])
2256
+
2257
+
2258
+ def find_local_repo_dir(repo_id: str, revision: Optional[str] = None) -> Optional[str]:
2259
+ import huggingface_hub as hf
2260
+
2261
+ # Build cache path
2262
+ cache_path = os.path.join(
2263
+ hf.constants.HF_HUB_CACHE,
2264
+ hf.constants.REPO_ID_SEPARATOR.join(["models", *repo_id.split("/")]),
2265
+ )
2266
+
2267
+ # Get revision from main ref if not specified
2268
+ if not revision:
2269
+ ref_path = os.path.join(cache_path, "refs", "main")
2270
+ if os.path.isfile(ref_path):
2271
+ with open(ref_path) as f:
2272
+ revision = f.read().strip()
2273
+
2274
+ # List files from revision directory
2275
+ if revision:
2276
+ rev_dir = os.path.join(cache_path, "snapshots", revision)
2277
+ if os.path.isdir(rev_dir):
2278
+ return rev_dir
2279
+
2280
+ return None
2281
+
2282
+
2283
+ def read_system_prompt_from_file(model_name: str) -> str:
2284
+ """Read system prompt from a file in the HuggingFace cache directory.
2285
+
2286
+ Args:
2287
+ model_name: The model name to construct the file path
2288
+
2289
+ Returns:
2290
+ The system prompt content from the file, or empty string if file not found
2291
+ """
2292
+ try:
2293
+ local_repo_dir = find_local_repo_dir(model_name)
2294
+ if local_repo_dir:
2295
+ system_prompt_file = os.path.join(local_repo_dir, "SYSTEM_PROMPT.txt")
2296
+ if os.path.exists(system_prompt_file):
2297
+ with open(system_prompt_file, "r", encoding="utf-8") as f:
2298
+ return f.read()
2299
+
2300
+ return ""
2301
+ except Exception:
2302
+ # If anything fails, return empty string
2303
+ return ""
2304
+
2305
+
2306
+ def bind_or_assign(target, source):
2307
+ if target is not None:
2308
+ target.copy_(source)
2309
+ return target
2310
+ else:
2311
+ return source
2312
+
2313
+
2314
+ def support_triton(backend: str) -> bool:
2315
+ return backend not in ["torch_native", "intel_amx"]
2316
+
2317
+
2318
+ try:
2319
+ import sgl_kernel
2320
+
2321
+ is_intel_amx_backend_available = hasattr(
2322
+ torch.ops.sgl_kernel, "convert_weight_packed"
2323
+ )
2324
+ except:
2325
+ is_intel_amx_backend_available = False
2326
+
2327
+
2328
+ def cpu_has_amx_support():
2329
+ return torch._C._cpu._is_amx_tile_supported() and is_intel_amx_backend_available
2330
+
2331
+
2332
+ class LazyValue:
2333
+ def __init__(self, creator: Callable):
2334
+ self._creator = creator
2335
+ self._value = None
2336
+
2337
+ @property
2338
+ def value(self):
2339
+ if self._creator is not None:
2340
+ self._value = self._creator()
2341
+ self._creator = None
2342
+ return self._value
@@ -2,6 +2,8 @@ import unittest
2
2
 
3
3
  import torch
4
4
 
5
+ from sglang.srt.layers.attention.flashattention_backend import FlashAttentionBackend
6
+ from sglang.srt.layers.radix_attention import RadixAttention
5
7
  from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool
6
8
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
7
9
  from sglang.test.test_utils import CustomTestCase
sglang/test/runners.py CHANGED
@@ -26,6 +26,7 @@ from transformers import (
26
26
  AutoModelForCausalLM,
27
27
  AutoModelForVision2Seq,
28
28
  AutoProcessor,
29
+ GenerationConfig,
29
30
  )
30
31
 
31
32
  from sglang.srt.entrypoints.engine import Engine
@@ -41,6 +42,21 @@ DEFAULT_PROMPTS = [
41
42
  # the output of gemma-2-2b from SRT is unstable on the commented prompt
42
43
  # "The capital of France is",
43
44
  ]
45
+ TEST_RERANK_QUERY_DOCS = [
46
+ {
47
+ "query": "How many people live in Berlin?",
48
+ "documents": [
49
+ "Berlin is well known for its museums.",
50
+ ],
51
+ },
52
+ {
53
+ "query": "How many people live in Berlin?",
54
+ "documents": [
55
+ "Berlin had a population of 3,520,031 registered inhabitants in an area of 891.82 square kilometers.",
56
+ "Berlin is well known for its museums.",
57
+ ],
58
+ },
59
+ ]
44
60
 
45
61
  dirpath = os.path.dirname(__file__)
46
62
  with open(os.path.join(dirpath, "long_prompt.txt"), "r") as f:
@@ -240,7 +256,7 @@ class HFRunner:
240
256
  self.model = _get_sentence_transformer_embedding_model(
241
257
  model_path, torch_dtype
242
258
  )
243
- elif self.model_type == "reward":
259
+ elif self.model_type == "reward" or self.model_type == "cross_encoder":
244
260
  from transformers import AutoModelForSequenceClassification
245
261
 
246
262
  self.model = AutoModelForSequenceClassification.from_pretrained(
@@ -302,6 +318,15 @@ class HFRunner:
302
318
  else:
303
319
  logits = self.model.encode(prompts).tolist()
304
320
  out_queue.put(ModelOutput(embed_logits=logits))
321
+ elif self.model_type == "cross_encoder":
322
+ inputs = self.tokenizer(
323
+ prompts, padding=True, return_tensors="pt"
324
+ ).to("cuda")
325
+ scores = self.model(**inputs).logits
326
+ scores = scores.squeeze().tolist()
327
+ if not isinstance(scores, list):
328
+ scores = [scores]
329
+ out_queue.put(ModelOutput(scores=scores))
305
330
 
306
331
  elif self.model_type == "reward":
307
332
  scores = []
@@ -321,7 +346,9 @@ class HFRunner:
321
346
 
322
347
  def forward(
323
348
  self,
324
- prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
349
+ prompts: Union[
350
+ List[List[str]], List[str], List[torch.Tensor]
351
+ ] = DEFAULT_PROMPTS,
325
352
  image_data: Optional[List[str]] = None,
326
353
  max_new_tokens: int = 8,
327
354
  lora_paths: Optional[List[str]] = None,
@@ -382,13 +409,17 @@ class HFRunner:
382
409
  model = base_model
383
410
 
384
411
  outputs = model.generate(
385
- input_ids,
386
- do_sample=False,
387
- temperature=None,
388
- top_p=None,
389
- max_new_tokens=max_new_tokens,
390
- return_dict_in_generate=True,
391
- output_scores=(not output_str_only),
412
+ input_ids=input_ids,
413
+ generation_config=GenerationConfig(
414
+ do_sample=False,
415
+ temperature=None,
416
+ top_p=None,
417
+ max_new_tokens=max_new_tokens,
418
+ return_dict_in_generate=True,
419
+ output_scores=(not output_str_only),
420
+ # make sure to disable compile
421
+ disable_compile=True,
422
+ ),
392
423
  )
393
424
 
394
425
  text = tokenizer.decode(
@@ -450,6 +481,7 @@ class SRTRunner:
450
481
  torch_dtype: torch.dtype,
451
482
  model_type: str,
452
483
  tp_size: int = 1,
484
+ impl: str = "auto",
453
485
  port: int = DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
454
486
  lora_paths: List[str] = None,
455
487
  max_loras_per_batch: int = 4,
@@ -470,6 +502,7 @@ class SRTRunner:
470
502
  speculative_num_draft_tokens: Optional[int] = None,
471
503
  disable_overlap_schedule: bool = False,
472
504
  disable_custom_all_reduce: bool = False,
505
+ torchao_config: Optional[str] = None,
473
506
  ):
474
507
  self.model_type = model_type
475
508
  self.is_generation = model_type == "generation"
@@ -488,6 +521,8 @@ class SRTRunner:
488
521
  tp_size=tp_size,
489
522
  dtype=get_dtype_str(torch_dtype),
490
523
  port=port,
524
+ impl=impl,
525
+ torchao_config=torchao_config,
491
526
  mem_fraction_static=mem_fraction_static,
492
527
  trust_remote_code=trust_remote_code,
493
528
  is_embedding=not self.is_generation,
@@ -517,7 +552,9 @@ class SRTRunner:
517
552
 
518
553
  def forward(
519
554
  self,
520
- prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
555
+ prompts: Union[
556
+ List[List[str]], List[str], List[torch.Tensor]
557
+ ] = DEFAULT_PROMPTS,
521
558
  image_data: Optional[List[str]] = None,
522
559
  max_new_tokens: int = 8,
523
560
  lora_paths: Optional[List[str]] = None,
@@ -543,6 +580,13 @@ class SRTRunner:
543
580
  else:
544
581
  logits = [response["embedding"]]
545
582
  return ModelOutput(embed_logits=logits)
583
+ # cross encoder model
584
+ elif self.model_type == "cross_encoder":
585
+ response = self.engine.rerank(prompts)
586
+ if not isinstance(response, list):
587
+ response = [response]
588
+ scores = [x["embedding"] for x in response]
589
+ return ModelOutput(scores=scores)
546
590
  # reward model
547
591
  else:
548
592
  response = self.engine.encode(prompts)
sglang/test/send_one.py CHANGED
@@ -127,6 +127,10 @@ def send_one_prompt(args):
127
127
  if args.batch_size > 1:
128
128
  ret = ret[0]
129
129
 
130
+ if response.status_code != 200:
131
+ print(ret)
132
+ return 0, 0
133
+
130
134
  latency = ret["meta_info"]["e2e_latency"]
131
135
 
132
136
  if "spec_verify_ct" in ret["meta_info"]:
@@ -343,6 +343,7 @@ class TestW8A8BlockFP8Matmul(CustomTestCase):
343
343
  OUT_DTYPES = [torch.bfloat16]
344
344
  M = [64, 128, 512, 1024, 4096]
345
345
  NKs = [
346
+ (2112, 7168),
346
347
  (1536, 7168),
347
348
  (3072, 1536),
348
349
  (24576, 7168),
@@ -0,0 +1,252 @@
1
+ import itertools
2
+ import os
3
+ import unittest
4
+ from typing import List, Tuple
5
+
6
+ import torch
7
+ from deep_gemm import fp8_gemm_nt
8
+
9
+ from sglang.test.test_utils import CustomTestCase
10
+
11
+ _is_cuda = torch.cuda.is_available() and torch.version.cuda
12
+
13
+
14
+ # Modify form DeepGEMM Blackwell
15
+ def ceil_div(x: int, y: int) -> int:
16
+ return (x + y - 1) // y
17
+
18
+
19
+ def align(x: int, y: int) -> int:
20
+ return ceil_div(x, y) * y
21
+
22
+
23
+ def per_token_group_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
24
+ assert x.dim() == 2 and x.size(1) % 128 == 0
25
+ m, n = x.shape
26
+ x_view = x.view(m, -1, 128)
27
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
28
+ sf = x_amax / 448.0
29
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
30
+
31
+
32
+ def per_block_quant_fp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
33
+ assert x.dim() == 2
34
+ m, n = x.shape
35
+ x_padded = torch.zeros(
36
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
37
+ )
38
+ x_padded[:m, :n] = x
39
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
40
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
41
+ sf = x_amax / 448.0
42
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
43
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
44
+ x_view.size(0), x_view.size(2)
45
+ )
46
+
47
+
48
+ def ceil_to_ue8m0(x: torch.Tensor):
49
+ assert x.view(-1).amax().item() > 0
50
+ return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
51
+
52
+
53
+ def per_token_group_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
54
+ assert x.dim() == 2 and x.size(1) % 128 == 0
55
+ m, n = x.shape
56
+ x_view = x.view(m, -1, 128)
57
+ x_amax = x_view.abs().float().amax(dim=2).view(m, -1).clamp(1e-4)
58
+ sf = ceil_to_ue8m0(x_amax / 448.0)
59
+ return (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn).view(m, n), sf
60
+
61
+
62
+ def per_block_quant_mxfp8(x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
63
+ assert x.dim() == 2
64
+ m, n = x.shape
65
+ x_padded = torch.zeros(
66
+ (align(m, 128), align(n, 128)), dtype=x.dtype, device=x.device
67
+ )
68
+ x_padded[:m, :n] = x
69
+ x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, 128)
70
+ x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
71
+ sf = ceil_to_ue8m0(x_amax / 448.0)
72
+ x_scaled = (x_view * (1.0 / sf)).to(torch.float8_e4m3fn)
73
+ return x_scaled.view_as(x_padded)[:m, :n].contiguous(), sf.view(
74
+ x_view.size(0), x_view.size(2)
75
+ )
76
+
77
+
78
+ # For test
79
+ def native_w8a8_block_fp8_matmul(A, B, As, Bs, block_size, output_dtype=torch.float16):
80
+ """This function performs matrix multiplication with block-wise quantization using native torch.
81
+
82
+ It takes two input tensors `A` and `B` with scales `As` and `Bs`.
83
+ The output is returned in the specified `output_dtype`.
84
+ """
85
+
86
+ A = A.to(torch.float32)
87
+ B = B.to(torch.float32)
88
+ assert A.shape[-1] == B.shape[-1]
89
+ assert B.ndim == 2 and B.is_contiguous() and Bs.ndim == 2
90
+ assert len(block_size) == 2
91
+ block_n, block_k = block_size[0], block_size[1]
92
+ assert (A.shape[-1] + block_k - 1) // block_k == As.shape[-1]
93
+ assert A.shape[:-1] == As.shape[:-1]
94
+
95
+ M = A.numel() // A.shape[-1]
96
+ N, K = B.shape
97
+ origin_C_shape = A.shape[:-1] + (N,)
98
+ A = A.reshape(M, A.shape[-1])
99
+ As = As.reshape(M, As.shape[-1])
100
+ n_tiles = (N + block_n - 1) // block_n
101
+ k_tiles = (K + block_k - 1) // block_k
102
+ assert n_tiles == Bs.shape[0]
103
+ assert k_tiles == Bs.shape[1]
104
+
105
+ C_shape = (M, N)
106
+ C = torch.zeros(C_shape, dtype=torch.float32, device=A.device)
107
+
108
+ A_tiles = [A[:, i * block_k : min((i + 1) * block_k, K)] for i in range(k_tiles)]
109
+ B_tiles = [
110
+ [
111
+ B[
112
+ j * block_n : min((j + 1) * block_n, N),
113
+ i * block_k : min((i + 1) * block_k, K),
114
+ ]
115
+ for i in range(k_tiles)
116
+ ]
117
+ for j in range(n_tiles)
118
+ ]
119
+ C_tiles = [C[:, j * block_n : min((j + 1) * block_n, N)] for j in range(n_tiles)]
120
+ As_tiles = [As[:, i : i + 1] for i in range(k_tiles)]
121
+
122
+ for i in range(k_tiles):
123
+ for j in range(n_tiles):
124
+ a = A_tiles[i]
125
+ b = B_tiles[j][i]
126
+ c = C_tiles[j]
127
+ s = As_tiles[i] * Bs[j][i]
128
+ c[:, :] += torch.matmul(a, b.t()) * s
129
+
130
+ C = C.reshape(origin_C_shape).to(output_dtype)
131
+ return C
132
+
133
+
134
+ def block_quant_dequant(
135
+ x_q_block: torch.Tensor,
136
+ x_s: torch.Tensor,
137
+ block_size: List[int],
138
+ dtype: torch.dtype,
139
+ ) -> torch.Tensor:
140
+ """This function converts block-wise quantization to unquantized.
141
+ The inputs are block-wise quantization tensor `x_q_block`, block-wise quantization scale
142
+ and the block size.
143
+ The output is an unquantized tensor with dtype.
144
+ """
145
+ block_n, block_k = block_size[0], block_size[1]
146
+ n, k = x_q_block.shape
147
+ n_tiles = (n + block_n - 1) // block_n
148
+ k_tiles = (k + block_k - 1) // block_k
149
+ assert n_tiles == x_s.shape[0]
150
+ assert k_tiles == x_s.shape[1]
151
+
152
+ x_dq_block = torch.empty_like(x_q_block, dtype=dtype)
153
+
154
+ for j in range(n_tiles):
155
+ for i in range(k_tiles):
156
+ x_q_block_tile = x_q_block[
157
+ j * block_n : min((j + 1) * block_n, n),
158
+ i * block_k : min((i + 1) * block_k, k),
159
+ ]
160
+ x_dq_block_tile = x_dq_block[
161
+ j * block_n : min((j + 1) * block_n, n),
162
+ i * block_k : min((i + 1) * block_k, k),
163
+ ]
164
+ x_dq_block_tile[:, :] = x_q_block_tile.to(torch.float32) * x_s[j][i]
165
+
166
+ return x_dq_block
167
+
168
+
169
+ class TestDeepGemmBlackwell(CustomTestCase):
170
+
171
+ if not _is_cuda:
172
+ OUT_DTYPES = [torch.float32, torch.half, torch.bfloat16]
173
+ M = [1, 7, 83, 512, 2048]
174
+ NKs = [
175
+ (N, K)
176
+ for N in [128, 512, 1024, 4096, 7748, 13824]
177
+ for K in [256, 4096, 5120, 3884, 13824]
178
+ ]
179
+ # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
180
+ BLOCK_SIZE = [[128, 128]]
181
+ SEEDS = [0]
182
+ else:
183
+ # use practical shape in DeepSeek V3 for test
184
+ OUT_DTYPES = [torch.bfloat16]
185
+ M = [64, 128, 512, 1024, 4096]
186
+ NKs = [
187
+ (2112, 7168),
188
+ (1536, 7168),
189
+ # (3072, 1536),
190
+ # (24576, 7168),
191
+ # (4096, 512),
192
+ # (7168, 2048),
193
+ # (4608, 7168),
194
+ # (512, 7168),
195
+ # (7168, 2304),
196
+ # (7168, 512),
197
+ ]
198
+ BLOCK_SIZE = [[128, 128]]
199
+ SEEDS = [0]
200
+
201
+ @classmethod
202
+ def setUpClass(cls):
203
+ if not torch.cuda.is_available():
204
+ raise unittest.SkipTest("CUDA is not available")
205
+ torch.set_default_device("cuda")
206
+
207
+ def _test_deep_gemm_blackwell(self, M, NK, block_size, out_dtype, seed):
208
+ N, K = NK
209
+ torch.manual_seed(seed)
210
+
211
+ A = torch.empty((M, K), dtype=torch.bfloat16).normal_(0, 0.2)
212
+ B = torch.empty((N, K), dtype=torch.bfloat16).normal_(0, 0.2)
213
+
214
+ A_q, A_s = per_token_group_quant_fp8(A)
215
+ B_q, B_s = per_block_quant_fp8(B)
216
+
217
+ A_dq = block_quant_dequant(A_q, A_s, [1, block_size[1]], out_dtype)
218
+ B_dq = block_quant_dequant(B_q, B_s, block_size, out_dtype)
219
+
220
+ A_qu = per_token_group_quant_mxfp8(A_dq)
221
+ B_qu = per_block_quant_mxfp8(B_dq)
222
+ out = None
223
+
224
+ with torch.inference_mode():
225
+ ref_out = native_w8a8_block_fp8_matmul(
226
+ A_q, B_q, A_s, B_s, block_size, out_dtype
227
+ )
228
+ out = torch.empty_like(ref_out)
229
+ fp8_gemm_nt(A_qu, B_qu, out)
230
+
231
+ torch.testing.assert_close(out, ref_out, atol=1e-1, rtol=1e-2)
232
+
233
+ def test_deep_gemm_blackwell(self):
234
+ for params in itertools.product(
235
+ self.M,
236
+ self.NKs,
237
+ self.BLOCK_SIZE,
238
+ self.OUT_DTYPES,
239
+ self.SEEDS,
240
+ ):
241
+ with self.subTest(
242
+ M=params[0],
243
+ NKs=params[1],
244
+ block_size=params[2],
245
+ out_dtype=params[3],
246
+ seed=params[4],
247
+ ):
248
+ self._test_deep_gemm_blackwell(*params)
249
+
250
+
251
+ if __name__ == "__main__":
252
+ unittest.main(verbosity=2)
@@ -84,6 +84,7 @@ def ep_moe(
84
84
  top_k,
85
85
  hidden_states.shape[1],
86
86
  BLOCK_SIZE=512,
87
+ use_per_token_if_dynamic=True,
87
88
  )
88
89
 
89
90
  seg_indptr_cur_rank = seg_indptr[start_expert_id : end_expert_id + 2]