sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import torch
5
5
  import torch.distributed as dist
6
6
  from torch import nn
7
7
 
8
- from sglang.srt.distributed import get_tensor_model_parallel_group
8
+ from sglang.srt.distributed import get_tp_group
9
9
  from sglang.srt.layers.dp_attention import get_attention_tp_group
10
10
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
11
11
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
30
30
  def __init__(self):
31
31
  super().__init__()
32
32
  self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
33
- self.tp_sync_group = get_tensor_model_parallel_group().device_group
33
+ self.tp_sync_group = get_tp_group().device_group
34
34
 
35
35
  if global_server_args_dict["enable_dp_attention"]:
36
36
  self.tp_sync_group = get_attention_tp_group().device_group
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
59
59
 
60
60
  # Apply the custom logit processors if registered in the sampling info.
61
61
  if sampling_info.has_custom_logit_processor:
62
- self._apply_custom_logit_processor(logits, sampling_info)
62
+ apply_custom_logit_processor(logits, sampling_info)
63
63
 
64
64
  if self.use_nan_detection and torch.any(torch.isnan(logits)):
65
65
  logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
81
81
  probs = logits
82
82
  del logits
83
83
 
84
- if global_server_args_dict["sampling_backend"] == "flashinfer":
85
- if return_logprob:
86
- # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
87
- # https://github.com/flashinfer-ai/flashinfer/issues/708
88
- # so we use the torch implementation.
89
-
90
- # clamp to avoid -inf
91
- logprobs = torch.log(
92
- top_p_normalize_probs_torch(probs, sampling_info.top_ps)
93
- ).clamp(min=torch.finfo(probs.dtype).min)
94
-
95
- max_top_k_round, batch_size = 32, probs.shape[0]
96
- if sampling_info.need_min_p_sampling:
97
- probs = top_k_renorm_prob(probs, sampling_info.top_ks)
98
- probs = top_p_renorm_prob(probs, sampling_info.top_ps)
99
- batch_next_token_ids = min_p_sampling_from_probs(
100
- probs, sampling_info.min_ps
101
- )
102
- else:
103
- # Check Nan will throw exception, only check when crash_on_warnings is True
104
- check_nan = self.use_nan_detection and crash_on_warnings()
105
- batch_next_token_ids = top_k_top_p_sampling_from_probs(
84
+ if True: # Keep this redundant check to simplify some internal code sync
85
+ if global_server_args_dict["sampling_backend"] == "flashinfer":
86
+ if sampling_info.need_min_p_sampling:
87
+ probs = top_k_renorm_prob(probs, sampling_info.top_ks)
88
+ probs = top_p_renorm_prob(probs, sampling_info.top_ps)
89
+ batch_next_token_ids = min_p_sampling_from_probs(
90
+ probs, sampling_info.min_ps
91
+ )
92
+ else:
93
+ batch_next_token_ids = top_k_top_p_sampling_from_probs(
94
+ probs,
95
+ sampling_info.top_ks,
96
+ sampling_info.top_ps,
97
+ filter_apply_order="joint",
98
+ check_nan=self.use_nan_detection,
99
+ )
100
+ elif global_server_args_dict["sampling_backend"] == "pytorch":
101
+ # A slower fallback implementation with torch native operations.
102
+ batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
106
103
  probs,
107
104
  sampling_info.top_ks,
108
105
  sampling_info.top_ps,
109
- filter_apply_order="joint",
110
- check_nan=check_nan,
106
+ sampling_info.min_ps,
107
+ sampling_info.need_min_p_sampling,
108
+ )
109
+ else:
110
+ raise ValueError(
111
+ f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
111
112
  )
112
113
 
113
- elif global_server_args_dict["sampling_backend"] == "pytorch":
114
- # A slower fallback implementation with torch native operations.
115
- batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
116
- probs,
117
- sampling_info.top_ks,
118
- sampling_info.top_ps,
119
- sampling_info.min_ps,
120
- sampling_info.need_min_p_sampling,
121
- )
122
-
123
- if return_logprob:
124
- # clamp to avoid -inf
125
- logprobs = torch.log(
126
- top_p_normalize_probs_torch(probs, sampling_info.top_ps)
127
- ).clamp(min=torch.finfo(probs.dtype).min)
128
- else:
129
- raise ValueError(
130
- f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
131
- )
114
+ if return_logprob:
115
+ # clamp to avoid -inf
116
+ logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
132
117
 
133
118
  # Attach logprobs to logits_output (in-place modification)
134
119
  if return_logprob:
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
165
150
 
166
151
  return batch_next_token_ids
167
152
 
168
- def _apply_custom_logit_processor(
169
- self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
170
- ):
171
- """Apply custom logit processors to the logits.
172
- This function will modify the logits in-place."""
173
-
174
- assert logits.shape[0] == len(sampling_batch_info), (
175
- f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
176
- f"sampling_batch_info ({len(sampling_batch_info)})"
177
- )
178
-
179
- for _, (
180
- processor,
181
- batch_mask,
182
- ) in sampling_batch_info.custom_logit_processor.items():
183
- # Get the batch indices that need to be processed
184
- batch_indices = batch_mask.nonzero(as_tuple=True)[0]
185
-
186
- assert batch_mask.shape[0] == len(sampling_batch_info), (
187
- f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
188
- f"sampling_batch_info ({len(sampling_batch_info)})"
189
- )
190
-
191
- # Apply the processor to the logits
192
- logits[batch_mask] = processor(
193
- logits[batch_mask],
194
- [sampling_batch_info.custom_params[i] for i in batch_indices],
195
- )
196
-
197
- logger.debug(
198
- f"Custom logit processor {processor.__class__.__name__} is applied."
199
- )
200
-
201
153
 
202
154
  def top_k_top_p_min_p_sampling_from_probs_torch(
203
155
  probs: torch.Tensor,
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
226
178
  return batch_next_token_ids
227
179
 
228
180
 
181
+ def sampling_from_probs_torch(probs: torch.Tensor):
182
+ """A sampling implementation with native pytorch operations, without
183
+ top-k, top-p, or min-p filtering."""
184
+ sampled_index = torch.multinomial(probs, num_samples=1)
185
+ batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
186
+ return batch_next_token_ids
187
+
188
+
229
189
  def top_p_normalize_probs_torch(
230
190
  probs: torch.Tensor,
231
191
  top_ps: torch.Tensor,
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
264
224
  output_token_ids_logprobs_idx.append([])
265
225
 
266
226
  return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
227
+
228
+
229
+ def apply_custom_logit_processor(
230
+ logits: torch.Tensor,
231
+ sampling_batch_info: SamplingBatchInfo,
232
+ num_tokens_in_batch: int = 1,
233
+ ):
234
+ """Apply custom logit processors to the logits.
235
+ This function will modify the logits in-place.
236
+ num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
237
+ tokens. By default, we assume each batch contains only 1 token.
238
+ """
239
+
240
+ assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
241
+ f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
242
+ f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
243
+ f"({num_tokens_in_batch})"
244
+ )
245
+
246
+ for _, (
247
+ processor,
248
+ batch_mask,
249
+ ) in sampling_batch_info.custom_logit_processor.items():
250
+ # Get the batch indices that need to be processed
251
+ batch_indices = batch_mask.nonzero(as_tuple=True)[0]
252
+
253
+ assert batch_mask.shape[0] == len(sampling_batch_info), (
254
+ f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
255
+ f"sampling_batch_info ({len(sampling_batch_info)})"
256
+ )
257
+ batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
258
+
259
+ # Apply the processor to the logits
260
+ logits[batch_mask] = processor(
261
+ logits[batch_mask],
262
+ [sampling_batch_info.custom_params[i] for i in batch_indices],
263
+ )
264
+
265
+ logger.debug(
266
+ f"Custom logit processor {processor.__class__.__name__} is applied."
267
+ )
@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
33
33
  """
34
34
  input = args[0] if args else next(iter(kwargs.values()))
35
35
  return (input,) if self.return_tuple else input
36
+
37
+
38
+ def is_sm100_supported(device=None) -> bool:
39
+ return (torch.cuda.get_device_capability(device)[0] == 10) and (
40
+ torch.version.cuda >= "12.8"
41
+ )
sglang/srt/lora/layers.py CHANGED
@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
137
137
  self.A_buffer_gate_up = A_buffer
138
138
  if self.lora_backend.fuse_stacked_lora_b:
139
139
  # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
140
- if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
140
+ if getattr(self, "B_buffer_gate_up", None) is None:
141
141
  self.B_buffer_gate_up = torch.empty(
142
142
  (
143
143
  B_buffer[0].shape[0],
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
202
202
  output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
203
203
 
204
204
  # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
205
- if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
205
+ if getattr(self, "B_buffer_qkv", None) is None:
206
206
  self.B_buffer_qkv = torch.empty(
207
207
  (
208
208
  B_buffer_q[0].shape[0],
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
221
221
  )
222
222
 
223
223
  # Offsets of q/k/v in output dimension
224
- if not hasattr(self, "output_offset") or self.output_offset is None:
225
- self.output_offset = torch.empty(
226
- 4, dtype=torch.int32, device=B_buffer_q.device
224
+ if getattr(self, "output_offset", None) is None:
225
+ self.output_offset = torch.tensor(
226
+ [
227
+ 0,
228
+ output_dim_q,
229
+ output_dim_q + output_dim_kv,
230
+ output_dim_q + 2 * output_dim_kv,
231
+ ],
232
+ dtype=torch.int32,
233
+ device=B_buffer_q.device,
227
234
  )
228
- self.output_offset[:4] = torch.tensor(
229
- [
230
- 0,
231
- output_dim_q,
232
- output_dim_q + output_dim_kv,
233
- output_dim_q + 2 * output_dim_kv,
234
- ],
235
- dtype=torch.int32,
236
- device=B_buffer_q.device,
237
- )
238
235
  # For computing number of launched blocks
239
236
  self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
240
237
  else:
sglang/srt/lora/lora.py CHANGED
@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
92
92
  for i in range(self.base_hf_config.num_hidden_layers):
93
93
  layer = self.layers[i]
94
94
  weight_names = [name for name, _ in layer.weights.items()]
95
- self.stack_qkv_proj(weight_names, layer.weights)
96
- self.stack_gate_up_proj(weight_names, layer.weights)
97
-
98
- def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
95
+ self.normalize_qkv_proj(weight_names, layer.weights)
96
+ self.normalize_gate_up_proj(weight_names, layer.weights)
99
97
 
98
+ def normalize_qkv_proj(
99
+ self, weight_names: List[str], weights: Dict[str, torch.Tensor]
100
+ ):
100
101
  # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
101
102
  target_module = set()
102
103
  for weight_name in weight_names:
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
106
107
  target_module.add("q_proj")
107
108
  if "v_proj" in weight_name:
108
109
  target_module.add("v_proj")
110
+ if "qkv_proj" in weight_name:
111
+ target_module.add("qkv_proj")
109
112
  if len(target_module) == 0:
110
113
  return
111
114
 
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
148
151
  if "k_proj" in target_module:
149
152
  weights.pop(k_name)
150
153
  weights.pop(v_name)
154
+ elif "qkv_proj" in weight_name:
155
+ # If qkv_proj is already stacked, we normalize it following the SGL convention.
156
+ qkv_name = weight_name
157
+ q_name = weight_name.replace("qkv_proj", "q_proj")
158
+ k_name = weight_name.replace("qkv_proj", "k_proj")
159
+ v_name = weight_name.replace("qkv_proj", "v_proj")
160
+ kv_name = weight_name.replace("qkv_proj", "kv_proj")
161
+ if "lora_A" in weight_name:
162
+ weights[qkv_name] = weights[qkv_name].repeat(3, 1)
163
+ else:
164
+ head_size = (
165
+ self.base_hf_config.hidden_size
166
+ // self.base_hf_config.num_attention_heads
167
+ )
168
+ weights[q_name], k_proj_weight, v_proj_weight = torch.split(
169
+ weights[qkv_name],
170
+ [
171
+ head_size * self.base_hf_config.num_attention_heads,
172
+ head_size * self.base_hf_config.num_key_value_heads,
173
+ head_size * self.base_hf_config.num_key_value_heads,
174
+ ],
175
+ dim=0,
176
+ )
177
+ weights[kv_name] = torch.stack(
178
+ [k_proj_weight, v_proj_weight],
179
+ dim=0,
180
+ )
151
181
 
152
- def stack_gate_up_proj(
182
+ def normalize_gate_up_proj(
153
183
  self, weight_names: List[str], weights: Dict[str, torch.Tensor]
154
184
  ):
155
185
  for weight_name in weight_names:
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
179
209
  weights.pop(weight_name)
180
210
  if up_name in weights:
181
211
  weights.pop(up_name)
212
+ elif "gate_up_proj" in weight_name:
213
+ # If gate_up_proj is already stacked, we normalize it following the SGL convention
214
+ gate_up_name = weight_name
215
+ if "lora_A" in weight_name:
216
+ weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
217
+ else:
218
+ output_dim = weights[gate_up_name].shape[0] // 2
219
+ weights[gate_up_name] = torch.stack(
220
+ [
221
+ weights[gate_up_name][:output_dim, :],
222
+ weights[gate_up_name][output_dim:, :],
223
+ ],
224
+ dim=0,
225
+ )
@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
32
32
  LoRAType,
33
33
  get_customized_names_from_hf_names,
34
34
  get_layer_id,
35
- get_stacked_name,
35
+ get_normalized_lora_weight_names,
36
36
  get_weight_name,
37
37
  )
38
38
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -81,7 +81,7 @@ class LoRAManager:
81
81
  seg_indptr=torch.zeros(
82
82
  self.max_bs_in_cuda_graph + 1, dtype=torch.int32
83
83
  ),
84
- max_len=0,
84
+ max_len=1,
85
85
  weight_indices=torch.zeros(
86
86
  self.max_bs_in_cuda_graph, dtype=torch.int32
87
87
  ),
@@ -89,6 +89,17 @@ class LoRAManager:
89
89
  scalings=torch.zeros(self.max_loras_per_batch, dtype=torch.float),
90
90
  )
91
91
 
92
+ # Initialize seg_lens and seg_indptr for CUDA graph as they remain constant
93
+ # across batches.
94
+ self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph].fill_(1)
95
+ torch.cumsum(
96
+ self.cuda_graph_batch_info.seg_lens[: self.max_bs_in_cuda_graph],
97
+ dim=0,
98
+ out=self.cuda_graph_batch_info.seg_indptr[
99
+ 1 : self.max_bs_in_cuda_graph + 1
100
+ ],
101
+ )
102
+
92
103
  def init_loras(self):
93
104
  # Config of each LoRA adapter
94
105
  self.configs: Dict[str, LoRAConfig] = {}
@@ -101,10 +112,13 @@ class LoRAManager:
101
112
  self.hf_target_names.update(self.configs[name].target_modules)
102
113
 
103
114
  # Target lora weight names for lora_a and lora_b modules respectively.
104
- # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
105
- self.lora_weight_names: Set[Tuple[str]] = set(
106
- [get_stacked_name(module) for module in self.hf_target_names]
107
- )
115
+ weights_A: List[str] = []
116
+ weights_B: List[str] = []
117
+ for module in self.hf_target_names:
118
+ lora_A, lora_B = get_normalized_lora_weight_names(module)
119
+ weights_A += lora_A
120
+ weights_B += lora_B
121
+ self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
108
122
 
109
123
  # load all weights to cpu
110
124
  self.loras: Dict[str, LoRAAdapter] = {}
@@ -156,6 +170,45 @@ class LoRAManager:
156
170
  # set up batch info shared by all lora modules
157
171
  bs = forward_batch.batch_size
158
172
 
173
+ def transfer_adapter_info(
174
+ weight_indices_out: torch.Tensor,
175
+ lora_ranks_out: torch.Tensor,
176
+ scalings_out: torch.Tensor,
177
+ ):
178
+ """
179
+ Transfer adapter metadata (weight indices, LoRA rank, scalings) from host
180
+ to device (CUDA) asynchronously.
181
+ """
182
+ weight_indices = [0] * len(forward_batch.lora_paths)
183
+ lora_ranks = [0] * self.max_loras_per_batch
184
+ scalings = [0] * self.max_loras_per_batch
185
+ for i, lora_path in enumerate(forward_batch.lora_paths):
186
+ weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
187
+ if lora_path is not None:
188
+ lora = self.loras[lora_path]
189
+ lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
190
+ scalings[weight_indices[i]] = lora.scaling
191
+
192
+ # Use pinned memory to avoid synchronizations during host-to-device transfer
193
+ weight_indices_tensor = torch.tensor(
194
+ weight_indices, dtype=torch.int32, pin_memory=True, device="cpu"
195
+ )
196
+ lora_ranks_tensor = torch.tensor(
197
+ lora_ranks, dtype=torch.int32, pin_memory=True, device="cpu"
198
+ )
199
+ scalings_tensor = torch.tensor(
200
+ scalings, dtype=torch.float, pin_memory=True, device="cpu"
201
+ )
202
+
203
+ # Copy to device tensors asynchronously
204
+ weight_indices_out[:bs].copy_(weight_indices_tensor, non_blocking=True)
205
+ lora_ranks_out[: self.max_loras_per_batch].copy_(
206
+ lora_ranks_tensor, non_blocking=True
207
+ )
208
+ scalings_out[: self.max_loras_per_batch].copy_(
209
+ scalings_tensor, non_blocking=True
210
+ )
211
+
159
212
  if (
160
213
  hasattr(self, "max_bs_in_cuda_graph")
161
214
  and bs <= self.max_bs_in_cuda_graph
@@ -163,51 +216,46 @@ class LoRAManager:
163
216
  ):
164
217
  # Do in-place updates when CUDA graph is enabled and the batch forward mode
165
218
  # could use CUDA graph.
166
- self.cuda_graph_batch_info.bs = bs
167
- self.cuda_graph_batch_info.seg_lens[:bs].fill_(1)
168
- torch.cumsum(
169
- self.cuda_graph_batch_info.seg_lens[:bs],
170
- dim=0,
171
- out=self.cuda_graph_batch_info.seg_indptr[1 : bs + 1],
219
+
220
+ transfer_adapter_info(
221
+ self.cuda_graph_batch_info.weight_indices,
222
+ self.cuda_graph_batch_info.lora_ranks,
223
+ self.cuda_graph_batch_info.scalings,
172
224
  )
173
- self.cuda_graph_batch_info.max_len = 1
174
225
 
175
- for i, lora_path in enumerate(forward_batch.lora_paths):
176
- self.cuda_graph_batch_info.weight_indices[i] = (
177
- self.memory_pool.get_buffer_id(lora_path)
178
- )
179
- if lora_path is not None:
180
- lora = self.loras[lora_path]
181
- self.cuda_graph_batch_info.lora_ranks[
182
- self.cuda_graph_batch_info.weight_indices[i]
183
- ] = lora.config.hf_config["r"]
184
- self.cuda_graph_batch_info.scalings[
185
- self.cuda_graph_batch_info.weight_indices[i]
186
- ] = lora.scaling
226
+ self.cuda_graph_batch_info.bs = bs
227
+ self.cuda_graph_batch_info.max_len = 1
187
228
  batch_info = self.cuda_graph_batch_info
188
229
  else:
230
+ weight_indices = torch.empty((bs,), dtype=torch.int32, device=self.device)
231
+ lora_ranks = torch.zeros(
232
+ (self.max_loras_per_batch,), dtype=torch.int64, device=self.device
233
+ )
234
+ scalings = torch.zeros(
235
+ (self.max_loras_per_batch,), dtype=torch.float, device=self.device
236
+ )
237
+ transfer_adapter_info(
238
+ weight_indices,
239
+ lora_ranks,
240
+ scalings,
241
+ )
242
+
189
243
  seg_lens = (
190
244
  forward_batch.extend_seq_lens
191
245
  if forward_batch.forward_mode.is_extend()
192
246
  else torch.ones(bs, device=self.device)
193
247
  )
248
+
249
+ max_len = (
250
+ # Calculate max_len from the CPU copy to avoid D2H transfer.
251
+ max(forward_batch.extend_seq_lens_cpu)
252
+ if forward_batch.forward_mode.is_extend()
253
+ else 1
254
+ )
255
+
194
256
  seg_indptr = torch.zeros((bs + 1,), dtype=torch.int32, device=self.device)
195
257
  seg_indptr[1:] = torch.cumsum(seg_lens, dim=0)
196
- max_len = int(torch.max(seg_lens))
197
- weight_indices = torch.empty((bs,), dtype=torch.int64, device=self.device)
198
258
 
199
- lora_ranks = torch.zeros(
200
- (self.max_loras_per_batch,), dtype=torch.int64, device="cuda"
201
- )
202
- scalings = torch.zeros(
203
- (self.max_loras_per_batch,), dtype=torch.float, device="cuda"
204
- )
205
- for i, lora_path in enumerate(forward_batch.lora_paths):
206
- weight_indices[i] = self.memory_pool.get_buffer_id(lora_path)
207
- if lora_path is not None:
208
- lora = self.loras[lora_path]
209
- lora_ranks[weight_indices[i]] = lora.config.hf_config["r"]
210
- scalings[weight_indices[i]] = lora.scaling
211
259
  batch_info = LoRABatchInfo(
212
260
  bs=bs,
213
261
  seg_lens=seg_lens,
@@ -263,7 +311,18 @@ class LoRAManager:
263
311
  self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
264
312
  i: [] for i in range(self.base_hf_config.num_hidden_layers)
265
313
  }
314
+
266
315
  for module_name, module in self.base_model.named_modules():
316
+ # TODO (lifuhuang): in the future, we should consider generalizing the
317
+ # should_apply_lora function to support mapping by full module name instead
318
+ # of just the last part (e.g., "qkv_proj") to support scenarios with multiple
319
+ # attention stacks (e.g., multimodal models).
320
+ # See: https://github.com/sgl-project/sglang/issues/6608
321
+ if getattr(
322
+ self.base_model, "should_apply_lora", None
323
+ ) and not self.base_model.should_apply_lora(module_name):
324
+ continue
325
+
267
326
  # The module should be converted if it is included in target_names
268
327
  if module_name.split(".")[-1] in customized_target_names:
269
328
  layer_id = get_layer_id(module_name)
@@ -91,18 +91,16 @@ class LoRAMemoryPool:
91
91
 
92
92
  def init_buffers(
93
93
  self,
94
- lora_weight_names: Set[Tuple[str]],
94
+ lora_weight_names: Tuple[Set[str]],
95
95
  base_model: torch.nn.Module,
96
96
  ):
97
97
 
98
98
  # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
99
99
  # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
100
- self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
100
+ self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
101
101
  device = next(base_model.parameters()).device
102
- lora_module_A_names = set([name[0] for name in lora_weight_names])
103
- lora_module_B_names = set([name[1] for name in lora_weight_names])
104
102
  # Init A tensor, column_major=False
105
- for module_A in lora_module_A_names:
103
+ for module_A in lora_weight_names[0]:
106
104
  lora_A_shape = self.get_lora_A_shape(module_A, base_model)
107
105
  self.A_buffer[module_A] = [
108
106
  torch.empty(
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
110
108
  dtype=self.dtype,
111
109
  device=device,
112
110
  )
113
- for i in range(self.num_layer)
111
+ for _ in range(self.num_layer)
114
112
  ]
115
113
  # Init B tensor, column_major=True
116
- for module_B in lora_module_B_names:
114
+ for module_B in lora_weight_names[1]:
117
115
  lora_B_shape = self.get_lora_B_shape(module_B, base_model)
118
116
  self.B_buffer[module_B] = [
119
117
  torch.empty(
@@ -134,12 +132,13 @@ class LoRAMemoryPool:
134
132
  for buffer_id in range(self.max_loras_per_batch):
135
133
  # Prioritize empty slots
136
134
  if self.buffer_id_to_uid[buffer_id] == "":
137
- return buffer_id, ""
135
+ return buffer_id
138
136
 
139
137
  for buffer_id in range(self.max_loras_per_batch):
140
138
  # Evict unneeded lora
141
139
  if self.buffer_id_to_uid[buffer_id] not in cur_uids:
142
- return buffer_id, self.buffer_id_to_uid[buffer_id]
140
+ self.uid_to_buffer_id.pop(self.buffer_id_to_uid[buffer_id])
141
+ return buffer_id
143
142
 
144
143
  raise ValueError(
145
144
  "No available buffer slots found. Please ensure the number of active loras is less than max_loras_per_batch."
@@ -147,9 +146,7 @@ class LoRAMemoryPool:
147
146
 
148
147
  for uid in cur_uids:
149
148
  if uid not in self.uid_to_buffer_id:
150
- buffer_id, evicted_lora_uid = get_available_buffer_slot()
151
- if evicted_lora_uid != "":
152
- self.uid_to_buffer_id.pop(evicted_lora_uid)
149
+ buffer_id = get_available_buffer_slot()
153
150
  self.load_lora_weight_to_buffer(
154
151
  uid, buffer_id, lora_adapters.get(uid, None)
155
152
  )
@@ -159,6 +156,10 @@ class LoRAMemoryPool:
159
156
  def load_lora_weight_to_buffer(
160
157
  self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
161
158
  ):
159
+ def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
160
+ assert (
161
+ buffer_view.shape == weight.shape
162
+ ), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
162
163
 
163
164
  if uid is None:
164
165
  for i in range(self.num_layer):
@@ -210,21 +211,27 @@ class LoRAMemoryPool:
210
211
 
211
212
  for name, weights in temp_A_buffer.items():
212
213
  c = get_stacked_multiply(name)
213
- self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
214
- weights
215
- )
214
+ buffer_view = self.A_buffer[name][layer_id][buffer_id][
215
+ : lora_rank * c, :
216
+ ]
217
+ check_lora_weight_shape(buffer_view, weights)
218
+ buffer_view.copy_(weights)
216
219
 
217
220
  for name, weights in temp_B_buffer.items():
218
221
  c = get_stacked_multiply(name)
219
222
  if c > 1:
220
223
  for stacked_id in range(c):
221
- self.B_buffer[name][layer_id][stacked_id][buffer_id][
222
- :, :lora_rank
223
- ].copy_(weights[stacked_id])
224
+ buffer_view = self.B_buffer[name][layer_id][stacked_id][
225
+ buffer_id
226
+ ][:, :lora_rank]
227
+ check_lora_weight_shape(buffer_view, weights[stacked_id])
228
+ buffer_view.copy_(weights[stacked_id])
224
229
  else:
225
- self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
226
- weights
227
- )
230
+ buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
231
+ :, :lora_rank
232
+ ]
233
+ check_lora_weight_shape(buffer_view, weights)
234
+ buffer_view.copy_(weights)
228
235
 
229
236
  def get_tensor(
230
237
  self, weight_name: str, layer_id: int, lora_type: LoRAType