sglang 0.4.6.post5__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (318) hide show
  1. sglang/bench_offline_throughput.py +10 -4
  2. sglang/bench_one_batch_server.py +67 -11
  3. sglang/bench_serving.py +85 -74
  4. sglang/lang/backend/runtime_endpoint.py +24 -1
  5. sglang/profiler.py +167 -0
  6. sglang/srt/_custom_ops.py +34 -0
  7. sglang/srt/configs/internvl.py +8 -12
  8. sglang/srt/configs/model_config.py +27 -1
  9. sglang/srt/constrained/base_grammar_backend.py +5 -2
  10. sglang/srt/constrained/llguidance_backend.py +9 -8
  11. sglang/srt/constrained/outlines_backend.py +5 -4
  12. sglang/srt/constrained/xgrammar_backend.py +18 -18
  13. sglang/srt/conversation.py +46 -8
  14. sglang/srt/custom_op.py +38 -3
  15. sglang/srt/debug_utils.py +74 -0
  16. sglang/srt/disaggregation/common/__init__.py +1 -0
  17. sglang/srt/disaggregation/common/conn.py +407 -0
  18. sglang/srt/disaggregation/decode.py +67 -3
  19. sglang/srt/disaggregation/fake/conn.py +1 -0
  20. sglang/srt/disaggregation/kv_events.py +60 -5
  21. sglang/srt/disaggregation/launch_lb.py +140 -0
  22. sglang/srt/disaggregation/mini_lb.py +29 -48
  23. sglang/srt/disaggregation/mooncake/conn.py +432 -140
  24. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  25. sglang/srt/disaggregation/nixl/conn.py +124 -432
  26. sglang/srt/disaggregation/prefill.py +2 -0
  27. sglang/srt/disaggregation/utils.py +38 -1
  28. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  29. sglang/srt/distributed/parallel_state.py +52 -5
  30. sglang/srt/entrypoints/EngineBase.py +6 -0
  31. sglang/srt/entrypoints/engine.py +102 -5
  32. sglang/srt/entrypoints/http_server.py +15 -2
  33. sglang/srt/function_call/base_format_detector.py +138 -86
  34. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  35. sglang/srt/function_call/ebnf_composer.py +33 -19
  36. sglang/srt/function_call/function_call_parser.py +27 -0
  37. sglang/srt/function_call/llama32_detector.py +33 -14
  38. sglang/srt/function_call/mistral_detector.py +73 -26
  39. sglang/srt/function_call/pythonic_detector.py +86 -20
  40. sglang/srt/function_call/qwen25_detector.py +64 -10
  41. sglang/srt/function_call/utils.py +17 -0
  42. sglang/srt/hf_transformers_utils.py +4 -0
  43. sglang/srt/layers/attention/aiter_backend.py +488 -123
  44. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  45. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  46. sglang/srt/layers/attention/flashattention_backend.py +103 -18
  47. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  48. sglang/srt/layers/attention/flashinfer_mla_backend.py +37 -1
  49. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  50. sglang/srt/layers/attention/tbo_backend.py +232 -0
  51. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  52. sglang/srt/layers/attention/triton_backend.py +244 -5
  53. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  54. sglang/srt/layers/communicator.py +260 -194
  55. sglang/srt/layers/dp_attention.py +6 -5
  56. sglang/srt/layers/layernorm.py +30 -19
  57. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  58. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  59. sglang/srt/layers/moe/ep_moe/kernels.py +27 -6
  60. sglang/srt/layers/moe/ep_moe/layer.py +94 -40
  61. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +13 -8
  62. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  71. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  72. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  73. sglang/srt/layers/moe/topk.py +44 -18
  74. sglang/srt/layers/multimodal.py +3 -3
  75. sglang/srt/layers/quantization/__init__.py +3 -2
  76. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  77. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  78. sglang/srt/layers/quantization/deep_gemm.py +55 -56
  79. sglang/srt/layers/quantization/fp8.py +28 -23
  80. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  81. sglang/srt/layers/quantization/fp8_utils.py +165 -49
  82. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  83. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  84. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  85. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  86. sglang/srt/layers/rotary_embedding.py +6 -12
  87. sglang/srt/layers/sampler.py +80 -79
  88. sglang/srt/layers/utils.py +6 -0
  89. sglang/srt/lora/layers.py +12 -15
  90. sglang/srt/lora/lora.py +49 -5
  91. sglang/srt/lora/lora_manager.py +19 -5
  92. sglang/srt/lora/mem_pool.py +24 -16
  93. sglang/srt/lora/utils.py +17 -13
  94. sglang/srt/managers/data_parallel_controller.py +13 -5
  95. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  96. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  97. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  98. sglang/srt/managers/eplb_manager.py +55 -14
  99. sglang/srt/managers/expert_distribution.py +220 -46
  100. sglang/srt/managers/expert_location.py +110 -56
  101. sglang/srt/managers/expert_location_dispatch.py +23 -6
  102. sglang/srt/managers/io_struct.py +15 -4
  103. sglang/srt/managers/mm_utils.py +88 -38
  104. sglang/srt/managers/multimodal_processors/base_processor.py +188 -16
  105. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  106. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  107. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  108. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  109. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  110. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  111. sglang/srt/managers/schedule_batch.py +140 -38
  112. sglang/srt/managers/scheduler.py +305 -112
  113. sglang/srt/managers/tokenizer_manager.py +134 -17
  114. sglang/srt/managers/utils.py +0 -4
  115. sglang/srt/metrics/collector.py +9 -0
  116. sglang/srt/model_executor/cuda_graph_runner.py +72 -61
  117. sglang/srt/model_executor/expert_location_updater.py +157 -22
  118. sglang/srt/model_executor/forward_batch_info.py +38 -17
  119. sglang/srt/model_executor/model_runner.py +96 -56
  120. sglang/srt/model_loader/utils.py +67 -1
  121. sglang/srt/models/deepseek_nextn.py +1 -1
  122. sglang/srt/models/deepseek_v2.py +609 -234
  123. sglang/srt/models/gemma3_causal.py +7 -0
  124. sglang/srt/models/gemma3_mm.py +19 -14
  125. sglang/srt/models/idefics2.py +342 -0
  126. sglang/srt/models/kimi_vl.py +4 -4
  127. sglang/srt/models/llama.py +1 -1
  128. sglang/srt/models/minicpmo.py +2 -5
  129. sglang/srt/models/minicpmv.py +3 -295
  130. sglang/srt/models/phi4mm.py +512 -0
  131. sglang/srt/models/qwen2.py +38 -9
  132. sglang/srt/models/qwen2_5_vl.py +3 -9
  133. sglang/srt/models/qwen2_eagle.py +4 -1
  134. sglang/srt/models/qwen2_moe.py +58 -191
  135. sglang/srt/models/qwen2_vl.py +3 -9
  136. sglang/srt/models/qwen3.py +41 -10
  137. sglang/srt/models/qwen3_moe.py +230 -191
  138. sglang/srt/models/registry.py +9 -1
  139. sglang/srt/models/transformers.py +291 -0
  140. sglang/srt/openai_api/adapter.py +86 -24
  141. sglang/srt/openai_api/protocol.py +31 -2
  142. sglang/srt/openai_api/utils.py +172 -0
  143. sglang/srt/operations.py +37 -2
  144. sglang/srt/operations_strategy.py +200 -24
  145. sglang/srt/sampling/sampling_batch_info.py +13 -1
  146. sglang/srt/sampling/sampling_params.py +2 -1
  147. sglang/srt/server_args.py +114 -27
  148. sglang/srt/speculative/build_eagle_tree.py +8 -8
  149. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  150. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  151. sglang/srt/speculative/eagle_utils.py +51 -91
  152. sglang/srt/speculative/eagle_worker.py +101 -21
  153. sglang/srt/two_batch_overlap.py +635 -0
  154. sglang/srt/utils.py +129 -7
  155. sglang/test/runners.py +16 -7
  156. sglang/test/send_one.py +4 -0
  157. sglang/test/test_cutlass_moe.py +3 -3
  158. sglang/test/test_fp4_moe.py +248 -0
  159. sglang/test/test_utils.py +79 -6
  160. sglang/version.py +1 -1
  161. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/METADATA +14 -11
  162. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/RECORD +318 -291
  163. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  164. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  165. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  166. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  167. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  168. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  169. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  170. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  171. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  172. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  173. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  174. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  175. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  176. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  177. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  178. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  179. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  180. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  181. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  182. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  183. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  184. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  185. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  186. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  187. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  188. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  189. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  190. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  191. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  192. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  193. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  194. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  195. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  196. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  197. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  198. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  199. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  200. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  201. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  202. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  317. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  318. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -5,7 +5,7 @@ import torch
5
5
  import torch.distributed as dist
6
6
  from torch import nn
7
7
 
8
- from sglang.srt.distributed import get_tensor_model_parallel_group
8
+ from sglang.srt.distributed import get_tp_group
9
9
  from sglang.srt.layers.dp_attention import get_attention_tp_group
10
10
  from sglang.srt.layers.logits_processor import LogitsProcessorOutput
11
11
  from sglang.srt.managers.schedule_batch import global_server_args_dict
@@ -30,7 +30,7 @@ class Sampler(nn.Module):
30
30
  def __init__(self):
31
31
  super().__init__()
32
32
  self.use_nan_detection = global_server_args_dict["enable_nan_detection"]
33
- self.tp_sync_group = get_tensor_model_parallel_group().device_group
33
+ self.tp_sync_group = get_tp_group().device_group
34
34
 
35
35
  if global_server_args_dict["enable_dp_attention"]:
36
36
  self.tp_sync_group = get_attention_tp_group().device_group
@@ -59,7 +59,7 @@ class Sampler(nn.Module):
59
59
 
60
60
  # Apply the custom logit processors if registered in the sampling info.
61
61
  if sampling_info.has_custom_logit_processor:
62
- self._apply_custom_logit_processor(logits, sampling_info)
62
+ apply_custom_logit_processor(logits, sampling_info)
63
63
 
64
64
  if self.use_nan_detection and torch.any(torch.isnan(logits)):
65
65
  logger.warning("Detected errors during sampling! NaN in the logits.")
@@ -81,54 +81,39 @@ class Sampler(nn.Module):
81
81
  probs = logits
82
82
  del logits
83
83
 
84
- if global_server_args_dict["sampling_backend"] == "flashinfer":
85
- if return_logprob:
86
- # NOTE: the top_p_renorm_prob from flashinfer has numerical problems,
87
- # https://github.com/flashinfer-ai/flashinfer/issues/708
88
- # so we use the torch implementation.
89
-
90
- # clamp to avoid -inf
91
- logprobs = torch.log(
92
- top_p_normalize_probs_torch(probs, sampling_info.top_ps)
93
- ).clamp(min=torch.finfo(probs.dtype).min)
94
-
95
- max_top_k_round, batch_size = 32, probs.shape[0]
96
- if sampling_info.need_min_p_sampling:
97
- probs = top_k_renorm_prob(probs, sampling_info.top_ks)
98
- probs = top_p_renorm_prob(probs, sampling_info.top_ps)
99
- batch_next_token_ids = min_p_sampling_from_probs(
100
- probs, sampling_info.min_ps
101
- )
102
- else:
103
- # Check Nan will throw exception, only check when crash_on_warnings is True
104
- check_nan = self.use_nan_detection and crash_on_warnings()
105
- batch_next_token_ids = top_k_top_p_sampling_from_probs(
84
+ if True: # Keep this redundant check to simplify some internal code sync
85
+ if global_server_args_dict["sampling_backend"] == "flashinfer":
86
+ if sampling_info.need_min_p_sampling:
87
+ probs = top_k_renorm_prob(probs, sampling_info.top_ks)
88
+ probs = top_p_renorm_prob(probs, sampling_info.top_ps)
89
+ batch_next_token_ids = min_p_sampling_from_probs(
90
+ probs, sampling_info.min_ps
91
+ )
92
+ else:
93
+ batch_next_token_ids = top_k_top_p_sampling_from_probs(
94
+ probs,
95
+ sampling_info.top_ks,
96
+ sampling_info.top_ps,
97
+ filter_apply_order="joint",
98
+ check_nan=self.use_nan_detection,
99
+ )
100
+ elif global_server_args_dict["sampling_backend"] == "pytorch":
101
+ # A slower fallback implementation with torch native operations.
102
+ batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
106
103
  probs,
107
104
  sampling_info.top_ks,
108
105
  sampling_info.top_ps,
109
- filter_apply_order="joint",
110
- check_nan=check_nan,
106
+ sampling_info.min_ps,
107
+ sampling_info.need_min_p_sampling,
108
+ )
109
+ else:
110
+ raise ValueError(
111
+ f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
111
112
  )
112
113
 
113
- elif global_server_args_dict["sampling_backend"] == "pytorch":
114
- # A slower fallback implementation with torch native operations.
115
- batch_next_token_ids = top_k_top_p_min_p_sampling_from_probs_torch(
116
- probs,
117
- sampling_info.top_ks,
118
- sampling_info.top_ps,
119
- sampling_info.min_ps,
120
- sampling_info.need_min_p_sampling,
121
- )
122
-
123
- if return_logprob:
124
- # clamp to avoid -inf
125
- logprobs = torch.log(
126
- top_p_normalize_probs_torch(probs, sampling_info.top_ps)
127
- ).clamp(min=torch.finfo(probs.dtype).min)
128
- else:
129
- raise ValueError(
130
- f"Invalid sampling backend: {global_server_args_dict['sampling_backend']}"
131
- )
114
+ if return_logprob:
115
+ # clamp to avoid -inf
116
+ logprobs = torch.log(probs).clamp(min=torch.finfo(probs.dtype).min)
132
117
 
133
118
  # Attach logprobs to logits_output (in-place modification)
134
119
  if return_logprob:
@@ -165,39 +150,6 @@ class Sampler(nn.Module):
165
150
 
166
151
  return batch_next_token_ids
167
152
 
168
- def _apply_custom_logit_processor(
169
- self, logits: torch.Tensor, sampling_batch_info: SamplingBatchInfo
170
- ):
171
- """Apply custom logit processors to the logits.
172
- This function will modify the logits in-place."""
173
-
174
- assert logits.shape[0] == len(sampling_batch_info), (
175
- f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
176
- f"sampling_batch_info ({len(sampling_batch_info)})"
177
- )
178
-
179
- for _, (
180
- processor,
181
- batch_mask,
182
- ) in sampling_batch_info.custom_logit_processor.items():
183
- # Get the batch indices that need to be processed
184
- batch_indices = batch_mask.nonzero(as_tuple=True)[0]
185
-
186
- assert batch_mask.shape[0] == len(sampling_batch_info), (
187
- f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
188
- f"sampling_batch_info ({len(sampling_batch_info)})"
189
- )
190
-
191
- # Apply the processor to the logits
192
- logits[batch_mask] = processor(
193
- logits[batch_mask],
194
- [sampling_batch_info.custom_params[i] for i in batch_indices],
195
- )
196
-
197
- logger.debug(
198
- f"Custom logit processor {processor.__class__.__name__} is applied."
199
- )
200
-
201
153
 
202
154
  def top_k_top_p_min_p_sampling_from_probs_torch(
203
155
  probs: torch.Tensor,
@@ -226,6 +178,14 @@ def top_k_top_p_min_p_sampling_from_probs_torch(
226
178
  return batch_next_token_ids
227
179
 
228
180
 
181
+ def sampling_from_probs_torch(probs: torch.Tensor):
182
+ """A sampling implementation with native pytorch operations, without
183
+ top-k, top-p, or min-p filtering."""
184
+ sampled_index = torch.multinomial(probs, num_samples=1)
185
+ batch_next_token_ids = sampled_index.view(-1).to(torch.int32)
186
+ return batch_next_token_ids
187
+
188
+
229
189
  def top_p_normalize_probs_torch(
230
190
  probs: torch.Tensor,
231
191
  top_ps: torch.Tensor,
@@ -264,3 +224,44 @@ def get_token_ids_logprobs(logprobs: torch.Tensor, token_ids_logprobs: List[List
264
224
  output_token_ids_logprobs_idx.append([])
265
225
 
266
226
  return output_token_ids_logprobs_val, output_token_ids_logprobs_idx
227
+
228
+
229
+ def apply_custom_logit_processor(
230
+ logits: torch.Tensor,
231
+ sampling_batch_info: SamplingBatchInfo,
232
+ num_tokens_in_batch: int = 1,
233
+ ):
234
+ """Apply custom logit processors to the logits.
235
+ This function will modify the logits in-place.
236
+ num_tokens_in_batch is needed to support spec decoding, where each batch can contain multiple
237
+ tokens. By default, we assume each batch contains only 1 token.
238
+ """
239
+
240
+ assert logits.shape[0] == len(sampling_batch_info) * num_tokens_in_batch, (
241
+ f"The batch size of logits ({logits.shape[0]}) does not match the batch size of "
242
+ f"sampling_batch_info ({len(sampling_batch_info)}) x num_tokens_in_batch "
243
+ f"({num_tokens_in_batch})"
244
+ )
245
+
246
+ for _, (
247
+ processor,
248
+ batch_mask,
249
+ ) in sampling_batch_info.custom_logit_processor.items():
250
+ # Get the batch indices that need to be processed
251
+ batch_indices = batch_mask.nonzero(as_tuple=True)[0]
252
+
253
+ assert batch_mask.shape[0] == len(sampling_batch_info), (
254
+ f"The number of batch mask ({batch_mask.shape[0]}) does not match the number of "
255
+ f"sampling_batch_info ({len(sampling_batch_info)})"
256
+ )
257
+ batch_mask = torch.repeat_interleave(batch_mask, num_tokens_in_batch)
258
+
259
+ # Apply the processor to the logits
260
+ logits[batch_mask] = processor(
261
+ logits[batch_mask],
262
+ [sampling_batch_info.custom_params[i] for i in batch_indices],
263
+ )
264
+
265
+ logger.debug(
266
+ f"Custom logit processor {processor.__class__.__name__} is applied."
267
+ )
@@ -33,3 +33,9 @@ class PPMissingLayer(torch.nn.Identity):
33
33
  """
34
34
  input = args[0] if args else next(iter(kwargs.values()))
35
35
  return (input,) if self.return_tuple else input
36
+
37
+
38
+ def is_sm100_supported(device=None) -> bool:
39
+ return (torch.cuda.get_device_capability(device)[0] == 10) and (
40
+ torch.version.cuda >= "12.8"
41
+ )
sglang/srt/lora/layers.py CHANGED
@@ -137,7 +137,7 @@ class MergedColumnParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
137
137
  self.A_buffer_gate_up = A_buffer
138
138
  if self.lora_backend.fuse_stacked_lora_b:
139
139
  # B_buffer_gate_up: (num_lora, 2 * output_dim, r)
140
- if not hasattr(self, "B_buffer_gate_up") or self.B_buffer_gate_up is None:
140
+ if getattr(self, "B_buffer_gate_up", None) is None:
141
141
  self.B_buffer_gate_up = torch.empty(
142
142
  (
143
143
  B_buffer[0].shape[0],
@@ -202,7 +202,7 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
202
202
  output_dim_q, output_dim_kv = B_buffer_q.shape[-2], B_buffer_kv.shape[-2]
203
203
 
204
204
  # B_buffer_qkv: (num_lora, output_dim_q + 2 * output_dim_kv, r)
205
- if not hasattr(self, "B_buffer_qkv") or self.B_buffer_qkv is None:
205
+ if getattr(self, "B_buffer_qkv", None) is None:
206
206
  self.B_buffer_qkv = torch.empty(
207
207
  (
208
208
  B_buffer_q[0].shape[0],
@@ -221,20 +221,17 @@ class QKVParallelLinearWithLoRA(ColumnParallelLinearWithLoRA):
221
221
  )
222
222
 
223
223
  # Offsets of q/k/v in output dimension
224
- if not hasattr(self, "output_offset") or self.output_offset is None:
225
- self.output_offset = torch.empty(
226
- 4, dtype=torch.int32, device=B_buffer_q.device
224
+ if getattr(self, "output_offset", None) is None:
225
+ self.output_offset = torch.tensor(
226
+ [
227
+ 0,
228
+ output_dim_q,
229
+ output_dim_q + output_dim_kv,
230
+ output_dim_q + 2 * output_dim_kv,
231
+ ],
232
+ dtype=torch.int32,
233
+ device=B_buffer_q.device,
227
234
  )
228
- self.output_offset[:4] = torch.tensor(
229
- [
230
- 0,
231
- output_dim_q,
232
- output_dim_q + output_dim_kv,
233
- output_dim_q + 2 * output_dim_kv,
234
- ],
235
- dtype=torch.int32,
236
- device=B_buffer_q.device,
237
- )
238
235
  # For computing number of launched blocks
239
236
  self.max_qkv_out_dim = max(output_dim_q, output_dim_kv)
240
237
  else:
sglang/srt/lora/lora.py CHANGED
@@ -92,11 +92,12 @@ class LoRAAdapter(nn.Module):
92
92
  for i in range(self.base_hf_config.num_hidden_layers):
93
93
  layer = self.layers[i]
94
94
  weight_names = [name for name, _ in layer.weights.items()]
95
- self.stack_qkv_proj(weight_names, layer.weights)
96
- self.stack_gate_up_proj(weight_names, layer.weights)
97
-
98
- def stack_qkv_proj(self, weight_names: List[str], weights: Dict[str, torch.Tensor]):
95
+ self.normalize_qkv_proj(weight_names, layer.weights)
96
+ self.normalize_gate_up_proj(weight_names, layer.weights)
99
97
 
98
+ def normalize_qkv_proj(
99
+ self, weight_names: List[str], weights: Dict[str, torch.Tensor]
100
+ ):
100
101
  # Collect target q/k/v modules. This process is necessary since there might be no lora attached to k_proj
101
102
  target_module = set()
102
103
  for weight_name in weight_names:
@@ -106,6 +107,8 @@ class LoRAAdapter(nn.Module):
106
107
  target_module.add("q_proj")
107
108
  if "v_proj" in weight_name:
108
109
  target_module.add("v_proj")
110
+ if "qkv_proj" in weight_name:
111
+ target_module.add("qkv_proj")
109
112
  if len(target_module) == 0:
110
113
  return
111
114
 
@@ -148,8 +151,35 @@ class LoRAAdapter(nn.Module):
148
151
  if "k_proj" in target_module:
149
152
  weights.pop(k_name)
150
153
  weights.pop(v_name)
154
+ elif "qkv_proj" in weight_name:
155
+ # If qkv_proj is already stacked, we normalize it following the SGL convention.
156
+ qkv_name = weight_name
157
+ q_name = weight_name.replace("qkv_proj", "q_proj")
158
+ k_name = weight_name.replace("qkv_proj", "k_proj")
159
+ v_name = weight_name.replace("qkv_proj", "v_proj")
160
+ kv_name = weight_name.replace("qkv_proj", "kv_proj")
161
+ if "lora_A" in weight_name:
162
+ weights[qkv_name] = weights[qkv_name].repeat(3, 1)
163
+ else:
164
+ head_size = (
165
+ self.base_hf_config.hidden_size
166
+ // self.base_hf_config.num_attention_heads
167
+ )
168
+ weights[q_name], k_proj_weight, v_proj_weight = torch.split(
169
+ weights[qkv_name],
170
+ [
171
+ head_size * self.base_hf_config.num_attention_heads,
172
+ head_size * self.base_hf_config.num_key_value_heads,
173
+ head_size * self.base_hf_config.num_key_value_heads,
174
+ ],
175
+ dim=0,
176
+ )
177
+ weights[kv_name] = torch.stack(
178
+ [k_proj_weight, v_proj_weight],
179
+ dim=0,
180
+ )
151
181
 
152
- def stack_gate_up_proj(
182
+ def normalize_gate_up_proj(
153
183
  self, weight_names: List[str], weights: Dict[str, torch.Tensor]
154
184
  ):
155
185
  for weight_name in weight_names:
@@ -179,3 +209,17 @@ class LoRAAdapter(nn.Module):
179
209
  weights.pop(weight_name)
180
210
  if up_name in weights:
181
211
  weights.pop(up_name)
212
+ elif "gate_up_proj" in weight_name:
213
+ # If gate_up_proj is already stacked, we normalize it following the SGL convention
214
+ gate_up_name = weight_name
215
+ if "lora_A" in weight_name:
216
+ weights[gate_up_name] = weights[gate_up_name].repeat(2, 1)
217
+ else:
218
+ output_dim = weights[gate_up_name].shape[0] // 2
219
+ weights[gate_up_name] = torch.stack(
220
+ [
221
+ weights[gate_up_name][:output_dim, :],
222
+ weights[gate_up_name][output_dim:, :],
223
+ ],
224
+ dim=0,
225
+ )
@@ -32,7 +32,7 @@ from sglang.srt.lora.utils import (
32
32
  LoRAType,
33
33
  get_customized_names_from_hf_names,
34
34
  get_layer_id,
35
- get_stacked_name,
35
+ get_normalized_lora_weight_names,
36
36
  get_weight_name,
37
37
  )
38
38
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
@@ -101,10 +101,13 @@ class LoRAManager:
101
101
  self.hf_target_names.update(self.configs[name].target_modules)
102
102
 
103
103
  # Target lora weight names for lora_a and lora_b modules respectively.
104
- # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
105
- self.lora_weight_names: Set[Tuple[str]] = set(
106
- [get_stacked_name(module) for module in self.hf_target_names]
107
- )
104
+ weights_A: List[str] = []
105
+ weights_B: List[str] = []
106
+ for module in self.hf_target_names:
107
+ lora_A, lora_B = get_normalized_lora_weight_names(module)
108
+ weights_A += lora_A
109
+ weights_B += lora_B
110
+ self.lora_weight_names: Tuple[Set[str]] = set(weights_A), set(weights_B)
108
111
 
109
112
  # load all weights to cpu
110
113
  self.loras: Dict[str, LoRAAdapter] = {}
@@ -263,7 +266,18 @@ class LoRAManager:
263
266
  self.lora_modules: Dict[int, List[Tuple[str, BaseLayerWithLoRA]]] = {
264
267
  i: [] for i in range(self.base_hf_config.num_hidden_layers)
265
268
  }
269
+
266
270
  for module_name, module in self.base_model.named_modules():
271
+ # TODO (lifuhuang): in the future, we should consider generalizing the
272
+ # should_apply_lora function to support mapping by full module name instead
273
+ # of just the last part (e.g., "qkv_proj") to support scenarios with multiple
274
+ # attention stacks (e.g., multimodal models).
275
+ # See: https://github.com/sgl-project/sglang/issues/6608
276
+ if getattr(
277
+ self.base_model, "should_apply_lora", None
278
+ ) and not self.base_model.should_apply_lora(module_name):
279
+ continue
280
+
267
281
  # The module should be converted if it is included in target_names
268
282
  if module_name.split(".")[-1] in customized_target_names:
269
283
  layer_id = get_layer_id(module_name)
@@ -91,18 +91,16 @@ class LoRAMemoryPool:
91
91
 
92
92
  def init_buffers(
93
93
  self,
94
- lora_weight_names: Set[Tuple[str]],
94
+ lora_weight_names: Tuple[Set[str]],
95
95
  base_model: torch.nn.Module,
96
96
  ):
97
97
 
98
98
  # lora_weight_names is a set of name pairs indicating each pair of lora modules to load
99
99
  # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj"), ("o_proj", "o_proj")}
100
- self.lora_weight_names: Set[Tuple[str]] = lora_weight_names
100
+ self.lora_weight_names: Tuple[Set[str]] = lora_weight_names
101
101
  device = next(base_model.parameters()).device
102
- lora_module_A_names = set([name[0] for name in lora_weight_names])
103
- lora_module_B_names = set([name[1] for name in lora_weight_names])
104
102
  # Init A tensor, column_major=False
105
- for module_A in lora_module_A_names:
103
+ for module_A in lora_weight_names[0]:
106
104
  lora_A_shape = self.get_lora_A_shape(module_A, base_model)
107
105
  self.A_buffer[module_A] = [
108
106
  torch.empty(
@@ -110,10 +108,10 @@ class LoRAMemoryPool:
110
108
  dtype=self.dtype,
111
109
  device=device,
112
110
  )
113
- for i in range(self.num_layer)
111
+ for _ in range(self.num_layer)
114
112
  ]
115
113
  # Init B tensor, column_major=True
116
- for module_B in lora_module_B_names:
114
+ for module_B in lora_weight_names[1]:
117
115
  lora_B_shape = self.get_lora_B_shape(module_B, base_model)
118
116
  self.B_buffer[module_B] = [
119
117
  torch.empty(
@@ -159,6 +157,10 @@ class LoRAMemoryPool:
159
157
  def load_lora_weight_to_buffer(
160
158
  self, uid: str, buffer_id: int, lora_adapter: LoRAAdapter = None
161
159
  ):
160
+ def check_lora_weight_shape(buffer_view: torch.Tensor, weight: torch.Tensor):
161
+ assert (
162
+ buffer_view.shape == weight.shape
163
+ ), f"LoRA buffer shape {buffer_view.shape} does not match weight shape {weight.shape}."
162
164
 
163
165
  if uid is None:
164
166
  for i in range(self.num_layer):
@@ -210,21 +212,27 @@ class LoRAMemoryPool:
210
212
 
211
213
  for name, weights in temp_A_buffer.items():
212
214
  c = get_stacked_multiply(name)
213
- self.A_buffer[name][layer_id][buffer_id][: lora_rank * c, :].copy_(
214
- weights
215
- )
215
+ buffer_view = self.A_buffer[name][layer_id][buffer_id][
216
+ : lora_rank * c, :
217
+ ]
218
+ check_lora_weight_shape(buffer_view, weights)
219
+ buffer_view.copy_(weights)
216
220
 
217
221
  for name, weights in temp_B_buffer.items():
218
222
  c = get_stacked_multiply(name)
219
223
  if c > 1:
220
224
  for stacked_id in range(c):
221
- self.B_buffer[name][layer_id][stacked_id][buffer_id][
222
- :, :lora_rank
223
- ].copy_(weights[stacked_id])
225
+ buffer_view = self.B_buffer[name][layer_id][stacked_id][
226
+ buffer_id
227
+ ][:, :lora_rank]
228
+ check_lora_weight_shape(buffer_view, weights[stacked_id])
229
+ buffer_view.copy_(weights[stacked_id])
224
230
  else:
225
- self.B_buffer[name][layer_id][0][buffer_id][:, :lora_rank].copy_(
226
- weights
227
- )
231
+ buffer_view = self.B_buffer[name][layer_id][0][buffer_id][
232
+ :, :lora_rank
233
+ ]
234
+ check_lora_weight_shape(buffer_view, weights)
235
+ buffer_view.copy_(weights)
228
236
 
229
237
  def get_tensor(
230
238
  self, weight_name: str, layer_id: int, lora_type: LoRAType
sglang/srt/lora/utils.py CHANGED
@@ -1,7 +1,7 @@
1
1
  import re
2
2
  from dataclasses import dataclass
3
3
  from enum import Enum
4
- from typing import Optional, Set, Tuple
4
+ from typing import List, Optional, Set, Tuple
5
5
 
6
6
  import torch
7
7
 
@@ -106,18 +106,22 @@ def get_hidden_dim(
106
106
  raise NotImplementedError()
107
107
 
108
108
 
109
- def get_stacked_name(name: str) -> Tuple[str]:
109
+ def get_normalized_lora_weight_names(name: str) -> Tuple[List[str], List[str]]:
110
110
  """
111
- Mapping a target module name to (stacked name for Lora A, stacked name for Lora B)
111
+ Mapping a target module name to names of the normized LoRA weights.
112
+ Returned tuple contains (name for Lora A, name for Lora B)
112
113
  """
113
114
  params_mapping = {
114
- "q_proj": ("qkv_proj", "q_proj"),
115
- "k_proj": ("qkv_proj", "kv_proj"),
116
- "v_proj": ("qkv_proj", "kv_proj"),
117
- "gate_proj": ("gate_up_proj", "gate_up_proj"),
118
- "up_proj": ("gate_up_proj", "gate_up_proj"),
115
+ "q_proj": (["qkv_proj"], ["q_proj"]),
116
+ "k_proj": (["qkv_proj"], ["kv_proj"]),
117
+ "v_proj": (["qkv_proj"], ["kv_proj"]),
118
+ "gate_proj": (["gate_up_proj"], ["gate_up_proj"]),
119
+ "up_proj": (["gate_up_proj"], ["gate_up_proj"]),
120
+ "qkv_proj": (["qkv_proj"], ["q_proj", "kv_proj"]),
121
+ "gate_up_proj": (["gate_up_proj"], ["gate_up_proj"]),
119
122
  }
120
- return params_mapping.get(name, (name, name))
123
+ stacked = params_mapping.get(name, ([name], [name]))
124
+ return stacked
121
125
 
122
126
 
123
127
  def get_stacked_multiply(module_name: str) -> int:
@@ -133,7 +137,7 @@ def get_stacked_multiply(module_name: str) -> int:
133
137
 
134
138
 
135
139
  def get_weight_name(
136
- target_name: str, lora_weight_names: Set[Tuple[str]], lora_type: LoRAType
140
+ target_name: str, lora_weight_names: Tuple[Set[str]], lora_type: LoRAType
137
141
  ) -> Optional[str]:
138
142
  """
139
143
  target_name is name of a given module,
@@ -142,9 +146,9 @@ def get_weight_name(
142
146
  Else raise ValueError.
143
147
  """
144
148
  idx = 0 if lora_type == LoRAType.LORA_A else 1
145
- for weight_name_pair in lora_weight_names:
146
- if weight_name_pair[idx] in target_name:
147
- return weight_name_pair[idx]
149
+ for weight_name in lora_weight_names[idx]:
150
+ if weight_name in target_name:
151
+ return weight_name
148
152
  raise ValueError(
149
153
  f"Cannot find weight name for {target_name} in {lora_weight_names}"
150
154
  )
@@ -248,12 +248,20 @@ class DataParallelController:
248
248
 
249
249
  def round_robin_scheduler(self, req: Req):
250
250
  if self.server_args.disaggregation_mode == "null":
251
- self.workers[self.round_robin_counter].send_pyobj(req)
252
- self.round_robin_counter = (self.round_robin_counter + 1) % len(
253
- self.workers
254
- )
251
+ if req.data_parallel_rank is not None:
252
+ logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
253
+ self.workers[req.data_parallel_rank].send_pyobj(req)
254
+ else:
255
+ self.workers[self.round_robin_counter].send_pyobj(req)
256
+ self.round_robin_counter = (self.round_robin_counter + 1) % len(
257
+ self.workers
258
+ )
255
259
  else:
256
- self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
260
+ if req.data_parallel_rank is not None:
261
+ logger.debug(f"Direct routing to DP rank {req.data_parallel_rank}")
262
+ self.workers[req.data_parallel_rank].send_pyobj(req)
263
+ else:
264
+ self.workers[req.bootstrap_room % len(self.workers)].send_pyobj(req)
257
265
 
258
266
  def shortest_queue_scheduler(self, input_requests):
259
267
  raise NotImplementedError()
@@ -0,0 +1,63 @@
1
+ from enum import Enum, auto
2
+ from typing import Optional
3
+
4
+ import torch
5
+
6
+ from sglang.srt.managers.eplb_algorithms import deepseek, deepseek_vec
7
+
8
+
9
+ class EplbAlgorithm(Enum):
10
+ deepseek = auto()
11
+ deepseek_hierarchical = auto()
12
+ deepseek_vec = auto()
13
+ deepseek_vec_hierarchical = auto()
14
+ # TODO may have more algorithm later
15
+
16
+
17
+ def rebalance_experts(
18
+ tokens_per_expert: torch.Tensor,
19
+ num_physical_experts: int,
20
+ num_local_physical_experts: int,
21
+ num_groups: Optional[int],
22
+ num_nodes: int,
23
+ algorithm: EplbAlgorithm,
24
+ ):
25
+ if algorithm in [EplbAlgorithm.deepseek, EplbAlgorithm.deepseek_hierarchical]:
26
+ return deepseek.rebalance_experts(
27
+ weight=tokens_per_expert.sum(dim=0),
28
+ num_replicas=num_physical_experts,
29
+ num_groups=num_groups,
30
+ num_nodes=num_nodes,
31
+ num_gpus=num_physical_experts // num_local_physical_experts,
32
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_hierarchical,
33
+ )
34
+
35
+ if algorithm in [
36
+ EplbAlgorithm.deepseek_vec,
37
+ EplbAlgorithm.deepseek_vec_hierarchical,
38
+ ]:
39
+ return deepseek_vec.rebalance_experts(
40
+ tokens_per_expert=tokens_per_expert,
41
+ num_physical_experts=num_physical_experts,
42
+ num_local_physical_experts=num_local_physical_experts,
43
+ num_groups=num_groups,
44
+ num_nodes=num_nodes,
45
+ enable_hierarchical=algorithm == EplbAlgorithm.deepseek_vec_hierarchical,
46
+ )
47
+
48
+ raise NotImplementedError
49
+
50
+
51
+ def compute_algorithm(
52
+ raw_algorithm: str,
53
+ num_groups: Optional[int],
54
+ num_nodes: int,
55
+ ) -> EplbAlgorithm:
56
+ if raw_algorithm != "auto":
57
+ return EplbAlgorithm[raw_algorithm]
58
+
59
+ # TODO test on real scenarios and know which ones perform better
60
+ if (num_groups is not None) and (num_groups % num_nodes == 0):
61
+ return EplbAlgorithm.deepseek_hierarchical
62
+ else:
63
+ return EplbAlgorithm.deepseek