sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -135,7 +135,6 @@ class LlavaBaseForCausalLM(nn.Module):
135
135
  """
136
136
  image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
137
137
  # NOTE: This is not memory efficient. (output_hidden_states=True) will save all the hidden stated.
138
-
139
138
  selected_image_feature = image_outputs.hidden_states[self.vision_feature_layer]
140
139
  if self.vision_feature_select_strategy in ["default", "patch"]:
141
140
  selected_image_feature = selected_image_feature[:, 1:]
@@ -146,7 +145,6 @@ class LlavaBaseForCausalLM(nn.Module):
146
145
  f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
147
146
  )
148
147
  image_features = self.multi_modal_projector(selected_image_feature)
149
-
150
148
  return image_features
151
149
 
152
150
  @torch.no_grad()
@@ -613,6 +611,10 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
613
611
 
614
612
  MULTIMODAL_PROJECTOR_TYPE = LlavaMultiModalProjector
615
613
 
614
+ @property
615
+ def dtype(self):
616
+ return self.torch_dtype
617
+
616
618
  def pad_input_ids(self, input_ids: List[int], image_inputs: MultimodalInputs):
617
619
  if hasattr(self.vision_tower, "pad_input_ids"):
618
620
  return self.vision_tower.pad_input_ids(input_ids, image_inputs)
@@ -672,11 +674,17 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
672
674
  assert hasattr(config, "text_config")
673
675
  assert hasattr(config, "vision_config")
674
676
  self.config = config
675
- self.text_config = config.text_config
676
- self.vision_config = config.vision_config
677
+ self.text_config = self.config.text_config
678
+ self.vision_config = self.config.vision_config
679
+ self.torch_dtype = getattr(self.config, "torch_dtype")
680
+
681
+ if not getattr(self.text_config, "torch_dtype"):
682
+ self.text_config.torch_dtype = self.torch_dtype
683
+ if not getattr(self.vision_config, "torch_dtype"):
684
+ self.vision_config.torch_dtype = self.torch_dtype
677
685
 
678
686
  if not hasattr(self.config, "vocab_size"):
679
- self.config.vocab_size = self.config.text_config.vocab_size
687
+ self.config.vocab_size = self.text_config.vocab_size
680
688
  if not hasattr(self.config, "image_aspect_ratio"):
681
689
  self.config.image_aspect_ratio = "anyres"
682
690
  if not hasattr(self.config, "image_grid_pinpoints"):
@@ -697,39 +705,39 @@ class LlavaForConditionalGeneration(LlavaBaseForCausalLM):
697
705
  if not hasattr(self.config, "projector_hidden_act"):
698
706
  self.config.projector_hidden_act = "gelu"
699
707
 
700
- self.vision_feature_layer = getattr(config, "vision_feature_layer", -1)
708
+ self.vision_feature_layer = getattr(self.config, "vision_feature_layer", -1)
701
709
  self.vision_feature_select_strategy = getattr(
702
- config, "vision_feature_select_strategy", "full"
710
+ self.config, "vision_feature_select_strategy", "full"
703
711
  )
704
- self.image_size = self.config.vision_config.image_size
705
- self.patch_size = self.config.vision_config.patch_size
712
+ self.image_size = self.vision_config.image_size
713
+ self.patch_size = self.vision_config.patch_size
706
714
 
707
- self.mm_patch_merge_type = config.mm_patch_merge_type
708
- self.image_aspect_ratio = config.image_aspect_ratio
709
- self.image_grid_pinpoints = config.image_grid_pinpoints
715
+ self.mm_patch_merge_type = self.config.mm_patch_merge_type
716
+ self.image_aspect_ratio = self.config.image_aspect_ratio
717
+ self.image_grid_pinpoints = self.config.image_grid_pinpoints
710
718
 
711
719
  self.image_feature_len = int((self.image_size // self.patch_size) ** 2)
712
720
 
713
721
  self.multi_modal_projector = self.MULTIMODAL_PROJECTOR_TYPE(config)
714
722
 
715
723
  language_model_cls = self._get_sgl_model_cls(
716
- config.text_config, AutoModelForCausalLM
724
+ self.text_config, AutoModelForCausalLM
717
725
  )
718
- vision_model_cls = self._get_sgl_model_cls(config.vision_config, AutoModel)
726
+ vision_model_cls = self._get_sgl_model_cls(self.vision_config, AutoModel)
719
727
  self.language_model = language_model_cls(
720
- config.text_config,
728
+ self.text_config,
721
729
  quant_config=quant_config,
722
730
  prefix=add_prefix("language_model", prefix),
723
731
  )
724
732
  self.vision_tower = vision_model_cls(
725
- config.vision_config,
733
+ self.vision_config,
726
734
  quant_config=quant_config,
727
735
  prefix=add_prefix("vision_tower", prefix),
728
736
  )
729
737
 
730
- if "unpad" in getattr(config, "mm_patch_merge_type", ""):
738
+ if "unpad" in getattr(self.config, "mm_patch_merge_type", ""):
731
739
  self.language_model.model.image_newline = nn.Parameter(
732
- torch.empty(config.text_config.hidden_size, dtype=torch.float16)
740
+ torch.empty(self.text_config.hidden_size, dtype=self.torch_dtype)
733
741
  )
734
742
 
735
743
  def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
@@ -0,0 +1,220 @@
1
+ # Adapted from https://github.com/vllm-project/vllm/pull/17433/files and deepseek_nextn.py
2
+
3
+ from functools import partial
4
+ from typing import Any, Dict, Iterable, Optional, Tuple
5
+
6
+ import torch
7
+ from torch import nn
8
+ from transformers import PretrainedConfig
9
+
10
+ from sglang.srt.distributed import (
11
+ get_tensor_model_parallel_rank,
12
+ get_tensor_model_parallel_world_size,
13
+ split_tensor_along_last_dim,
14
+ tensor_model_parallel_all_gather,
15
+ )
16
+ from sglang.srt.layers.layernorm import RMSNorm
17
+ from sglang.srt.layers.linear import QKVParallelLinear, RowParallelLinear
18
+ from sglang.srt.layers.logits_processor import LogitsProcessor
19
+ from sglang.srt.layers.pooler import Pooler, PoolingType
20
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
21
+ from sglang.srt.layers.radix_attention import RadixAttention
22
+ from sglang.srt.layers.rotary_embedding import get_rope
23
+ from sglang.srt.layers.vocab_parallel_embedding import (
24
+ ParallelLMHead,
25
+ VocabParallelEmbedding,
26
+ )
27
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
28
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
29
+ from sglang.srt.models.mimo import MiMoForCausalLM
30
+ from sglang.srt.models.qwen2 import (
31
+ Qwen2Attention,
32
+ Qwen2DecoderLayer,
33
+ Qwen2MLP,
34
+ Qwen2Model,
35
+ )
36
+ from sglang.srt.utils import add_prefix
37
+
38
+
39
+ class MiMoMultiTokenPredictorLayer(nn.Module):
40
+
41
+ def __init__(
42
+ self,
43
+ config: PretrainedConfig,
44
+ prefix: str,
45
+ quant_config: Optional[QuantizationConfig] = None,
46
+ ) -> None:
47
+ super().__init__()
48
+
49
+ self.embed_tokens = VocabParallelEmbedding(
50
+ config.vocab_size,
51
+ config.hidden_size,
52
+ )
53
+ self.token_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
54
+ self.hidden_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
55
+ self.input_proj = nn.Linear(
56
+ config.hidden_size * 2, config.hidden_size, bias=False
57
+ )
58
+ self.mtp_block = Qwen2DecoderLayer(
59
+ config=config, quant_config=quant_config, prefix=prefix
60
+ )
61
+ self.final_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.Tensor,
66
+ positions: torch.Tensor,
67
+ forward_batch: ForwardBatch,
68
+ input_embeds: torch.Tensor = None,
69
+ ) -> torch.Tensor:
70
+
71
+ if input_embeds is None:
72
+ hidden_states = self.embed_tokens(input_ids)
73
+ else:
74
+ hidden_states = input_embeds
75
+ # masking inputs at position 0, as not needed by MTP
76
+ hidden_states[positions == 0] = 0
77
+
78
+ hidden_states = self.input_proj(
79
+ torch.cat(
80
+ (
81
+ self.hidden_layernorm(forward_batch.spec_info.hidden_states),
82
+ self.token_layernorm(hidden_states),
83
+ ),
84
+ dim=-1,
85
+ )
86
+ )
87
+
88
+ hidden_states, residual = self.mtp_block(
89
+ positions=positions,
90
+ hidden_states=hidden_states,
91
+ forward_batch=forward_batch,
92
+ residual=None,
93
+ )
94
+ hidden_states = residual + hidden_states
95
+ hidden_states = self.final_layernorm(hidden_states)
96
+ return hidden_states
97
+
98
+
99
+ class MiMoMTP(nn.Module):
100
+ def __init__(
101
+ self,
102
+ config: PretrainedConfig,
103
+ quant_config: Optional[QuantizationConfig] = None,
104
+ prefix: str = "",
105
+ ) -> None:
106
+ nn.Module.__init__(self)
107
+ self.config = config
108
+ self.tp_size = get_tensor_model_parallel_world_size()
109
+ self.quant_config = quant_config
110
+
111
+ self.model = MiMoMultiTokenPredictorLayer(
112
+ config,
113
+ prefix,
114
+ quant_config,
115
+ )
116
+ self.lm_head = ParallelLMHead(
117
+ config.vocab_size,
118
+ config.hidden_size,
119
+ quant_config=quant_config,
120
+ )
121
+ self.logits_processor = LogitsProcessor(config)
122
+
123
+ @torch.no_grad()
124
+ def forward(
125
+ self,
126
+ input_ids: torch.Tensor,
127
+ positions: torch.Tensor,
128
+ forward_batch: ForwardBatch,
129
+ ) -> torch.Tensor:
130
+ hidden_states = self.model(input_ids, positions, forward_batch)
131
+ return self.logits_processor(
132
+ input_ids, hidden_states, self.lm_head, forward_batch
133
+ )
134
+
135
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
136
+ stacked_params_mapping = [
137
+ # (param_name, shard_name, shard_id)
138
+ ("qkv_proj", "q_proj", "q"),
139
+ ("qkv_proj", "k_proj", "k"),
140
+ ("qkv_proj", "v_proj", "v"),
141
+ ("gate_up_proj", "gate_proj", 0),
142
+ ("gate_up_proj", "up_proj", 1),
143
+ ]
144
+
145
+ params_dict = dict(self.named_parameters())
146
+ for name, loaded_weight in weights:
147
+ if "rotary_emb.inv_freq" in name or "projector" in name:
148
+ continue
149
+ if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name:
150
+ # Models trained using ColossalAI may include these tensors in
151
+ # the checkpoint. Skip them.
152
+ continue
153
+ if self.config.tie_word_embeddings and "lm_head.weight" in name:
154
+ continue
155
+ if name.startswith("model.vision_tower") and name not in params_dict:
156
+ continue
157
+ name = self.map_model_name_to_mtp_param_name(name)
158
+
159
+ for param_name, weight_name, shard_id in stacked_params_mapping:
160
+ if weight_name not in name:
161
+ continue
162
+ if "mtp_block" not in name:
163
+ break
164
+ name = name.replace(weight_name, param_name)
165
+ # Skip loading extra bias for GPTQ models.
166
+ if name.endswith(".bias") and name not in params_dict:
167
+ continue
168
+ param = params_dict[name]
169
+ weight_loader = param.weight_loader
170
+ weight_loader(param, loaded_weight, shard_id)
171
+ break
172
+ else:
173
+ # Skip loading extra bias for GPTQ models.
174
+ if name.endswith(".bias") and name not in params_dict:
175
+ continue
176
+ if "mtp_block" not in name and (
177
+ "embed_tokens" not in name
178
+ and "lm_head" not in name
179
+ and "token_layernorm" not in name
180
+ and "hidden_layernorm" not in name
181
+ and "input_proj" not in name
182
+ and "final_layernorm" not in name
183
+ ):
184
+ continue
185
+ param = params_dict[name]
186
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
187
+ weight_loader(param, loaded_weight)
188
+
189
+ def map_model_name_to_mtp_param_name(self, name: str) -> str:
190
+ import re
191
+
192
+ name_without_prefix = [
193
+ "token_layernorm",
194
+ "hidden_layernorm",
195
+ "input_proj",
196
+ "final_layernorm",
197
+ ]
198
+ pattern = r"model.mtp_layers.(\d+)."
199
+ group = re.match(pattern, name)
200
+ if group is not None:
201
+ for sub_name in name_without_prefix:
202
+ if sub_name in name:
203
+ name = name.replace(group.group(), "model.")
204
+ return name
205
+ name = name.replace(group.group(), "model.mtp_block.")
206
+ return name
207
+
208
+ def get_embed_and_head(self):
209
+ return self.model.embed_tokens.weight, self.lm_head.weight
210
+
211
+ def set_embed_and_head(self, embed, head):
212
+ del self.model.embed_tokens.weight
213
+ del self.lm_head.weight
214
+ self.model.embed_tokens.weight = embed
215
+ self.lm_head.weight = head
216
+ torch.cuda.empty_cache()
217
+ torch.cuda.synchronize()
218
+
219
+
220
+ EntryClass = MiMoMTP
@@ -51,11 +51,8 @@ from sglang.srt.managers.schedule_batch import (
51
51
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
52
52
  from sglang.srt.model_loader.utils import set_default_torch_dtype
53
53
  from sglang.srt.model_loader.weight_utils import default_weight_loader
54
- from sglang.srt.models.minicpmv import (
55
- Idefics2VisionTransformer,
56
- MiniCPMBaseModel,
57
- Resampler2_5,
58
- )
54
+ from sglang.srt.models.idefics2 import Idefics2VisionTransformer
55
+ from sglang.srt.models.minicpmv import MiniCPMBaseModel, Resampler2_5
59
56
  from sglang.srt.models.qwen2 import Qwen2ForCausalLM
60
57
  from sglang.srt.utils import logger
61
58
 
@@ -1520,12 +1517,15 @@ class MiniCPMO(MiniCPMBaseModel):
1520
1517
  slice_start_id: int = mm_input.slice_start_id
1521
1518
  slice_end_id: int = mm_input.slice_end_id
1522
1519
 
1523
- media_token_pairs = [
1520
+ data_token_pairs = [
1524
1521
  (im_start_id, im_end_id),
1525
1522
  (slice_start_id, slice_end_id),
1526
1523
  (mm_input.audio_start_id, mm_input.audio_end_id),
1527
1524
  ]
1528
- pattern = MultiModalityDataPaddingPatternTokenPairs(media_token_pairs)
1525
+ data_start_token_ids = [im_start_id, mm_input.audio_start_id]
1526
+ pattern = MultiModalityDataPaddingPatternTokenPairs(
1527
+ data_token_pairs=data_token_pairs, data_start_token_ids=data_start_token_ids
1528
+ )
1529
1529
 
1530
1530
  return pattern.pad_input_tokens(input_ids, mm_input)
1531
1531
 
@@ -1823,22 +1823,12 @@ class MiniCPMO(MiniCPMBaseModel):
1823
1823
  **kwargs: Any,
1824
1824
  ) -> torch.Tensor:
1825
1825
 
1826
- mm_input = forward_batch.merge_mm_inputs()
1827
- placeholder_token_ids = (
1828
- ([mm_input.im_token_id] + [item.pad_value for item in mm_input.mm_items])
1829
- if forward_batch.contains_mm_inputs()
1830
- else []
1831
- )
1832
1826
  hidden_states = general_mm_embed_routine(
1833
1827
  input_ids=input_ids,
1834
1828
  forward_batch=forward_batch,
1835
1829
  language_model=self.llm,
1836
1830
  image_data_embedding_func=self.get_image_feature,
1837
1831
  audio_data_embedding_func=self.get_audio_feature,
1838
- placeholder_tokens={
1839
- Modality.IMAGE: placeholder_token_ids,
1840
- Modality.AUDIO: placeholder_token_ids,
1841
- },
1842
1832
  positions=positions,
1843
1833
  )
1844
1834
  return hidden_states
@@ -20,6 +20,7 @@
20
20
  # See the License for the specific language governing permissions and
21
21
  # limitations under the License.
22
22
  """Inference-only MiniCPM-V model compatible with HuggingFace weights."""
23
+
23
24
  from functools import partial
24
25
  from typing import (
25
26
  Any,
@@ -41,13 +42,7 @@ from torch import nn
41
42
  from torch.nn.init import trunc_normal_
42
43
  from transformers import PretrainedConfig
43
44
 
44
- from sglang.srt.layers.activation import get_act_fn
45
- from sglang.srt.layers.attention.vision import VisionAttention
46
- from sglang.srt.layers.linear import (
47
- ColumnParallelLinear,
48
- ReplicatedLinear,
49
- RowParallelLinear,
50
- )
45
+ from sglang.srt.layers.linear import ReplicatedLinear
51
46
  from sglang.srt.layers.logits_processor import LogitsProcessor
52
47
  from sglang.srt.layers.quantization.base_config import QuantizationConfig
53
48
  from sglang.srt.managers.mm_utils import (
@@ -58,6 +53,7 @@ from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInp
58
53
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
59
54
  from sglang.srt.model_loader.utils import set_default_torch_dtype
60
55
  from sglang.srt.model_loader.weight_utils import default_weight_loader
56
+ from sglang.srt.models.idefics2 import Idefics2VisionTransformer
61
57
  from sglang.srt.models.qwen2 import Qwen2Config, Qwen2ForCausalLM
62
58
  from sglang.srt.utils import add_prefix, flatten_nested_list
63
59
 
@@ -146,294 +142,6 @@ def get_2d_sincos_pos_embed(
146
142
  return pos_embed
147
143
 
148
144
 
149
- class Idefics2VisionMLP(nn.Module):
150
-
151
- def __init__(
152
- self,
153
- config: PretrainedConfig,
154
- quant_config: Optional[QuantizationConfig] = None,
155
- prefix: str = "",
156
- ) -> None:
157
- super().__init__()
158
- self.config = config
159
- self.activation_fn = get_act_fn(config.hidden_act)
160
- self.fc1 = ColumnParallelLinear(
161
- config.hidden_size,
162
- config.intermediate_size,
163
- bias=True,
164
- quant_config=quant_config,
165
- prefix=add_prefix("fc1", prefix),
166
- )
167
- self.fc2 = RowParallelLinear(
168
- config.intermediate_size,
169
- config.hidden_size,
170
- bias=True,
171
- quant_config=quant_config,
172
- prefix=add_prefix("fc2", prefix),
173
- )
174
-
175
- def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
176
- hidden_states, _ = self.fc1(hidden_states)
177
- hidden_states = self.activation_fn(hidden_states)
178
- hidden_states, _ = self.fc2(hidden_states)
179
- return hidden_states
180
-
181
-
182
- class Idefics2EncoderLayer(nn.Module):
183
-
184
- def __init__(
185
- self,
186
- config: PretrainedConfig,
187
- quant_config: Optional[QuantizationConfig] = None,
188
- prefix: str = "",
189
- ) -> None:
190
- super().__init__()
191
- self.embed_dim = config.hidden_size
192
- self.num_heads = config.num_attention_heads
193
- self.self_attn = VisionAttention(
194
- embed_dim=config.hidden_size,
195
- num_heads=self.num_heads,
196
- projection_size=config.intermediate_size,
197
- use_qkv_parallel=True,
198
- quant_config=quant_config,
199
- dropout=config.attention_dropout,
200
- qkv_backend="sdpa",
201
- softmax_in_single_precision=True,
202
- flatten_batch=False,
203
- prefix=add_prefix("self_attn", prefix),
204
- )
205
- self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
206
- self.mlp = Idefics2VisionMLP(
207
- config,
208
- quant_config=quant_config,
209
- prefix=add_prefix("mlp", prefix),
210
- )
211
- self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
212
-
213
- def forward(
214
- self,
215
- hidden_states: torch.Tensor,
216
- cu_seqlens: torch.Tensor,
217
- ) -> torch.Tensor:
218
- """
219
- Args:
220
- hidden_states (`torch.FloatTensor`):
221
- Input to the layer of shape `(batch, seq_len, embed_dim)`.
222
-
223
- """
224
- residual = hidden_states
225
- hidden_states = self.layer_norm1(hidden_states)
226
- hidden_states = self.self_attn(hidden_states, cu_seqlens=cu_seqlens)
227
-
228
- hidden_states = residual + hidden_states
229
- residual = hidden_states
230
- hidden_states = self.layer_norm2(hidden_states)
231
- hidden_states = self.mlp(hidden_states)
232
- hidden_states = residual + hidden_states
233
- return hidden_states
234
-
235
-
236
- class Idefics2Encoder(nn.Module):
237
- """
238
- Transformer encoder consisting of `config.num_hidden_layers` self attention
239
- layers. Each layer is a
240
- [`Idefics2EncoderLayer`].
241
-
242
- Args:
243
- config: Idefics2Config
244
- """
245
-
246
- def __init__(
247
- self,
248
- config: PretrainedConfig,
249
- quant_config: Optional[QuantizationConfig] = None,
250
- prefix: str = "",
251
- ) -> None:
252
- super().__init__()
253
-
254
- self.config = config
255
- self.layers = nn.ModuleList(
256
- [
257
- Idefics2EncoderLayer(
258
- config,
259
- quant_config=quant_config,
260
- prefix=add_prefix(f"layers.{i}", prefix),
261
- )
262
- for i in range(config.num_hidden_layers)
263
- ]
264
- )
265
-
266
- def forward(
267
- self,
268
- inputs_embeds: torch.Tensor,
269
- cu_seqlens: torch.Tensor,
270
- ) -> torch.Tensor:
271
- r"""
272
- Args:
273
- inputs_embeds (torch.Tensor):
274
- Optionally, instead of passing `input_ids` you can choose to
275
- directly pass an embedded representation.
276
- This is useful if you want more control over how to convert
277
- `input_ids` indices into associated vectorsthan the model's
278
- internal embedding lookup matrix.
279
- """
280
- hidden_states = inputs_embeds
281
- for encoder_layer in self.layers:
282
- layer_outputs = encoder_layer(
283
- hidden_states,
284
- cu_seqlens=cu_seqlens,
285
- )
286
- hidden_states = layer_outputs
287
- return hidden_states
288
-
289
-
290
- class Idefics2VisionEmbeddings(nn.Module):
291
- """
292
- This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings
293
- ` to enable images of variable
294
- resolution.
295
-
296
- The modifications are adapted from [Patch n' Pack: NaViT, a Vision
297
- Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
298
- which allows treating images in their native aspect ratio and without the
299
- need to resize them to the same fixed size. In particular, we start from the
300
- original pre-trained SigLIP model(which uses images of fixed-size square
301
- images) and adapt it by training on images of variable resolutions.
302
- """
303
-
304
- def __init__(self, config: PretrainedConfig):
305
- super().__init__()
306
- self.embed_dim = config.hidden_size
307
- self.image_size = config.image_size
308
- self.patch_size = config.patch_size
309
- self.patch_embedding = nn.Conv2d(
310
- in_channels=config.num_channels,
311
- out_channels=self.embed_dim,
312
- kernel_size=self.patch_size,
313
- stride=self.patch_size,
314
- padding="valid",
315
- )
316
- self.num_patches_per_side = self.image_size // self.patch_size
317
- self.num_patches = self.num_patches_per_side**2
318
- self.num_positions = self.num_patches
319
- self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
320
-
321
- def get_position_ids(
322
- self,
323
- pixel_values: torch.FloatTensor,
324
- patch_attention_mask: torch.BoolTensor,
325
- tgt_sizes: Optional[torch.IntTensor] = None,
326
- ):
327
- batch_size, _, max_im_h, max_im_w = pixel_values.shape
328
-
329
- max_nb_patches_h, max_nb_patches_w = (
330
- max_im_h // self.patch_size,
331
- max_im_w // self.patch_size,
332
- )
333
- boundaries = torch.arange(
334
- 1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side
335
- )
336
- position_ids = torch.full(
337
- size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0
338
- )
339
-
340
- for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
341
-
342
- if tgt_sizes is not None:
343
- nb_patches_h = tgt_sizes[batch_idx][0]
344
- nb_patches_w = tgt_sizes[batch_idx][1]
345
- else:
346
- nb_patches_h = p_attn_mask[:, 0].sum()
347
- nb_patches_w = p_attn_mask[0].sum()
348
- fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
349
- fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)
350
- bucket_coords_h = torch.bucketize(
351
- fractional_coords_h, boundaries, right=True
352
- )
353
- bucket_coords_w = torch.bucketize(
354
- fractional_coords_w, boundaries, right=True
355
- )
356
- pos_ids = (
357
- bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w
358
- ).flatten()
359
- position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
360
- position_ids = position_ids.to(self.position_embedding.weight.device)
361
- return position_ids
362
-
363
- def forward(
364
- self,
365
- pixel_values: torch.FloatTensor,
366
- patch_attention_mask: torch.BoolTensor,
367
- tgt_sizes: Optional[torch.IntTensor] = None,
368
- ) -> torch.Tensor:
369
- target_dtype = self.patch_embedding.weight.dtype
370
- pixel_values = pixel_values.to(
371
- device=self.patch_embedding.weight.device, dtype=target_dtype
372
- )
373
- patch_embeds = self.patch_embedding(pixel_values)
374
- embeddings = patch_embeds.flatten(2).transpose(1, 2)
375
- position_ids = self.get_position_ids(
376
- pixel_values, patch_attention_mask, tgt_sizes
377
- )
378
-
379
- embeddings = embeddings + self.position_embedding(position_ids)
380
- return embeddings
381
-
382
-
383
- class Idefics2VisionTransformer(nn.Module):
384
-
385
- def __init__(
386
- self,
387
- config: PretrainedConfig,
388
- quant_config: Optional[QuantizationConfig] = None,
389
- prefix: str = "",
390
- ) -> None:
391
- super().__init__()
392
-
393
- embed_dim = config.hidden_size
394
- self.config = config
395
- self.embeddings = Idefics2VisionEmbeddings(config)
396
- self.encoder = Idefics2Encoder(
397
- config=config,
398
- quant_config=quant_config,
399
- prefix=add_prefix("encoder", prefix),
400
- )
401
- self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
402
-
403
- def get_input_embeddings(self) -> nn.Embedding:
404
- return self.embeddings
405
-
406
- def compute_cu_seqlens(self, tgt_sizes: torch.Tensor) -> torch.Tensor:
407
- patch_len = tgt_sizes[:, 0] * tgt_sizes[:, 1] # shape: (batch_size,)
408
- cu_seqlens = torch.cat(
409
- [
410
- torch.tensor([0], device=patch_len.device, dtype=torch.int32),
411
- torch.cumsum(patch_len, dim=0, dtype=torch.int32),
412
- ],
413
- dim=0,
414
- ).to(tgt_sizes.device)
415
- return cu_seqlens
416
-
417
- def forward(
418
- self,
419
- pixel_values,
420
- patch_attention_mask: Optional[torch.BoolTensor] = None,
421
- tgt_sizes: Optional[torch.IntTensor] = None,
422
- ) -> torch.Tensor:
423
- hidden_states = self.embeddings(
424
- pixel_values=pixel_values,
425
- patch_attention_mask=patch_attention_mask,
426
- tgt_sizes=tgt_sizes,
427
- )
428
- cu_seqlens = self.compute_cu_seqlens(tgt_sizes)
429
- encoder_outputs = self.encoder(
430
- hidden_states,
431
- cu_seqlens=cu_seqlens,
432
- )
433
- last_hidden_state = self.post_layernorm(encoder_outputs)
434
- return last_hidden_state
435
-
436
-
437
145
  class MiniCPMVImagePixelInputs(TypedDict):
438
146
  type: Literal["pixel_values"]
439
147
  data: List[torch.Tensor]