sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -3,32 +3,41 @@ import concurrent.futures
3
3
  import dataclasses
4
4
  import multiprocessing as mp
5
5
  import os
6
+ import re
6
7
  from abc import ABC, abstractmethod
7
- from typing import List, Optional
8
+ from enum import Enum
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
10
 
9
11
  import numpy as np
10
- import PIL
11
12
  import torch
12
13
  from PIL import Image
13
14
  from transformers import BaseImageProcessorFast
14
15
 
15
- from sglang.srt.managers.schedule_batch import Modality
16
+ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
16
17
  from sglang.srt.utils import encode_video, load_audio, load_image
17
18
 
18
19
 
20
+ class MultimodalInputFormat(Enum):
21
+ """Enum for different multimodal input formats."""
22
+
23
+ RAW_IMAGES = "raw_images"
24
+ PRECOMPUTED_FEATURES = "precomputed_features"
25
+ PIXEL_VALUES = "pixel_values"
26
+
27
+
19
28
  @dataclasses.dataclass
20
29
  class BaseMultiModalProcessorOutput:
21
30
  # input_text, with each frame of video/image represented with a image_token
22
31
  input_text: str
23
32
 
24
33
  # frames loaded from image and video, in given order
25
- images: Optional[list[PIL.Image]] = None
34
+ images: Optional[list[Union[Image.Image, dict]]] = None
26
35
 
27
36
  # audios
28
- audios: Optional[list[np.ndarray]] = None
37
+ audios: Optional[list[Union[np.ndarray, dict]]] = None
29
38
 
30
39
  def normalize(self):
31
- for field_name in ["image_sizes", "images", "audios"]:
40
+ for field_name in ["images", "audios"]:
32
41
  field = getattr(self, field_name, None)
33
42
  if field is not None and isinstance(field, list) and len(field) == 0:
34
43
  setattr(self, field_name, None)
@@ -36,16 +45,48 @@ class BaseMultiModalProcessorOutput:
36
45
 
37
46
  @dataclasses.dataclass
38
47
  class MultimodalSpecialTokens:
39
- image_token: Optional[str] = None
40
- video_token: Optional[str] = None
41
- audio_token: Optional[str] = None
42
-
43
- def collect(self) -> list[str]:
44
- return [
45
- token
46
- for token in [self.image_token, self.video_token, self.audio_token]
47
- if token
48
+ image_token: Optional[Union[int, str, List[str]]] = None
49
+ video_token: Optional[Union[int, str, List[str]]] = None
50
+ audio_token: Optional[Union[int, str, List[str]]] = None
51
+
52
+ def convert_to_str(self, token: Union[str, int], processor) -> str:
53
+ if token is None:
54
+ return token
55
+ if isinstance(token, str):
56
+ return token
57
+ return processor.tokenizer.convert_ids_to_tokens([token])[0]
58
+
59
+ def convert_to_strs(self, processor):
60
+ self.image_token = self.convert_to_str(self.image_token, processor)
61
+ self.video_token = self.convert_to_str(self.video_token, processor)
62
+ self.audio_token = self.convert_to_str(self.audio_token, processor)
63
+
64
+ image_token_regex: Optional[re.Pattern] = None
65
+ video_token_regex: Optional[re.Pattern] = None
66
+ audio_token_regex: Optional[re.Pattern] = None
67
+
68
+ def __post_init__(self):
69
+ if self.image_token_regex is None and self.image_token is not None:
70
+ self.image_token_regex = re.compile(re.escape(self.image_token))
71
+ if self.video_token_regex is None and self.video_token is not None:
72
+ self.video_token_regex = re.compile(re.escape(self.video_token))
73
+ if self.audio_token_regex is None and self.audio_token is not None:
74
+ self.audio_token_regex = re.compile(re.escape(self.audio_token))
75
+
76
+ def collect(self) -> re.Pattern:
77
+ tokens = [
78
+ self.image_token_regex,
79
+ self.video_token_regex,
80
+ self.audio_token_regex,
48
81
  ]
82
+ patterns = []
83
+ flags = 0
84
+ for t in tokens:
85
+ if t is not None:
86
+ patterns.append(t.pattern)
87
+ flags |= t.flags
88
+ combined = "(" + "|".join(f"(?:{p})" for p in patterns) + ")"
89
+ return re.compile(combined, flags)
49
90
 
50
91
 
51
92
  class BaseMultimodalProcessor(ABC):
@@ -54,6 +95,7 @@ class BaseMultimodalProcessor(ABC):
54
95
  def __init__(self, hf_config, server_args, _processor):
55
96
  self.hf_config = hf_config
56
97
  self._processor = _processor
98
+ self.arch = hf_config.architectures[0]
57
99
  self.server_args = server_args
58
100
  # FIXME: not accurate, model and image specific
59
101
  self.NUM_TOKEN_PER_FRAME = 330
@@ -136,6 +178,8 @@ class BaseMultimodalProcessor(ABC):
136
178
  data, is_video, is_audio, frame_count_limit=None, discard_alpha_channel=True
137
179
  ):
138
180
  """Static method that can be pickled for multiprocessing"""
181
+ if isinstance(data, dict):
182
+ return data
139
183
  try:
140
184
  if is_audio:
141
185
  return load_audio(data)
@@ -175,7 +219,10 @@ class BaseMultimodalProcessor(ABC):
175
219
  image_index, audio_index = 0, 0
176
220
 
177
221
  for text_part in text_parts:
178
- if text_part == multimodal_tokens.image_token:
222
+ if (
223
+ multimodal_tokens.image_token_regex
224
+ and multimodal_tokens.image_token_regex.match(text_part)
225
+ ):
179
226
  data = image_data[image_index]
180
227
  is_video = isinstance(data, str) and data.startswith("video:")
181
228
  estimated_frames = estimated_frames_list[image_index]
@@ -192,7 +239,10 @@ class BaseMultimodalProcessor(ABC):
192
239
  )
193
240
  task_info.append((Modality.IMAGE, data, frame_count_limit))
194
241
  image_index += 1
195
- elif text_part == multimodal_tokens.audio_token:
242
+ elif (
243
+ multimodal_tokens.audio_token_regex
244
+ and multimodal_tokens.audio_token_regex.match(text_part)
245
+ ):
196
246
  data = audio_data[audio_index]
197
247
  futures.append(
198
248
  self.io_executor.submit(
@@ -228,17 +278,13 @@ class BaseMultimodalProcessor(ABC):
228
278
  discard_alpha_channel: if True, discards the alpha channel in the returned images
229
279
 
230
280
  """
231
-
281
+ if not return_text:
282
+ raise NotImplementedError()
232
283
  if image_data is None:
233
284
  image_data = []
234
- if isinstance(multimodal_tokens.image_token, int):
235
- multimodal_tokens.image_token = (
236
- self._processor.tokenizer.convert_ids_to_tokens(
237
- multimodal_tokens.image_token
238
- )
239
- )
240
- else:
241
- multimodal_tokens.image_token = multimodal_tokens.image_token
285
+
286
+ multimodal_tokens.convert_to_strs(self._processor)
287
+ multimodal_tokens_pattern = multimodal_tokens.collect()
242
288
 
243
289
  if isinstance(prompt, list) and return_text:
244
290
  assert len(prompt) and isinstance(prompt[0], int)
@@ -247,16 +293,8 @@ class BaseMultimodalProcessor(ABC):
247
293
  prompt = prompt
248
294
 
249
295
  assert isinstance(prompt, str)
250
- if return_text:
251
- import re
252
-
253
- pattern = (
254
- "("
255
- + "|".join(re.escape(sep) for sep in multimodal_tokens.collect())
256
- + ")"
257
- )
258
- # split text into list of normal text and special tokens
259
- text_parts = re.split(pattern, prompt)
296
+ # split text into list of normal text and special tokens
297
+ text_parts = re.split(multimodal_tokens_pattern, prompt)
260
298
 
261
299
  futures, task_info = self.submit_data_loading_tasks(
262
300
  text_parts=text_parts,
@@ -266,34 +304,253 @@ class BaseMultimodalProcessor(ABC):
266
304
  discard_alpha_channel=discard_alpha_channel,
267
305
  )
268
306
  # Process results
269
- image_sizes, images, audios = [], [], []
307
+ images, audios = [], []
270
308
  new_text = ""
271
309
  task_ptr = 0
272
310
 
273
311
  for text_part in text_parts:
274
- if text_part in multimodal_tokens.collect():
312
+ if multimodal_tokens_pattern.match(text_part):
275
313
  task_type, data, frame_limit = task_info[task_ptr]
276
314
  result = futures[task_ptr].result()
277
315
  task_ptr += 1
278
316
 
279
317
  if task_type == Modality.IMAGE:
318
+ # If data is already processed it will be a
319
+ # dictionary. In this case we want to keep the
320
+ # expanded tokens in text_part. Otherwise, we will
321
+ # call the processor code, so keep only a single image
322
+ # token.
323
+ mm_tokens = (
324
+ text_part
325
+ if isinstance(data, dict)
326
+ else multimodal_tokens.image_token
327
+ )
280
328
  frames = [result] if not isinstance(result, list) else result
281
329
  if frames:
282
- image_sizes += frames[0].size * len(frames)
283
330
  images += frames
284
- new_text += multimodal_tokens.image_token * len(frames)
331
+ new_text += mm_tokens * len(frames)
285
332
  elif task_type == Modality.AUDIO:
286
333
  # audio
334
+ mm_tokens = (
335
+ text_part
336
+ if isinstance(data, dict)
337
+ else multimodal_tokens.audio_token
338
+ )
287
339
  audios.append(result)
288
- new_text += multimodal_tokens.audio_token
340
+ new_text += mm_tokens
289
341
  # TODO: handle video
290
342
  else:
291
343
  new_text += text_part
292
344
 
293
345
  out = BaseMultiModalProcessorOutput(
346
+ input_text=new_text,
294
347
  images=images,
295
348
  audios=audios,
296
- input_text=new_text,
297
349
  )
298
350
  out.normalize()
299
351
  return out
352
+
353
+ @staticmethod
354
+ def get_mm_items_offset(
355
+ input_ids: torch.Tensor, mm_token_id: int
356
+ ) -> List[Tuple[int, int]]:
357
+ """
358
+ Get a set of range for mm_items from input_ids
359
+ Example:
360
+ input_ids = [1, 2, 3, 3, 3, 4, 3, 3]
361
+ mm_token_id = 3
362
+ return result = [(2,4),(6,7)]
363
+ """
364
+ mask = input_ids == mm_token_id
365
+
366
+ start_positions = (mask & ~torch.roll(mask, 1)).nonzero(as_tuple=True)[0]
367
+ end_positions = (mask & ~torch.roll(mask, -1)).nonzero(as_tuple=True)[0]
368
+
369
+ return list(zip(start_positions.tolist(), end_positions.tolist()))
370
+
371
+ @staticmethod
372
+ def get_mm_items_offset_by_pair(
373
+ input_ids: torch.Tensor, mm_start_id: int, mm_end_id: int
374
+ ) -> List[Tuple[int, int]]:
375
+ indices_start = (input_ids == mm_start_id).nonzero(as_tuple=True)[0] + 1
376
+ indices_end = (input_ids == mm_end_id).nonzero(as_tuple=True)[0] - 1
377
+
378
+ return list(zip(indices_start.tolist(), indices_end.tolist()))
379
+
380
+ @staticmethod
381
+ def _extract_processor_features(
382
+ items: List[dict], attr_name: str
383
+ ) -> Optional[torch.Tensor]:
384
+ """
385
+ Helper function to concat extracted attributes from processor output.
386
+ """
387
+ values = [value for item in items if (value := item.get(attr_name)) is not None]
388
+ return torch.cat(values) if values else None
389
+
390
+ # When we assume that all the items have the same attributes
391
+ def _extract_processor_features_from_all_attributes(
392
+ self, items: List[dict]
393
+ ) -> dict:
394
+ values = {}
395
+ # Verify all items have the same keys
396
+ first_keys = set(items[0].keys())
397
+ for item in items[1:]:
398
+ if set(item.keys()) != first_keys:
399
+ raise ValueError(
400
+ f"All items must have the same attributes. "
401
+ f"First item has {first_keys}, but found {set(item.keys())}"
402
+ )
403
+
404
+ # Process each attribute
405
+ for k, v in items[0].items():
406
+ if isinstance(v, list):
407
+ values[k] = self._extract_processor_features(items, k)
408
+ else:
409
+ # Verify all items have the same value for non-list attributes
410
+ for item in items[1:]:
411
+ if item[k] != v:
412
+ raise ValueError(
413
+ f"All items must have the same value for attribute {k}. "
414
+ f"First item has {v}, but found {item[k]}"
415
+ )
416
+ values[k] = v
417
+ return values
418
+
419
+ def process_and_combine_mm_data(
420
+ self, base_output: BaseMultiModalProcessorOutput
421
+ ) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
422
+ """
423
+ Process multimodal data and return the combined multimodal item and input_ids.
424
+ Handles all three input formats at the same abstraction level.
425
+
426
+ Returns:
427
+ Tuple of (combined_mm_item, input_ids)
428
+ """
429
+
430
+ def tokenize_text(input_text: str) -> torch.Tensor:
431
+ """Tokenize input text."""
432
+ return self._processor.tokenizer(
433
+ input_text,
434
+ return_tensors="pt",
435
+ add_special_tokens=True,
436
+ ).input_ids.flatten()
437
+
438
+ def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
439
+ """Categorize multimodal inputs and validate consistency."""
440
+ try:
441
+ has_image = False
442
+ has_pixel_values = False
443
+ has_precomputed_features = False
444
+
445
+ for mm_input in mm_inputs:
446
+ if isinstance(mm_input, Image.Image):
447
+ has_image = True
448
+ elif isinstance(mm_input, dict):
449
+ if mm_input.get("precomputed_features", None) is not None:
450
+ has_precomputed_features = True
451
+ elif mm_input.get("pixel_values", None) is not None:
452
+ has_pixel_values = True
453
+ else:
454
+ raise ValueError(
455
+ f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
456
+ )
457
+ else:
458
+ raise ValueError(
459
+ f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
460
+ )
461
+
462
+ # Validate format consistency
463
+ format_count = sum(
464
+ [has_image, has_pixel_values, has_precomputed_features]
465
+ )
466
+ if format_count > 1:
467
+ raise ValueError(
468
+ "Unsupported: mixture of multimodal input formats. "
469
+ f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
470
+ f"precomputed_features={has_precomputed_features}"
471
+ )
472
+
473
+ if has_image:
474
+ return MultimodalInputFormat.RAW_IMAGES
475
+ elif has_precomputed_features:
476
+ return MultimodalInputFormat.PRECOMPUTED_FEATURES
477
+ elif has_pixel_values:
478
+ return MultimodalInputFormat.PIXEL_VALUES
479
+ else:
480
+ raise ValueError("No valid multimodal input format found")
481
+ except Exception as e:
482
+ raise ValueError(f"Failed to categorize inputs: {e}")
483
+
484
+ def process_raw_images(
485
+ base_output: BaseMultiModalProcessorOutput,
486
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
487
+ """Process raw Image.Image objects using transformers processor."""
488
+ ret = self.process_mm_data(
489
+ input_text=base_output.input_text,
490
+ images=base_output.images,
491
+ )
492
+ combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
493
+
494
+ # Copy all fields from processor output except input_ids
495
+ for key, value in ret.items():
496
+ if key != "input_ids" and hasattr(combined_mm_item, key):
497
+ setattr(combined_mm_item, key, value)
498
+
499
+ input_ids = ret["input_ids"].flatten()
500
+ return combined_mm_item, input_ids
501
+
502
+ def process_precomputed_features(
503
+ base_output: BaseMultiModalProcessorOutput,
504
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
505
+ """Process inputs with precomputed features."""
506
+ combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
507
+ combined_mm_item.precomputed_features = self._extract_processor_features(
508
+ base_output.images, "precomputed_features"
509
+ )
510
+ input_ids = tokenize_text(base_output.input_text)
511
+ return combined_mm_item, input_ids
512
+
513
+ def process_pixel_values(
514
+ base_output: BaseMultiModalProcessorOutput,
515
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
516
+ """Process inputs with pixel values."""
517
+ values = self._extract_processor_features_from_all_attributes(
518
+ base_output.images
519
+ )
520
+ combined_mm_item = MultimodalDataItem.from_dict(values)
521
+ input_ids = tokenize_text(base_output.input_text)
522
+ return combined_mm_item, input_ids
523
+
524
+ def finalize_mm_item(
525
+ combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
526
+ ) -> MultimodalDataItem:
527
+ """Apply common post-processing to the multimodal item."""
528
+ combined_mm_item.image_offsets = self.get_mm_items_offset(
529
+ input_ids=input_ids,
530
+ mm_token_id=self.IM_TOKEN_ID,
531
+ )
532
+ return combined_mm_item
533
+
534
+ # Main logic
535
+ mm_inputs = base_output.images
536
+ if not mm_inputs:
537
+ # Return text-only case
538
+ input_ids = tokenize_text(base_output.input_text)
539
+ return None, input_ids
540
+
541
+ # Categorize input formats
542
+ input_format = categorize_mm_inputs(mm_inputs)
543
+
544
+ # Process based on format
545
+ if input_format == MultimodalInputFormat.RAW_IMAGES:
546
+ combined_mm_item, input_ids = process_raw_images(base_output)
547
+ elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
548
+ combined_mm_item, input_ids = process_precomputed_features(base_output)
549
+ elif input_format == MultimodalInputFormat.PIXEL_VALUES:
550
+ combined_mm_item, input_ids = process_pixel_values(base_output)
551
+ else:
552
+ raise ValueError(f"Unknown input format: {input_format}")
553
+
554
+ # Finalize with common processing
555
+ combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
556
+ return combined_mm_item, input_ids
@@ -70,8 +70,13 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
70
70
  batched_images_spatial_crop = torch.stack(batched_images_spatial_crop, dim=0)
71
71
 
72
72
  items = []
73
+ input_ids = res["input_ids"]
74
+ image_offsets = self.get_mm_items_offset(
75
+ input_ids=input_ids, mm_token_id=self._processor.image_token_id
76
+ )
73
77
  item = MultimodalDataItem(
74
78
  pixel_values=res["images"],
79
+ image_offsets=image_offsets,
75
80
  modality=Modality.IMAGE,
76
81
  image_emb_mask=images_seq_mask,
77
82
  image_spatial_crop=batched_images_spatial_crop,
@@ -80,6 +85,6 @@ class DeepseekVL2ImageProcessor(BaseMultimodalProcessor):
80
85
 
81
86
  return {
82
87
  "mm_items": items,
83
- "input_ids": res["input_ids"].tolist(),
88
+ "input_ids": input_ids.tolist(),
84
89
  "im_token_id": self._processor.image_token_id,
85
90
  }
@@ -1,4 +1,5 @@
1
- from typing import List, Union
1
+ import re
2
+ from typing import Dict, List, Union
2
3
 
3
4
  from sglang.srt.managers.multimodal_processor import (
4
5
  BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -18,13 +19,19 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
18
19
 
19
20
  def __init__(self, hf_config, server_args, _processor):
20
21
  super().__init__(hf_config, server_args, _processor)
22
+ # The single, pre-expanded image token.
21
23
  self.IMAGE_TOKEN = "<start_of_image>"
24
+ # The regex that matches expanded image tokens.
25
+ self.IMAGE_TOKEN_REGEX = re.compile(
26
+ r"<start_of_image>(?:(?:<image_soft_token>)*<end_of_image>)?"
27
+ )
22
28
  self.IM_START_TOKEN_ID = hf_config.boi_token_index
23
29
  self.IM_END_TOKEN_ID = hf_config.eoi_token_index
30
+ self.IM_TOKEN_ID = hf_config.image_token_index
24
31
 
25
32
  async def process_mm_data_async(
26
33
  self,
27
- image_data: List[Union[str, bytes]],
34
+ image_data: List[Union[str, bytes, Dict]],
28
35
  input_text,
29
36
  request_obj,
30
37
  max_req_input_len,
@@ -36,30 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
36
43
  if isinstance(image_data, str):
37
44
  image_data = [image_data]
38
45
 
39
- image_token = self.IMAGE_TOKEN
40
46
  base_output = self.load_mm_data(
41
47
  prompt=input_text,
42
48
  image_data=image_data,
43
- multimodal_tokens=MultimodalSpecialTokens(image_token=image_token),
49
+ multimodal_tokens=MultimodalSpecialTokens(
50
+ image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
51
+ ),
44
52
  max_req_input_len=max_req_input_len,
45
53
  discard_alpha_channel=True,
46
54
  )
47
55
 
48
- ret = self.process_mm_data(
49
- input_text=base_output.input_text, images=base_output.images
50
- )
51
-
52
- items = []
53
- for i, image in enumerate(base_output.images):
54
- item = MultimodalDataItem(
55
- pixel_values=ret["pixel_values"][i],
56
- modality=Modality.IMAGE,
57
- )
58
- items += [item]
56
+ combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
59
57
 
60
58
  return {
61
- "mm_items": items,
62
- "input_ids": ret["input_ids"].flatten().tolist(),
59
+ "input_ids": input_ids.tolist(),
60
+ "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
63
61
  "im_start_id": self.IM_START_TOKEN_ID,
64
62
  "im_end_id": self.IM_END_TOKEN_ID,
65
63
  }
@@ -3,7 +3,6 @@
3
3
  import numpy as np
4
4
  import torch
5
5
  from decord import VideoReader, cpu
6
- from numpy.distutils.cpuinfo import cpu
7
6
  from PIL import Image
8
7
 
9
8
  from sglang.srt.managers.multimodal_processors.base_processor import (
@@ -176,6 +175,10 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
176
175
  if not image_data:
177
176
  return None
178
177
 
178
+ # Ensure image_data is a list
179
+ if isinstance(image_data, str):
180
+ image_data = [image_data]
181
+
179
182
  base_output = self.load_mm_data(
180
183
  prompt=input_text,
181
184
  image_data=image_data,
@@ -210,7 +213,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
210
213
  return None
211
214
 
212
215
  pixel_values = torch.cat(pixel_values, dim=0)
213
- items = [MultimodalDataItem(pixel_values=pixel_values, modality=Modality.IMAGE)]
214
216
 
215
217
  for idx, num_patches in enumerate(num_patches_list):
216
218
  image_tokens = (
@@ -221,10 +223,21 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
221
223
  input_text = input_text.replace("<image>", image_tokens, 1)
222
224
 
223
225
  tokenizer = self._processor
226
+ input_ids = tokenizer(input_text, return_tensors="pt")["input_ids"].flatten()
227
+ image_offsets = self.get_mm_items_offset(
228
+ input_ids=input_ids,
229
+ mm_token_id=self.img_context_token_id,
230
+ )
231
+ items = [
232
+ MultimodalDataItem(
233
+ pixel_values=pixel_values,
234
+ modality=Modality.IMAGE,
235
+ image_offsets=image_offsets,
236
+ )
237
+ ]
238
+
224
239
  return {
225
- "input_ids": tokenizer(input_text, return_tensors="pt")["input_ids"]
226
- .flatten()
227
- .tolist(),
240
+ "input_ids": input_ids.tolist(),
228
241
  "mm_items": items,
229
242
  "im_start_id": self.img_start_token_id,
230
243
  "im_end_id": self.img_end_token_id,
@@ -45,15 +45,21 @@ class JanusProImageProcessor(BaseMultimodalProcessor):
45
45
  prompt=base_out.input_text,
46
46
  images=images,
47
47
  )
48
+
49
+ input_ids = res["input_ids"].flatten()
50
+ image_offsets = self.get_mm_items_offset(
51
+ input_ids=input_ids, mm_token_id=processor.image_id
52
+ )
48
53
  return {
49
54
  "mm_items": [
50
55
  MultimodalDataItem(
51
56
  pixel_values=res["pixel_values"],
52
57
  image_emb_mask=res["images_emb_mask"],
58
+ image_offsets=image_offsets,
53
59
  modality=Modality.IMAGE,
54
60
  )
55
61
  ],
56
- "input_ids": res["input_ids"].flatten().tolist(),
62
+ "input_ids": input_ids.tolist(),
57
63
  "im_start_id": processor.image_start_id,
58
64
  "im_end_id": processor.image_end_id,
59
65
  "im_token_id": processor.image_id,
@@ -1,9 +1,7 @@
1
- import asyncio
2
- import math
3
- from typing import List, Union
1
+ import re
2
+ from typing import Any, Dict, List, Optional, Union
4
3
 
5
4
  import torch
6
- from PIL import Image
7
5
 
8
6
  from sglang.srt.managers.multimodal_processors.base_processor import (
9
7
  BaseMultimodalProcessor as SGLangBaseProcessor,
@@ -22,20 +20,12 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
22
20
  def __init__(self, hf_config, server_args, _processor):
23
21
  super().__init__(hf_config, server_args, _processor)
24
22
  self.IMAGE_TOKEN = "<|media_pad|>"
25
- self.im_token_id = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
26
-
27
- self.im_start = "<|media_start|>"
28
- self.im_start_id = _processor.tokenizer.convert_tokens_to_ids(self.im_start)
29
-
30
- self.im_end = "<|media_end|>"
31
- self.im_end_id = _processor.tokenizer.convert_tokens_to_ids(self.im_end)
32
-
33
- self.im_content = "<|media_content|>"
34
- self.im_content_id = _processor.tokenizer.convert_tokens_to_ids(self.im_content)
23
+ self.IMAGE_TOKEN_REGEX = re.compile(r"(?:<\|media_pad\|>)+")
24
+ self.IM_TOKEN_ID = _processor.tokenizer.convert_tokens_to_ids(self.IMAGE_TOKEN)
35
25
 
36
26
  async def process_mm_data_async(
37
27
  self,
38
- image_data: List[Union[str, bytes]],
28
+ image_data: List[Union[str, bytes, Dict]],
39
29
  input_text,
40
30
  request_obj,
41
31
  max_req_input_len,
@@ -50,24 +40,16 @@ class KimiVLImageProcessor(SGLangBaseProcessor):
50
40
  base_output = self.load_mm_data(
51
41
  prompt=input_text,
52
42
  image_data=image_data,
53
- multimodal_tokens=MultimodalSpecialTokens(image_token=self.IMAGE_TOKEN),
43
+ multimodal_tokens=MultimodalSpecialTokens(
44
+ image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
45
+ ),
54
46
  max_req_input_len=max_req_input_len,
55
47
  )
56
- ret = self.process_mm_data(
57
- input_text=base_output.input_text,
58
- images=base_output.images,
59
- )
48
+
49
+ combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
50
+
60
51
  return {
61
- "input_ids": ret["input_ids"].flatten().tolist(),
62
- "mm_items": [
63
- MultimodalDataItem(
64
- pixel_values=ret["pixel_values"],
65
- image_grid_thws=ret["image_grid_hws"],
66
- modality=Modality.IMAGE,
67
- )
68
- ],
69
- "im_token_id": self.im_token_id,
70
- "im_start_id": self.im_start_id,
71
- "im_end_id": self.im_end_id,
72
- "im_content_id": self.im_content_id,
52
+ "input_ids": input_ids.tolist(),
53
+ "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
54
+ "im_token_id": self.IM_TOKEN_ID,
73
55
  }