sglang 0.4.6.post5__py3-none-any.whl → 0.4.7.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (359) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +7 -0
  3. sglang/bench_offline_throughput.py +10 -4
  4. sglang/bench_one_batch_server.py +67 -11
  5. sglang/bench_serving.py +86 -75
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/lang/interpreter.py +40 -1
  8. sglang/lang/ir.py +27 -0
  9. sglang/math_utils.py +8 -0
  10. sglang/profiler.py +167 -0
  11. sglang/srt/_custom_ops.py +34 -0
  12. sglang/srt/configs/internvl.py +8 -12
  13. sglang/srt/configs/model_config.py +33 -1
  14. sglang/srt/constrained/base_grammar_backend.py +5 -2
  15. sglang/srt/constrained/llguidance_backend.py +9 -8
  16. sglang/srt/constrained/outlines_backend.py +5 -4
  17. sglang/srt/constrained/xgrammar_backend.py +18 -18
  18. sglang/srt/conversation.py +52 -8
  19. sglang/srt/custom_op.py +38 -3
  20. sglang/srt/debug_utils.py +74 -0
  21. sglang/srt/disaggregation/base/__init__.py +1 -1
  22. sglang/srt/disaggregation/base/conn.py +25 -11
  23. sglang/srt/disaggregation/common/__init__.py +5 -0
  24. sglang/srt/disaggregation/common/conn.py +407 -0
  25. sglang/srt/disaggregation/common/utils.py +42 -0
  26. sglang/srt/disaggregation/decode.py +261 -52
  27. sglang/srt/disaggregation/fake/__init__.py +1 -1
  28. sglang/srt/disaggregation/fake/conn.py +16 -9
  29. sglang/srt/disaggregation/kv_events.py +60 -5
  30. sglang/srt/disaggregation/launch_lb.py +140 -0
  31. sglang/srt/disaggregation/mini_lb.py +29 -48
  32. sglang/srt/disaggregation/mooncake/__init__.py +1 -1
  33. sglang/srt/disaggregation/mooncake/conn.py +446 -149
  34. sglang/srt/disaggregation/mooncake/transfer_engine.py +32 -16
  35. sglang/srt/disaggregation/nixl/__init__.py +6 -1
  36. sglang/srt/disaggregation/nixl/conn.py +134 -437
  37. sglang/srt/disaggregation/prefill.py +130 -43
  38. sglang/srt/disaggregation/utils.py +127 -86
  39. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  40. sglang/srt/distributed/parallel_state.py +52 -5
  41. sglang/srt/entrypoints/EngineBase.py +6 -0
  42. sglang/srt/entrypoints/engine.py +116 -5
  43. sglang/srt/entrypoints/http_server.py +28 -4
  44. sglang/srt/eplb_simulator/__init__.py +1 -0
  45. sglang/srt/eplb_simulator/reader.py +51 -0
  46. sglang/srt/function_call/base_format_detector.py +138 -86
  47. sglang/srt/function_call/deepseekv3_detector.py +54 -6
  48. sglang/srt/function_call/ebnf_composer.py +33 -19
  49. sglang/srt/function_call/function_call_parser.py +27 -0
  50. sglang/srt/function_call/llama32_detector.py +33 -14
  51. sglang/srt/function_call/mistral_detector.py +73 -26
  52. sglang/srt/function_call/pythonic_detector.py +86 -20
  53. sglang/srt/function_call/qwen25_detector.py +64 -10
  54. sglang/srt/function_call/utils.py +17 -0
  55. sglang/srt/hf_transformers_utils.py +4 -0
  56. sglang/srt/layers/activation.py +19 -0
  57. sglang/srt/layers/attention/aiter_backend.py +503 -125
  58. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  59. sglang/srt/layers/attention/cutlass_mla_backend.py +40 -34
  60. sglang/srt/layers/attention/flashattention_backend.py +137 -63
  61. sglang/srt/layers/attention/flashinfer_backend.py +46 -3
  62. sglang/srt/layers/attention/flashinfer_mla_backend.py +59 -25
  63. sglang/srt/layers/attention/flashmla_backend.py +2 -10
  64. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  65. sglang/srt/layers/attention/tbo_backend.py +232 -0
  66. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  67. sglang/srt/layers/attention/triton_backend.py +304 -65
  68. sglang/srt/layers/attention/triton_ops/decode_attention.py +2 -7
  69. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  70. sglang/srt/layers/attention/vision.py +51 -24
  71. sglang/srt/layers/communicator.py +281 -197
  72. sglang/srt/layers/dp_attention.py +6 -5
  73. sglang/srt/layers/layernorm.py +30 -19
  74. sglang/srt/layers/linear.py +0 -4
  75. sglang/srt/layers/logits_processor.py +0 -12
  76. sglang/srt/layers/moe/cutlass_moe.py +170 -7
  77. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  78. sglang/srt/layers/moe/ep_moe/kernels.py +33 -11
  79. sglang/srt/layers/moe/ep_moe/layer.py +136 -72
  80. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +24 -45
  81. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  82. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  84. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  85. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +221 -29
  91. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -4
  92. sglang/srt/layers/moe/topk.py +60 -26
  93. sglang/srt/layers/multimodal.py +3 -3
  94. sglang/srt/layers/pooler.py +56 -0
  95. sglang/srt/layers/quantization/__init__.py +3 -2
  96. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  97. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  98. sglang/srt/layers/quantization/deep_gemm_wrapper/__init__.py +1 -0
  99. sglang/srt/layers/quantization/{deep_gemm.py → deep_gemm_wrapper/compile_utils.py} +69 -127
  100. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +32 -0
  101. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +110 -0
  102. sglang/srt/layers/quantization/fp8.py +28 -23
  103. sglang/srt/layers/quantization/fp8_kernel.py +156 -75
  104. sglang/srt/layers/quantization/fp8_utils.py +250 -69
  105. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  106. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  107. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  108. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  109. sglang/srt/layers/radix_attention.py +2 -3
  110. sglang/srt/layers/rotary_embedding.py +6 -12
  111. sglang/srt/layers/sampler.py +80 -79
  112. sglang/srt/layers/utils.py +6 -0
  113. sglang/srt/lora/layers.py +12 -15
  114. sglang/srt/lora/lora.py +49 -5
  115. sglang/srt/lora/lora_manager.py +98 -39
  116. sglang/srt/lora/mem_pool.py +28 -21
  117. sglang/srt/lora/utils.py +17 -13
  118. sglang/srt/managers/cache_controller.py +2 -1
  119. sglang/srt/managers/data_parallel_controller.py +13 -5
  120. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  121. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  122. sglang/srt/managers/{deepseek_eplb.py → eplb_algorithms/deepseek_vec.py} +5 -7
  123. sglang/srt/managers/eplb_manager.py +55 -14
  124. sglang/srt/managers/expert_distribution.py +220 -46
  125. sglang/srt/managers/expert_location.py +110 -56
  126. sglang/srt/managers/expert_location_dispatch.py +23 -6
  127. sglang/srt/managers/io_struct.py +43 -8
  128. sglang/srt/managers/mm_utils.py +88 -38
  129. sglang/srt/managers/multimodal_processors/base_processor.py +190 -18
  130. sglang/srt/managers/multimodal_processors/gemma3.py +4 -31
  131. sglang/srt/managers/multimodal_processors/internvl.py +4 -0
  132. sglang/srt/managers/multimodal_processors/kimi_vl.py +15 -34
  133. sglang/srt/managers/multimodal_processors/minicpm.py +2 -1
  134. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  135. sglang/srt/managers/multimodal_processors/qwen_vl.py +22 -64
  136. sglang/srt/managers/multimodal_processors/vila.py +85 -0
  137. sglang/srt/managers/schedule_batch.py +173 -38
  138. sglang/srt/managers/scheduler.py +376 -127
  139. sglang/srt/managers/tokenizer_manager.py +163 -19
  140. sglang/srt/managers/utils.py +0 -4
  141. sglang/srt/mem_cache/chunk_cache.py +1 -0
  142. sglang/srt/mem_cache/hiradix_cache.py +4 -2
  143. sglang/srt/mem_cache/memory_pool.py +111 -407
  144. sglang/srt/mem_cache/memory_pool_host.py +380 -0
  145. sglang/srt/mem_cache/radix_cache.py +36 -12
  146. sglang/srt/metrics/collector.py +9 -0
  147. sglang/srt/model_executor/cuda_graph_runner.py +191 -113
  148. sglang/srt/model_executor/expert_location_updater.py +157 -22
  149. sglang/srt/model_executor/forward_batch_info.py +52 -22
  150. sglang/srt/model_executor/model_runner.py +102 -62
  151. sglang/srt/model_loader/loader.py +8 -1
  152. sglang/srt/model_loader/utils.py +67 -1
  153. sglang/srt/models/bert.py +113 -13
  154. sglang/srt/models/deepseek_nextn.py +1 -1
  155. sglang/srt/models/deepseek_v2.py +623 -290
  156. sglang/srt/models/gemma3_causal.py +7 -0
  157. sglang/srt/models/gemma3_mm.py +19 -14
  158. sglang/srt/models/idefics2.py +342 -0
  159. sglang/srt/models/internvl.py +46 -102
  160. sglang/srt/models/kimi_vl.py +4 -4
  161. sglang/srt/models/llama.py +1 -1
  162. sglang/srt/models/minicpmo.py +2 -5
  163. sglang/srt/models/minicpmv.py +3 -295
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +38 -9
  166. sglang/srt/models/qwen2_5_vl.py +3 -9
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +58 -191
  169. sglang/srt/models/qwen2_vl.py +3 -9
  170. sglang/srt/models/qwen3.py +41 -10
  171. sglang/srt/models/qwen3_moe.py +230 -191
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/roberta.py +117 -9
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/models/vila.py +305 -0
  176. sglang/srt/openai_api/adapter.py +248 -28
  177. sglang/srt/openai_api/protocol.py +68 -3
  178. sglang/srt/openai_api/utils.py +172 -0
  179. sglang/srt/operations.py +37 -2
  180. sglang/srt/operations_strategy.py +200 -24
  181. sglang/srt/sampling/sampling_batch_info.py +37 -1
  182. sglang/srt/sampling/sampling_params.py +4 -1
  183. sglang/srt/server_args.py +381 -209
  184. sglang/srt/speculative/build_eagle_tree.py +9 -9
  185. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +12 -14
  186. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +256 -0
  187. sglang/srt/speculative/eagle_utils.py +440 -200
  188. sglang/srt/speculative/eagle_worker.py +234 -63
  189. sglang/srt/two_batch_overlap.py +637 -0
  190. sglang/srt/utils.py +187 -7
  191. sglang/test/attention/test_prefix_chunk_info.py +2 -0
  192. sglang/test/runners.py +54 -10
  193. sglang/test/send_one.py +4 -0
  194. sglang/test/test_block_fp8.py +1 -0
  195. sglang/test/test_block_fp8_deep_gemm_blackwell.py +252 -0
  196. sglang/test/test_block_fp8_ep.py +1 -0
  197. sglang/test/test_cutlass_moe.py +3 -3
  198. sglang/test/test_fp4_moe.py +248 -0
  199. sglang/test/test_utils.py +82 -7
  200. sglang/utils.py +9 -0
  201. sglang/version.py +1 -1
  202. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/METADATA +17 -14
  203. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/RECORD +359 -321
  204. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/WHEEL +1 -1
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  356. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  357. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  358. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/licenses/LICENSE +0 -0
  359. {sglang-0.4.6.post5.dist-info → sglang-0.4.7.post1.dist-info}/top_level.txt +0 -0
@@ -25,7 +25,7 @@ from sglang.srt.managers.schedule_batch import global_server_args_dict
25
25
  class ExpertLocationDispatchInfo:
26
26
  ep_dispatch_algorithm: Literal["static", "random"]
27
27
  # (num_logical_experts,)
28
- partial_logical_to_rank_dispatch_physical_map: torch.Tensor
28
+ partial_logical_to_rank_dispatch_physical_map: Optional[torch.Tensor]
29
29
  # (num_logical_experts, X)
30
30
  partial_logical_to_all_physical_map: torch.Tensor
31
31
  # (num_logical_experts,)
@@ -42,9 +42,14 @@ class ExpertLocationDispatchInfo:
42
42
 
43
43
  return cls(
44
44
  ep_dispatch_algorithm=ep_dispatch_algorithm,
45
- partial_logical_to_rank_dispatch_physical_map=expert_location_metadata.logical_to_rank_dispatch_physical_map[
46
- layer_id, :
47
- ],
45
+ partial_logical_to_rank_dispatch_physical_map=(
46
+ expert_location_metadata.logical_to_rank_dispatch_physical_map[
47
+ layer_id, :
48
+ ]
49
+ if expert_location_metadata.logical_to_rank_dispatch_physical_map
50
+ is not None
51
+ else None
52
+ ),
48
53
  partial_logical_to_all_physical_map=expert_location_metadata.logical_to_all_physical_map[
49
54
  layer_id, :
50
55
  ],
@@ -55,6 +60,18 @@ class ExpertLocationDispatchInfo:
55
60
  )
56
61
 
57
62
 
63
+ def transform_select_experts_inputs(
64
+ router_logits: torch.Tensor,
65
+ correction_bias: Optional[torch.Tensor],
66
+ info: Optional[ExpertLocationDispatchInfo],
67
+ ):
68
+ if (info is not None) and (info.ep_dispatch_algorithm == "fake"):
69
+ router_logits = torch.randn_like(router_logits)
70
+ if correction_bias is not None:
71
+ correction_bias = torch.zeros_like(correction_bias)
72
+ return router_logits, correction_bias
73
+
74
+
58
75
  def topk_ids_logical_to_physical(
59
76
  topk_ids: torch.Tensor, info: Optional[ExpertLocationDispatchInfo]
60
77
  ) -> torch.Tensor:
@@ -63,9 +80,9 @@ def topk_ids_logical_to_physical(
63
80
 
64
81
  if info.ep_dispatch_algorithm == "static":
65
82
  return _topk_ids_logical_to_physical_static(topk_ids, info)
66
- if info.ep_dispatch_algorithm == "dynamic":
83
+ if info.ep_dispatch_algorithm in ["dynamic", "fake"]:
67
84
  return _topk_ids_logical_to_physical_dynamic(topk_ids, info)
68
- raise NotImplementedError
85
+ raise NotImplementedError(f"Unknown algorithm {info.ep_dispatch_algorithm}")
69
86
 
70
87
 
71
88
  def _topk_ids_logical_to_physical_static(
@@ -20,7 +20,7 @@ import copy
20
20
  import uuid
21
21
  from dataclasses import dataclass, field
22
22
  from enum import Enum
23
- from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Union
23
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
24
24
 
25
25
  from sglang.srt.mm_utils import has_valid_data
26
26
 
@@ -30,7 +30,7 @@ if TYPE_CHECKING:
30
30
  else:
31
31
  Image = Any
32
32
 
33
- from sglang.srt.managers.schedule_batch import BaseFinishReason, flatten_nested_list
33
+ from sglang.srt.managers.schedule_batch import BaseFinishReason
34
34
  from sglang.srt.sampling.sampling_params import SamplingParams
35
35
 
36
36
 
@@ -87,7 +87,7 @@ class GenerateReqInput:
87
87
 
88
88
  # The modalities of the image data [image, multi-images, video]
89
89
  modalities: Optional[List[str]] = None
90
- # LoRA related
90
+ # The path to the LoRA
91
91
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
92
92
 
93
93
  # Session info for continual prompting
@@ -99,13 +99,16 @@ class GenerateReqInput:
99
99
  custom_logit_processor: Optional[Union[List[Optional[str]], str]] = None
100
100
 
101
101
  # Whether to return hidden states
102
- return_hidden_states: bool = False
102
+ return_hidden_states: Union[List[bool], bool] = False
103
103
 
104
104
  # For disaggregated inference
105
105
  bootstrap_host: Optional[Union[List[str], str]] = None
106
- bootstrap_port: Optional[Union[List[int], int]] = None
106
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
107
107
  bootstrap_room: Optional[Union[List[int], int]] = None
108
108
 
109
+ # For data parallel rank routing
110
+ data_parallel_rank: Optional[int] = None
111
+
109
112
  def contains_mm_input(self) -> bool:
110
113
  return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
111
114
 
@@ -406,7 +409,11 @@ class GenerateReqInput:
406
409
  if self.custom_logit_processor is not None
407
410
  else None
408
411
  ),
409
- return_hidden_states=self.return_hidden_states,
412
+ return_hidden_states=(
413
+ self.return_hidden_states[i]
414
+ if isinstance(self.return_hidden_states, list)
415
+ else self.return_hidden_states
416
+ ),
410
417
  # if `__getitem__` is called, the bootstrap_host, bootstrap_port, bootstrap_room must be a list
411
418
  bootstrap_host=(
412
419
  self.bootstrap_host[i] if self.bootstrap_host is not None else None
@@ -417,6 +424,9 @@ class GenerateReqInput:
417
424
  bootstrap_room=(
418
425
  self.bootstrap_room[i] if self.bootstrap_room is not None else None
419
426
  ),
427
+ data_parallel_rank=(
428
+ self.data_parallel_rank if self.data_parallel_rank is not None else None
429
+ ),
420
430
  )
421
431
 
422
432
 
@@ -464,11 +474,14 @@ class TokenizedGenerateReqInput:
464
474
  bootstrap_port: Optional[int] = None
465
475
  bootstrap_room: Optional[int] = None
466
476
 
477
+ # For data parallel rank routing
478
+ data_parallel_rank: Optional[int] = None
479
+
467
480
 
468
481
  @dataclass
469
482
  class EmbeddingReqInput:
470
483
  # The input prompt. It can be a single prompt or a batch of prompts.
471
- text: Optional[Union[List[str], str]] = None
484
+ text: Optional[Union[List[List[str]], List[str], str]] = None
472
485
  # The image input. It can be an image instance, file name, URL, or base64 encoded string.
473
486
  # Can be formatted as:
474
487
  # - Single image for a single request
@@ -492,6 +505,8 @@ class EmbeddingReqInput:
492
505
  log_metrics: bool = True
493
506
  # The modalities of the image data [image, multi-images, video]
494
507
  modalities: Optional[List[str]] = None
508
+ # For cross-encoder requests
509
+ is_cross_encoder_request: bool = False
495
510
 
496
511
  def contains_mm_input(self) -> bool:
497
512
  return has_valid_data(self.image_data) or has_valid_data(self.audio_data)
@@ -551,6 +566,16 @@ class EmbeddingReqInput:
551
566
  return self.rid
552
567
 
553
568
  def __getitem__(self, i):
569
+ if self.is_cross_encoder_request:
570
+ return EmbeddingReqInput(
571
+ text=[self.text[i]] if self.text is not None else None,
572
+ input_ids=None,
573
+ image_data=None,
574
+ sampling_params=self.sampling_params[i],
575
+ rid=self.rid[i],
576
+ is_cross_encoder_request=True,
577
+ )
578
+
554
579
  return EmbeddingReqInput(
555
580
  text=self.text[i] if self.text is not None else None,
556
581
  input_ids=self.input_ids[i] if self.input_ids is not None else None,
@@ -570,6 +595,8 @@ class TokenizedEmbeddingReqInput:
570
595
  input_ids: List[int]
571
596
  # The image inputs
572
597
  image_inputs: dict
598
+ # The token type ids
599
+ token_type_ids: List[int]
573
600
  # Dummy sampling params for compatibility
574
601
  sampling_params: SamplingParams
575
602
 
@@ -834,6 +861,12 @@ class SetInternalStateReq:
834
861
  server_args: Dict[str, Any]
835
862
 
836
863
 
864
+ @dataclass
865
+ class V1RerankReqInput:
866
+ query: str
867
+ documents: List[str]
868
+
869
+
837
870
  @dataclass
838
871
  class SetInternalStateReqOutput:
839
872
  updated: bool
@@ -848,7 +881,8 @@ class ProfileReqInput:
848
881
  # If it is set, profiling is automatically stopped after this step, and
849
882
  # the caller doesn't need to run stop_profile.
850
883
  num_steps: Optional[int] = None
851
- activities: Optional[List[Literal["CPU", "GPU", "MEM", "CUDA_PROFILER"]]] = None
884
+ activities: Optional[List[str]] = None
885
+ profile_by_stage: bool = False
852
886
  with_stack: Optional[bool] = None
853
887
  record_shapes: Optional[bool] = None
854
888
 
@@ -875,6 +909,7 @@ class ProfileReq:
875
909
  output_dir: Optional[str] = None
876
910
  num_steps: Optional[int] = None
877
911
  activities: Optional[List[str]] = None
912
+ profile_by_stage: bool = False
878
913
  with_stack: Optional[bool] = None
879
914
  record_shapes: Optional[bool] = None
880
915
  profile_id: Optional[str] = None
@@ -252,40 +252,36 @@ def get_embedding_chunk(
252
252
  return embedding_chunk, start_index, end_index
253
253
 
254
254
 
255
- def get_embedding_and_mask(
255
+ def _get_precomputed_embedding(
256
+ items: List[MultimodalDataItem],
257
+ ) -> Optional[torch.Tensor]:
258
+ """
259
+ If all items have precomputed_features, return their concatenation.
260
+ If some but not all have precomputed_features, raise NotImplementedError.
261
+ If none have precomputed_features, return None.
262
+ """
263
+ precomputed_features = [item.precomputed_features for item in items]
264
+ if any(feature is not None for feature in precomputed_features):
265
+ if not all(feature is not None for feature in precomputed_features):
266
+ raise NotImplementedError(
267
+ "MM inputs where only some items are precomputed."
268
+ )
269
+ result = torch.concat(precomputed_features)
270
+ # some models embedding is 3-dim, reshape it to 2-dim (similar to get_embedding_chunk)
271
+ result = result.reshape(-1, result.shape[-1])
272
+ return result
273
+ return None
274
+
275
+
276
+ def _get_chunked_prefill_embedding(
256
277
  data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
257
278
  embedding_items: List[MultimodalDataItem],
258
- placeholder_tensor: torch.Tensor,
259
- input_ids: torch.Tensor,
260
279
  items_size: List[int],
261
280
  prefix_length: List[int],
262
281
  extend_length: List[int],
263
282
  items_offset_list: List[List[Tuple[int, int]]],
264
- ) -> Tuple[torch.Tensor, torch.Tensor]:
265
- """
266
- Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
267
-
268
- Args:
269
- data_embedding_func: Function that generates embeddings for multimodal items
270
- embedding_items: List of multimodal items to embed
271
- placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
272
- input_ids: The input token IDs tensor
273
- items_size: Cumulative sizes of multimodal items per request
274
- prefix_length: Prefix lengths for each request
275
- extend_length: Sequence lengths for each request
276
- items_offset_list: List of offset ranges for multimodal items in each request
277
-
278
- Returns:
279
- A tuple containing:
280
- - The generated embeddings tensor
281
- - A boolean mask tensor indicating where these embeddings should be placed
282
-
283
- Raises:
284
- AssertionError: If the number of multimodal tokens in input_ids doesn't match
285
- the number of tokens in the generated embeddings
286
- """
287
- # 1. Get the embedding
288
- # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
283
+ ) -> Optional[torch.Tensor]:
284
+ # Calculate embedding for each request, try to get it from cache to avoid repeated calculation
289
285
  embedding_list = []
290
286
  for i in range(len(items_size) - 1):
291
287
  if items_size[i] == items_size[i + 1]:
@@ -321,21 +317,28 @@ def get_embedding_and_mask(
321
317
  embedding_cache.free(embedding_items_hash)
322
318
  embedding_list.append(embedding_per_req_chunk)
323
319
  if len(embedding_list) == 0:
324
- return None, None
325
- embedding = torch.concat(embedding_list, dim=0)
326
- # 2. Check the embedding
327
- num_mm_tokens_in_embedding = embedding.shape[0]
328
- special_multimodal_mask = torch.isin(
329
- input_ids,
330
- placeholder_tensor,
331
- ).unsqueeze(-1)
320
+ return None
321
+ return torch.concat(embedding_list, dim=0)
322
+
323
+
324
+ def _get_multimodal_mask(
325
+ input_ids: torch.Tensor, placeholder_tensor: torch.Tensor
326
+ ) -> torch.Tensor:
327
+ return torch.isin(input_ids, placeholder_tensor).unsqueeze(-1)
332
328
 
333
- num_mm_tokens_in_input_ids = special_multimodal_mask.sum().item()
329
+
330
+ def _adjust_embedding_length(
331
+ embedding: torch.Tensor,
332
+ mask: torch.Tensor,
333
+ logger,
334
+ ) -> torch.Tensor:
335
+ num_mm_tokens_in_embedding = embedding.shape[0]
336
+ num_mm_tokens_in_input_ids = mask.sum().item()
334
337
  if num_mm_tokens_in_input_ids != num_mm_tokens_in_embedding:
335
338
  logger.warning(
336
339
  f"Number of tokens in multimodal embedding does not match those in the input text. "
337
340
  f"Got {num_mm_tokens_in_input_ids} tokens in the text but {num_mm_tokens_in_embedding} "
338
- "tokens from multimodal embeddings."
341
+ f"tokens from multimodal embeddings."
339
342
  )
340
343
  if num_mm_tokens_in_input_ids < num_mm_tokens_in_embedding:
341
344
  chunked_prefill_size = global_server_args_dict["chunked_prefill_size"]
@@ -353,7 +356,54 @@ def get_embedding_and_mask(
353
356
  raise RuntimeError(
354
357
  f"Insufficient multimodal embedding length: {num_mm_tokens_in_input_ids=} vs {num_mm_tokens_in_embedding=}. This is an internal error"
355
358
  )
359
+ return embedding
360
+
361
+
362
+ def get_embedding_and_mask(
363
+ data_embedding_func: Callable[[List[MultimodalDataItem]], torch.Tensor],
364
+ embedding_items: List[MultimodalDataItem],
365
+ placeholder_tensor: torch.Tensor,
366
+ input_ids: torch.Tensor,
367
+ items_size: List[int],
368
+ prefix_length: List[int],
369
+ extend_length: List[int],
370
+ items_offset_list: List[List[Tuple[int, int]]],
371
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
372
+ """
373
+ Generate multimodal embeddings and create a mask for identifying their positions in the input sequence.
356
374
 
375
+ Args:
376
+ data_embedding_func: Function that generates embeddings for multimodal items
377
+ embedding_items: List of multimodal items to embed
378
+ placeholder_tensor: Tensor containing token IDs that serve as placeholders for multimodal content
379
+ input_ids: The input token IDs tensor
380
+ items_size: Cumulative sizes of multimodal items per request
381
+ prefix_length: Prefix lengths for each request
382
+ extend_length: Sequence lengths for each request
383
+ items_offset_list: List of offset ranges for multimodal items in each request
384
+
385
+ Returns:
386
+ A tuple containing:
387
+ - The generated embeddings tensor
388
+ - A boolean mask tensor indicating where these embeddings should be placed
389
+ """
390
+ # 1. Get embedding
391
+ embedding = _get_precomputed_embedding(embedding_items)
392
+ if embedding is None:
393
+ embedding = _get_chunked_prefill_embedding(
394
+ data_embedding_func,
395
+ embedding_items,
396
+ items_size,
397
+ prefix_length,
398
+ extend_length,
399
+ items_offset_list,
400
+ )
401
+ if embedding is None:
402
+ return None, None
403
+ # 2. Get mask
404
+ special_multimodal_mask = _get_multimodal_mask(input_ids, placeholder_tensor)
405
+ # 3. Adjust embedding length if needed
406
+ embedding = _adjust_embedding_length(embedding, special_multimodal_mask, logger)
357
407
  return embedding, special_multimodal_mask
358
408
 
359
409
 
@@ -5,7 +5,8 @@ import multiprocessing as mp
5
5
  import os
6
6
  import re
7
7
  from abc import ABC, abstractmethod
8
- from typing import List, Optional, Tuple, Union
8
+ from enum import Enum
9
+ from typing import Any, Dict, List, Optional, Tuple, Union
9
10
 
10
11
  import numpy as np
11
12
  import torch
@@ -16,16 +17,24 @@ from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
16
17
  from sglang.srt.utils import encode_video, load_audio, load_image
17
18
 
18
19
 
20
+ class MultimodalInputFormat(Enum):
21
+ """Enum for different multimodal input formats."""
22
+
23
+ RAW_IMAGES = "raw_images"
24
+ PRECOMPUTED_FEATURES = "precomputed_features"
25
+ PIXEL_VALUES = "pixel_values"
26
+
27
+
19
28
  @dataclasses.dataclass
20
29
  class BaseMultiModalProcessorOutput:
21
30
  # input_text, with each frame of video/image represented with a image_token
22
31
  input_text: str
23
32
 
24
33
  # frames loaded from image and video, in given order
25
- images: Optional[list[Union[Image.Image, MultimodalDataItem]]] = None
34
+ images: Optional[list[Union[Image.Image, dict]]] = None
26
35
 
27
36
  # audios
28
- audios: Optional[list[Union[np.ndarray, MultimodalDataItem]]] = None
37
+ audios: Optional[list[Union[np.ndarray, dict]]] = None
29
38
 
30
39
  def normalize(self):
31
40
  for field_name in ["images", "audios"]:
@@ -137,7 +146,7 @@ class BaseMultimodalProcessor(ABC):
137
146
  request_obj,
138
147
  max_req_input_len,
139
148
  **kwargs,
140
- ):
149
+ ) -> Optional[Dict[str, Any]]:
141
150
  pass
142
151
 
143
152
  def get_estimated_frames_list(self, image_data):
@@ -170,8 +179,6 @@ class BaseMultimodalProcessor(ABC):
170
179
  ):
171
180
  """Static method that can be pickled for multiprocessing"""
172
181
  if isinstance(data, dict):
173
- return MultimodalDataItem.from_dict(data)
174
- if isinstance(data, MultimodalDataItem):
175
182
  return data
176
183
  try:
177
184
  if is_audio:
@@ -254,7 +261,7 @@ class BaseMultimodalProcessor(ABC):
254
261
 
255
262
  def load_mm_data(
256
263
  self,
257
- prompt: str,
264
+ prompt: str | List[int],
258
265
  multimodal_tokens: MultimodalSpecialTokens,
259
266
  max_req_input_len: int,
260
267
  image_data: Optional[list] = None,
@@ -370,15 +377,180 @@ class BaseMultimodalProcessor(ABC):
370
377
 
371
378
  return list(zip(indices_start.tolist(), indices_end.tolist()))
372
379
 
373
- def mm_inputs_are_preprocessed(self, mm_inputs: Optional[list]):
374
- """Returns true if all images are preprocessed, false if all are not, and error otherwise."""
375
- if not mm_inputs:
376
- return True
377
- ret = any(isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs)
378
- if ret and not all(
379
- isinstance(mm_input, MultimodalDataItem) for mm_input in mm_inputs
380
- ):
381
- raise ValueError(
382
- "Unsupported: mixture of multimodal inputs where some but not all are preprocessed."
380
+ @staticmethod
381
+ def _extract_processor_features(
382
+ items: List[dict], attr_name: str
383
+ ) -> Optional[torch.Tensor]:
384
+ """
385
+ Helper function to concat extracted attributes from processor output.
386
+ """
387
+ values = [value for item in items if (value := item.get(attr_name)) is not None]
388
+ return torch.cat(values) if values else None
389
+
390
+ # When we assume that all the items have the same attributes
391
+ def _extract_processor_features_from_all_attributes(
392
+ self, items: List[dict]
393
+ ) -> dict:
394
+ values = {}
395
+ # Verify all items have the same keys
396
+ first_keys = set(items[0].keys())
397
+ for item in items[1:]:
398
+ if set(item.keys()) != first_keys:
399
+ raise ValueError(
400
+ f"All items must have the same attributes. "
401
+ f"First item has {first_keys}, but found {set(item.keys())}"
402
+ )
403
+
404
+ # Process each attribute
405
+ for k, v in items[0].items():
406
+ if isinstance(v, list):
407
+ values[k] = self._extract_processor_features(items, k)
408
+ else:
409
+ # Verify all items have the same value for non-list attributes
410
+ for item in items[1:]:
411
+ if item[k] != v:
412
+ raise ValueError(
413
+ f"All items must have the same value for attribute {k}. "
414
+ f"First item has {v}, but found {item[k]}"
415
+ )
416
+ values[k] = v
417
+ return values
418
+
419
+ def process_and_combine_mm_data(
420
+ self, base_output: BaseMultiModalProcessorOutput
421
+ ) -> Tuple[Optional[MultimodalDataItem], torch.Tensor]:
422
+ """
423
+ Process multimodal data and return the combined multimodal item and input_ids.
424
+ Handles all three input formats at the same abstraction level.
425
+
426
+ Returns:
427
+ Tuple of (combined_mm_item, input_ids)
428
+ """
429
+
430
+ def tokenize_text(input_text: str) -> torch.Tensor:
431
+ """Tokenize input text."""
432
+ return self._processor.tokenizer(
433
+ input_text,
434
+ return_tensors="pt",
435
+ add_special_tokens=True,
436
+ ).input_ids.flatten()
437
+
438
+ def categorize_mm_inputs(mm_inputs: List) -> MultimodalInputFormat:
439
+ """Categorize multimodal inputs and validate consistency."""
440
+ try:
441
+ has_image = False
442
+ has_pixel_values = False
443
+ has_precomputed_features = False
444
+
445
+ for mm_input in mm_inputs:
446
+ if isinstance(mm_input, Image.Image):
447
+ has_image = True
448
+ elif isinstance(mm_input, dict):
449
+ if mm_input.get("precomputed_features", None) is not None:
450
+ has_precomputed_features = True
451
+ elif mm_input.get("pixel_values", None) is not None:
452
+ has_pixel_values = True
453
+ else:
454
+ raise ValueError(
455
+ f"Invalid multimodal input: {mm_input}, expected dict with pixel_values or precomputed_features"
456
+ )
457
+ else:
458
+ raise ValueError(
459
+ f"Invalid multimodal input: {mm_input}, expected Image.Image or dict"
460
+ )
461
+
462
+ # Validate format consistency
463
+ format_count = sum(
464
+ [has_image, has_pixel_values, has_precomputed_features]
465
+ )
466
+ if format_count > 1:
467
+ raise ValueError(
468
+ "Unsupported: mixture of multimodal input formats. "
469
+ f"Found formats: image={has_image}, pixel_values={has_pixel_values}, "
470
+ f"precomputed_features={has_precomputed_features}"
471
+ )
472
+
473
+ if has_image:
474
+ return MultimodalInputFormat.RAW_IMAGES
475
+ elif has_precomputed_features:
476
+ return MultimodalInputFormat.PRECOMPUTED_FEATURES
477
+ elif has_pixel_values:
478
+ return MultimodalInputFormat.PIXEL_VALUES
479
+ else:
480
+ raise ValueError("No valid multimodal input format found")
481
+ except Exception as e:
482
+ raise ValueError(f"Failed to categorize inputs: {e}")
483
+
484
+ def process_raw_images(
485
+ base_output: BaseMultiModalProcessorOutput,
486
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
487
+ """Process raw Image.Image objects using transformers processor."""
488
+ ret = self.process_mm_data(
489
+ input_text=base_output.input_text,
490
+ images=base_output.images,
491
+ )
492
+ combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
493
+
494
+ # Copy all fields from processor output except input_ids
495
+ for key, value in ret.items():
496
+ if key != "input_ids" and hasattr(combined_mm_item, key):
497
+ setattr(combined_mm_item, key, value)
498
+
499
+ input_ids = ret["input_ids"].flatten()
500
+ return combined_mm_item, input_ids
501
+
502
+ def process_precomputed_features(
503
+ base_output: BaseMultiModalProcessorOutput,
504
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
505
+ """Process inputs with precomputed features."""
506
+ combined_mm_item = MultimodalDataItem(modality=Modality.IMAGE)
507
+ combined_mm_item.precomputed_features = self._extract_processor_features(
508
+ base_output.images, "precomputed_features"
383
509
  )
384
- return ret
510
+ input_ids = tokenize_text(base_output.input_text)
511
+ return combined_mm_item, input_ids
512
+
513
+ def process_pixel_values(
514
+ base_output: BaseMultiModalProcessorOutput,
515
+ ) -> Tuple[MultimodalDataItem, torch.Tensor]:
516
+ """Process inputs with pixel values."""
517
+ values = self._extract_processor_features_from_all_attributes(
518
+ base_output.images
519
+ )
520
+ combined_mm_item = MultimodalDataItem.from_dict(values)
521
+ input_ids = tokenize_text(base_output.input_text)
522
+ return combined_mm_item, input_ids
523
+
524
+ def finalize_mm_item(
525
+ combined_mm_item: MultimodalDataItem, input_ids: torch.Tensor
526
+ ) -> MultimodalDataItem:
527
+ """Apply common post-processing to the multimodal item."""
528
+ combined_mm_item.image_offsets = self.get_mm_items_offset(
529
+ input_ids=input_ids,
530
+ mm_token_id=self.IM_TOKEN_ID,
531
+ )
532
+ return combined_mm_item
533
+
534
+ # Main logic
535
+ mm_inputs = base_output.images
536
+ if not mm_inputs:
537
+ # Return text-only case
538
+ input_ids = tokenize_text(base_output.input_text)
539
+ return None, input_ids
540
+
541
+ # Categorize input formats
542
+ input_format = categorize_mm_inputs(mm_inputs)
543
+
544
+ # Process based on format
545
+ if input_format == MultimodalInputFormat.RAW_IMAGES:
546
+ combined_mm_item, input_ids = process_raw_images(base_output)
547
+ elif input_format == MultimodalInputFormat.PRECOMPUTED_FEATURES:
548
+ combined_mm_item, input_ids = process_precomputed_features(base_output)
549
+ elif input_format == MultimodalInputFormat.PIXEL_VALUES:
550
+ combined_mm_item, input_ids = process_pixel_values(base_output)
551
+ else:
552
+ raise ValueError(f"Unknown input format: {input_format}")
553
+
554
+ # Finalize with common processing
555
+ combined_mm_item = finalize_mm_item(combined_mm_item, input_ids)
556
+ return combined_mm_item, input_ids
@@ -27,6 +27,7 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
27
27
  )
28
28
  self.IM_START_TOKEN_ID = hf_config.boi_token_index
29
29
  self.IM_END_TOKEN_ID = hf_config.eoi_token_index
30
+ self.IM_TOKEN_ID = hf_config.image_token_index
30
31
 
31
32
  async def process_mm_data_async(
32
33
  self,
@@ -42,49 +43,21 @@ class Gemma3SGLangImageProcessor(SGLangBaseProcessor):
42
43
  if isinstance(image_data, str):
43
44
  image_data = [image_data]
44
45
 
45
- image_token = self.IMAGE_TOKEN
46
- image_token_regex = self.IMAGE_TOKEN_REGEX
47
46
  base_output = self.load_mm_data(
48
47
  prompt=input_text,
49
48
  image_data=image_data,
50
49
  multimodal_tokens=MultimodalSpecialTokens(
51
- image_token=image_token, image_token_regex=image_token_regex
50
+ image_token=self.IMAGE_TOKEN, image_token_regex=self.IMAGE_TOKEN_REGEX
52
51
  ),
53
52
  max_req_input_len=max_req_input_len,
54
53
  discard_alpha_channel=True,
55
54
  )
56
55
 
57
- images_are_preprocessed = self.mm_inputs_are_preprocessed(base_output.images)
58
- ret = self.process_mm_data(
59
- input_text=base_output.input_text,
60
- images=None if images_are_preprocessed else base_output.images,
61
- )
62
-
63
- items = []
64
- input_ids = ret["input_ids"].flatten()
65
- image_offsets = self.get_mm_items_offset(
66
- input_ids=input_ids,
67
- mm_token_id=self.hf_config.image_token_index,
68
- )
69
- for i, image in enumerate(base_output.images):
70
- if images_are_preprocessed:
71
- pixel_values = image.pixel_values
72
- precomputed_features = image.precomputed_features
73
- else:
74
- pixel_values = ret["pixel_values"][i]
75
- precomputed_features = None
76
-
77
- item = MultimodalDataItem(
78
- pixel_values=pixel_values,
79
- precomputed_features=precomputed_features,
80
- modality=Modality.IMAGE,
81
- image_offsets=image_offsets[i],
82
- )
83
- items += [item]
56
+ combined_mm_item, input_ids = self.process_and_combine_mm_data(base_output)
84
57
 
85
58
  return {
86
- "mm_items": items,
87
59
  "input_ids": input_ids.tolist(),
60
+ "mm_items": [combined_mm_item] if combined_mm_item is not None else [],
88
61
  "im_start_id": self.IM_START_TOKEN_ID,
89
62
  "im_end_id": self.IM_END_TOKEN_ID,
90
63
  }