sglang 0.4.6.post4__py3-none-any.whl → 0.4.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (358) hide show
  1. sglang/bench_offline_throughput.py +16 -10
  2. sglang/bench_one_batch.py +5 -4
  3. sglang/bench_one_batch_server.py +86 -22
  4. sglang/bench_serving.py +197 -110
  5. sglang/compile_deep_gemm.py +4 -4
  6. sglang/lang/backend/runtime_endpoint.py +24 -1
  7. sglang/profiler.py +167 -0
  8. sglang/srt/_custom_ops.py +34 -0
  9. sglang/srt/configs/internvl.py +8 -12
  10. sglang/srt/configs/model_config.py +66 -29
  11. sglang/srt/constrained/base_grammar_backend.py +5 -2
  12. sglang/srt/constrained/llguidance_backend.py +9 -8
  13. sglang/srt/constrained/outlines_backend.py +5 -4
  14. sglang/srt/constrained/xgrammar_backend.py +18 -18
  15. sglang/srt/conversation.py +47 -9
  16. sglang/srt/custom_op.py +38 -3
  17. sglang/srt/debug_utils.py +74 -0
  18. sglang/srt/disaggregation/common/__init__.py +1 -0
  19. sglang/srt/disaggregation/common/conn.py +407 -0
  20. sglang/srt/disaggregation/decode.py +187 -134
  21. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  22. sglang/srt/disaggregation/fake/conn.py +4 -13
  23. sglang/srt/disaggregation/kv_events.py +412 -0
  24. sglang/srt/disaggregation/launch_lb.py +140 -0
  25. sglang/srt/disaggregation/mini_lb.py +84 -70
  26. sglang/srt/disaggregation/mooncake/conn.py +441 -140
  27. sglang/srt/disaggregation/mooncake/transfer_engine.py +31 -14
  28. sglang/srt/disaggregation/nixl/conn.py +124 -442
  29. sglang/srt/disaggregation/prefill.py +128 -44
  30. sglang/srt/disaggregation/utils.py +154 -6
  31. sglang/srt/distributed/device_communicators/pymscclpp.py +315 -0
  32. sglang/srt/distributed/parallel_state.py +52 -5
  33. sglang/srt/distributed/utils.py +3 -3
  34. sglang/srt/entrypoints/EngineBase.py +11 -0
  35. sglang/srt/entrypoints/engine.py +129 -12
  36. sglang/srt/entrypoints/http_server.py +21 -6
  37. sglang/srt/entrypoints/http_server_engine.py +5 -2
  38. sglang/srt/function_call/base_format_detector.py +302 -0
  39. sglang/srt/function_call/core_types.py +34 -0
  40. sglang/srt/function_call/deepseekv3_detector.py +205 -0
  41. sglang/srt/function_call/ebnf_composer.py +248 -0
  42. sglang/srt/function_call/function_call_parser.py +202 -0
  43. sglang/srt/function_call/llama32_detector.py +93 -0
  44. sglang/srt/function_call/mistral_detector.py +131 -0
  45. sglang/srt/function_call/pythonic_detector.py +229 -0
  46. sglang/srt/function_call/qwen25_detector.py +121 -0
  47. sglang/srt/function_call/utils.py +52 -0
  48. sglang/srt/hf_transformers_utils.py +50 -7
  49. sglang/srt/layers/attention/aiter_backend.py +878 -0
  50. sglang/srt/layers/attention/base_attn_backend.py +4 -0
  51. sglang/srt/layers/attention/cutlass_mla_backend.py +2 -19
  52. sglang/srt/layers/attention/flashattention_backend.py +166 -35
  53. sglang/srt/layers/attention/flashinfer_backend.py +45 -1
  54. sglang/srt/layers/attention/flashinfer_mla_backend.py +45 -5
  55. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  56. sglang/srt/layers/attention/intel_amx_backend.py +128 -0
  57. sglang/srt/layers/attention/tbo_backend.py +232 -0
  58. sglang/srt/layers/attention/torch_native_backend.py +3 -0
  59. sglang/srt/layers/attention/triton_backend.py +247 -5
  60. sglang/srt/layers/attention/triton_ops/extend_attention.py +12 -4
  61. sglang/srt/layers/attention/utils.py +2 -2
  62. sglang/srt/layers/attention/vision.py +1 -1
  63. sglang/srt/layers/communicator.py +517 -0
  64. sglang/srt/layers/dp_attention.py +6 -15
  65. sglang/srt/layers/layernorm.py +30 -19
  66. sglang/srt/layers/moe/cutlass_moe.py +370 -0
  67. sglang/srt/layers/moe/cutlass_moe_params.py +169 -0
  68. sglang/srt/layers/moe/ep_moe/kernels.py +60 -17
  69. sglang/srt/layers/moe/ep_moe/layer.py +195 -87
  70. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +88 -8
  71. sglang/srt/layers/moe/fused_moe_native.py +4 -0
  72. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  73. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  76. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  77. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  78. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  80. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +220 -25
  81. sglang/srt/layers/moe/fused_moe_triton/layer.py +48 -4
  82. sglang/srt/layers/moe/topk.py +107 -24
  83. sglang/srt/layers/multimodal.py +70 -0
  84. sglang/srt/layers/quantization/__init__.py +10 -4
  85. sglang/srt/layers/quantization/blockwise_int8.py +3 -0
  86. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +5 -0
  87. sglang/srt/layers/quantization/deep_gemm.py +60 -59
  88. sglang/srt/layers/quantization/fp8.py +113 -18
  89. sglang/srt/layers/quantization/fp8_kernel.py +118 -66
  90. sglang/srt/layers/quantization/fp8_utils.py +165 -43
  91. sglang/srt/layers/quantization/gptq.py +298 -6
  92. sglang/srt/layers/quantization/int8_kernel.py +18 -5
  93. sglang/srt/layers/quantization/modelopt_quant.py +334 -7
  94. sglang/srt/layers/quantization/moe_wna16.py +3 -0
  95. sglang/srt/layers/quantization/qoq.py +244 -0
  96. sglang/srt/layers/quantization/w8a8_fp8.py +3 -0
  97. sglang/srt/layers/quantization/w8a8_int8.py +3 -0
  98. sglang/srt/layers/rotary_embedding.py +6 -12
  99. sglang/srt/layers/sampler.py +80 -79
  100. sglang/srt/layers/utils.py +6 -0
  101. sglang/srt/lora/layers.py +12 -15
  102. sglang/srt/lora/lora.py +49 -5
  103. sglang/srt/lora/lora_manager.py +20 -8
  104. sglang/srt/lora/mem_pool.py +24 -16
  105. sglang/srt/lora/utils.py +17 -13
  106. sglang/srt/managers/data_parallel_controller.py +13 -5
  107. sglang/srt/managers/eplb_algorithms/__init__.py +63 -0
  108. sglang/srt/managers/eplb_algorithms/deepseek.py +223 -0
  109. sglang/srt/managers/eplb_algorithms/deepseek_vec.py +276 -0
  110. sglang/srt/managers/eplb_manager.py +96 -0
  111. sglang/srt/managers/expert_distribution.py +878 -56
  112. sglang/srt/managers/expert_location.py +448 -0
  113. sglang/srt/managers/expert_location_dispatch.py +108 -0
  114. sglang/srt/managers/io_struct.py +29 -5
  115. sglang/srt/managers/mm_utils.py +355 -151
  116. sglang/srt/managers/multimodal_processors/base_processor.py +299 -42
  117. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  118. sglang/srt/managers/multimodal_processors/gemma3.py +15 -17
  119. sglang/srt/managers/multimodal_processors/internvl.py +18 -5
  120. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  121. sglang/srt/managers/multimodal_processors/kimi_vl.py +14 -32
  122. sglang/srt/managers/multimodal_processors/llava.py +3 -3
  123. sglang/srt/managers/multimodal_processors/minicpm.py +27 -32
  124. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  125. sglang/srt/managers/multimodal_processors/phi4mm.py +87 -0
  126. sglang/srt/managers/multimodal_processors/pixtral.py +9 -9
  127. sglang/srt/managers/multimodal_processors/qwen_vl.py +35 -35
  128. sglang/srt/managers/schedule_batch.py +185 -55
  129. sglang/srt/managers/schedule_policy.py +4 -5
  130. sglang/srt/managers/scheduler.py +389 -154
  131. sglang/srt/managers/session_controller.py +1 -1
  132. sglang/srt/managers/tokenizer_manager.py +231 -39
  133. sglang/srt/managers/utils.py +0 -4
  134. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  135. sglang/srt/mem_cache/chunk_cache.py +3 -1
  136. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  137. sglang/srt/mem_cache/memory_pool.py +74 -52
  138. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  139. sglang/srt/mem_cache/radix_cache.py +58 -5
  140. sglang/srt/metrics/collector.py +11 -2
  141. sglang/srt/mm_utils.py +10 -0
  142. sglang/srt/model_executor/cuda_graph_runner.py +87 -65
  143. sglang/srt/model_executor/expert_location_updater.py +557 -0
  144. sglang/srt/model_executor/forward_batch_info.py +39 -14
  145. sglang/srt/model_executor/model_runner.py +231 -101
  146. sglang/srt/model_loader/loader.py +10 -6
  147. sglang/srt/model_loader/utils.py +67 -1
  148. sglang/srt/models/clip.py +5 -1
  149. sglang/srt/models/deepseek_nextn.py +1 -1
  150. sglang/srt/models/deepseek_v2.py +732 -403
  151. sglang/srt/models/exaone.py +8 -3
  152. sglang/srt/models/gemma3_causal.py +7 -0
  153. sglang/srt/models/gemma3_mm.py +75 -33
  154. sglang/srt/models/idefics2.py +342 -0
  155. sglang/srt/models/kimi_vl.py +4 -4
  156. sglang/srt/models/llama.py +1 -1
  157. sglang/srt/models/llama4.py +10 -2
  158. sglang/srt/models/llava.py +26 -18
  159. sglang/srt/models/mimo_mtp.py +220 -0
  160. sglang/srt/models/minicpmo.py +7 -17
  161. sglang/srt/models/minicpmv.py +3 -295
  162. sglang/srt/models/mistral.py +71 -1
  163. sglang/srt/models/mllama.py +3 -3
  164. sglang/srt/models/phi4mm.py +512 -0
  165. sglang/srt/models/qwen2.py +133 -35
  166. sglang/srt/models/qwen2_5_vl.py +5 -3
  167. sglang/srt/models/qwen2_eagle.py +4 -1
  168. sglang/srt/models/qwen2_moe.py +206 -69
  169. sglang/srt/models/qwen2_vl.py +3 -3
  170. sglang/srt/models/qwen3.py +92 -19
  171. sglang/srt/models/qwen3_moe.py +457 -55
  172. sglang/srt/models/registry.py +9 -1
  173. sglang/srt/models/siglip.py +294 -0
  174. sglang/srt/models/transformers.py +291 -0
  175. sglang/srt/openai_api/adapter.py +114 -40
  176. sglang/srt/openai_api/protocol.py +37 -2
  177. sglang/srt/openai_api/utils.py +172 -0
  178. sglang/srt/operations.py +189 -0
  179. sglang/srt/operations_strategy.py +207 -0
  180. sglang/srt/sampling/sampling_batch_info.py +13 -1
  181. sglang/srt/sampling/sampling_params.py +2 -1
  182. sglang/srt/server_args.py +235 -38
  183. sglang/srt/speculative/build_eagle_tree.py +8 -8
  184. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -11
  185. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +253 -0
  186. sglang/srt/speculative/eagle_utils.py +181 -90
  187. sglang/srt/speculative/eagle_worker.py +146 -21
  188. sglang/srt/two_batch_overlap.py +635 -0
  189. sglang/srt/utils.py +197 -19
  190. sglang/test/runners.py +16 -7
  191. sglang/test/send_one.py +4 -0
  192. sglang/test/test_cutlass_moe.py +278 -0
  193. sglang/test/test_fp4_moe.py +248 -0
  194. sglang/test/test_utils.py +81 -42
  195. sglang/utils.py +2 -2
  196. sglang/version.py +1 -1
  197. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/METADATA +31 -19
  198. sglang-0.4.7.dist-info/RECORD +699 -0
  199. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/WHEEL +1 -1
  200. sglang/srt/function_call_parser.py +0 -858
  201. sglang/srt/platforms/interface.py +0 -371
  202. sglang-0.4.6.post4.dist-info/RECORD +0 -646
  203. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  204. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  205. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  206. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  207. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  208. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  209. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  210. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  211. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  212. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  213. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  214. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=144,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  215. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  216. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1024,device_name=NVIDIA_H200.json → triton_3_1_0/E=16,N=1024,device_name=NVIDIA_H200.json} +0 -0
  217. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  218. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  219. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  220. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  221. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  222. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  223. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  224. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  225. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  226. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  227. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  228. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  229. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  230. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  231. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  232. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  233. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json} +0 -0
  234. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  235. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json → triton_3_1_0/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json} +0 -0
  236. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  237. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=160,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  238. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=20,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  239. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=24,N=1024,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  240. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  241. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  242. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  243. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  244. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  245. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,block_shape=[128, 128].json} +0 -0
  246. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  247. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  248. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  249. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  250. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  251. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  252. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H20,dtype=int8_w8a8,block_shape=[128, 128].json} +0 -0
  253. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  254. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  255. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L20,dtype=int8_w8a8.json} +0 -0
  256. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json → triton_3_1_0/E=256,N=64,device_name=NVIDIA_L40S,dtype=int8_w8a8.json} +0 -0
  257. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  258. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  259. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  260. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  261. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  262. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  263. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=1280,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=1280,device_name=NVIDIA_H200.json} +0 -0
  264. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  265. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  266. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=2560,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=2560,device_name=NVIDIA_H200.json} +0 -0
  267. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  268. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  269. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  270. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=320,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=320,device_name=NVIDIA_H200.json} +0 -0
  271. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_1_0/E=64,N=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  272. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  273. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  274. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  275. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  276. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  277. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  278. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=64,N=640,device_name=NVIDIA_H200.json → triton_3_1_0/E=64,N=640,device_name=NVIDIA_H200.json} +0 -0
  279. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI300X.json} +0 -0
  280. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Instinct_MI325X.json} +0 -0
  281. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=14336,device_name=AMD_Radeon_Graphics.json} +0 -0
  282. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  283. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  284. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=14336,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=14336,device_name=NVIDIA_H200.json} +0 -0
  285. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI300X.json} +0 -0
  286. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Instinct_MI325X.json} +0 -0
  287. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=1792,device_name=AMD_Radeon_Graphics.json} +0 -0
  288. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  289. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  290. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  291. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  292. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=1792,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=1792,device_name=NVIDIA_H200.json} +0 -0
  293. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  294. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  295. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  296. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  297. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=2048,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=2048,device_name=NVIDIA_H200.json} +0 -0
  298. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI300X.json} +0 -0
  299. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Instinct_MI325X.json} +0 -0
  300. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=3584,device_name=AMD_Radeon_Graphics.json} +0 -0
  301. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json} +0 -0
  302. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  303. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json} +0 -0
  304. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  305. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  306. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  307. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_H200.json} +0 -0
  308. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=3584,device_name=NVIDIA_L40S.json → triton_3_1_0/E=8,N=3584,device_name=NVIDIA_L40S.json} +0 -0
  309. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  310. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  311. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  312. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  313. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  314. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  315. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  316. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=4096,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=4096,device_name=NVIDIA_H200.json} +0 -0
  317. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI300X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI300X.json} +0 -0
  318. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Instinct_MI325X.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Instinct_MI325X.json} +0 -0
  319. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=AMD_Radeon_Graphics.json → triton_3_1_0/E=8,N=7168,device_name=AMD_Radeon_Graphics.json} +0 -0
  320. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json} +0 -0
  321. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  322. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  323. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  324. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=7168,device_name=NVIDIA_H200.json → triton_3_1_0/E=8,N=7168,device_name=NVIDIA_H200.json} +0 -0
  325. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json} +0 -0
  326. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json} +0 -0
  327. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json} +0 -0
  328. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json} +0 -0
  329. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json → triton_3_1_0/E=8,N=8192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json} +0 -0
  330. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  331. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  332. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H20.json} +0 -0
  333. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=192,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=192,device_name=NVIDIA_H200.json} +0 -0
  334. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  335. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  336. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H20.json} +0 -0
  337. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  338. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=384,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=384,device_name=NVIDIA_H200.json} +0 -0
  339. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=512,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  340. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  341. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H100_80GB_HBM3.json} +0 -0
  342. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  343. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H20.json} +0 -0
  344. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  345. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=768,device_name=NVIDIA_H200.json → triton_3_2_0/E=128,N=768,device_name=NVIDIA_H200.json} +0 -0
  346. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=128,N=96,device_name=NVIDIA_H20.json → triton_3_2_0/E=128,N=96,device_name=NVIDIA_H20.json} +0 -0
  347. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=264,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  348. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  349. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  350. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  351. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json} +0 -0
  352. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  353. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_2_0/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json} +0 -0
  354. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=272,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  355. /sglang/srt/layers/moe/fused_moe_triton/configs/{E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json → triton_3_2_0/E=288,N=64,device_name=NVIDIA_A800-SXM4-80GB.json} +0 -0
  356. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  357. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/licenses/LICENSE +0 -0
  358. {sglang-0.4.6.post4.dist-info → sglang-0.4.7.dist-info}/top_level.txt +0 -0
@@ -85,6 +85,22 @@ class RuntimeEndpoint(BaseBackend):
85
85
  )
86
86
  self._assert_success(res)
87
87
 
88
+ def start_profile(self):
89
+ res = http_request(
90
+ self.base_url + "/start_profile",
91
+ api_key=self.api_key,
92
+ verify=self.verify,
93
+ )
94
+ self._assert_success(res)
95
+
96
+ def stop_profile(self):
97
+ res = http_request(
98
+ self.base_url + "/stop_profile",
99
+ api_key=self.api_key,
100
+ verify=self.verify,
101
+ )
102
+ self._assert_success(res)
103
+
88
104
  def commit_lazy_operations(self, s: StreamExecutor):
89
105
  data = {"text": s.text_, "sampling_params": {"max_new_tokens": 0}}
90
106
  self._add_images(s, data)
@@ -374,7 +390,8 @@ class Runtime:
374
390
  self.pid = None
375
391
  pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
376
392
 
377
- proc = multiprocessing.Process(
393
+ ctx = multiprocessing.get_context("spawn")
394
+ proc = ctx.Process(
378
395
  target=launch_server,
379
396
  args=(self.server_args, pipe_writer),
380
397
  )
@@ -406,6 +423,12 @@ class Runtime:
406
423
  kill_process_tree(self.pid)
407
424
  self.pid = None
408
425
 
426
+ def start_profile(self):
427
+ self.endpoint.start_profile()
428
+
429
+ def stop_profile(self):
430
+ self.endpoint.stop_profile()
431
+
409
432
  def cache_prefix(self, prefix: str):
410
433
  self.endpoint.cache_prefix(prefix)
411
434
 
sglang/profiler.py ADDED
@@ -0,0 +1,167 @@
1
+ """
2
+ Run live profiling.
3
+
4
+ Usage:
5
+ python3 -m sglang.profiler
6
+ """
7
+
8
+ import argparse
9
+ import json
10
+ import os
11
+ import time
12
+ import urllib.parse
13
+ from argparse import ArgumentParser
14
+ from pathlib import Path
15
+ from typing import List, Optional
16
+
17
+ import requests
18
+
19
+ PARENT_FOLDER = "/tmp/sglang-profile"
20
+
21
+
22
+ def _run_profile(
23
+ url: Optional[str],
24
+ num_steps: int,
25
+ activities: List[str],
26
+ output_dir: Optional[str] = None,
27
+ profile_name: Optional[str] = None,
28
+ profile_by_stage: bool = False,
29
+ ) -> str:
30
+ if output_dir is None:
31
+ output_dir = PARENT_FOLDER
32
+
33
+ output_dir = os.path.normpath(output_dir)
34
+ output_dir = os.path.abspath(output_dir)
35
+ output_dir = Path(output_dir)
36
+
37
+ # Add "profile_name/timestamp" to the path.
38
+ if profile_name:
39
+ output_dir = output_dir / profile_name
40
+ output_dir = output_dir / str(time.time())
41
+ output_dir.mkdir(exist_ok=True, parents=True)
42
+
43
+ print(f"Dump profiling traces to {output_dir}")
44
+ print(
45
+ f"Waiting for {num_steps} steps and the trace to be flushed.... ({profile_by_stage=})"
46
+ )
47
+
48
+ # Dump server args.
49
+ file_path = Path(output_dir) / "server_args.json"
50
+ if not file_path.exists():
51
+ response = requests.get(url + "/get_server_info")
52
+ response.raise_for_status()
53
+ server_args_data = response.json()
54
+ with open(file_path, "w") as file:
55
+ file.write(json.dumps(server_args_data))
56
+
57
+ # Start profiler. The API replies when all steps are processed
58
+ # and files are generated.
59
+ json_data = {
60
+ "output_dir": str(output_dir),
61
+ "num_steps": str(num_steps),
62
+ "activities": activities,
63
+ "profile_by_stage": profile_by_stage,
64
+ }
65
+
66
+ response = requests.post(url=url + "/start_profile", json=json_data)
67
+ response.raise_for_status()
68
+
69
+ trace_link = str(output_dir)
70
+ return trace_link
71
+
72
+
73
+ def run_profile(
74
+ url: Optional[str],
75
+ num_steps: int,
76
+ activities: List[str],
77
+ output_dir: Optional[str] = None,
78
+ profile_name: Optional[str] = None,
79
+ profile_by_stage: bool = False,
80
+ ):
81
+ # step based profile will self terminate on num_steps constraints
82
+ link = _run_profile(
83
+ url, num_steps, activities, output_dir, profile_name, profile_by_stage
84
+ )
85
+ return link
86
+
87
+
88
+ if __name__ == "__main__":
89
+ parser = ArgumentParser(description="Benchmark the online serving throughput.")
90
+ parser.add_argument(
91
+ "--url",
92
+ type=str,
93
+ default="http://localhost:30000",
94
+ help="Server or API base url if not using http host and port.",
95
+ )
96
+ parser.add_argument(
97
+ "--output-dir",
98
+ type=str,
99
+ default=None,
100
+ help="Profile directory to dump profile traces.",
101
+ )
102
+ parser.add_argument(
103
+ "--profile-name",
104
+ type=str,
105
+ default=None,
106
+ help="The name of this profile run.",
107
+ )
108
+ parser.add_argument(
109
+ "--num-steps",
110
+ type=int,
111
+ default=5,
112
+ help="The number of forward steps to profile.",
113
+ )
114
+ parser.add_argument(
115
+ "--profile-by-stage",
116
+ action=argparse.BooleanOptionalAction,
117
+ type=bool,
118
+ default=False,
119
+ help="The number of forward steps to profile.",
120
+ )
121
+ parser.add_argument(
122
+ "--cpu",
123
+ action=argparse.BooleanOptionalAction,
124
+ type=bool,
125
+ default=True,
126
+ help="Whether to profile CPU activity",
127
+ )
128
+ parser.add_argument(
129
+ "--gpu",
130
+ action=argparse.BooleanOptionalAction,
131
+ type=bool,
132
+ default=True,
133
+ help="Whether to profile GPU activity",
134
+ )
135
+ parser.add_argument(
136
+ "--mem",
137
+ action=argparse.BooleanOptionalAction,
138
+ type=bool,
139
+ default=False,
140
+ help="Whether to memory usage (https://pytorch.org/memory_viz)",
141
+ )
142
+ parser.add_argument(
143
+ "--rpd",
144
+ action=argparse.BooleanOptionalAction,
145
+ type=bool,
146
+ default=False,
147
+ help="Whether to use rpd profiler (https://github.com/ROCm/rocmProfileData)",
148
+ )
149
+
150
+ args = parser.parse_args()
151
+ activities = []
152
+ if args.cpu:
153
+ activities.append("CPU")
154
+ if args.gpu:
155
+ activities.append("GPU")
156
+ if args.mem:
157
+ activities.append("MEM")
158
+ if args.rpd:
159
+ activities.append("RPD")
160
+ run_profile(
161
+ args.url,
162
+ args.num_steps,
163
+ activities,
164
+ args.output_dir,
165
+ args.profile_name,
166
+ args.profile_by_stage,
167
+ )
sglang/srt/_custom_ops.py CHANGED
@@ -113,3 +113,37 @@ else:
113
113
 
114
114
  def get_meta_buffer_ipc_handle(inp: torch.Tensor) -> torch.Tensor:
115
115
  return sgl_kernel.allreduce.get_meta_buffer_ipc_handle(inp)
116
+
117
+
118
+ def mscclpp_generate_unique_id() -> bytes:
119
+ return sgl_kernel.allreduce.mscclpp_generate_unique_id()
120
+
121
+
122
+ def mscclpp_init_context(
123
+ unique_id: bytes,
124
+ rank: int,
125
+ world_size: int,
126
+ scratch: torch.Tensor,
127
+ put_buffer: torch.Tensor,
128
+ nranks_per_node: int,
129
+ rank_to_node: List[int],
130
+ rank_to_ib: List[int],
131
+ context_selection: int,
132
+ ) -> int:
133
+ return sgl_kernel.allreduce.mscclpp_init_context(
134
+ unique_id,
135
+ rank,
136
+ world_size,
137
+ scratch,
138
+ put_buffer,
139
+ nranks_per_node,
140
+ rank_to_node,
141
+ rank_to_ib,
142
+ context_selection,
143
+ )
144
+
145
+
146
+ def mscclpp_allreduce(
147
+ context: int, inp: torch.Tensor, out: torch.Tensor, nthreads: int, nblocks: int
148
+ ) -> None:
149
+ return sgl_kernel.allreduce.mscclpp_allreduce(context, inp, out, nthreads, nblocks)
@@ -7,11 +7,8 @@ import sentencepiece as spm
7
7
  from transformers import (
8
8
  TOKENIZER_MAPPING,
9
9
  LlamaConfig,
10
- Phi3Config,
11
10
  PretrainedConfig,
12
11
  PreTrainedTokenizer,
13
- PreTrainedTokenizerFast,
14
- Qwen2Config,
15
12
  )
16
13
 
17
14
  from sglang.utils import logger
@@ -302,24 +299,23 @@ class InternVLChatConfig(PretrainedConfig):
302
299
  )
303
300
 
304
301
  if llm_config is None:
305
- # TODO: There might still be a bug in transformers version 4.44 and above.
306
- llm_config = {"architectures": [""]}
302
+ llm_config = {"architectures": ["InternLM2ForCausalLM"]}
307
303
  logger.info(
308
304
  "llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`)."
309
305
  )
306
+
310
307
  self.vision_config = InternVisionConfig(**vision_config)
311
- if llm_config["architectures"][0] == "LlamaForCausalLM":
308
+ if llm_config.get("architectures")[0] == "LlamaForCausalLM":
312
309
  self.llm_config = LlamaConfig(**llm_config)
313
- elif llm_config["architectures"][0] == "InternLM2ForCausalLM":
310
+ elif llm_config.get("architectures")[0] == "InternLM2ForCausalLM":
314
311
  self.llm_config = InternLM2Config(**llm_config)
315
- elif llm_config["architectures"][0] == "Phi3ForCausalLM":
316
- self.llm_config = Phi3Config(**llm_config)
317
- elif llm_config["architectures"][0] == "Qwen2ForCausalLM":
318
- self.llm_config = Qwen2Config(**llm_config)
319
312
  else:
320
313
  raise ValueError(
321
- "Unsupported architecture: {}".format(llm_config["architectures"][0])
314
+ "Unsupported architecture: {}".format(
315
+ llm_config.get("architectures")[0]
316
+ )
322
317
  )
318
+
323
319
  self.use_backbone_lora = use_backbone_lora
324
320
  self.use_llm_lora = use_llm_lora
325
321
  self.pad2square = pad2square
@@ -16,13 +16,17 @@ import json
16
16
  import logging
17
17
  import math
18
18
  import os
19
- from enum import IntEnum, auto
19
+ from enum import Enum, IntEnum, auto
20
20
  from typing import List, Optional, Set, Union
21
21
 
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import get_config, get_context_length
25
+ from sglang.srt.hf_transformers_utils import (
26
+ get_config,
27
+ get_context_length,
28
+ get_hf_text_config,
29
+ )
26
30
  from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
31
  from sglang.srt.server_args import ServerArgs
28
32
  from sglang.srt.utils import get_bool_env_var, is_hip
@@ -35,6 +39,12 @@ class AttentionArch(IntEnum):
35
39
  MHA = auto()
36
40
 
37
41
 
42
+ class ModelImpl(str, Enum):
43
+ AUTO = "auto"
44
+ SGLANG = "sglang"
45
+ TRANSFORMERS = "transformers"
46
+
47
+
38
48
  class ModelConfig:
39
49
  def __init__(
40
50
  self,
@@ -49,11 +59,13 @@ class ModelConfig:
49
59
  quantization: Optional[str] = None,
50
60
  override_config_file: Optional[str] = None,
51
61
  is_draft_model: bool = False,
62
+ impl: Union[str, ModelImpl] = ModelImpl.AUTO,
52
63
  ) -> None:
53
64
 
54
65
  self.model_path = model_path
55
66
  self.revision = revision
56
67
  self.quantization = quantization
68
+ self.impl = impl
57
69
 
58
70
  # Parse args
59
71
  self.maybe_pull_model_tokenizer_from_remote()
@@ -69,6 +81,7 @@ class ModelConfig:
69
81
  model_override_args=self.model_override_args,
70
82
  **kwargs,
71
83
  )
84
+
72
85
  self.hf_text_config = get_hf_text_config(self.hf_config)
73
86
  self.attention_chunk_size = getattr(
74
87
  self.hf_text_config, "attention_chunk_size", None
@@ -93,6 +106,8 @@ class ModelConfig:
93
106
  ):
94
107
  self.hf_config.architectures[0] = "DeepseekV3ForCausalLMNextN"
95
108
 
109
+ if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
110
+ self.hf_config.architectures[0] = "MiMoMTP"
96
111
  # Check model type
97
112
  self.is_generation = is_generation_model(
98
113
  self.hf_config.architectures, is_embedding
@@ -109,6 +124,10 @@ class ModelConfig:
109
124
  self.is_audio_model = enable_multimodal and is_audio_model(
110
125
  self.hf_config.architectures
111
126
  )
127
+ self.is_multimodal_chunked_prefill_supported = (
128
+ enable_multimodal
129
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
130
+ )
112
131
  self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
113
132
  self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
114
133
 
@@ -185,6 +204,22 @@ class ModelConfig:
185
204
  self.v_head_dim = self.hf_text_config.v_head_dim
186
205
  self.qk_nope_head_dim = self.hf_text_config.qk_nope_head_dim
187
206
  else:
207
+ if (
208
+ "MistralModel" in self.hf_config.architectures
209
+ or "MixtralForCausalLM" in self.hf_config.architectures
210
+ or "MistralForCausalLM" in self.hf_config.architectures
211
+ ):
212
+ if getattr(self, "head_dim", None) is None:
213
+ self.head_dim = (
214
+ self.hf_config.hidden_size // self.hf_config.num_attention_heads
215
+ )
216
+ # In transformers==4.52.3, the head_dim is null in MistralConfig
217
+ if (
218
+ not hasattr(self.hf_text_config, "head_dim")
219
+ or self.hf_text_config.head_dim is None
220
+ ):
221
+ setattr(self.hf_text_config, "head_dim", self.head_dim)
222
+
188
223
  self.attention_arch = AttentionArch.MHA
189
224
 
190
225
  self.num_attention_heads = self.hf_text_config.num_attention_heads
@@ -209,7 +244,13 @@ class ModelConfig:
209
244
 
210
245
  # Cache attributes
211
246
  self.hf_eos_token_id = self.get_hf_eos_token_id()
212
- self.image_token_id = getattr(self.hf_config, "image_token_id", None)
247
+
248
+ config = self.hf_config
249
+
250
+ # multimodal
251
+ self.image_token_id = getattr(config, "image_token_id", None) or getattr(
252
+ config, "image_token_index", None
253
+ )
213
254
 
214
255
  @staticmethod
215
256
  def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
@@ -223,6 +264,7 @@ class ModelConfig:
223
264
  enable_multimodal=server_args.enable_multimodal,
224
265
  dtype=server_args.dtype,
225
266
  quantization=server_args.quantization,
267
+ impl=server_args.impl,
226
268
  **kwargs,
227
269
  )
228
270
 
@@ -332,6 +374,7 @@ class ModelConfig:
332
374
  "w8a8_int8",
333
375
  "w8a8_fp8",
334
376
  "moe_wna16",
377
+ "qoq",
335
378
  ]
336
379
  compatible_quantization_methods = {
337
380
  "modelopt_fp4": ["modelopt"],
@@ -423,31 +466,6 @@ class ModelConfig:
423
466
  self.model_path = client.get_local_dir()
424
467
 
425
468
 
426
- def get_hf_text_config(config: PretrainedConfig):
427
- """Get the "sub" config relevant to llm for multi modal models.
428
- No op for pure text models.
429
- """
430
- class_name = config.architectures[0]
431
- if class_name.startswith("Llava") and class_name.endswith("ForCausalLM"):
432
- # We support non-hf version of llava models, so we do not want to
433
- # read the wrong values from the unused default text_config.
434
- # NOTE(HandH1998): We set `torch_dtype` of config to `torch.float16` for the weights, as
435
- # `torch.float16` is default used for image features in `python/sglang/srt/models/llava.py`.
436
- setattr(config, "torch_dtype", torch.float16)
437
- return config
438
-
439
- if hasattr(config, "text_config"):
440
- # The code operates under the assumption that text_config should have
441
- # `num_attention_heads` (among others). Assert here to fail early
442
- # if transformers config doesn't align with this assumption.
443
- assert hasattr(config.text_config, "num_attention_heads")
444
- return config.text_config
445
- if hasattr(config, "language_config"):
446
- return config.language_config
447
- else:
448
- return config
449
-
450
-
451
469
  # adapted from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/config.py
452
470
  _STR_DTYPE_TO_TORCH_DTYPE = {
453
471
  "half": torch.float16,
@@ -466,6 +484,8 @@ def _get_and_verify_dtype(
466
484
  # NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
467
485
  # because config.torch_dtype can be None.
468
486
  config_dtype = getattr(config, "torch_dtype", None)
487
+ if isinstance(config_dtype, str):
488
+ config_dtype = _STR_DTYPE_TO_TORCH_DTYPE.get(config_dtype, None)
469
489
  if config_dtype is None:
470
490
  config_dtype = torch.float32
471
491
 
@@ -537,6 +557,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
537
557
 
538
558
 
539
559
  multimodal_model_archs = [
560
+ "CLIPModel",
540
561
  "DeepseekVL2ForCausalLM",
541
562
  "Gemma3ForConditionalGeneration",
542
563
  "Grok1VForCausalLM",
@@ -549,13 +570,14 @@ multimodal_model_archs = [
549
570
  "LlavaVidForCausalLM",
550
571
  "MiniCPMO",
551
572
  "MiniCPMV",
573
+ "Mistral3ForConditionalGeneration",
552
574
  "MultiModalityCausalLM",
553
575
  "MllamaForConditionalGeneration",
554
576
  "Qwen2VLForConditionalGeneration",
555
577
  "Qwen2_5_VLForConditionalGeneration",
556
- "CLIPModel",
557
578
  "KimiVLForConditionalGeneration",
558
579
  "InternVLChatModel",
580
+ "Phi4MMForCausalLM",
559
581
  ]
560
582
 
561
583
 
@@ -585,6 +607,21 @@ def is_encoder_decoder_model(model_architectures: List[str]):
585
607
  return "MllamaForConditionalGeneration" in model_architectures
586
608
 
587
609
 
610
+ def is_multimodal_chunked_prefill_supported(model_architectures: List[str]):
611
+ """Check if chunked prefill is supported for a MultiModal model."""
612
+ unsupported = [
613
+ "Grok1VForCausalLM",
614
+ "Grok1AForCausalLM",
615
+ "LlavaLlamaForCausalLM",
616
+ "MllamaForConditionalGeneration",
617
+ "CLIPModel",
618
+ ]
619
+ if any(multi_model_arch in unsupported for multi_model_arch in model_architectures):
620
+ return False
621
+ else:
622
+ return True
623
+
624
+
588
625
  def yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
589
626
  if scale <= 1:
590
627
  return 1.0
@@ -60,7 +60,7 @@ class BaseGrammarObject:
60
60
  raise NotImplementedError()
61
61
 
62
62
  def copy(self) -> "BaseGrammarObject":
63
- raise NotImplementedError()
63
+ return self
64
64
 
65
65
  @property
66
66
  def finished(self):
@@ -99,9 +99,12 @@ class BaseGrammarObject:
99
99
  raise NotImplementedError()
100
100
 
101
101
 
102
+ INVALID_GRAMMAR_OBJ = BaseGrammarObject()
103
+
104
+
102
105
  @dataclass
103
106
  class CacheEntry:
104
- value: Optional[BaseGrammarObject]
107
+ value: BaseGrammarObject
105
108
  event: Event
106
109
 
107
110
 
@@ -28,6 +28,7 @@ from llguidance.torch import (
28
28
  )
29
29
 
30
30
  from sglang.srt.constrained.base_grammar_backend import (
31
+ INVALID_GRAMMAR_OBJ,
31
32
  BaseGrammarBackend,
32
33
  BaseGrammarObject,
33
34
  )
@@ -126,8 +127,8 @@ class GuidanceBackend(BaseGrammarBackend):
126
127
  serialized_grammar=serialized_grammar,
127
128
  )
128
129
  except Exception as e:
129
- logger.warning(f"Skip invalid grammar: {serialized_grammar}, {e=}")
130
- return None
130
+ logger.error(f"Hit invalid grammar: {serialized_grammar=}, {e=}")
131
+ return INVALID_GRAMMAR_OBJ
131
132
 
132
133
  def dispatch_json(self, key_string: str) -> Optional[GuidanceGrammar]:
133
134
  try:
@@ -138,8 +139,8 @@ class GuidanceBackend(BaseGrammarBackend):
138
139
  },
139
140
  )
140
141
  except Exception as e:
141
- logger.warning(f"Skip invalid grammar: {key_string=}, {e=}")
142
- return None
142
+ logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
143
+ return INVALID_GRAMMAR_OBJ
143
144
  return self._from_serialized(serialized_grammar)
144
145
 
145
146
  def dispatch_regex(self, key_string: str) -> Optional[GuidanceGrammar]:
@@ -151,8 +152,8 @@ class GuidanceBackend(BaseGrammarBackend):
151
152
  serialized_grammar = grammar_from("ebnf", key_string)
152
153
  return self._from_serialized(serialized_grammar)
153
154
  except ValueError as e:
154
- logger.warning(f"Skip invalid ebnf: regex={key_string}, {e=}")
155
- return None
155
+ logger.error(f"Hit invalid ebnf: {key_string=}, {e=}")
156
+ return INVALID_GRAMMAR_OBJ
156
157
 
157
158
  def dispatch_structural_tag(self, key_string: str) -> Optional[GuidanceGrammar]:
158
159
  try:
@@ -169,5 +170,5 @@ class GuidanceBackend(BaseGrammarBackend):
169
170
  g = StructTag.to_grammar(tags)
170
171
  return self._from_serialized(g)
171
172
  except Exception as e:
172
- logging.warning(f"Skip invalid structural_tag: {key_string}, {e=}")
173
- return None
173
+ logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
174
+ return INVALID_GRAMMAR_OBJ
@@ -24,6 +24,7 @@ from outlines.models.transformers import TransformerTokenizer
24
24
  from pydantic import BaseModel
25
25
 
26
26
  from sglang.srt.constrained.base_grammar_backend import (
27
+ INVALID_GRAMMAR_OBJ,
27
28
  BaseGrammarBackend,
28
29
  BaseGrammarObject,
29
30
  )
@@ -151,8 +152,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
151
152
  # outlines <= 0.0.46
152
153
  guide = RegexGuide(regex, self.outlines_tokenizer)
153
154
  except interegular.patterns.InvalidSyntax as e:
154
- logger.warning(f"skip invalid regex schema: {regex=}, {e=}")
155
- return None
155
+ logger.error(f"Hit invalid regex schema: {regex=}, {e=}")
156
+ return INVALID_GRAMMAR_OBJ
156
157
 
157
158
  jump_forward_map = None
158
159
  return OutlinesGrammar(guide, jump_forward_map)
@@ -170,8 +171,8 @@ class OutlinesGrammarBackend(BaseGrammarBackend):
170
171
  whitespace_pattern=self.whitespace_pattern,
171
172
  )
172
173
  except (NotImplementedError, json.decoder.JSONDecodeError, ValueError) as e:
173
- logger.warning(f"Skip invalid json_schema: {key_string=}, {e=}")
174
- return None
174
+ logger.error(f"Hit invalid json_schema: {key_string=}, {e=}")
175
+ return INVALID_GRAMMAR_OBJ
175
176
  return self._compile_regex(regex)
176
177
 
177
178
  def dispatch_regex(self, key_string: str):
@@ -28,6 +28,7 @@ from xgrammar import (
28
28
  )
29
29
 
30
30
  from sglang.srt.constrained.base_grammar_backend import (
31
+ INVALID_GRAMMAR_OBJ,
31
32
  BaseGrammarBackend,
32
33
  BaseGrammarObject,
33
34
  )
@@ -152,10 +153,11 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
152
153
  ):
153
154
  super().__init__()
154
155
 
155
- tokenizer_info = TokenizerInfo.from_huggingface(
156
- tokenizer, vocab_size=vocab_size
157
- )
158
- override_stop_tokens = None
156
+ if True:
157
+ tokenizer_info = TokenizerInfo.from_huggingface(
158
+ tokenizer, vocab_size=vocab_size
159
+ )
160
+ override_stop_tokens = None
159
161
 
160
162
  self.grammar_compiler = GrammarCompiler(tokenizer_info=tokenizer_info)
161
163
  self.vocab_size = vocab_size
@@ -178,25 +180,26 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
178
180
  ctx = self.grammar_compiler.compile_builtin_json_grammar()
179
181
  else:
180
182
  ctx = self.grammar_compiler.compile_json_schema(schema=key_string)
181
- except RuntimeError as e:
182
- logging.warning(f"Skip invalid json_schema: json_schema={key_string}, {e=}")
183
- return None
183
+
184
+ except (RuntimeError, json.decoder.JSONDecodeError) as e:
185
+ logging.error(f"Hit invalid json_schema: {key_string=}, {e=}")
186
+ return INVALID_GRAMMAR_OBJ
184
187
  return self._from_context(ctx, key_string)
185
188
 
186
189
  def dispatch_ebnf(self, key_string: str) -> Optional[XGrammarGrammar]:
187
190
  try:
188
191
  ctx = self.grammar_compiler.compile_grammar(key_string)
189
192
  except RuntimeError as e:
190
- logging.warning(f"Skip invalid ebnf: ebnf={key_string}, {e=}")
191
- return None
193
+ logging.error(f"Hit invalid ebnf: {key_string=}, {e=}")
194
+ return INVALID_GRAMMAR_OBJ
192
195
  return self._from_context(ctx, key_string)
193
196
 
194
197
  def dispatch_regex(self, key_string: str) -> Optional[XGrammarGrammar]:
195
198
  try:
196
199
  ctx = self.grammar_compiler.compile_regex(key_string)
197
200
  except RuntimeError as e:
198
- logging.warning(f"Skip invalid regex: regex={key_string}, {e=}")
199
- return None
201
+ logging.error(f"Hit invalid regex: {key_string=}, {e=}")
202
+ return INVALID_GRAMMAR_OBJ
200
203
  return self._from_context(ctx, key_string)
201
204
 
202
205
  def dispatch_structural_tag(self, key_string: str) -> Optional[XGrammarGrammar]:
@@ -213,13 +216,10 @@ class XGrammarGrammarBackend(BaseGrammarBackend):
213
216
  ctx = self.grammar_compiler.compile_structural_tag(
214
217
  tags, structural_tag["triggers"]
215
218
  )
216
- except RuntimeError as e:
217
- logging.warning(
218
- f"Skip invalid structural_tag: structural_tag={key_string}, {e=}"
219
- )
220
- return None
219
+ except (RuntimeError, json.decoder.JSONDecodeError) as e:
220
+ logging.error(f"Hit invalid structural_tag: {key_string=}, {e=}")
221
+ return INVALID_GRAMMAR_OBJ
221
222
  return self._from_context(ctx, key_string)
222
223
 
223
224
  def reset(self):
224
- if self.grammar_compiler:
225
- self.grammar_compiler.clear_cache()
225
+ self.grammar_compiler.clear_cache()