sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,11 @@ if _is_cuda:
27
27
 
28
28
  def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
29
29
  """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
30
- return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
30
+ return (
31
+ _is_cuda
32
+ and hasattr(forward_batch.token_to_kv_pool, "dtype")
33
+ and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
34
+ )
31
35
 
32
36
 
33
37
  def create_fused_set_kv_buffer_arg(
@@ -49,3 +53,9 @@ def create_fused_set_kv_buffer_arg(
49
53
  v_scale=layer.v_scale,
50
54
  cache_loc=forward_batch.out_cache_loc,
51
55
  )
56
+
57
+
58
+ def permute_inv(perm: torch.Tensor) -> torch.Tensor:
59
+ inv_perm = torch.empty_like(perm)
60
+ inv_perm[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
61
+ return inv_perm
@@ -155,7 +155,6 @@ class BaseMultimodalProcessor(ABC):
155
155
  ):
156
156
  self.hf_config = hf_config
157
157
  self._processor = _processor
158
- self.arch = hf_config.architectures[0]
159
158
  self.server_args = server_args
160
159
  self.transport_mode = transport_mode
161
160
 
@@ -179,18 +178,21 @@ class BaseMultimodalProcessor(ABC):
179
178
  "image_attention_mask": Modality.IMAGE,
180
179
  "image_emb_mask": Modality.IMAGE,
181
180
  "images_spatial_crop": Modality.IMAGE,
181
+ "images_crop": Modality.IMAGE,
182
182
  "tgt_size": Modality.IMAGE,
183
183
  "image_grid_hws": Modality.IMAGE,
184
184
  "aspect_ratio_ids": Modality.IMAGE,
185
185
  "aspect_ratio_mask": Modality.IMAGE,
186
186
  "num_patches": Modality.IMAGE,
187
187
  "patch_pixel_values": Modality.IMAGE,
188
+ "block_sizes": Modality.IMAGE,
188
189
  # Audio-related attributes
189
190
  "audio_features": Modality.AUDIO,
190
191
  "audio_feature_lens": Modality.AUDIO,
191
192
  "input_features": Modality.AUDIO,
192
193
  "input_features_mask": Modality.AUDIO,
193
194
  "audio_attention_mask": Modality.AUDIO,
195
+ "feature_attention_mask": Modality.AUDIO,
194
196
  # Video-related attributes
195
197
  "pixel_values_videos": Modality.VIDEO,
196
198
  "second_per_grid_ts": Modality.VIDEO,
@@ -222,6 +224,7 @@ class BaseMultimodalProcessor(ABC):
222
224
  if self._processor.__class__.__name__ in {
223
225
  "Gemma3nProcessor",
224
226
  "Qwen2AudioProcessor",
227
+ "Qwen3OmniMoeProcessor",
225
228
  }:
226
229
  # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
227
230
  kwargs["audio"] = audios
@@ -312,7 +315,9 @@ class BaseMultimodalProcessor(ABC):
312
315
  try:
313
316
  if modality == Modality.IMAGE:
314
317
  img, _ = load_image(data)
315
- return img.convert("RGB") if discard_alpha_channel else img
318
+ if discard_alpha_channel and img.mode != "RGB":
319
+ img = img.convert("RGB")
320
+ return img
316
321
  elif modality == Modality.VIDEO:
317
322
  return load_video(data, frame_count_limit)
318
323
  elif modality == Modality.AUDIO:
@@ -0,0 +1,37 @@
1
+ from typing import List, Union
2
+
3
+ from sglang.srt.models.deepseek_ocr import DeepseekOCRForCausalLM
4
+ from sglang.srt.multimodal.processors.base_processor import (
5
+ BaseMultimodalProcessor,
6
+ MultimodalSpecialTokens,
7
+ )
8
+
9
+
10
+ class DeepseekOCRProcessor(BaseMultimodalProcessor):
11
+ models = [DeepseekOCRForCausalLM]
12
+
13
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
14
+ _processor.image_size = 640
15
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
16
+ self.mm_tokens = MultimodalSpecialTokens(
17
+ image_token="<image>", image_token_id=self._processor.image_token_id
18
+ ).build(_processor)
19
+
20
+ async def process_mm_data_async(
21
+ self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
22
+ ):
23
+ base_output = self.load_mm_data(
24
+ prompt=input_text,
25
+ multimodal_tokens=self.mm_tokens,
26
+ image_data=image_data,
27
+ )
28
+
29
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
30
+ base_output, self.mm_tokens
31
+ )
32
+
33
+ return {
34
+ "input_ids": input_ids.tolist(),
35
+ "mm_items": mm_items,
36
+ "im_token_id": self.mm_tokens.image_token_id,
37
+ }
@@ -18,9 +18,6 @@
18
18
  # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
19
  from typing import List, Union
20
20
 
21
- import torch
22
-
23
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
24
21
  from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
25
22
  from sglang.srt.multimodal.processors.base_processor import (
26
23
  BaseMultimodalProcessor,
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import math
3
2
  import re
4
3
  from typing import Dict, List, Union
5
4
 
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import List, Union
3
2
 
4
3
  from decord import VideoReader
@@ -9,10 +8,7 @@ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
9
8
  from sglang.srt.multimodal.processors.base_processor import (
10
9
  BaseMultimodalProcessor as SGLangBaseProcessor,
11
10
  )
12
- from sglang.srt.multimodal.processors.base_processor import (
13
- BaseMultiModalProcessorOutput,
14
- MultimodalSpecialTokens,
15
- )
11
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
16
12
 
17
13
 
18
14
  class Glm4vImageProcessor(SGLangBaseProcessor):
@@ -21,7 +17,7 @@ class Glm4vImageProcessor(SGLangBaseProcessor):
21
17
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
22
18
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
23
19
 
24
- # GLM-4.1V and GLM-4.5V specific tokens
20
+ # GLM-V specific tokens
25
21
  self.IMAGE_TOKEN = "<|image|>"
26
22
  self.VIDEO_TOKEN = "<|video|>"
27
23
  self.IMAGE_START_TOKEN = "<|begin_of_image|>"
@@ -4,10 +4,8 @@ from functools import lru_cache
4
4
 
5
5
  import numpy as np
6
6
  import torch
7
- import torchvision.transforms as T
8
7
  from decord import VideoReader, cpu, gpu
9
8
  from PIL import Image
10
- from torchvision.transforms import InterpolationMode
11
9
 
12
10
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
11
  from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
@@ -1,6 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
3
  from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
5
4
  from sglang.srt.multimodal.processors.base_processor import (
6
5
  BaseMultimodalProcessor,
@@ -1,13 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- import torch
4
- from transformers.image_utils import SizeDict
5
- from transformers.models.llama4.image_processing_llama4_fast import (
6
- find_supported_resolutions,
7
- get_best_fit,
8
- )
9
-
10
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
3
  from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
12
4
  from sglang.srt.multimodal.processors.base_processor import (
13
5
  BaseMultimodalProcessor,
@@ -1,64 +1,72 @@
1
- from typing import Any, Dict, List, Optional, Type
1
+ from typing import Any
2
2
 
3
3
  import torch.nn as nn
4
4
  from transformers.configuration_utils import PretrainedConfig
5
5
  from transformers.processing_utils import ProcessorMixin
6
6
  from transformers.tokenization_utils_base import PreTrainedTokenizerBase
7
7
 
8
- from sglang.srt.managers.io_struct import (
9
- EmbeddingReqInput,
10
- GenerateReqInput,
11
- ImageDataInputItem,
12
- )
13
- from sglang.srt.models.vila import VILAForConditionalGeneration
8
+ from sglang.srt.managers.io_struct import GenerateReqInput
9
+ from sglang.srt.models.nvila import NVILAForConditionalGeneration
10
+ from sglang.srt.models.nvila_lite import NVILALiteForConditionalGeneration
14
11
  from sglang.srt.multimodal.processors.base_processor import (
15
12
  BaseMultimodalProcessor,
16
13
  MultimodalSpecialTokens,
17
14
  )
18
15
  from sglang.srt.server_args import ServerArgs
19
16
 
17
+ NUM_VIDEO_FRAMES = 8
20
18
 
21
- class VILAProcessor(ProcessorMixin):
22
- """A stub class for the VILA processor."""
23
-
24
- tokenizer: PreTrainedTokenizerBase
25
-
26
-
27
- class VILAMultimodalProcessor(BaseMultimodalProcessor):
28
- models: List[Type[nn.Module]] = [VILAForConditionalGeneration]
29
19
 
30
- _processor: VILAProcessor
20
+ class NVILAMultimodalProcessor(BaseMultimodalProcessor):
21
+ models: list[type[nn.Module]] = [
22
+ NVILAForConditionalGeneration,
23
+ NVILALiteForConditionalGeneration,
24
+ ]
31
25
 
32
26
  def __init__(
33
27
  self,
34
28
  hf_config: PretrainedConfig,
35
29
  server_args: ServerArgs,
36
- _processor: VILAProcessor,
30
+ _processor: ProcessorMixin,
37
31
  *args,
38
32
  **kwargs,
39
33
  ) -> None:
40
34
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
35
+
36
+ self._processor: ProcessorMixin
37
+
38
+ tokenizer: PreTrainedTokenizerBase = getattr(self._processor, "tokenizer")
39
+
41
40
  self.mm_tokens = MultimodalSpecialTokens(
42
- image_token=self._processor.tokenizer.image_token,
41
+ image_token=tokenizer.image_token,
43
42
  image_token_id=hf_config.image_token_id,
43
+ video_token=tokenizer.video_token,
44
44
  video_token_id=hf_config.video_token_id,
45
45
  ).build(_processor)
46
46
 
47
47
  async def process_mm_data_async(
48
48
  self,
49
- image_data: Optional[ImageDataInputItem | List[ImageDataInputItem]],
50
- input_text: str | List[int],
51
- request_obj: GenerateReqInput | EmbeddingReqInput,
49
+ image_data,
50
+ audio_data,
51
+ input_text,
52
+ request_obj: GenerateReqInput,
52
53
  **kwargs,
53
- ) -> Optional[Dict[str, Any]]:
54
+ ) -> dict[str, Any] | None:
54
55
  base_output = self.load_mm_data(
55
56
  prompt=input_text,
56
57
  multimodal_tokens=self.mm_tokens,
57
- image_data=image_data,
58
+ image_data=request_obj.image_data, # type: ignore
59
+ video_data=request_obj.video_data, # type: ignore
58
60
  )
59
61
 
62
+ for i, video in enumerate(base_output.videos): # type: ignore
63
+ base_output.videos[i] = [x.asnumpy() for x in video] # type: ignore
64
+
60
65
  mm_items, input_ids, _ = self.process_and_combine_mm_data(
61
- base_output, self.mm_tokens
66
+ base_output,
67
+ self.mm_tokens,
68
+ do_sample_frames=True,
69
+ num_frames=NUM_VIDEO_FRAMES,
62
70
  )
63
71
 
64
72
  return {
@@ -3,7 +3,6 @@ from typing import List, Union
3
3
 
4
4
  from transformers.processing_utils import ProcessorMixin
5
5
 
6
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
6
  from sglang.srt.models.phi4mm import Phi4MMForCausalLM
8
7
  from sglang.srt.multimodal.processors.base_processor import (
9
8
  BaseMultimodalProcessor,
@@ -0,0 +1,52 @@
1
+ # Copy from qwen_vl.py, adapted for points-v15-chat
2
+
3
+ import asyncio
4
+ from typing import List, Union
5
+
6
+ from PIL import Image
7
+
8
+ from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
9
+ from sglang.srt.multimodal.processors.qwen_vl import (
10
+ QwenVLImageProcessor,
11
+ resize_image_async,
12
+ )
13
+
14
+
15
+ class POINTSV15ChatProcessor(QwenVLImageProcessor):
16
+ models = [POINTSV15ChatModel]
17
+
18
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
19
+ # Compatible with POINTSV15Chat
20
+ hf_config.vision_start_token_id = None
21
+ hf_config.vision_end_token_id = None
22
+ hf_config.video_token_id = None
23
+
24
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
25
+
26
+ async def process_mm_data_async(
27
+ self,
28
+ image_data: List[Union[str, bytes]],
29
+ input_text,
30
+ request_obj,
31
+ *args,
32
+ **kwargs,
33
+ ):
34
+ base_output = self.load_mm_data(
35
+ prompt=input_text,
36
+ image_data=image_data,
37
+ multimodal_tokens=self.mm_tokens,
38
+ )
39
+
40
+ if base_output.images and isinstance(base_output.images[0], Image.Image):
41
+ resize_tasks = [resize_image_async(image) for image in base_output.images]
42
+ base_output.images = await asyncio.gather(*resize_tasks)
43
+
44
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
45
+ base_output, self.mm_tokens
46
+ )
47
+
48
+ return {
49
+ "input_ids": input_ids.tolist(),
50
+ "mm_items": mm_items,
51
+ "im_token_id": self.mm_tokens.image_token_id,
52
+ }
@@ -9,9 +9,11 @@ import torchvision
9
9
  from PIL import Image
10
10
  from torchvision.transforms import InterpolationMode
11
11
 
12
+ from sglang.srt.environ import envs
12
13
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
13
14
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
14
15
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
16
+ from sglang.srt.models.qwen3_omni_moe import Qwen3OmniMoeForConditionalGeneration
15
17
  from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
16
18
  from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
17
19
  from sglang.srt.multimodal.processors.base_processor import (
@@ -22,8 +24,14 @@ from sglang.utils import logger
22
24
 
23
25
  IMAGE_FACTOR = 28
24
26
  MIN_PIXELS = 4 * 28 * 28
25
- MAX_PIXELS = 16384 * 28 * 28
27
+ MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
26
28
  MAX_RATIO = 200
29
+ RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
30
+ if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
31
+ logger.warning(
32
+ f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
33
+ f"Ignoring and using default."
34
+ )
27
35
  VIDEO_TOTAL_PIXELS = int(
28
36
  float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
29
37
  )
@@ -85,7 +93,7 @@ def resize_image(
85
93
  min_pixels=min_pixels,
86
94
  max_pixels=max_pixels,
87
95
  )
88
- image = image.resize((resized_width, resized_height))
96
+ image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
89
97
  return image
90
98
 
91
99
 
@@ -206,25 +214,41 @@ async def preprocess_video(
206
214
  interpolation=InterpolationMode.BICUBIC,
207
215
  antialias=True,
208
216
  ).float()
209
- return video
210
-
211
-
212
- # Compatible with Qwen2VL and Qwen2_5VL
213
- class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
217
+ video_metadata = {
218
+ "fps": video_fps,
219
+ "duration": total_frames / video_fps,
220
+ "total_num_frames": total_frames,
221
+ "frames_indices": idx,
222
+ "video_backend": "torchvision",
223
+ }
224
+ return video, video_metadata
225
+
226
+
227
+ # Compatible with Qwen-VL & Qwen-Omni Series
228
+ class QwenVLImageProcessor(SGLangBaseProcessor):
214
229
  models = [
215
230
  Qwen2VLForConditionalGeneration,
216
231
  Qwen2_5_VLForConditionalGeneration,
217
232
  Qwen3VLForConditionalGeneration,
218
233
  Qwen3VLMoeForConditionalGeneration,
234
+ Qwen3OmniMoeForConditionalGeneration,
219
235
  ]
220
236
 
221
237
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
238
+ self.model_type = hf_config.model_type
239
+ if hf_config.model_type == "qwen3_omni_moe":
240
+ hf_config = hf_config.thinker_config
241
+
222
242
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
223
- # The regex that matches expanded image tokens.
243
+
224
244
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
225
245
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
226
246
  self.vision_start_token_id = hf_config.vision_start_token_id
227
247
  self.vision_end_token_id = hf_config.vision_end_token_id
248
+
249
+ self.audio_start_token_id = getattr(hf_config, "audio_start_token_id", None)
250
+ self.audio_token_id = getattr(hf_config, "audio_token_id", None)
251
+
228
252
  self.NUM_TOKEN_PER_FRAME = 770
229
253
  self.IMAGE_FACTOR = 28
230
254
  self.MIN_PIXELS = 4 * 28 * 28
@@ -233,10 +257,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
233
257
  self.mm_tokens = MultimodalSpecialTokens(
234
258
  image_token="<|vision_start|><|image_pad|><|vision_end|>",
235
259
  image_token_id=hf_config.image_token_id,
260
+ # The regex that matches expanded image tokens.
236
261
  image_token_regex=re.compile(
237
262
  r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
238
263
  ),
239
264
  video_token_id=hf_config.video_token_id,
265
+ audio_token_id=self.audio_token_id,
240
266
  ).build(_processor)
241
267
 
242
268
  async def process_mm_data_async(
@@ -247,11 +273,11 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
247
273
  *args,
248
274
  **kwargs,
249
275
  ):
250
-
251
276
  base_output = self.load_mm_data(
252
277
  prompt=input_text,
253
278
  image_data=image_data,
254
279
  video_data=request_obj.video_data,
280
+ audio_data=request_obj.audio_data,
255
281
  multimodal_tokens=self.mm_tokens,
256
282
  )
257
283
 
@@ -260,29 +286,61 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
260
286
  resize_tasks = [resize_image_async(image) for image in base_output.images]
261
287
  base_output.images = await asyncio.gather(*resize_tasks)
262
288
 
289
+ video_metadata = None
263
290
  if base_output.videos:
264
- base_output.videos = [
265
- await preprocess_video(video) for video in base_output.videos
266
- ]
291
+ video_results = await asyncio.gather(
292
+ *[preprocess_video(video) for video in base_output.videos]
293
+ )
294
+ base_output.videos, video_metadata = map(list, zip(*video_results))
295
+
296
+ # NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
297
+ if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
298
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
299
+ base_output,
300
+ self.mm_tokens,
301
+ video_metadata=video_metadata,
302
+ do_sample_frames=False,
303
+ )
304
+ else:
305
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
306
+ base_output, self.mm_tokens
307
+ )
308
+
309
+ audio_feature_lengths = None
267
310
 
268
- mm_items, input_ids, ret = self.process_and_combine_mm_data(
269
- base_output, self.mm_tokens
311
+ if self.model_type == "qwen3_omni_moe":
312
+ audio_item = next((mm for mm in mm_items if mm.is_audio()), None)
313
+ if audio_item:
314
+ audio_feature_lengths = torch.sum(
315
+ audio_item.feature_attention_mask, dim=1
316
+ )
317
+
318
+ second_per_grid_ts = getattr(ret, "second_per_grid_ts", None) or getattr(
319
+ ret, "video_second_per_grid", None
270
320
  )
271
321
 
272
322
  input_ids = input_ids.flatten()
323
+
273
324
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
274
325
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
275
326
  image_token_id=self.mm_tokens.image_token_id,
276
327
  video_token_id=self.mm_tokens.video_token_id,
277
328
  vision_start_token_id=self.vision_start_token_id,
278
- model_type=self.hf_config.model_type,
329
+ model_type=self.model_type,
279
330
  tokens_per_second=getattr(
280
331
  self.hf_config.vision_config, "tokens_per_second", None
281
332
  ),
282
333
  input_ids=input_ids.unsqueeze(0),
283
334
  image_grid_thw=getattr(ret, "image_grid_thw", None),
284
335
  video_grid_thw=getattr(ret, "video_grid_thw", None),
285
- second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
336
+ second_per_grid_ts=second_per_grid_ts,
337
+ use_audio_in_video=False,
338
+ audio_seqlens=audio_feature_lengths,
339
+ audio_token_id=getattr(self.hf_config, "audio_token_id", None),
340
+ audio_start_token_id=self.audio_start_token_id,
341
+ position_id_per_seconds=getattr(
342
+ self.hf_config, "position_id_per_seconds", None
343
+ ),
286
344
  )
287
345
  mrope_positions = mrope_positions.squeeze(1)
288
346
 
@@ -293,6 +351,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
293
351
  "im_end_id": self.IM_END_TOKEN_ID,
294
352
  "im_token_id": self.mm_tokens.image_token_id,
295
353
  "video_token_id": self.mm_tokens.video_token_id,
354
+ "audio_token_id": self.mm_tokens.audio_token_id,
296
355
  "mrope_positions": mrope_positions,
297
356
  "mrope_position_delta": mrope_position_delta,
298
357
  }
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  import re
3
3
  from itertools import product
4
- from typing import List, Literal, Optional, TypedDict, Union
4
+ from typing import List, Optional, Union
5
5
 
6
6
  import numpy as np
7
7
  import torch
@@ -838,6 +838,19 @@ register_conv_template(
838
838
  )
839
839
  )
840
840
 
841
+ register_conv_template(
842
+ Conversation(
843
+ name="deepseek-ocr",
844
+ system_message="",
845
+ system_template="",
846
+ roles=("", ""),
847
+ sep="",
848
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
849
+ stop_str=["<|end▁of▁sentence|>"],
850
+ image_token="<image>",
851
+ )
852
+ )
853
+
841
854
  register_conv_template(
842
855
  Conversation(
843
856
  name="deepseek-vl2",
@@ -960,6 +973,19 @@ register_conv_template(
960
973
  )
961
974
  )
962
975
 
976
+ register_conv_template(
977
+ Conversation(
978
+ name="points-v15-chat",
979
+ system_message="",
980
+ system_template="",
981
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
982
+ sep="<|im_end|>\n",
983
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
984
+ stop_str=["<|im_end|>"],
985
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
986
+ video_token="<|vision_start|><|video_pad|><|vision_end|>",
987
+ )
988
+ )
963
989
 
964
990
  MODEL_TYPE_TO_TEMPLATE = {
965
991
  "internvl_chat": "internvl-2-5",
@@ -968,9 +994,16 @@ MODEL_TYPE_TO_TEMPLATE = {
968
994
  "phi4mm": "phi-4-mm",
969
995
  "minicpmv": "minicpmv",
970
996
  "minicpmo": "minicpmo",
997
+ "deepseek-ocr": "deepseek-ocr",
971
998
  }
972
999
 
973
1000
 
1001
+ @register_conv_template_matching_function
1002
+ def match_points_v15_chat(model_path: str):
1003
+ if re.search(r"points", model_path, re.IGNORECASE):
1004
+ return "points-v15-chat"
1005
+
1006
+
974
1007
  def get_model_type(model_path: str) -> Optional[str]:
975
1008
  config_path = os.path.join(model_path, "config.json")
976
1009
  if not os.path.exists(config_path):
@@ -1038,3 +1071,11 @@ def match_phi_4_mm(model_path: str):
1038
1071
  return "phi-4-mm"
1039
1072
  model_type = get_model_type(model_path)
1040
1073
  return MODEL_TYPE_TO_TEMPLATE.get(model_type)
1074
+
1075
+
1076
+ @register_conv_template_matching_function
1077
+ def match_deepseek_ocr(model_path: str):
1078
+ if "deepseek-ocr" in model_path.lower():
1079
+ return "deepseek-ocr"
1080
+ model_type = get_model_type(model_path)
1081
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import Dict, Optional, Tuple, Type
3
2
 
4
3
  from sglang.srt.parser.harmony_parser import HarmonyParser
@@ -250,6 +249,31 @@ class GptOssDetector(BaseReasoningFormatDetector):
250
249
  )
251
250
 
252
251
 
252
+ class MiniMaxAppendThinkDetector(BaseReasoningFormatDetector):
253
+ """
254
+ Append `<think>` token to the beginning of the text.
255
+ """
256
+
257
+ def __init__(self, stream_reasoning: bool = True, force_reasoning: bool = False):
258
+ # scheduler.py need `reasoning_parser.detector.think_end_token`
259
+ super().__init__(
260
+ "<think>",
261
+ "</think>",
262
+ force_reasoning=force_reasoning,
263
+ stream_reasoning=stream_reasoning,
264
+ )
265
+ self.is_first_chunk = False
266
+
267
+ def parse_streaming_increment(self, new_text: str) -> StreamingParseResult:
268
+ if not self.is_first_chunk:
269
+ self.is_first_chunk = True
270
+ new_text = self.think_start_token + new_text
271
+ return StreamingParseResult(normal_text=new_text)
272
+
273
+ def detect_and_parse(self, text: str) -> StreamingParseResult:
274
+ return StreamingParseResult(normal_text=self.think_start_token + text)
275
+
276
+
253
277
  class ReasoningParser:
254
278
  """
255
279
  Parser that handles both streaming and non-streaming scenarios for extracting
@@ -269,6 +293,8 @@ class ReasoningParser:
269
293
  "kimi": KimiDetector,
270
294
  "qwen3": Qwen3Detector,
271
295
  "qwen3-thinking": Qwen3Detector,
296
+ "minimax": Qwen3Detector,
297
+ "minimax-append-think": MiniMaxAppendThinkDetector,
272
298
  "step3": DeepSeekR1Detector,
273
299
  }
274
300
 
@@ -286,7 +312,7 @@ class ReasoningParser:
286
312
  raise ValueError(f"Unsupported model type: {model_type}")
287
313
 
288
314
  # Special cases where we override force_reasoning
289
- if model_type.lower() in {"qwen3-thinking", "gpt-oss"}:
315
+ if model_type.lower() in {"qwen3-thinking", "gpt-oss", "minimax"}:
290
316
  force_reasoning = True
291
317
 
292
318
  # Only pass force_reasoning if explicitly set, let detectors use their defaults