sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,11 @@
1
1
  # Adapted from https://huggingface.co/OpenGVLab/InternVL2-4B/blob/main/modeling_intern_vit.py
2
2
 
3
+ from functools import lru_cache
4
+
3
5
  import numpy as np
4
6
  import torch
5
- import torchvision.transforms as T
6
7
  from decord import VideoReader, cpu, gpu
7
8
  from PIL import Image
8
- from torchvision.transforms import InterpolationMode
9
9
 
10
10
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
11
  from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
@@ -19,6 +19,20 @@ from sglang.srt.multimodal.processors.base_processor import (
19
19
  class InternVLImageProcessor(BaseMultimodalProcessor):
20
20
  models = [InternVLChatModel, InternS1ForConditionalGeneration]
21
21
 
22
+ IMAGENET_MEAN = [0.485, 0.456, 0.406]
23
+ IMAGENET_STD = [0.229, 0.224, 0.225]
24
+
25
+ @staticmethod
26
+ @lru_cache(maxsize=1)
27
+ def _get_normalize_tensors(device="cuda", dtype=torch.float32):
28
+ mean = torch.tensor(
29
+ InternVLImageProcessor.IMAGENET_MEAN, device=device, dtype=dtype
30
+ ).view(-1, 1, 1)
31
+ std = torch.tensor(
32
+ InternVLImageProcessor.IMAGENET_STD, device=device, dtype=dtype
33
+ ).view(-1, 1, 1)
34
+ return mean, std
35
+
22
36
  def __init__(self, hf_config, server_args, _image_processor, *args, **kwargs):
23
37
  super().__init__(hf_config, server_args, _image_processor, *args, **kwargs)
24
38
  image_size = (
@@ -88,6 +102,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
88
102
  bound, fps, max_frame, first_idx=0, num_segments=num_segments
89
103
  )
90
104
 
105
+ mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
106
+
91
107
  for frame_index in frame_indices:
92
108
  # Load frame
93
109
  frame = vr[frame_index]
@@ -97,10 +113,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
97
113
  img_np = frame.asnumpy()
98
114
  img = torch.from_numpy(img_np).permute(2, 0, 1).cuda().float() / 255.0
99
115
 
100
- # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
101
- mean = img.mean(dim=[1, 2], keepdim=True)
102
- # Prevent division by zero; clamp to minimum value of 1e-6
103
- std = img.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
104
116
  img = (img - mean) / std
105
117
 
106
118
  tiles = InternVLImageProcessor.dynamic_preprocess(
@@ -188,6 +200,8 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
188
200
  num_patches_list = []
189
201
  pixel_values = []
190
202
 
203
+ mean, std = InternVLImageProcessor._get_normalize_tensors(device="cuda")
204
+
191
205
  # Process each input with allocated frames
192
206
  for image_index, image in enumerate(base_output.images):
193
207
  try:
@@ -201,10 +215,6 @@ class InternVLImageProcessor(BaseMultimodalProcessor):
201
215
  else:
202
216
  tensor = image.cuda() # assume already tensor
203
217
 
204
- # Using the mean and variance of the ImageNet dataset for all input images can lead to accuracy issues, while using the mean and variance of each input image is a more accurate choice.
205
- mean = tensor.mean(dim=[1, 2], keepdim=True)
206
- # Prevent division by zero; clamp to minimum value of 1e-6
207
- std = tensor.std(dim=[1, 2], keepdim=True).clamp(min=1e-6)
208
218
  tensor = (tensor - mean) / std
209
219
  tiles = self.dynamic_preprocess(
210
220
  tensor, image_size=448, max_num=12, use_thumbnail=True
@@ -1,6 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
3
  from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
5
4
  from sglang.srt.multimodal.processors.base_processor import (
6
5
  BaseMultimodalProcessor,
@@ -1,13 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- import torch
4
- from transformers.image_utils import SizeDict
5
- from transformers.models.llama4.image_processing_llama4_fast import (
6
- find_supported_resolutions,
7
- get_best_fit,
8
- )
9
-
10
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
3
  from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
12
4
  from sglang.srt.multimodal.processors.base_processor import (
13
5
  BaseMultimodalProcessor,
@@ -3,7 +3,6 @@ from typing import List, Union
3
3
 
4
4
  from transformers.processing_utils import ProcessorMixin
5
5
 
6
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
6
  from sglang.srt.models.phi4mm import Phi4MMForCausalLM
8
7
  from sglang.srt.multimodal.processors.base_processor import (
9
8
  BaseMultimodalProcessor,
@@ -0,0 +1,52 @@
1
+ # Copy from qwen_vl.py, adapted for points-v15-chat
2
+
3
+ import asyncio
4
+ from typing import List, Union
5
+
6
+ from PIL import Image
7
+
8
+ from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
9
+ from sglang.srt.multimodal.processors.qwen_vl import (
10
+ Qwen2_5VLImageProcessor,
11
+ resize_image_async,
12
+ )
13
+
14
+
15
+ class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
16
+ models = [POINTSV15ChatModel]
17
+
18
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
19
+ # Compatible with POINTSV15Chat
20
+ hf_config.vision_start_token_id = None
21
+ hf_config.vision_end_token_id = None
22
+ hf_config.video_token_id = None
23
+
24
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
25
+
26
+ async def process_mm_data_async(
27
+ self,
28
+ image_data: List[Union[str, bytes]],
29
+ input_text,
30
+ request_obj,
31
+ *args,
32
+ **kwargs,
33
+ ):
34
+ base_output = self.load_mm_data(
35
+ prompt=input_text,
36
+ image_data=image_data,
37
+ multimodal_tokens=self.mm_tokens,
38
+ )
39
+
40
+ if base_output.images and isinstance(base_output.images[0], Image.Image):
41
+ resize_tasks = [resize_image_async(image) for image in base_output.images]
42
+ base_output.images = await asyncio.gather(*resize_tasks)
43
+
44
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
45
+ base_output, self.mm_tokens
46
+ )
47
+
48
+ return {
49
+ "input_ids": input_ids.tolist(),
50
+ "mm_items": mm_items,
51
+ "im_token_id": self.mm_tokens.image_token_id,
52
+ }
@@ -9,9 +9,13 @@ import torchvision
9
9
  from PIL import Image
10
10
  from torchvision.transforms import InterpolationMode
11
11
 
12
+ from sglang.srt.environ import envs
12
13
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
13
14
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
14
15
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
16
+ from sglang.srt.models.qwen3_omni_moe import Qwen3OmniMoeForConditionalGeneration
17
+ from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
18
+ from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
15
19
  from sglang.srt.multimodal.processors.base_processor import (
16
20
  BaseMultimodalProcessor as SGLangBaseProcessor,
17
21
  )
@@ -20,8 +24,14 @@ from sglang.utils import logger
20
24
 
21
25
  IMAGE_FACTOR = 28
22
26
  MIN_PIXELS = 4 * 28 * 28
23
- MAX_PIXELS = 16384 * 28 * 28
27
+ MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
24
28
  MAX_RATIO = 200
29
+ RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
30
+ if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
31
+ logger.warning(
32
+ f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
33
+ f"Ignoring and using default."
34
+ )
25
35
  VIDEO_TOTAL_PIXELS = int(
26
36
  float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
27
37
  )
@@ -83,7 +93,7 @@ def resize_image(
83
93
  min_pixels=min_pixels,
84
94
  max_pixels=max_pixels,
85
95
  )
86
- image = image.resize((resized_width, resized_height))
96
+ image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
87
97
  return image
88
98
 
89
99
 
@@ -204,20 +214,41 @@ async def preprocess_video(
204
214
  interpolation=InterpolationMode.BICUBIC,
205
215
  antialias=True,
206
216
  ).float()
207
- return video
208
-
209
-
210
- # Compatible with Qwen2VL and Qwen2_5VL
211
- class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
212
- models = [Qwen2VLForConditionalGeneration, Qwen2_5_VLForConditionalGeneration]
217
+ video_metadata = {
218
+ "fps": video_fps,
219
+ "duration": total_frames / video_fps,
220
+ "total_num_frames": total_frames,
221
+ "frames_indices": idx,
222
+ "video_backend": "torchvision",
223
+ }
224
+ return video, video_metadata
225
+
226
+
227
+ # Compatible with Qwen-VL & Qwen-Omni Series
228
+ class QwenVLImageProcessor(SGLangBaseProcessor):
229
+ models = [
230
+ Qwen2VLForConditionalGeneration,
231
+ Qwen2_5_VLForConditionalGeneration,
232
+ Qwen3VLForConditionalGeneration,
233
+ Qwen3VLMoeForConditionalGeneration,
234
+ Qwen3OmniMoeForConditionalGeneration,
235
+ ]
213
236
 
214
237
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
238
+ self.model_type = hf_config.model_type
239
+ if hf_config.model_type == "qwen3_omni_moe":
240
+ hf_config = hf_config.thinker_config
241
+
215
242
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
216
- # The regex that matches expanded image tokens.
243
+
217
244
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
218
245
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
219
246
  self.vision_start_token_id = hf_config.vision_start_token_id
220
247
  self.vision_end_token_id = hf_config.vision_end_token_id
248
+
249
+ self.audio_start_token_id = getattr(hf_config, "audio_start_token_id", None)
250
+ self.audio_token_id = getattr(hf_config, "audio_token_id", None)
251
+
221
252
  self.NUM_TOKEN_PER_FRAME = 770
222
253
  self.IMAGE_FACTOR = 28
223
254
  self.MIN_PIXELS = 4 * 28 * 28
@@ -226,10 +257,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
226
257
  self.mm_tokens = MultimodalSpecialTokens(
227
258
  image_token="<|vision_start|><|image_pad|><|vision_end|>",
228
259
  image_token_id=hf_config.image_token_id,
260
+ # The regex that matches expanded image tokens.
229
261
  image_token_regex=re.compile(
230
262
  r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
231
263
  ),
232
264
  video_token_id=hf_config.video_token_id,
265
+ audio_token_id=self.audio_token_id,
233
266
  ).build(_processor)
234
267
 
235
268
  async def process_mm_data_async(
@@ -240,11 +273,11 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
240
273
  *args,
241
274
  **kwargs,
242
275
  ):
243
-
244
276
  base_output = self.load_mm_data(
245
277
  prompt=input_text,
246
278
  image_data=image_data,
247
279
  video_data=request_obj.video_data,
280
+ audio_data=request_obj.audio_data,
248
281
  multimodal_tokens=self.mm_tokens,
249
282
  )
250
283
 
@@ -253,29 +286,61 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
253
286
  resize_tasks = [resize_image_async(image) for image in base_output.images]
254
287
  base_output.images = await asyncio.gather(*resize_tasks)
255
288
 
289
+ video_metadata = None
256
290
  if base_output.videos:
257
- base_output.videos = [
258
- await preprocess_video(video) for video in base_output.videos
259
- ]
291
+ video_results = await asyncio.gather(
292
+ *[preprocess_video(video) for video in base_output.videos]
293
+ )
294
+ base_output.videos, video_metadata = map(list, zip(*video_results))
295
+
296
+ # NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
297
+ if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
298
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
299
+ base_output,
300
+ self.mm_tokens,
301
+ video_metadata=video_metadata,
302
+ do_sample_frames=False,
303
+ )
304
+ else:
305
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
306
+ base_output, self.mm_tokens
307
+ )
308
+
309
+ audio_feature_lengths = None
260
310
 
261
- mm_items, input_ids, ret = self.process_and_combine_mm_data(
262
- base_output, self.mm_tokens
311
+ if self.model_type == "qwen3_omni_moe":
312
+ audio_item = next((mm for mm in mm_items if mm.is_audio()), None)
313
+ if audio_item:
314
+ audio_feature_lengths = torch.sum(
315
+ audio_item.feature_attention_mask, dim=1
316
+ )
317
+
318
+ second_per_grid_ts = getattr(ret, "second_per_grid_ts", None) or getattr(
319
+ ret, "video_second_per_grid", None
263
320
  )
264
321
 
265
322
  input_ids = input_ids.flatten()
323
+
266
324
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
267
325
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
268
326
  image_token_id=self.mm_tokens.image_token_id,
269
327
  video_token_id=self.mm_tokens.video_token_id,
270
328
  vision_start_token_id=self.vision_start_token_id,
271
- model_type=self.hf_config.model_type,
329
+ model_type=self.model_type,
272
330
  tokens_per_second=getattr(
273
331
  self.hf_config.vision_config, "tokens_per_second", None
274
332
  ),
275
333
  input_ids=input_ids.unsqueeze(0),
276
334
  image_grid_thw=getattr(ret, "image_grid_thw", None),
277
335
  video_grid_thw=getattr(ret, "video_grid_thw", None),
278
- second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
336
+ second_per_grid_ts=second_per_grid_ts,
337
+ use_audio_in_video=False,
338
+ audio_seqlens=audio_feature_lengths,
339
+ audio_token_id=getattr(self.hf_config, "audio_token_id", None),
340
+ audio_start_token_id=self.audio_start_token_id,
341
+ position_id_per_seconds=getattr(
342
+ self.hf_config, "position_id_per_seconds", None
343
+ ),
279
344
  )
280
345
  mrope_positions = mrope_positions.squeeze(1)
281
346
 
@@ -286,6 +351,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
286
351
  "im_end_id": self.IM_END_TOKEN_ID,
287
352
  "im_token_id": self.mm_tokens.image_token_id,
288
353
  "video_token_id": self.mm_tokens.video_token_id,
354
+ "audio_token_id": self.mm_tokens.audio_token_id,
289
355
  "mrope_positions": mrope_positions,
290
356
  "mrope_position_delta": mrope_position_delta,
291
357
  }
@@ -0,0 +1,81 @@
1
+ from typing import List, Union
2
+
3
+ from sglang.srt.models.sarashina2_vision import Sarashina2VisionForCausalLM
4
+ from sglang.srt.multimodal.processors.base_processor import (
5
+ BaseMultimodalProcessor,
6
+ MultimodalSpecialTokens,
7
+ )
8
+
9
+
10
+ class Sarashina2VisionProcessor(BaseMultimodalProcessor):
11
+ models = [Sarashina2VisionForCausalLM]
12
+
13
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
14
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
15
+
16
+ # Sarashina2Vision specific tokens (default is <|file|>)
17
+ self.IMAGE_TOKEN = "<|file|>"
18
+ self.IM_TOKEN_ID = getattr(hf_config, "image_token_index", 14)
19
+ self.IM_START_ID = getattr(hf_config, "start_image_token_index", 102397)
20
+ self.IM_END_ID = getattr(hf_config, "end_image_token_index", 102398)
21
+
22
+ self.mm_tokens = MultimodalSpecialTokens(
23
+ image_token=self.IMAGE_TOKEN,
24
+ image_token_id=self.IM_TOKEN_ID,
25
+ ).build(_processor)
26
+
27
+ # Patch the processor's image processor to handle parameter compatibility
28
+ if hasattr(_processor, "image_processor") and hasattr(
29
+ _processor.image_processor, "_preprocess"
30
+ ):
31
+ original_preprocess = _processor.image_processor._preprocess
32
+
33
+ def patched_preprocess(*args, **kwargs):
34
+ # Filter kwargs to only include parameters that the custom _preprocess method accepts
35
+ # Based on Sarashina2VisionImageProcessor._preprocess signature
36
+ allowed_params = {
37
+ "do_resize",
38
+ "resample",
39
+ "do_rescale",
40
+ "rescale_factor",
41
+ "do_normalize",
42
+ "image_mean",
43
+ "image_std",
44
+ "do_convert_rgb",
45
+ "data_format",
46
+ "input_data_format",
47
+ }
48
+ filtered_kwargs = {
49
+ k: v for k, v in kwargs.items() if k in allowed_params
50
+ }
51
+ return original_preprocess(*args, **filtered_kwargs)
52
+
53
+ _processor.image_processor._preprocess = patched_preprocess
54
+
55
+ async def process_mm_data_async(
56
+ self,
57
+ image_data: List[Union[str, bytes]],
58
+ input_text,
59
+ request_obj,
60
+ *args,
61
+ **kwargs,
62
+ ):
63
+ """Process image data for Sarashina2Vision model using standard SGLang pattern."""
64
+ base_output = self.load_mm_data(
65
+ prompt=input_text,
66
+ image_data=image_data,
67
+ multimodal_tokens=self.mm_tokens,
68
+ )
69
+
70
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
71
+ base_output=base_output,
72
+ mm_tokens=self.mm_tokens,
73
+ )
74
+
75
+ return {
76
+ "mm_items": mm_items,
77
+ "input_ids": input_ids.tolist(),
78
+ "im_token_id": self.mm_tokens.image_token_id,
79
+ "im_start_id": self.IM_START_ID,
80
+ "im_end_id": self.IM_END_ID,
81
+ }
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  import re
3
3
  from itertools import product
4
- from typing import List, Literal, Optional, TypedDict, Union
4
+ from typing import List, Optional, Union
5
5
 
6
6
  import numpy as np
7
7
  import torch
@@ -838,6 +838,19 @@ register_conv_template(
838
838
  )
839
839
  )
840
840
 
841
+ register_conv_template(
842
+ Conversation(
843
+ name="deepseek-ocr",
844
+ system_message="",
845
+ system_template="",
846
+ roles=("", ""),
847
+ sep="",
848
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
849
+ stop_str=["<|end▁of▁sentence|>"],
850
+ image_token="<image>",
851
+ )
852
+ )
853
+
841
854
  register_conv_template(
842
855
  Conversation(
843
856
  name="deepseek-vl2",
@@ -960,6 +973,19 @@ register_conv_template(
960
973
  )
961
974
  )
962
975
 
976
+ register_conv_template(
977
+ Conversation(
978
+ name="points-v15-chat",
979
+ system_message="",
980
+ system_template="",
981
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
982
+ sep="<|im_end|>\n",
983
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
984
+ stop_str=["<|im_end|>"],
985
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
986
+ video_token="<|vision_start|><|video_pad|><|vision_end|>",
987
+ )
988
+ )
963
989
 
964
990
  MODEL_TYPE_TO_TEMPLATE = {
965
991
  "internvl_chat": "internvl-2-5",
@@ -968,9 +994,16 @@ MODEL_TYPE_TO_TEMPLATE = {
968
994
  "phi4mm": "phi-4-mm",
969
995
  "minicpmv": "minicpmv",
970
996
  "minicpmo": "minicpmo",
997
+ "deepseek-ocr": "deepseek-ocr",
971
998
  }
972
999
 
973
1000
 
1001
+ @register_conv_template_matching_function
1002
+ def match_points_v15_chat(model_path: str):
1003
+ if re.search(r"points", model_path, re.IGNORECASE):
1004
+ return "points-v15-chat"
1005
+
1006
+
974
1007
  def get_model_type(model_path: str) -> Optional[str]:
975
1008
  config_path = os.path.join(model_path, "config.json")
976
1009
  if not os.path.exists(config_path):
@@ -1038,3 +1071,11 @@ def match_phi_4_mm(model_path: str):
1038
1071
  return "phi-4-mm"
1039
1072
  model_type = get_model_type(model_path)
1040
1073
  return MODEL_TYPE_TO_TEMPLATE.get(model_type)
1074
+
1075
+
1076
+ @register_conv_template_matching_function
1077
+ def match_deepseek_ocr(model_path: str):
1078
+ if "deepseek-ocr" in model_path.lower():
1079
+ return "deepseek-ocr"
1080
+ model_type = get_model_type(model_path)
1081
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
@@ -89,6 +89,12 @@ def detect_jinja_template_content_format(chat_template: str) -> str:
89
89
  - If template has loops like {%- for content in message['content'] -%} → 'openai'
90
90
  - Otherwise → 'string'
91
91
  """
92
+ # Shortcut for multimodal templates
93
+ if any(
94
+ keyword in chat_template for keyword in ["image", "audio", "video", "vision"]
95
+ ):
96
+ return "openai"
97
+
92
98
  jinja_ast = _try_extract_ast(chat_template)
93
99
  if jinja_ast is None:
94
100
  return "string"
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import Dict, Optional, Tuple, Type
3
2
 
4
3
  from sglang.srt.parser.harmony_parser import HarmonyParser
@@ -1,18 +1,22 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
3
  from functools import lru_cache
4
- from typing import Any, Dict, List, Optional
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
5
5
 
6
6
  import dill
7
+ import orjson
7
8
  import torch
8
9
 
10
+ if TYPE_CHECKING:
11
+ from sglang.srt.managers.schedule_batch import Req
12
+
9
13
 
10
14
  @lru_cache(maxsize=None)
11
15
  def _cache_from_str(json_str: str):
12
16
  """Deserialize a json string to a Callable object.
13
17
  This function is cached to avoid redundant deserialization.
14
18
  """
15
- data = json.loads(json_str)
19
+ data = orjson.loads(json_str)
16
20
  return dill.loads(bytes.fromhex(data["callable"]))
17
21
 
18
22
 
@@ -51,3 +55,74 @@ class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
51
55
  ), f"{custom_param_list=}"
52
56
  logits[..., disallowed_token_ids] = -float("inf")
53
57
  return logits
58
+
59
+
60
+ class ThinkingBudgetLogitProcessor(CustomLogitProcessor):
61
+ """A logit processor that controls the length of thinking."""
62
+
63
+ THINKING_START_TOKEN_ID: int
64
+ THINKING_END_TOKEN_ID: int
65
+ NEW_LINE_TOKEN_ID: int
66
+
67
+ def __call__(self, logits, custom_param_list: list[dict[str, Any]]):
68
+ if custom_param_list is None or not custom_param_list:
69
+ return logits
70
+ for i, param_dict in enumerate(custom_param_list):
71
+ if param_dict is None:
72
+ continue
73
+
74
+ thinking_budget: int | None = param_dict.get("thinking_budget")
75
+
76
+ # Skip if thinking_budget is unset, or not an integer, or negative
77
+ if (
78
+ thinking_budget is None
79
+ or not isinstance(thinking_budget, int)
80
+ or thinking_budget < 0
81
+ ):
82
+ continue
83
+ req: Req = param_dict.get("__req__")
84
+ cur_ids: list[int] = [*req.origin_input_ids, *req.output_ids]
85
+
86
+ # Check if out of thinking stage
87
+ if (
88
+ self.THINKING_START_TOKEN_ID not in cur_ids
89
+ or self.THINKING_END_TOKEN_ID in cur_ids
90
+ ):
91
+ continue
92
+
93
+ # Find the index of the thinking start token
94
+ start_index = cur_ids.index(self.THINKING_START_TOKEN_ID)
95
+
96
+ # Count the number of tokens after the thinking start token
97
+ num_tokens_after_start = len(cur_ids) - start_index - 1
98
+
99
+ if num_tokens_after_start < thinking_budget:
100
+ continue
101
+
102
+ # Ensure new line token before thinking end token
103
+ if not req.output_ids or req.output_ids[-1] != self.NEW_LINE_TOKEN_ID:
104
+ logits[i, :] = -float("inf")
105
+ logits[i, self.NEW_LINE_TOKEN_ID] = 0.0
106
+ continue
107
+
108
+ # Assign highest probability to the thinking end token
109
+ logits[i, :] = -float("inf")
110
+ logits[i, self.THINKING_END_TOKEN_ID] = 0.0
111
+
112
+ return logits
113
+
114
+
115
+ class Qwen3ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
116
+ """A logit processor that controls the length of thinking for Qwen3 models."""
117
+
118
+ THINKING_START_TOKEN_ID: int = 151667
119
+ THINKING_END_TOKEN_ID: int = 151668
120
+ NEW_LINE_TOKEN_ID: int = 198
121
+
122
+
123
+ class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
124
+ """A logit processor that controls the length of thinking for DeepSeek-R1 models."""
125
+
126
+ THINKING_START_TOKEN_ID: int = 128798
127
+ THINKING_END_TOKEN_ID: int = 128799
128
+ NEW_LINE_TOKEN_ID: int = 201