sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -27,7 +27,11 @@ if _is_cuda:
27
27
 
28
28
  def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
29
29
  """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
30
- return _is_cuda and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
30
+ return (
31
+ _is_cuda
32
+ and hasattr(forward_batch.token_to_kv_pool, "dtype")
33
+ and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
34
+ )
31
35
 
32
36
 
33
37
  def create_fused_set_kv_buffer_arg(
@@ -49,3 +53,9 @@ def create_fused_set_kv_buffer_arg(
49
53
  v_scale=layer.v_scale,
50
54
  cache_loc=forward_batch.out_cache_loc,
51
55
  )
56
+
57
+
58
+ def permute_inv(perm: torch.Tensor) -> torch.Tensor:
59
+ inv_perm = torch.empty_like(perm)
60
+ inv_perm[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
61
+ return inv_perm
@@ -155,7 +155,6 @@ class BaseMultimodalProcessor(ABC):
155
155
  ):
156
156
  self.hf_config = hf_config
157
157
  self._processor = _processor
158
- self.arch = hf_config.architectures[0]
159
158
  self.server_args = server_args
160
159
  self.transport_mode = transport_mode
161
160
 
@@ -179,6 +178,7 @@ class BaseMultimodalProcessor(ABC):
179
178
  "image_attention_mask": Modality.IMAGE,
180
179
  "image_emb_mask": Modality.IMAGE,
181
180
  "images_spatial_crop": Modality.IMAGE,
181
+ "images_crop": Modality.IMAGE,
182
182
  "tgt_size": Modality.IMAGE,
183
183
  "image_grid_hws": Modality.IMAGE,
184
184
  "aspect_ratio_ids": Modality.IMAGE,
@@ -191,6 +191,7 @@ class BaseMultimodalProcessor(ABC):
191
191
  "input_features": Modality.AUDIO,
192
192
  "input_features_mask": Modality.AUDIO,
193
193
  "audio_attention_mask": Modality.AUDIO,
194
+ "feature_attention_mask": Modality.AUDIO,
194
195
  # Video-related attributes
195
196
  "pixel_values_videos": Modality.VIDEO,
196
197
  "second_per_grid_ts": Modality.VIDEO,
@@ -222,6 +223,7 @@ class BaseMultimodalProcessor(ABC):
222
223
  if self._processor.__class__.__name__ in {
223
224
  "Gemma3nProcessor",
224
225
  "Qwen2AudioProcessor",
226
+ "Qwen3OmniMoeProcessor",
225
227
  }:
226
228
  # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
227
229
  kwargs["audio"] = audios
@@ -312,7 +314,9 @@ class BaseMultimodalProcessor(ABC):
312
314
  try:
313
315
  if modality == Modality.IMAGE:
314
316
  img, _ = load_image(data)
315
- return img.convert("RGB") if discard_alpha_channel else img
317
+ if discard_alpha_channel and img.mode != "RGB":
318
+ img = img.convert("RGB")
319
+ return img
316
320
  elif modality == Modality.VIDEO:
317
321
  return load_video(data, frame_count_limit)
318
322
  elif modality == Modality.AUDIO:
@@ -0,0 +1,37 @@
1
+ from typing import List, Union
2
+
3
+ from sglang.srt.models.deepseek_ocr import DeepseekOCRForCausalLM
4
+ from sglang.srt.multimodal.processors.base_processor import (
5
+ BaseMultimodalProcessor,
6
+ MultimodalSpecialTokens,
7
+ )
8
+
9
+
10
+ class DeepseekOCRProcessor(BaseMultimodalProcessor):
11
+ models = [DeepseekOCRForCausalLM]
12
+
13
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
14
+ _processor.image_size = 640
15
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
16
+ self.mm_tokens = MultimodalSpecialTokens(
17
+ image_token="<image>", image_token_id=self._processor.image_token_id
18
+ ).build(_processor)
19
+
20
+ async def process_mm_data_async(
21
+ self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
22
+ ):
23
+ base_output = self.load_mm_data(
24
+ prompt=input_text,
25
+ multimodal_tokens=self.mm_tokens,
26
+ image_data=image_data,
27
+ )
28
+
29
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
30
+ base_output, self.mm_tokens
31
+ )
32
+
33
+ return {
34
+ "input_ids": input_ids.tolist(),
35
+ "mm_items": mm_items,
36
+ "im_token_id": self.mm_tokens.image_token_id,
37
+ }
@@ -18,9 +18,6 @@
18
18
  # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
19
  from typing import List, Union
20
20
 
21
- import torch
22
-
23
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
24
21
  from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
25
22
  from sglang.srt.multimodal.processors.base_processor import (
26
23
  BaseMultimodalProcessor,
@@ -1,5 +1,4 @@
1
1
  import asyncio
2
- import math
3
2
  import re
4
3
  from typing import Dict, List, Union
5
4
 
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import List, Union
3
2
 
4
3
  from decord import VideoReader
@@ -9,10 +8,7 @@ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
9
8
  from sglang.srt.multimodal.processors.base_processor import (
10
9
  BaseMultimodalProcessor as SGLangBaseProcessor,
11
10
  )
12
- from sglang.srt.multimodal.processors.base_processor import (
13
- BaseMultiModalProcessorOutput,
14
- MultimodalSpecialTokens,
15
- )
11
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
16
12
 
17
13
 
18
14
  class Glm4vImageProcessor(SGLangBaseProcessor):
@@ -4,10 +4,8 @@ from functools import lru_cache
4
4
 
5
5
  import numpy as np
6
6
  import torch
7
- import torchvision.transforms as T
8
7
  from decord import VideoReader, cpu, gpu
9
8
  from PIL import Image
10
- from torchvision.transforms import InterpolationMode
11
9
 
12
10
  from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
13
11
  from sglang.srt.models.interns1 import InternS1ForConditionalGeneration
@@ -1,6 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
4
3
  from sglang.srt.models.deepseek_janus_pro import MultiModalityCausalLM
5
4
  from sglang.srt.multimodal.processors.base_processor import (
6
5
  BaseMultimodalProcessor,
@@ -1,13 +1,5 @@
1
1
  from typing import List, Union
2
2
 
3
- import torch
4
- from transformers.image_utils import SizeDict
5
- from transformers.models.llama4.image_processing_llama4_fast import (
6
- find_supported_resolutions,
7
- get_best_fit,
8
- )
9
-
10
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
11
3
  from sglang.srt.models.mllama4 import Llama4ForConditionalGeneration
12
4
  from sglang.srt.multimodal.processors.base_processor import (
13
5
  BaseMultimodalProcessor,
@@ -3,7 +3,6 @@ from typing import List, Union
3
3
 
4
4
  from transformers.processing_utils import ProcessorMixin
5
5
 
6
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
7
6
  from sglang.srt.models.phi4mm import Phi4MMForCausalLM
8
7
  from sglang.srt.multimodal.processors.base_processor import (
9
8
  BaseMultimodalProcessor,
@@ -0,0 +1,52 @@
1
+ # Copy from qwen_vl.py, adapted for points-v15-chat
2
+
3
+ import asyncio
4
+ from typing import List, Union
5
+
6
+ from PIL import Image
7
+
8
+ from sglang.srt.models.points_v15_chat import POINTSV15ChatModel
9
+ from sglang.srt.multimodal.processors.qwen_vl import (
10
+ Qwen2_5VLImageProcessor,
11
+ resize_image_async,
12
+ )
13
+
14
+
15
+ class POINTSV15ChatProcessor(Qwen2_5VLImageProcessor):
16
+ models = [POINTSV15ChatModel]
17
+
18
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
19
+ # Compatible with POINTSV15Chat
20
+ hf_config.vision_start_token_id = None
21
+ hf_config.vision_end_token_id = None
22
+ hf_config.video_token_id = None
23
+
24
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
25
+
26
+ async def process_mm_data_async(
27
+ self,
28
+ image_data: List[Union[str, bytes]],
29
+ input_text,
30
+ request_obj,
31
+ *args,
32
+ **kwargs,
33
+ ):
34
+ base_output = self.load_mm_data(
35
+ prompt=input_text,
36
+ image_data=image_data,
37
+ multimodal_tokens=self.mm_tokens,
38
+ )
39
+
40
+ if base_output.images and isinstance(base_output.images[0], Image.Image):
41
+ resize_tasks = [resize_image_async(image) for image in base_output.images]
42
+ base_output.images = await asyncio.gather(*resize_tasks)
43
+
44
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
45
+ base_output, self.mm_tokens
46
+ )
47
+
48
+ return {
49
+ "input_ids": input_ids.tolist(),
50
+ "mm_items": mm_items,
51
+ "im_token_id": self.mm_tokens.image_token_id,
52
+ }
@@ -9,9 +9,11 @@ import torchvision
9
9
  from PIL import Image
10
10
  from torchvision.transforms import InterpolationMode
11
11
 
12
+ from sglang.srt.environ import envs
12
13
  from sglang.srt.layers.rotary_embedding import MRotaryEmbedding
13
14
  from sglang.srt.models.qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
14
15
  from sglang.srt.models.qwen2_vl import Qwen2VLForConditionalGeneration
16
+ from sglang.srt.models.qwen3_omni_moe import Qwen3OmniMoeForConditionalGeneration
15
17
  from sglang.srt.models.qwen3_vl import Qwen3VLForConditionalGeneration
16
18
  from sglang.srt.models.qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
17
19
  from sglang.srt.multimodal.processors.base_processor import (
@@ -22,8 +24,14 @@ from sglang.utils import logger
22
24
 
23
25
  IMAGE_FACTOR = 28
24
26
  MIN_PIXELS = 4 * 28 * 28
25
- MAX_PIXELS = 16384 * 28 * 28
27
+ MAX_PIXELS = envs.SGLANG_IMAGE_MAX_PIXELS.get()
26
28
  MAX_RATIO = 200
29
+ RESIZE_RESAMPLE = getattr(Image, envs.SGLANG_RESIZE_RESAMPLE.get(), None)
30
+ if envs.SGLANG_RESIZE_RESAMPLE.is_set() and RESIZE_RESAMPLE is None:
31
+ logger.warning(
32
+ f"Invalid RESIZE_RESAMPLE value: '{envs.SGLANG_RESIZE_RESAMPLE.get()}'. "
33
+ f"Ignoring and using default."
34
+ )
27
35
  VIDEO_TOTAL_PIXELS = int(
28
36
  float(os.environ.get("VIDEO_MAX_PIXELS", 128000 * 28 * 28 * 0.9))
29
37
  )
@@ -85,7 +93,7 @@ def resize_image(
85
93
  min_pixels=min_pixels,
86
94
  max_pixels=max_pixels,
87
95
  )
88
- image = image.resize((resized_width, resized_height))
96
+ image = image.resize((resized_width, resized_height), resample=RESIZE_RESAMPLE)
89
97
  return image
90
98
 
91
99
 
@@ -206,25 +214,41 @@ async def preprocess_video(
206
214
  interpolation=InterpolationMode.BICUBIC,
207
215
  antialias=True,
208
216
  ).float()
209
- return video
210
-
211
-
212
- # Compatible with Qwen2VL and Qwen2_5VL
213
- class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
217
+ video_metadata = {
218
+ "fps": video_fps,
219
+ "duration": total_frames / video_fps,
220
+ "total_num_frames": total_frames,
221
+ "frames_indices": idx,
222
+ "video_backend": "torchvision",
223
+ }
224
+ return video, video_metadata
225
+
226
+
227
+ # Compatible with Qwen-VL & Qwen-Omni Series
228
+ class QwenVLImageProcessor(SGLangBaseProcessor):
214
229
  models = [
215
230
  Qwen2VLForConditionalGeneration,
216
231
  Qwen2_5_VLForConditionalGeneration,
217
232
  Qwen3VLForConditionalGeneration,
218
233
  Qwen3VLMoeForConditionalGeneration,
234
+ Qwen3OmniMoeForConditionalGeneration,
219
235
  ]
220
236
 
221
237
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
238
+ self.model_type = hf_config.model_type
239
+ if hf_config.model_type == "qwen3_omni_moe":
240
+ hf_config = hf_config.thinker_config
241
+
222
242
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
223
- # The regex that matches expanded image tokens.
243
+
224
244
  self.IM_START_TOKEN_ID = hf_config.vision_start_token_id
225
245
  self.IM_END_TOKEN_ID = hf_config.vision_end_token_id
226
246
  self.vision_start_token_id = hf_config.vision_start_token_id
227
247
  self.vision_end_token_id = hf_config.vision_end_token_id
248
+
249
+ self.audio_start_token_id = getattr(hf_config, "audio_start_token_id", None)
250
+ self.audio_token_id = getattr(hf_config, "audio_token_id", None)
251
+
228
252
  self.NUM_TOKEN_PER_FRAME = 770
229
253
  self.IMAGE_FACTOR = 28
230
254
  self.MIN_PIXELS = 4 * 28 * 28
@@ -233,10 +257,12 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
233
257
  self.mm_tokens = MultimodalSpecialTokens(
234
258
  image_token="<|vision_start|><|image_pad|><|vision_end|>",
235
259
  image_token_id=hf_config.image_token_id,
260
+ # The regex that matches expanded image tokens.
236
261
  image_token_regex=re.compile(
237
262
  r"<\|vision_start\|>(?:<\|image_pad\|>)+<\|vision_end\|>"
238
263
  ),
239
264
  video_token_id=hf_config.video_token_id,
265
+ audio_token_id=self.audio_token_id,
240
266
  ).build(_processor)
241
267
 
242
268
  async def process_mm_data_async(
@@ -247,11 +273,11 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
247
273
  *args,
248
274
  **kwargs,
249
275
  ):
250
-
251
276
  base_output = self.load_mm_data(
252
277
  prompt=input_text,
253
278
  image_data=image_data,
254
279
  video_data=request_obj.video_data,
280
+ audio_data=request_obj.audio_data,
255
281
  multimodal_tokens=self.mm_tokens,
256
282
  )
257
283
 
@@ -260,29 +286,61 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
260
286
  resize_tasks = [resize_image_async(image) for image in base_output.images]
261
287
  base_output.images = await asyncio.gather(*resize_tasks)
262
288
 
289
+ video_metadata = None
263
290
  if base_output.videos:
264
- base_output.videos = [
265
- await preprocess_video(video) for video in base_output.videos
266
- ]
291
+ video_results = await asyncio.gather(
292
+ *[preprocess_video(video) for video in base_output.videos]
293
+ )
294
+ base_output.videos, video_metadata = map(list, zip(*video_results))
295
+
296
+ # NOTE: for qwen3-vl, video_meta need to be passed in, since do_sample_frames is already done in preprocess_video
297
+ if self.hf_config.model_type in ("qwen3_vl", "qwen3_vl_moe"):
298
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
299
+ base_output,
300
+ self.mm_tokens,
301
+ video_metadata=video_metadata,
302
+ do_sample_frames=False,
303
+ )
304
+ else:
305
+ mm_items, input_ids, ret = self.process_and_combine_mm_data(
306
+ base_output, self.mm_tokens
307
+ )
308
+
309
+ audio_feature_lengths = None
267
310
 
268
- mm_items, input_ids, ret = self.process_and_combine_mm_data(
269
- base_output, self.mm_tokens
311
+ if self.model_type == "qwen3_omni_moe":
312
+ audio_item = next((mm for mm in mm_items if mm.is_audio()), None)
313
+ if audio_item:
314
+ audio_feature_lengths = torch.sum(
315
+ audio_item.feature_attention_mask, dim=1
316
+ )
317
+
318
+ second_per_grid_ts = getattr(ret, "second_per_grid_ts", None) or getattr(
319
+ ret, "video_second_per_grid", None
270
320
  )
271
321
 
272
322
  input_ids = input_ids.flatten()
323
+
273
324
  mrope_positions, mrope_position_delta = MRotaryEmbedding.get_rope_index(
274
325
  spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
275
326
  image_token_id=self.mm_tokens.image_token_id,
276
327
  video_token_id=self.mm_tokens.video_token_id,
277
328
  vision_start_token_id=self.vision_start_token_id,
278
- model_type=self.hf_config.model_type,
329
+ model_type=self.model_type,
279
330
  tokens_per_second=getattr(
280
331
  self.hf_config.vision_config, "tokens_per_second", None
281
332
  ),
282
333
  input_ids=input_ids.unsqueeze(0),
283
334
  image_grid_thw=getattr(ret, "image_grid_thw", None),
284
335
  video_grid_thw=getattr(ret, "video_grid_thw", None),
285
- second_per_grid_ts=getattr(ret, "second_per_grid_ts", None),
336
+ second_per_grid_ts=second_per_grid_ts,
337
+ use_audio_in_video=False,
338
+ audio_seqlens=audio_feature_lengths,
339
+ audio_token_id=getattr(self.hf_config, "audio_token_id", None),
340
+ audio_start_token_id=self.audio_start_token_id,
341
+ position_id_per_seconds=getattr(
342
+ self.hf_config, "position_id_per_seconds", None
343
+ ),
286
344
  )
287
345
  mrope_positions = mrope_positions.squeeze(1)
288
346
 
@@ -293,6 +351,7 @@ class Qwen2_5VLImageProcessor(SGLangBaseProcessor):
293
351
  "im_end_id": self.IM_END_TOKEN_ID,
294
352
  "im_token_id": self.mm_tokens.image_token_id,
295
353
  "video_token_id": self.mm_tokens.video_token_id,
354
+ "audio_token_id": self.mm_tokens.audio_token_id,
296
355
  "mrope_positions": mrope_positions,
297
356
  "mrope_position_delta": mrope_position_delta,
298
357
  }
@@ -1,7 +1,7 @@
1
1
  import math
2
2
  import re
3
3
  from itertools import product
4
- from typing import List, Literal, Optional, TypedDict, Union
4
+ from typing import List, Optional, Union
5
5
 
6
6
  import numpy as np
7
7
  import torch
@@ -838,6 +838,19 @@ register_conv_template(
838
838
  )
839
839
  )
840
840
 
841
+ register_conv_template(
842
+ Conversation(
843
+ name="deepseek-ocr",
844
+ system_message="",
845
+ system_template="",
846
+ roles=("", ""),
847
+ sep="",
848
+ sep_style=SeparatorStyle.NO_COLON_SINGLE,
849
+ stop_str=["<|end▁of▁sentence|>"],
850
+ image_token="<image>",
851
+ )
852
+ )
853
+
841
854
  register_conv_template(
842
855
  Conversation(
843
856
  name="deepseek-vl2",
@@ -960,6 +973,19 @@ register_conv_template(
960
973
  )
961
974
  )
962
975
 
976
+ register_conv_template(
977
+ Conversation(
978
+ name="points-v15-chat",
979
+ system_message="",
980
+ system_template="",
981
+ roles=("<|im_start|>user", "<|im_start|>assistant"),
982
+ sep="<|im_end|>\n",
983
+ sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
984
+ stop_str=["<|im_end|>"],
985
+ image_token="<|vision_start|><|image_pad|><|vision_end|>",
986
+ video_token="<|vision_start|><|video_pad|><|vision_end|>",
987
+ )
988
+ )
963
989
 
964
990
  MODEL_TYPE_TO_TEMPLATE = {
965
991
  "internvl_chat": "internvl-2-5",
@@ -968,9 +994,16 @@ MODEL_TYPE_TO_TEMPLATE = {
968
994
  "phi4mm": "phi-4-mm",
969
995
  "minicpmv": "minicpmv",
970
996
  "minicpmo": "minicpmo",
997
+ "deepseek-ocr": "deepseek-ocr",
971
998
  }
972
999
 
973
1000
 
1001
+ @register_conv_template_matching_function
1002
+ def match_points_v15_chat(model_path: str):
1003
+ if re.search(r"points", model_path, re.IGNORECASE):
1004
+ return "points-v15-chat"
1005
+
1006
+
974
1007
  def get_model_type(model_path: str) -> Optional[str]:
975
1008
  config_path = os.path.join(model_path, "config.json")
976
1009
  if not os.path.exists(config_path):
@@ -1038,3 +1071,11 @@ def match_phi_4_mm(model_path: str):
1038
1071
  return "phi-4-mm"
1039
1072
  model_type = get_model_type(model_path)
1040
1073
  return MODEL_TYPE_TO_TEMPLATE.get(model_type)
1074
+
1075
+
1076
+ @register_conv_template_matching_function
1077
+ def match_deepseek_ocr(model_path: str):
1078
+ if "deepseek-ocr" in model_path.lower():
1079
+ return "deepseek-ocr"
1080
+ model_type = get_model_type(model_path)
1081
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import Dict, Optional, Tuple, Type
3
2
 
4
3
  from sglang.srt.parser.harmony_parser import HarmonyParser
@@ -1,18 +1,22 @@
1
1
  import json
2
2
  from abc import ABC, abstractmethod
3
3
  from functools import lru_cache
4
- from typing import Any, Dict, List, Optional
4
+ from typing import TYPE_CHECKING, Any, Dict, List, Optional
5
5
 
6
6
  import dill
7
+ import orjson
7
8
  import torch
8
9
 
10
+ if TYPE_CHECKING:
11
+ from sglang.srt.managers.schedule_batch import Req
12
+
9
13
 
10
14
  @lru_cache(maxsize=None)
11
15
  def _cache_from_str(json_str: str):
12
16
  """Deserialize a json string to a Callable object.
13
17
  This function is cached to avoid redundant deserialization.
14
18
  """
15
- data = json.loads(json_str)
19
+ data = orjson.loads(json_str)
16
20
  return dill.loads(bytes.fromhex(data["callable"]))
17
21
 
18
22
 
@@ -51,3 +55,74 @@ class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
51
55
  ), f"{custom_param_list=}"
52
56
  logits[..., disallowed_token_ids] = -float("inf")
53
57
  return logits
58
+
59
+
60
+ class ThinkingBudgetLogitProcessor(CustomLogitProcessor):
61
+ """A logit processor that controls the length of thinking."""
62
+
63
+ THINKING_START_TOKEN_ID: int
64
+ THINKING_END_TOKEN_ID: int
65
+ NEW_LINE_TOKEN_ID: int
66
+
67
+ def __call__(self, logits, custom_param_list: list[dict[str, Any]]):
68
+ if custom_param_list is None or not custom_param_list:
69
+ return logits
70
+ for i, param_dict in enumerate(custom_param_list):
71
+ if param_dict is None:
72
+ continue
73
+
74
+ thinking_budget: int | None = param_dict.get("thinking_budget")
75
+
76
+ # Skip if thinking_budget is unset, or not an integer, or negative
77
+ if (
78
+ thinking_budget is None
79
+ or not isinstance(thinking_budget, int)
80
+ or thinking_budget < 0
81
+ ):
82
+ continue
83
+ req: Req = param_dict.get("__req__")
84
+ cur_ids: list[int] = [*req.origin_input_ids, *req.output_ids]
85
+
86
+ # Check if out of thinking stage
87
+ if (
88
+ self.THINKING_START_TOKEN_ID not in cur_ids
89
+ or self.THINKING_END_TOKEN_ID in cur_ids
90
+ ):
91
+ continue
92
+
93
+ # Find the index of the thinking start token
94
+ start_index = cur_ids.index(self.THINKING_START_TOKEN_ID)
95
+
96
+ # Count the number of tokens after the thinking start token
97
+ num_tokens_after_start = len(cur_ids) - start_index - 1
98
+
99
+ if num_tokens_after_start < thinking_budget:
100
+ continue
101
+
102
+ # Ensure new line token before thinking end token
103
+ if not req.output_ids or req.output_ids[-1] != self.NEW_LINE_TOKEN_ID:
104
+ logits[i, :] = -float("inf")
105
+ logits[i, self.NEW_LINE_TOKEN_ID] = 0.0
106
+ continue
107
+
108
+ # Assign highest probability to the thinking end token
109
+ logits[i, :] = -float("inf")
110
+ logits[i, self.THINKING_END_TOKEN_ID] = 0.0
111
+
112
+ return logits
113
+
114
+
115
+ class Qwen3ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
116
+ """A logit processor that controls the length of thinking for Qwen3 models."""
117
+
118
+ THINKING_START_TOKEN_ID: int = 151667
119
+ THINKING_END_TOKEN_ID: int = 151668
120
+ NEW_LINE_TOKEN_ID: int = 198
121
+
122
+
123
+ class DeepSeekR1ThinkingBudgetLogitProcessor(ThinkingBudgetLogitProcessor):
124
+ """A logit processor that controls the length of thinking for DeepSeek-R1 models."""
125
+
126
+ THINKING_START_TOKEN_ID: int = 128798
127
+ THINKING_END_TOKEN_ID: int = 128799
128
+ NEW_LINE_TOKEN_ID: int = 201