sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,357 @@
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3
+
4
+ # Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
5
+ #
6
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
7
+ # and OPT implementations in this library. It has been modified from its
8
+ # original forms to accommodate minor architectural differences compared
9
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
10
+ #
11
+ # Licensed under the Apache License, Version 2.0 (the "License");
12
+ # you may not use this file except in compliance with the License.
13
+ # You may obtain a copy of the License at
14
+ #
15
+ # http://www.apache.org/licenses/LICENSE-2.0
16
+ #
17
+ # Unless required by applicable law or agreed to in writing, software
18
+ # distributed under the License is distributed on an "AS IS" BASIS,
19
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
20
+ # See the License for the specific language governing permissions and
21
+ # limitations under the License.
22
+ # Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/starcoder2.py
23
+ """ PyTorch Starcoder2 model."""
24
+ from collections.abc import Iterable
25
+ from typing import Optional, Tuple
26
+
27
+ import torch
28
+ from torch import nn
29
+ from transformers import Starcoder2Config
30
+
31
+ from sglang.srt.distributed import get_pp_group, get_tensor_model_parallel_world_size
32
+ from sglang.srt.layers.activation import get_act_fn
33
+ from sglang.srt.layers.linear import (
34
+ ColumnParallelLinear,
35
+ QKVParallelLinear,
36
+ RowParallelLinear,
37
+ )
38
+ from sglang.srt.layers.logits_processor import LogitsProcessor
39
+ from sglang.srt.layers.quantization.base_config import QuantizationConfig
40
+ from sglang.srt.layers.radix_attention import RadixAttention
41
+ from sglang.srt.layers.rotary_embedding import get_rope
42
+ from sglang.srt.layers.vocab_parallel_embedding import (
43
+ DEFAULT_VOCAB_PADDING_SIZE,
44
+ ParallelLMHead,
45
+ VocabParallelEmbedding,
46
+ )
47
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
48
+ from sglang.srt.model_loader.weight_utils import default_weight_loader
49
+ from sglang.srt.utils import add_prefix, make_layers
50
+
51
+
52
+ class Starcoder2Attention(nn.Module):
53
+
54
+ def __init__(
55
+ self,
56
+ config: Starcoder2Config,
57
+ quant_config: Optional[QuantizationConfig] = None,
58
+ prefix: str = "",
59
+ layer_id: int = 0,
60
+ ):
61
+ super().__init__()
62
+ self.config = config
63
+
64
+ self.hidden_size = config.hidden_size
65
+ tp_size = get_tensor_model_parallel_world_size()
66
+ self.total_num_heads = config.num_attention_heads
67
+ assert self.total_num_heads % tp_size == 0
68
+ self.num_heads = self.total_num_heads // tp_size
69
+ self.total_num_kv_heads = config.num_key_value_heads
70
+ if self.total_num_kv_heads >= tp_size:
71
+ # Number of KV heads is greater than TP size, so we partition
72
+ # the KV heads across multiple tensor parallel GPUs.
73
+ assert self.total_num_kv_heads % tp_size == 0
74
+ else:
75
+ # Number of KV heads is less than TP size, so we replicate
76
+ # the KV heads across multiple tensor parallel GPUs.
77
+ assert tp_size % self.total_num_kv_heads == 0
78
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
79
+ self.head_dim = self.hidden_size // self.total_num_heads
80
+ self.q_size = self.num_heads * self.head_dim
81
+ self.kv_size = self.num_kv_heads * self.head_dim
82
+ self.scaling = self.head_dim**-0.5
83
+ self.rope_theta = config.rope_theta
84
+ self.max_position_embeddings = config.max_position_embeddings
85
+ self.use_bias = config.use_bias
86
+
87
+ self.qkv_proj = QKVParallelLinear(
88
+ self.hidden_size,
89
+ self.head_dim,
90
+ self.total_num_heads,
91
+ self.total_num_kv_heads,
92
+ bias=self.use_bias,
93
+ quant_config=quant_config,
94
+ prefix=f"{prefix}.qkv_proj",
95
+ )
96
+ self.o_proj = RowParallelLinear(
97
+ self.total_num_heads * self.head_dim,
98
+ self.hidden_size,
99
+ bias=self.use_bias,
100
+ quant_config=quant_config,
101
+ prefix=f"{prefix}.o_proj",
102
+ )
103
+ self.rotary_emb = get_rope(
104
+ self.head_dim,
105
+ rotary_dim=self.head_dim,
106
+ max_position=self.max_position_embeddings,
107
+ base=int(self.rope_theta),
108
+ is_neox_style=True,
109
+ )
110
+ self.attn = RadixAttention(
111
+ self.num_heads,
112
+ self.head_dim,
113
+ self.scaling,
114
+ num_kv_heads=self.num_kv_heads,
115
+ layer_id=layer_id,
116
+ quant_config=quant_config,
117
+ prefix=f"{prefix}.attn",
118
+ )
119
+
120
+ def forward(
121
+ self,
122
+ positions: torch.Tensor,
123
+ hidden_states: torch.Tensor,
124
+ forward_batch: ForwardBatch,
125
+ ) -> torch.Tensor:
126
+ qkv, _ = self.qkv_proj(hidden_states)
127
+ q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
128
+ q, k = self.rotary_emb(positions, q, k)
129
+ attn_output = self.attn(q, k, v, forward_batch)
130
+ output, _ = self.o_proj(attn_output)
131
+ return output
132
+
133
+
134
+ class Starcoder2MLP(nn.Module):
135
+
136
+ def __init__(
137
+ self,
138
+ config: Starcoder2Config,
139
+ quant_config: Optional[QuantizationConfig] = None,
140
+ prefix: str = "",
141
+ ):
142
+ super().__init__()
143
+ self.c_fc = ColumnParallelLinear(
144
+ config.hidden_size,
145
+ config.intermediate_size,
146
+ bias=config.use_bias,
147
+ quant_config=quant_config,
148
+ prefix=f"{prefix}.c_fc",
149
+ )
150
+ self.c_proj = RowParallelLinear(
151
+ config.intermediate_size,
152
+ config.hidden_size,
153
+ bias=config.use_bias,
154
+ quant_config=quant_config,
155
+ prefix=f"{prefix}.c_proj",
156
+ )
157
+ self.act = get_act_fn(config.hidden_act)
158
+
159
+ def forward(
160
+ self,
161
+ hidden_states: torch.Tensor,
162
+ ) -> torch.Tensor:
163
+ hidden_states, _ = self.c_fc(hidden_states)
164
+ hidden_states = self.act(hidden_states)
165
+ hidden_states, _ = self.c_proj(hidden_states)
166
+ return hidden_states
167
+
168
+
169
+ class Starcoder2DecoderLayer(nn.Module):
170
+
171
+ def __init__(
172
+ self,
173
+ config: Starcoder2Config,
174
+ layer_id: int,
175
+ quant_config: Optional[QuantizationConfig] = None,
176
+ prefix: str = "",
177
+ ):
178
+ super().__init__()
179
+ self.hidden_size = config.hidden_size
180
+ self.self_attn = Starcoder2Attention(
181
+ config=config,
182
+ layer_id=layer_id,
183
+ quant_config=quant_config,
184
+ prefix=f"{prefix}.self_attn",
185
+ )
186
+ self.mlp = Starcoder2MLP(
187
+ config, quant_config=quant_config, prefix=f"{prefix}.mlp"
188
+ )
189
+ self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
190
+ self.post_attention_layernorm = nn.LayerNorm(
191
+ config.hidden_size, eps=config.norm_epsilon
192
+ )
193
+
194
+ def forward(
195
+ self,
196
+ positions: torch.Tensor,
197
+ hidden_states: torch.Tensor,
198
+ forward_batch: ForwardBatch,
199
+ ) -> torch.Tensor:
200
+ # Self Attention
201
+ residual = hidden_states
202
+ hidden_states = self.input_layernorm(hidden_states)
203
+ hidden_states = self.self_attn(
204
+ positions=positions,
205
+ hidden_states=hidden_states,
206
+ forward_batch=forward_batch,
207
+ )
208
+ hidden_states = residual + hidden_states
209
+
210
+ # Fully Connected
211
+ residual = hidden_states
212
+ hidden_states = self.post_attention_layernorm(hidden_states)
213
+ hidden_states = self.mlp(hidden_states)
214
+ hidden_states = residual + hidden_states
215
+
216
+ return hidden_states
217
+
218
+
219
+ class Starcoder2Model(nn.Module):
220
+
221
+ def __init__(
222
+ self,
223
+ config: Starcoder2Config,
224
+ quant_config: Optional[QuantizationConfig] = None,
225
+ prefix: str = "",
226
+ ):
227
+ super().__init__()
228
+
229
+ self.config = config
230
+ self.vocab_size = config.vocab_size
231
+
232
+ self.embed_tokens = VocabParallelEmbedding(
233
+ config.vocab_size,
234
+ config.hidden_size,
235
+ quant_config=quant_config,
236
+ prefix=f"{prefix}.embed_tokens",
237
+ )
238
+
239
+ pp_group = get_pp_group()
240
+ pp_size = pp_group.world_size
241
+ pp_rank = pp_group.rank
242
+ self.start_layer = pp_rank * config.num_hidden_layers // pp_size
243
+ self.end_layer = (pp_rank + 1) * config.num_hidden_layers // pp_size
244
+
245
+ self.layers = make_layers(
246
+ config.num_hidden_layers,
247
+ lambda idx, prefix: Starcoder2DecoderLayer(
248
+ config=config, quant_config=quant_config, layer_id=idx, prefix=prefix
249
+ ),
250
+ prefix=f"{prefix}.layers",
251
+ )
252
+ self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
253
+
254
+ def forward(
255
+ self,
256
+ input_ids: torch.Tensor,
257
+ positions: torch.Tensor,
258
+ forward_batch: ForwardBatch,
259
+ inputs_embeds: Optional[torch.Tensor] = None,
260
+ ) -> torch.Tensor:
261
+ if inputs_embeds is None:
262
+ hidden_states = self.embed_tokens(input_ids)
263
+ else:
264
+ hidden_states = inputs_embeds
265
+ for i in range(self.start_layer, self.end_layer):
266
+ layer = self.layers[i]
267
+ hidden_states = layer(
268
+ positions,
269
+ hidden_states,
270
+ forward_batch,
271
+ )
272
+ hidden_states = self.norm(hidden_states)
273
+ return hidden_states
274
+
275
+
276
+ class Starcoder2ForCausalLM(nn.Module):
277
+
278
+ def __init__(
279
+ self,
280
+ config: Starcoder2Config,
281
+ quant_config: Optional[QuantizationConfig] = None,
282
+ prefix: str = "",
283
+ ):
284
+ super().__init__()
285
+ self.config = config
286
+ self.model = Starcoder2Model(
287
+ config, quant_config, prefix=add_prefix("model", prefix)
288
+ )
289
+ self.vocab_size = config.vocab_size
290
+ self.unpadded_vocab_size = config.vocab_size
291
+ if config.tie_word_embeddings:
292
+ self.lm_head = self.model.embed_tokens
293
+ else:
294
+ self.unpadded_vocab_size = config.vocab_size
295
+ self.lm_head = ParallelLMHead(
296
+ self.unpadded_vocab_size,
297
+ config.hidden_size,
298
+ org_num_embeddings=config.vocab_size,
299
+ padding_size=DEFAULT_VOCAB_PADDING_SIZE,
300
+ quant_config=quant_config,
301
+ prefix=f"{prefix}.lm_head",
302
+ )
303
+ self.logits_processor = LogitsProcessor(config=config)
304
+
305
+ def forward(
306
+ self,
307
+ input_ids: torch.Tensor,
308
+ positions: torch.Tensor,
309
+ forward_batch: ForwardBatch,
310
+ inputs_embeds: Optional[torch.Tensor] = None,
311
+ ) -> torch.Tensor:
312
+ hidden_states = self.model(
313
+ input_ids=input_ids,
314
+ positions=positions,
315
+ forward_batch=forward_batch,
316
+ inputs_embeds=inputs_embeds,
317
+ )
318
+ return self.logits_processor(
319
+ input_ids, hidden_states, self.lm_head, forward_batch
320
+ )
321
+
322
+ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
323
+ stacked_params_mapping = [
324
+ # (param_name, shard_name, shard_id)
325
+ ("qkv_proj", "q_proj", "q"),
326
+ ("qkv_proj", "k_proj", "k"),
327
+ ("qkv_proj", "v_proj", "v"),
328
+ ]
329
+ params_dict = dict(self.named_parameters())
330
+
331
+ for name, loaded_weight in weights:
332
+ if "rotary_emb.inv_freqs" in name:
333
+ continue
334
+
335
+ is_stacked = False
336
+ for param_name, weight_name, shard_id in stacked_params_mapping:
337
+ if weight_name in name:
338
+ name = name.replace(weight_name, param_name)
339
+ param = params_dict[name]
340
+ weight_loader = getattr(
341
+ param, "weight_loader", default_weight_loader
342
+ )
343
+ weight_loader(param, loaded_weight, shard_id)
344
+ is_stacked = True
345
+ break
346
+ if is_stacked:
347
+ continue
348
+
349
+ param = params_dict.get(name)
350
+ if param is None:
351
+ continue
352
+
353
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
354
+ weight_loader(param, loaded_weight)
355
+
356
+
357
+ EntryClass = Starcoder2ForCausalLM
@@ -1,8 +1,7 @@
1
1
  import logging
2
2
  import math
3
- from collections.abc import Iterable
4
3
  from math import sqrt
5
- from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, Union
4
+ from typing import Any, Dict, Iterable, List, Optional, Tuple
6
5
 
7
6
  import torch
8
7
  from torch import nn
@@ -57,7 +56,6 @@ from sglang.srt.managers.schedule_batch import (
57
56
  Modality,
58
57
  MultimodalDataItem,
59
58
  MultimodalInputs,
60
- global_server_args_dict,
61
59
  )
62
60
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
63
61
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -300,7 +298,7 @@ class Step3TextDecoderLayer(nn.Module):
300
298
  # self.n_shared_experts = 1
301
299
  # self.num_fused_shared_experts = (
302
300
  # 0
303
- # if global_server_args_dict["disable_shared_experts_fusion"]
301
+ # if global_server_args.disable_shared_experts_fusion
304
302
  # else self.n_shared_experts
305
303
  # )
306
304
  self.num_fused_shared_experts = 0
@@ -774,7 +772,7 @@ class Step3VLForConditionalGeneration(nn.Module):
774
772
  # self.n_shared_experts = 1
775
773
  # self.num_fused_shared_experts = (
776
774
  # 0
777
- # if global_server_args_dict["disable_shared_experts_fusion"]
775
+ # if global_server_args.disable_shared_experts_fusion
778
776
  # else self.n_shared_experts
779
777
  # )
780
778
  self.num_fused_shared_experts = 0
@@ -66,8 +66,8 @@ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
66
66
  from sglang.srt.model_loader.weight_utils import default_weight_loader
67
67
  from sglang.srt.utils import add_prefix
68
68
 
69
- tp_size = get_tensor_model_parallel_world_size()
70
- tp_rank = get_tensor_model_parallel_rank()
69
+ tp_size: Optional[int] = None
70
+ tp_rank: Optional[int] = None
71
71
 
72
72
 
73
73
  def gate_up_proj_weight_loader(
@@ -341,6 +341,13 @@ class LlamaModel(nn.Module):
341
341
  quant_config: Optional[QuantizationConfig] = None,
342
342
  ) -> None:
343
343
  super().__init__()
344
+
345
+ global tp_size, tp_rank
346
+ if tp_size is None:
347
+ tp_size = get_tensor_model_parallel_world_size()
348
+ if tp_rank is None:
349
+ tp_rank = get_tensor_model_parallel_rank()
350
+
344
351
  self.config = config
345
352
  self.padding_idx = config.pad_token_id
346
353
  self.vocab_size = config.vocab_size
@@ -0,0 +1,61 @@
1
+ # Copyright 2023-2025 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
14
+
15
+ import torch
16
+
17
+ from sglang.srt.layers.radix_attention import RadixAttention
18
+ from sglang.srt.model_executor.forward_batch_info import ForwardBatch
19
+ from sglang.srt.utils import is_cuda
20
+
21
+ _is_cuda = is_cuda()
22
+
23
+
24
+ if _is_cuda:
25
+ from sgl_kernel import FusedSetKVBufferArg
26
+
27
+
28
+ def enable_fused_set_kv_buffer(forward_batch: ForwardBatch):
29
+ """Enable fused set_kv_buffer only on CUDA with bfloat16 KV cache."""
30
+ return (
31
+ _is_cuda
32
+ and hasattr(forward_batch.token_to_kv_pool, "dtype")
33
+ and forward_batch.token_to_kv_pool.dtype == torch.bfloat16
34
+ )
35
+
36
+
37
+ def create_fused_set_kv_buffer_arg(
38
+ value: torch.Tensor,
39
+ layer: RadixAttention,
40
+ forward_batch: ForwardBatch,
41
+ ):
42
+ layer_id = layer.layer_id
43
+ token_to_kv_pool = forward_batch.token_to_kv_pool
44
+
45
+ k_buffer = token_to_kv_pool.get_key_buffer(layer_id)
46
+ v_buffer = token_to_kv_pool.get_value_buffer(layer_id)
47
+
48
+ return FusedSetKVBufferArg(
49
+ value=value,
50
+ k_buffer=k_buffer.view(k_buffer.shape[0], -1),
51
+ v_buffer=v_buffer.view(v_buffer.shape[0], -1),
52
+ k_scale=layer.k_scale,
53
+ v_scale=layer.v_scale,
54
+ cache_loc=forward_batch.out_cache_loc,
55
+ )
56
+
57
+
58
+ def permute_inv(perm: torch.Tensor) -> torch.Tensor:
59
+ inv_perm = torch.empty_like(perm)
60
+ inv_perm[perm] = torch.arange(perm.numel(), device=perm.device, dtype=perm.dtype)
61
+ return inv_perm
@@ -155,7 +155,6 @@ class BaseMultimodalProcessor(ABC):
155
155
  ):
156
156
  self.hf_config = hf_config
157
157
  self._processor = _processor
158
- self.arch = hf_config.architectures[0]
159
158
  self.server_args = server_args
160
159
  self.transport_mode = transport_mode
161
160
 
@@ -179,6 +178,7 @@ class BaseMultimodalProcessor(ABC):
179
178
  "image_attention_mask": Modality.IMAGE,
180
179
  "image_emb_mask": Modality.IMAGE,
181
180
  "images_spatial_crop": Modality.IMAGE,
181
+ "images_crop": Modality.IMAGE,
182
182
  "tgt_size": Modality.IMAGE,
183
183
  "image_grid_hws": Modality.IMAGE,
184
184
  "aspect_ratio_ids": Modality.IMAGE,
@@ -191,6 +191,7 @@ class BaseMultimodalProcessor(ABC):
191
191
  "input_features": Modality.AUDIO,
192
192
  "input_features_mask": Modality.AUDIO,
193
193
  "audio_attention_mask": Modality.AUDIO,
194
+ "feature_attention_mask": Modality.AUDIO,
194
195
  # Video-related attributes
195
196
  "pixel_values_videos": Modality.VIDEO,
196
197
  "second_per_grid_ts": Modality.VIDEO,
@@ -222,6 +223,7 @@ class BaseMultimodalProcessor(ABC):
222
223
  if self._processor.__class__.__name__ in {
223
224
  "Gemma3nProcessor",
224
225
  "Qwen2AudioProcessor",
226
+ "Qwen3OmniMoeProcessor",
225
227
  }:
226
228
  # Note(Xinyuan): for gemma3n, ref: https://github.com/huggingface/transformers/blob/ccf2ca162e33f381e454cdb74bf4b41a51ab976d/src/transformers/models/gemma3n/processing_gemma3n.py#L107
227
229
  kwargs["audio"] = audios
@@ -234,19 +236,27 @@ class BaseMultimodalProcessor(ABC):
234
236
  and isinstance(processor.image_processor, BaseImageProcessorFast)
235
237
  and not self.server_args.disable_fast_image_processor
236
238
  ):
237
- kwargs["device"] = "cuda" if not _is_npu else "npu"
239
+ if not _is_npu:
240
+ kwargs["device"] = "cuda"
241
+ elif processor.__class__.__name__ not in {
242
+ "Qwen2_5_VLProcessor",
243
+ "Qwen3VLProcessor",
244
+ }:
245
+ # Note: for qwen-vl, processor has some reshape issue because of dims restriction on Ascend.
246
+ kwargs["device"] = "npu"
238
247
  result = processor.__call__(
239
248
  text=[input_text],
240
249
  padding=True,
241
250
  return_tensors="pt",
242
251
  **kwargs,
243
252
  )
244
- # move feature tensors to cpu
245
- for feature_name in self.FEATURE_NAMES:
246
- if feature_name in result and isinstance(
247
- result[feature_name], torch.Tensor
248
- ):
249
- result[feature_name] = result[feature_name].to("cpu")
253
+ if not self.server_args.keep_mm_feature_on_device:
254
+ # move feature tensors to cpu
255
+ for feature_name in self.FEATURE_NAMES:
256
+ if feature_name in result and isinstance(
257
+ result[feature_name], torch.Tensor
258
+ ):
259
+ result[feature_name] = result[feature_name].to("cpu")
250
260
 
251
261
  return result
252
262
 
@@ -304,7 +314,9 @@ class BaseMultimodalProcessor(ABC):
304
314
  try:
305
315
  if modality == Modality.IMAGE:
306
316
  img, _ = load_image(data)
307
- return img.convert("RGB") if discard_alpha_channel else img
317
+ if discard_alpha_channel and img.mode != "RGB":
318
+ img = img.convert("RGB")
319
+ return img
308
320
  elif modality == Modality.VIDEO:
309
321
  return load_video(data, frame_count_limit)
310
322
  elif modality == Modality.AUDIO:
@@ -0,0 +1,37 @@
1
+ from typing import List, Union
2
+
3
+ from sglang.srt.models.deepseek_ocr import DeepseekOCRForCausalLM
4
+ from sglang.srt.multimodal.processors.base_processor import (
5
+ BaseMultimodalProcessor,
6
+ MultimodalSpecialTokens,
7
+ )
8
+
9
+
10
+ class DeepseekOCRProcessor(BaseMultimodalProcessor):
11
+ models = [DeepseekOCRForCausalLM]
12
+
13
+ def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
14
+ _processor.image_size = 640
15
+ super().__init__(hf_config, server_args, _processor, *args, **kwargs)
16
+ self.mm_tokens = MultimodalSpecialTokens(
17
+ image_token="<image>", image_token_id=self._processor.image_token_id
18
+ ).build(_processor)
19
+
20
+ async def process_mm_data_async(
21
+ self, image_data: List[Union[str, bytes]], input_text, *args, **kwargs
22
+ ):
23
+ base_output = self.load_mm_data(
24
+ prompt=input_text,
25
+ multimodal_tokens=self.mm_tokens,
26
+ image_data=image_data,
27
+ )
28
+
29
+ mm_items, input_ids, _ = self.process_and_combine_mm_data(
30
+ base_output, self.mm_tokens
31
+ )
32
+
33
+ return {
34
+ "input_ids": input_ids.tolist(),
35
+ "mm_items": mm_items,
36
+ "im_token_id": self.mm_tokens.image_token_id,
37
+ }
@@ -18,9 +18,6 @@
18
18
  # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
19
19
  from typing import List, Union
20
20
 
21
- import torch
22
-
23
- from sglang.srt.managers.schedule_batch import Modality, MultimodalDataItem
24
21
  from sglang.srt.models.deepseek_vl2 import DeepseekVL2ForCausalLM
25
22
  from sglang.srt.multimodal.processors.base_processor import (
26
23
  BaseMultimodalProcessor,
@@ -1,10 +1,10 @@
1
1
  import asyncio
2
- import math
3
2
  import re
4
3
  from typing import Dict, List, Union
5
4
 
6
5
  from PIL import Image
7
6
 
7
+ from sglang.srt.models.dots_ocr import DotsOCRForCausalLM
8
8
  from sglang.srt.models.dots_vlm import DotsVLMForCausalLM
9
9
  from sglang.srt.multimodal.processors.base_processor import (
10
10
  BaseMultimodalProcessor,
@@ -14,7 +14,7 @@ from sglang.srt.multimodal.processors.qwen_vl import resize_image_async
14
14
 
15
15
 
16
16
  class DotsVLMImageProcessor(BaseMultimodalProcessor):
17
- models = [DotsVLMForCausalLM]
17
+ models = [DotsVLMForCausalLM, DotsOCRForCausalLM]
18
18
 
19
19
  def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
20
20
  super().__init__(hf_config, server_args, _processor, *args, **kwargs)
@@ -82,11 +82,9 @@ class DotsVLMImageProcessor(BaseMultimodalProcessor):
82
82
  for image in base_output.images
83
83
  ]
84
84
  base_output.images = await asyncio.gather(*resize_tasks)
85
-
86
85
  combined_mm_item, input_ids, _ = self.process_and_combine_mm_data(
87
86
  base_output, self.mm_tokens
88
87
  )
89
-
90
88
  if combined_mm_item is None:
91
89
  return None
92
90
 
@@ -1,4 +1,3 @@
1
- import re
2
1
  from typing import List, Union
3
2
 
4
3
  from decord import VideoReader
@@ -9,10 +8,7 @@ from sglang.srt.models.glm4v_moe import Glm4vMoeForConditionalGeneration
9
8
  from sglang.srt.multimodal.processors.base_processor import (
10
9
  BaseMultimodalProcessor as SGLangBaseProcessor,
11
10
  )
12
- from sglang.srt.multimodal.processors.base_processor import (
13
- BaseMultiModalProcessorOutput,
14
- MultimodalSpecialTokens,
15
- )
11
+ from sglang.srt.multimodal.processors.base_processor import MultimodalSpecialTokens
16
12
 
17
13
 
18
14
  class Glm4vImageProcessor(SGLangBaseProcessor):