sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -22,16 +22,17 @@ from typing import List, Optional, Set, Union
22
22
  import torch
23
23
  from transformers import PretrainedConfig
24
24
 
25
- from sglang.srt.hf_transformers_utils import (
25
+ from sglang.srt.environ import envs
26
+ from sglang.srt.layers.quantization import QUANTIZATION_METHODS
27
+ from sglang.srt.server_args import ServerArgs
28
+ from sglang.srt.utils import is_hip, retry
29
+ from sglang.srt.utils.hf_transformers_utils import (
26
30
  get_config,
27
31
  get_context_length,
28
32
  get_generation_config,
29
33
  get_hf_text_config,
30
34
  get_sparse_attention_config,
31
35
  )
32
- from sglang.srt.layers.quantization import QUANTIZATION_METHODS
33
- from sglang.srt.server_args import ServerArgs
34
- from sglang.srt.utils import get_bool_env_var, is_hip
35
36
  from sglang.utils import is_in_ci
36
37
 
37
38
  logger = logging.getLogger(__name__)
@@ -48,6 +49,30 @@ class ModelImpl(str, Enum):
48
49
  TRANSFORMERS = "transformers"
49
50
 
50
51
 
52
+ def is_deepseek_nsa(config: PretrainedConfig) -> bool:
53
+ return (
54
+ config.architectures is not None
55
+ and config.architectures[0]
56
+ in ["DeepseekV3ForCausalLM", "DeepseekV32ForCausalLM"]
57
+ and getattr(config, "index_topk", None) is not None
58
+ )
59
+
60
+
61
+ def get_nsa_index_head_dim(config: PretrainedConfig) -> int:
62
+ assert is_deepseek_nsa(config)
63
+ return config.index_head_dim
64
+
65
+
66
+ def get_nsa_index_topk(config: PretrainedConfig) -> int:
67
+ assert is_deepseek_nsa(config)
68
+ return config.index_topk
69
+
70
+
71
+ def get_nsa_index_n_heads(config: PretrainedConfig) -> int:
72
+ assert is_deepseek_nsa(config)
73
+ return config.index_n_heads
74
+
75
+
51
76
  class ModelConfig:
52
77
  def __init__(
53
78
  self,
@@ -69,14 +94,15 @@ class ModelConfig:
69
94
  self.model_path = model_path
70
95
  self.revision = revision
71
96
  self.quantization = quantization
97
+ self.is_draft_model = is_draft_model
72
98
  self.model_impl = model_impl
73
99
 
74
- self.maybe_pull_model_tokenizer_from_remote()
100
+ # Get hf config
101
+ self._maybe_pull_model_tokenizer_from_remote()
75
102
  self.model_override_args = json.loads(model_override_args)
76
103
  kwargs = {}
77
104
  if override_config_file and override_config_file.strip():
78
105
  kwargs["_configuration_file"] = override_config_file.strip()
79
-
80
106
  self.hf_config = get_config(
81
107
  self.model_path,
82
108
  trust_remote_code=trust_remote_code,
@@ -84,7 +110,7 @@ class ModelConfig:
84
110
  model_override_args=self.model_override_args,
85
111
  **kwargs,
86
112
  )
87
-
113
+ self.hf_text_config = get_hf_text_config(self.hf_config)
88
114
  self.hf_generation_config = get_generation_config(
89
115
  self.model_path,
90
116
  trust_remote_code=trust_remote_code,
@@ -92,7 +118,25 @@ class ModelConfig:
92
118
  **kwargs,
93
119
  )
94
120
 
95
- self.hf_text_config = get_hf_text_config(self.hf_config)
121
+ # Set enable_multimodal
122
+ if enable_multimodal is None:
123
+ mm_disabled_models = [
124
+ "Gemma3ForConditionalGeneration",
125
+ "Llama4ForConditionalGeneration",
126
+ "Step3VLForConditionalGeneration",
127
+ ]
128
+ if self.hf_config.architectures[0] in mm_disabled_models:
129
+ enable_multimodal = False
130
+ logger.info(
131
+ f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
132
+ )
133
+ else:
134
+ enable_multimodal = True
135
+
136
+ # Config draft model
137
+ self._config_draft_model()
138
+
139
+ # Check model type
96
140
  self.attention_chunk_size = getattr(
97
141
  self.hf_text_config, "attention_chunk_size", None
98
142
  )
@@ -108,20 +152,70 @@ class ModelConfig:
108
152
  self.hf_config.architectures, self.hf_text_config.num_hidden_layers
109
153
  )
110
154
  )
155
+ self.is_generation = is_generation_model(
156
+ self.hf_config.architectures, is_embedding
157
+ )
158
+ self.is_multimodal = enable_multimodal and is_multimodal_model(
159
+ self.hf_config.architectures
160
+ )
161
+ self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
162
+ self.hf_config.architectures
163
+ )
164
+ self.is_image_gen = enable_multimodal and is_image_gen_model(
165
+ self.hf_config.architectures
166
+ )
167
+ self.is_audio_model = enable_multimodal and is_audio_model(
168
+ self.hf_config.architectures
169
+ )
170
+ self.is_multimodal_chunked_prefill_supported = (
171
+ enable_multimodal
172
+ and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
173
+ )
174
+ self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
175
+ self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
111
176
 
112
- if enable_multimodal is None:
113
- mm_disabled_models = [
114
- "Gemma3ForConditionalGeneration",
115
- "Llama4ForConditionalGeneration",
116
- "Step3VLForConditionalGeneration",
117
- ]
118
- if self.hf_config.architectures[0] in mm_disabled_models:
119
- enable_multimodal = False
120
- logger.info(
121
- f"Multimodal is disabled for {self.hf_config.model_type}. To enable it, set --enable-multimodal."
122
- )
123
- else:
124
- enable_multimodal = True
177
+ # Derive context length and model shapes
178
+ self._derive_context_length(context_length)
179
+ self._derive_model_shapes()
180
+
181
+ # Verify quantization
182
+ self._verify_quantization()
183
+
184
+ # Verify dual-chunk attention config
185
+ self._verify_dual_chunk_attention_config()
186
+
187
+ # Cache attributes
188
+ self.hf_eos_token_id = self._get_hf_eos_token_id()
189
+
190
+ # multimodal
191
+ self.image_token_id = getattr(
192
+ self.hf_config, "image_token_id", None
193
+ ) or getattr(self.hf_config, "image_token_index", None)
194
+
195
+ @staticmethod
196
+ def from_server_args(
197
+ server_args: ServerArgs,
198
+ model_path: str = None,
199
+ model_revision: str = None,
200
+ **kwargs,
201
+ ):
202
+ return ModelConfig(
203
+ model_path=model_path or server_args.model_path,
204
+ trust_remote_code=server_args.trust_remote_code,
205
+ revision=model_revision or server_args.revision,
206
+ context_length=server_args.context_length,
207
+ model_override_args=server_args.json_model_override_args,
208
+ is_embedding=server_args.is_embedding,
209
+ enable_multimodal=server_args.enable_multimodal,
210
+ dtype=server_args.dtype,
211
+ quantization=server_args.quantization,
212
+ hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
213
+ model_impl=server_args.model_impl,
214
+ **kwargs,
215
+ )
216
+
217
+ def _config_draft_model(self):
218
+ is_draft_model = self.is_draft_model
125
219
 
126
220
  if (
127
221
  is_draft_model
@@ -141,37 +235,25 @@ class ModelConfig:
141
235
 
142
236
  if is_draft_model and self.hf_config.architectures[0] == "MiMoForCausalLM":
143
237
  self.hf_config.architectures[0] = "MiMoMTP"
238
+ if is_draft_model and self.hf_config.architectures[0] in [
239
+ "BailingMoeV2ForCausalLM",
240
+ "BailingMoeForCausalLM",
241
+ ]:
242
+ self.hf_config.architectures[0] = "BailingMoeForCausalLMNextN"
144
243
  if (
145
244
  is_draft_model
146
245
  and self.hf_config.architectures[0] == "Ernie4_5_MoeForCausalLM"
147
246
  ):
148
247
  self.hf_config.architectures[0] = "Ernie4_5_MoeForCausalLMMTP"
149
248
 
150
- # Check model type
151
- self.is_generation = is_generation_model(
152
- self.hf_config.architectures, is_embedding
153
- )
154
- self.is_multimodal = enable_multimodal and is_multimodal_model(
155
- self.hf_config.architectures
156
- )
157
- self.is_multimodal_gen = enable_multimodal and is_multimodal_gen_model(
158
- self.hf_config.architectures
159
- )
160
- self.is_image_gen = enable_multimodal and is_image_gen_model(
161
- self.hf_config.architectures
162
- )
163
- self.is_audio_model = enable_multimodal and is_audio_model(
164
- self.hf_config.architectures
165
- )
166
- self.is_multimodal_chunked_prefill_supported = (
167
- enable_multimodal
168
- and is_multimodal_chunked_prefill_supported(self.hf_config.architectures)
169
- )
170
- self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
171
- self.dtype = _get_and_verify_dtype(self.hf_text_config, dtype)
249
+ if is_draft_model and self.hf_config.architectures[0] == "Qwen3NextForCausalLM":
250
+ self.hf_config.architectures[0] = "Qwen3NextForCausalLMMTP"
251
+ self.hf_config.num_nextn_predict_layers = 1
172
252
 
173
- # Derive context length
253
+ def _derive_context_length(self, context_length: int):
254
+ is_draft_model = self.is_draft_model
174
255
  derived_context_len = get_context_length(self.hf_text_config)
256
+
175
257
  if context_length is not None:
176
258
  if context_length > derived_context_len:
177
259
  reason = "Target model's" if is_draft_model else "User-specified"
@@ -180,11 +262,16 @@ class ModelConfig:
180
262
  f"This may lead to incorrect model outputs or CUDA errors. Note that the derived context_length may differ from max_position_embeddings in the model's config."
181
263
  )
182
264
  if (
183
- get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN")
265
+ envs.SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN.get()
184
266
  or is_in_ci() # FIXME: fix this special case
185
267
  ):
186
268
  logger.warning(msg)
187
269
  self.context_len = context_length
270
+ if is_draft_model:
271
+ self.hf_text_config.max_position_embeddings = context_length
272
+ logger.warning(
273
+ f"Overriding the draft model's max_position_embeddings to {context_length}."
274
+ )
188
275
  else:
189
276
  raise ValueError(
190
277
  f"{msg} To allow overriding this maximum, set the env var SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1"
@@ -194,6 +281,10 @@ class ModelConfig:
194
281
  else:
195
282
  self.context_len = derived_context_len
196
283
 
284
+ # Transfer context_len to HuggingFace config so models can access it
285
+ self.hf_config.context_len = self.context_len
286
+
287
+ def _derive_model_shapes(self):
197
288
  # Unify the config keys for hf_text_config
198
289
  self.head_dim = getattr(
199
290
  self.hf_text_config,
@@ -204,10 +295,12 @@ class ModelConfig:
204
295
  # FIXME: temporary special judge for MLA architecture
205
296
  if (
206
297
  "DeepseekV2ForCausalLM" in self.hf_config.architectures
298
+ or "DeepseekV32ForCausalLM" in self.hf_config.architectures
207
299
  or "DeepseekV3ForCausalLM" in self.hf_config.architectures
208
300
  or "DeepseekV3ForCausalLMNextN" in self.hf_config.architectures
209
301
  or "LongcatFlashForCausalLM" in self.hf_config.architectures
210
302
  or "LongcatFlashForCausalLMNextN" in self.hf_config.architectures
303
+ or "DotsVLMForCausalLM" in self.hf_config.architectures
211
304
  ):
212
305
  self.head_dim = 256
213
306
  self.attention_arch = AttentionArch.MLA
@@ -215,6 +308,11 @@ class ModelConfig:
215
308
  self.qk_nope_head_dim = self.hf_config.qk_nope_head_dim
216
309
  self.qk_rope_head_dim = self.hf_config.qk_rope_head_dim
217
310
  self.v_head_dim = self.hf_config.v_head_dim
311
+ self.index_head_dim = (
312
+ get_nsa_index_head_dim(self.hf_config)
313
+ if is_deepseek_nsa(self.hf_config)
314
+ else None
315
+ )
218
316
 
219
317
  # Handle rope scaling with yarn
220
318
  self.scaling = 1 / math.sqrt(self.qk_nope_head_dim + self.qk_rope_head_dim)
@@ -287,37 +385,6 @@ class ModelConfig:
287
385
  )
288
386
  self.vocab_size = self.hf_text_config.vocab_size
289
387
 
290
- # Verify quantization
291
- self._verify_quantization()
292
-
293
- # Verify dual-chunk attention config
294
- self._verify_dual_chunk_attention_config()
295
-
296
- # Cache attributes
297
- self.hf_eos_token_id = self.get_hf_eos_token_id()
298
-
299
- # multimodal
300
- self.image_token_id = getattr(
301
- self.hf_config, "image_token_id", None
302
- ) or getattr(self.hf_config, "image_token_index", None)
303
-
304
- @staticmethod
305
- def from_server_args(server_args: ServerArgs, model_path: str = None, **kwargs):
306
- return ModelConfig(
307
- model_path=model_path or server_args.model_path,
308
- trust_remote_code=server_args.trust_remote_code,
309
- revision=server_args.revision,
310
- context_length=server_args.context_length,
311
- model_override_args=server_args.json_model_override_args,
312
- is_embedding=server_args.is_embedding,
313
- enable_multimodal=server_args.enable_multimodal,
314
- dtype=server_args.dtype,
315
- quantization=server_args.quantization,
316
- hybrid_kvcache_ratio=server_args.hybrid_kvcache_ratio,
317
- model_impl=server_args.model_impl,
318
- **kwargs,
319
- )
320
-
321
388
  def get_total_num_attention_heads(self) -> int:
322
389
  return self.num_attention_heads
323
390
 
@@ -412,11 +479,38 @@ class ModelConfig:
412
479
  is_local = os.path.exists(self.model_path)
413
480
  modelopt_quant_config = {"quant_method": "modelopt"}
414
481
  if not is_local:
415
- from huggingface_hub import HfApi
482
+ import huggingface_hub
483
+
484
+ try:
485
+ from huggingface_hub import HfApi
486
+
487
+ hf_api = HfApi()
488
+
489
+ def check_hf_quant_config():
490
+ return hf_api.file_exists(
491
+ self.model_path, "hf_quant_config.json"
492
+ )
493
+
494
+ # Retry HF API call up to 3 times
495
+ file_exists = retry(
496
+ check_hf_quant_config,
497
+ max_retry=2,
498
+ initial_delay=1.0,
499
+ max_delay=5.0,
500
+ )
501
+
502
+ if file_exists:
503
+ quant_cfg = modelopt_quant_config
504
+
505
+ except huggingface_hub.errors.OfflineModeIsEnabled:
506
+ logger.warning(
507
+ "Offline mode is enabled, skipping hf_quant_config.json check"
508
+ )
509
+ except Exception as e:
510
+ logger.warning(
511
+ f"Failed to check hf_quant_config.json: {self.model_path} {e}"
512
+ )
416
513
 
417
- hf_api = HfApi()
418
- if hf_api.file_exists(self.model_path, "hf_quant_config.json"):
419
- quant_cfg = modelopt_quant_config
420
514
  elif os.path.exists(os.path.join(self.model_path, "hf_quant_config.json")):
421
515
  quant_config_file = os.path.join(
422
516
  self.model_path, "hf_quant_config.json"
@@ -543,7 +637,7 @@ class ModelConfig:
543
637
  "sparse_attention_enabled"
544
638
  ] = True
545
639
 
546
- def get_hf_eos_token_id(self) -> Optional[Set[int]]:
640
+ def _get_hf_eos_token_id(self) -> Optional[Set[int]]:
547
641
  eos_ids = getattr(self.hf_config, "eos_token_id", None)
548
642
  if eos_ids is not None:
549
643
  # it can be either int or list of int
@@ -563,7 +657,7 @@ class ModelConfig:
563
657
  eos_ids = eos_ids | generation_eos_ids
564
658
  return eos_ids
565
659
 
566
- def maybe_pull_model_tokenizer_from_remote(self) -> None:
660
+ def _maybe_pull_model_tokenizer_from_remote(self) -> None:
567
661
  """
568
662
  Pull the model config files to a temporary
569
663
  directory in case of remote.
@@ -706,12 +800,17 @@ multimodal_model_archs = [
706
800
  "Qwen2AudioForConditionalGeneration",
707
801
  "Qwen2VLForConditionalGeneration",
708
802
  "Qwen2_5_VLForConditionalGeneration",
803
+ "Qwen3VLForConditionalGeneration",
804
+ "Qwen3VLMoeForConditionalGeneration",
709
805
  "KimiVLForConditionalGeneration",
710
806
  "InternVLChatModel",
711
807
  "InternS1ForConditionalGeneration",
712
808
  "Phi4MMForCausalLM",
713
809
  "VILAForConditionalGeneration",
714
810
  "Step3VLForConditionalGeneration",
811
+ "DotsVLMForCausalLM",
812
+ "DotsOCRForCausalLM",
813
+ "Sarashina2VisionForCausalLM",
715
814
  ]
716
815
 
717
816