sglang 0.5.2rc2__py3-none-any.whl → 0.5.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.dist-info}/top_level.txt +0 -0
@@ -23,6 +23,7 @@ from sglang.srt.managers.tp_worker import TpModelWorker
23
23
  from sglang.srt.model_executor.forward_batch_info import (
24
24
  CaptureHiddenMode,
25
25
  ForwardBatch,
26
+ ForwardBatchOutput,
26
27
  ForwardMode,
27
28
  )
28
29
  from sglang.srt.server_args import ServerArgs
@@ -33,20 +34,23 @@ from sglang.srt.speculative.eagle_draft_cuda_graph_runner import (
33
34
  from sglang.srt.speculative.eagle_draft_extend_cuda_graph_runner import (
34
35
  EAGLEDraftExtendCudaGraphRunner,
35
36
  )
36
- from sglang.srt.speculative.eagle_utils import (
37
+ from sglang.srt.speculative.eagle_info import (
37
38
  EagleDraftInput,
38
39
  EagleVerifyInput,
39
40
  EagleVerifyOutput,
41
+ )
42
+ from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
43
+ from sglang.srt.speculative.spec_utils import (
40
44
  assign_draft_cache_locs,
41
45
  fast_topk,
42
46
  generate_token_bitmask,
43
47
  select_top_k_tokens,
44
48
  )
45
- from sglang.srt.speculative.spec_info import SpeculativeAlgorithm
46
49
  from sglang.srt.utils import (
47
50
  empty_context,
48
51
  get_available_gpu_memory,
49
52
  get_bool_env_var,
53
+ is_blackwell,
50
54
  is_cuda,
51
55
  next_power_of_2,
52
56
  )
@@ -190,7 +194,7 @@ class EAGLEWorker(TpModelWorker):
190
194
  # Initialize decode attention backend
191
195
  self.draft_attn_backend = self._create_decode_backend()
192
196
 
193
- # Initialize prefill attention backend
197
+ # Initialize draft extend attention backend (respects speculative_attention_mode setting)
194
198
  self.draft_extend_attn_backend = self._create_draft_extend_backend()
195
199
 
196
200
  self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
@@ -213,6 +217,11 @@ class EAGLEWorker(TpModelWorker):
213
217
  "triton": self._create_triton_decode_backend,
214
218
  "aiter": self._create_aiter_decode_backend,
215
219
  "fa3": self._create_fa3_decode_backend,
220
+ "hybrid_linear_attn": (
221
+ self._create_fa3_decode_backend
222
+ if not is_blackwell()
223
+ else self._create_triton_decode_backend
224
+ ),
216
225
  "flashmla": self._create_flashmla_decode_backend,
217
226
  "trtllm_mha": self._create_trtllm_mha_decode_backend,
218
227
  "trtllm_mla": self._create_trtllm_mla_decode_backend,
@@ -230,14 +239,24 @@ class EAGLEWorker(TpModelWorker):
230
239
  "triton": self._create_triton_prefill_backend,
231
240
  "aiter": self._create_aiter_prefill_backend,
232
241
  "fa3": self._create_fa3_prefill_backend,
242
+ "hybrid_linear_attn": (
243
+ self._create_fa3_prefill_backend
244
+ if not is_blackwell()
245
+ else self._create_triton_prefill_backend
246
+ ),
247
+ "flashmla": self._create_flashmla_prefill_backend,
233
248
  "trtllm_mha": self._create_trtllm_mha_prefill_backend,
234
249
  "trtllm_mla": self._create_trtllm_mla_prefill_backend,
235
250
  }
236
-
251
+ backend_name = (
252
+ "decode_attention_backend"
253
+ if self.server_args.speculative_attention_mode == "decode"
254
+ else "prefill_attention_backend"
255
+ )
237
256
  return self._create_backend(
238
- "prefill_attention_backend",
257
+ backend_name,
239
258
  backend_map,
240
- "EAGLE is not supported in prefill attention backend {backend_type}",
259
+ "EAGLE is not supported in attention backend {backend_type}",
241
260
  )
242
261
 
243
262
  def _create_flashinfer_decode_backend(self):
@@ -365,6 +384,12 @@ class EAGLEWorker(TpModelWorker):
365
384
 
366
385
  return TRTLLMMLABackend(self.draft_model_runner, skip_prefill=False)
367
386
 
387
+ def _create_flashmla_prefill_backend(self):
388
+ logger.warning(
389
+ "flashmla prefill backend is not yet supported for draft extend."
390
+ )
391
+ return None
392
+
368
393
  def init_cuda_graphs(self):
369
394
  """Capture cuda graphs."""
370
395
  self.cuda_graph_runner = None
@@ -404,9 +429,7 @@ class EAGLEWorker(TpModelWorker):
404
429
  def draft_model_runner(self):
405
430
  return self.model_runner
406
431
 
407
- def forward_batch_speculative_generation(
408
- self, batch: ScheduleBatch
409
- ) -> Tuple[LogitsProcessorOutput, torch.Tensor, int, int, bool]:
432
+ def forward_batch_generation(self, batch: ScheduleBatch) -> ForwardBatchOutput:
410
433
  """Run speculative decoding forward.
411
434
 
412
435
  NOTE: Many states of batch is modified as you go through. It is not guaranteed that
@@ -419,14 +442,19 @@ class EAGLEWorker(TpModelWorker):
419
442
  the batch id (used for overlap schedule), and number of accepted tokens.
420
443
  """
421
444
  if batch.forward_mode.is_extend() or batch.is_extend_in_batch:
422
- logits_output, next_token_ids, bid, seq_lens_cpu = (
423
- self.forward_target_extend(batch)
445
+ logits_output, next_token_ids, seq_lens_cpu = self.forward_target_extend(
446
+ batch
424
447
  )
425
448
  with self.draft_tp_context(self.draft_model_runner.tp_group):
426
449
  self.forward_draft_extend(
427
450
  batch, logits_output.hidden_states, next_token_ids, seq_lens_cpu
428
451
  )
429
- return logits_output, next_token_ids, bid, 0, False
452
+ return ForwardBatchOutput(
453
+ logits_output=logits_output,
454
+ next_token_ids=next_token_ids,
455
+ num_accepted_tokens=0,
456
+ can_run_cuda_graph=False,
457
+ )
430
458
  else:
431
459
  with self.draft_tp_context(self.draft_model_runner.tp_group):
432
460
  spec_info = self.draft(batch)
@@ -444,12 +472,11 @@ class EAGLEWorker(TpModelWorker):
444
472
  # decode is not finished
445
473
  self.forward_draft_extend_after_decode(batch)
446
474
 
447
- return (
448
- logits_output,
449
- verify_output.verified_id,
450
- model_worker_batch.bid,
451
- sum(verify_output.accept_length_per_req_cpu),
452
- can_run_cuda_graph,
475
+ return ForwardBatchOutput(
476
+ logits_output=logits_output,
477
+ next_token_ids=verify_output.verified_id,
478
+ num_accepted_tokens=sum(verify_output.accept_length_per_req_cpu),
479
+ can_run_cuda_graph=can_run_cuda_graph,
453
480
  )
454
481
 
455
482
  def check_forward_draft_extend_after_decode(self, batch: ScheduleBatch):
@@ -481,19 +508,21 @@ class EAGLEWorker(TpModelWorker):
481
508
  Returns:
482
509
  logits_output: The output of logits. It will contain the full hidden states.
483
510
  next_token_ids: Next token ids generated.
484
- bid: The model batch ID. Used for overlap schedule.
485
511
  """
486
512
  # Forward with the target model and get hidden states.
487
513
  # We need the full hidden states to prefill the KV cache of the draft model.
488
514
  model_worker_batch = batch.get_model_worker_batch()
489
515
  model_worker_batch.capture_hidden_mode = CaptureHiddenMode.FULL
490
- logits_output, next_token_ids, _ = self.target_worker.forward_batch_generation(
516
+ forward_batch_output = self.target_worker.forward_batch_generation(
491
517
  model_worker_batch
492
518
  )
519
+ logits_output, next_token_ids = (
520
+ forward_batch_output.logits_output,
521
+ forward_batch_output.next_token_ids,
522
+ )
493
523
  return (
494
524
  logits_output,
495
525
  next_token_ids,
496
- model_worker_batch.bid,
497
526
  model_worker_batch.seq_lens_cpu,
498
527
  )
499
528
 
@@ -525,6 +554,8 @@ class EAGLEWorker(TpModelWorker):
525
554
  batch.seq_lens,
526
555
  self.speculative_num_steps,
527
556
  )
557
+ prefix_lens_cpu = batch.seq_lens_cpu
558
+ seq_lens_cpu = batch.seq_lens_cpu + self.speculative_num_steps
528
559
  extend_num_tokens = num_seqs * self.speculative_num_steps
529
560
  else:
530
561
  # In this case, the last partial page needs to be duplicated.
@@ -560,14 +591,23 @@ class EAGLEWorker(TpModelWorker):
560
591
  self.topk,
561
592
  self.page_size,
562
593
  )
563
-
564
- # TODO(lmzheng): remove this device sync
565
- extend_num_tokens = torch.sum(self.extend_lens).item()
594
+ prefix_lens_cpu = batch.seq_lens_cpu
595
+ last_page_lens = prefix_lens_cpu % self.page_size
596
+ num_new_pages_per_topk = (
597
+ last_page_lens + self.speculative_num_steps + self.page_size - 1
598
+ ) // self.page_size
599
+ seq_lens_cpu = (
600
+ prefix_lens_cpu // self.page_size * self.page_size
601
+ + num_new_pages_per_topk * (self.page_size * self.topk)
602
+ )
603
+ extend_num_tokens = torch.sum((seq_lens_cpu - prefix_lens_cpu)).item()
566
604
 
567
605
  out_cache_loc, token_to_kv_pool_state_backup = (
568
606
  batch.alloc_paged_token_slots_extend(
569
607
  prefix_lens,
608
+ prefix_lens_cpu,
570
609
  seq_lens,
610
+ seq_lens_cpu,
571
611
  last_loc,
572
612
  extend_num_tokens,
573
613
  backup_state=True,
@@ -729,6 +769,14 @@ class EAGLEWorker(TpModelWorker):
729
769
 
730
770
  # Set inputs
731
771
  forward_batch.input_ids = input_ids
772
+ # This is a temporary fix for the case that the user is using standalone
773
+ # speculative decoding and the draft model architecture is gpt-oss. gpt-oss
774
+ # rope kernel needs cache_loc to be contiguous.
775
+ if (
776
+ self.server_args.speculative_algorithm == "STANDALONE"
777
+ and self.model_config.hf_config.architectures[0] == "GptOssForCausalLM"
778
+ ):
779
+ out_cache_loc = out_cache_loc.contiguous()
732
780
  forward_batch.out_cache_loc = out_cache_loc[i]
733
781
  forward_batch.positions.add_(1)
734
782
  forward_batch.attn_backend = self.draft_attn_backend.attn_backends[i]
@@ -747,6 +795,10 @@ class EAGLEWorker(TpModelWorker):
747
795
 
748
796
  return score_list, token_list, parents_list
749
797
 
798
+ def clear_cache_pool(self):
799
+ self.model_runner.req_to_token_pool.clear()
800
+ self.model_runner.token_to_kv_pool_allocator.clear()
801
+
750
802
  def verify(self, batch: ScheduleBatch, spec_info: EagleVerifyInput):
751
803
  spec_info.prepare_for_verify(batch, self.page_size)
752
804
  batch.return_hidden_states = False
@@ -770,10 +822,12 @@ class EAGLEWorker(TpModelWorker):
770
822
  ).cpu()
771
823
 
772
824
  # Forward
773
- logits_output, _, can_run_cuda_graph = (
774
- self.target_worker.forward_batch_generation(
775
- model_worker_batch, skip_sample=True
776
- )
825
+ forward_batch_output = self.target_worker.forward_batch_generation(
826
+ model_worker_batch, is_verify=True
827
+ )
828
+ logits_output, can_run_cuda_graph = (
829
+ forward_batch_output.logits_output,
830
+ forward_batch_output.can_run_cuda_graph,
777
831
  )
778
832
 
779
833
  vocab_mask = None
@@ -813,6 +867,21 @@ class EAGLEWorker(TpModelWorker):
813
867
  ]
814
868
  logits_output.hidden_states = logits_output.hidden_states[res.accepted_indices]
815
869
 
870
+ # QQ: can be optimized
871
+ if self.target_worker.model_runner.is_hybrid_gdn:
872
+ # res.draft_input.accept_length is on GPU but may be empty for last verify?
873
+ accepted_length = (
874
+ torch.tensor(
875
+ res.accept_length_per_req_cpu,
876
+ device=logits_output.hidden_states.device,
877
+ dtype=torch.int32,
878
+ )
879
+ + 1
880
+ )
881
+ self.target_worker.model_runner.attn_backend.update_mamba_state_after_mtp_verify(
882
+ accepted_length, self.target_worker.model_runner.model
883
+ )
884
+
816
885
  if batch.return_logprob:
817
886
  self.add_logprob_values(batch, res, logits_output)
818
887
 
@@ -958,6 +1027,7 @@ class EAGLEWorker(TpModelWorker):
958
1027
  assert isinstance(batch.spec_info, EagleDraftInput)
959
1028
  # Backup fields that will be modified in-place
960
1029
  seq_lens_backup = batch.seq_lens.clone()
1030
+ seq_lens_cpu_backup = batch.seq_lens_cpu.clone()
961
1031
  req_pool_indices_backup = batch.req_pool_indices
962
1032
  accept_length_backup = batch.spec_info.accept_length
963
1033
  return_logprob_backup = batch.return_logprob
@@ -1036,6 +1106,7 @@ class EAGLEWorker(TpModelWorker):
1036
1106
  ForwardMode.DECODE if not input_is_idle else ForwardMode.IDLE
1037
1107
  )
1038
1108
  batch.seq_lens = seq_lens_backup
1109
+ batch.seq_lens_cpu = seq_lens_cpu_backup
1039
1110
  batch.req_pool_indices = req_pool_indices_backup
1040
1111
  batch.spec_info.accept_length = accept_length_backup
1041
1112
  batch.return_logprob = return_logprob_backup