sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,421 +0,0 @@
1
- """Radix cache for LoRA. It's modified based on RadixCache with lora_id added to the key of nodes."""
2
-
3
- import heapq
4
- import time
5
- from collections import defaultdict
6
- from typing import TYPE_CHECKING, Any, List, Optional
7
-
8
- import torch
9
-
10
- from sglang.srt.mem_cache.allocator import BaseTokenToKVPoolAllocator
11
- from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache, MatchResult
12
- from sglang.srt.mem_cache.memory_pool import ReqToTokenPool
13
-
14
- if TYPE_CHECKING:
15
- from sglang.srt.managers.schedule_batch import Req
16
- else:
17
- Req = Any # Placeholder for Req type when not type checking
18
-
19
-
20
- class LoRAKey:
21
-
22
- def __init__(self, lora_id: str, token_ids: List[int]):
23
- self.lora_id = (
24
- lora_id # lora_id of adaptor, should be hash value of adaptor path
25
- )
26
- self.token_ids = token_ids # token_ids of the key
27
-
28
- def __len__(self):
29
- return len(self.token_ids)
30
-
31
-
32
- def get_child_key(key: LoRAKey):
33
- # Here the key of children dict is the hash of lora_id + str(token_ids[0])
34
- # So the child key can be matched only when lora_id and token_ids[0] are the same
35
- if key.lora_id is None:
36
- return hash(str(key.token_ids[0]))
37
- else:
38
- return hash(key.lora_id + str(key.token_ids[0]))
39
-
40
-
41
- class LoRATreeNode:
42
-
43
- counter = 0
44
-
45
- def __init__(self, id: Optional[int] = None):
46
- self.children = defaultdict(LoRATreeNode)
47
- self.parent: LoRATreeNode = None
48
- self.key: LoRAKey = None
49
- self.value: Optional[torch.Tensor] = None
50
- self.lock_ref = 0
51
- self.last_access_time = time.monotonic()
52
-
53
- self.id = LoRATreeNode.counter if id is None else id
54
- LoRATreeNode.counter += 1
55
-
56
- @property
57
- def evicted(self):
58
- return self.value is None
59
-
60
- def __lt__(self, other: "LoRATreeNode"):
61
- return self.last_access_time < other.last_access_time
62
-
63
-
64
- def _key_match(key0: LoRAKey, key1: LoRAKey):
65
- if key0.lora_id != key1.lora_id:
66
- raise ValueError(
67
- f"_key_match should be run on the same lora_id, but got key0.lora_id={key0.lora_id} != key1.lora_id={key1.lora_id}"
68
- )
69
- i = 0
70
- for k0, k1 in zip(key0.token_ids, key1.token_ids):
71
- if k0 != k1:
72
- break
73
- i += 1
74
- return i
75
-
76
-
77
- class LoRARadixCache(BasePrefixCache):
78
-
79
- def __init__(
80
- self,
81
- req_to_token_pool: ReqToTokenPool,
82
- token_to_kv_pool_allocator: BaseTokenToKVPoolAllocator,
83
- page_size: int,
84
- disable: bool = False,
85
- ):
86
- if page_size > 1:
87
- raise ValueError("LoRARadixCache currently only supports page_size = 1")
88
-
89
- if token_to_kv_pool_allocator is None:
90
- raise ValueError(
91
- "token_to_kv_pool_allocator is required to run LoraRadixCache"
92
- )
93
-
94
- self.req_to_token_pool = req_to_token_pool
95
- self.token_to_kv_pool_allocator = token_to_kv_pool_allocator
96
- self.page_size = page_size
97
- self.disable = disable
98
- self.device = self.token_to_kv_pool_allocator.device
99
-
100
- self.key_match_fn = _key_match
101
- self.get_child_key_fn = get_child_key
102
- self.reset()
103
-
104
- def reset(self):
105
- self.root_node = LoRATreeNode()
106
- self.root_node.key = LoRAKey(lora_id="", token_ids=[])
107
- self.root_node.value = None
108
- self.evictable_size_ = 0
109
- self.protected_size_ = 0
110
-
111
- def match_prefix(self, key: List[int], **kwargs) -> MatchResult:
112
- raise ValueError(
113
- "LoRARadixCache needs both token ids and lora id as inputs for matching. Please use match_prefix_with_lora_id instead."
114
- )
115
-
116
- def match_prefix_with_lora_id(self, key: LoRAKey, **kwargs) -> MatchResult:
117
- """Find the matching prefix from the lora radix tree.
118
- Args:
119
- key: A LoRAKey to find a matching prefix.
120
- Returns:
121
- A tuple of a tensor of matching prefix token IDs and
122
- the last node that contains the prefix values. Note that
123
- this API can modify the internal state of the Radix tree.
124
- The last node create a new child if the prefix is shorter
125
- than the last node's value.
126
- """
127
- if self.disable or len(key) == 0:
128
- return MatchResult(
129
- device_indices=torch.empty(
130
- (0,),
131
- dtype=torch.int64,
132
- device=self.device,
133
- ),
134
- last_device_node=self.root_node,
135
- last_host_node=self.root_node,
136
- )
137
-
138
- value, last_node = self._match_prefix_helper(self.root_node, key)
139
- if value:
140
- value = torch.cat(value)
141
- else:
142
- value = torch.empty((0,), dtype=torch.int64, device=self.device)
143
- return MatchResult(
144
- device_indices=value,
145
- last_device_node=last_node,
146
- last_host_node=last_node,
147
- )
148
-
149
- def insert(self, key: LoRAKey, value=None):
150
- if self.disable:
151
- return 0
152
-
153
- if value is None:
154
- value = [x for x in key.token_ids]
155
- return self._insert_helper(self.root_node, key, value)
156
-
157
- def cache_finished_req(self, req: Req):
158
- """Cache request when it finishes."""
159
- if self.disable:
160
- kv_indices = self.req_to_token_pool.req_to_token[
161
- req.req_pool_idx, : len(req.origin_input_ids) + len(req.output_ids) - 1
162
- ]
163
- self.token_to_kv_pool_allocator.free(kv_indices)
164
- self.req_to_token_pool.free(req.req_pool_idx)
165
- return
166
-
167
- token_ids = (req.origin_input_ids + req.output_ids)[:-1]
168
- kv_indices = self.req_to_token_pool.req_to_token[
169
- req.req_pool_idx, : len(token_ids)
170
- ]
171
-
172
- page_aligned_len = len(kv_indices)
173
- page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
174
-
175
- # Radix Cache takes one ref in memory pool
176
- lora_key = LoRAKey(lora_id=req.lora_id, token_ids=token_ids[:page_aligned_len])
177
- new_prefix_len = self.insert(lora_key, page_aligned_kv_indices)
178
- self.token_to_kv_pool_allocator.free(
179
- kv_indices[len(req.prefix_indices) : new_prefix_len]
180
- )
181
-
182
- # Remove req slot release the cache lock
183
- self.req_to_token_pool.free(req.req_pool_idx)
184
- self.dec_lock_ref(req.last_node)
185
-
186
- def cache_unfinished_req(self, req: Req, chunked=False):
187
- """Cache request when it is unfinished."""
188
- if self.disable:
189
- return
190
-
191
- token_ids = req.fill_ids
192
- kv_indices = self.req_to_token_pool.req_to_token[
193
- req.req_pool_idx, : len(token_ids)
194
- ]
195
-
196
- page_aligned_len = len(kv_indices)
197
- page_aligned_kv_indices = kv_indices.to(dtype=torch.int64, copy=True)
198
- page_aligned_token_ids = token_ids[:page_aligned_len]
199
-
200
- # Radix Cache takes one ref in memory pool
201
- inserted_key = LoRAKey(lora_id=req.lora_id, token_ids=page_aligned_token_ids)
202
- new_prefix_len = self.insert(inserted_key, page_aligned_kv_indices)
203
- self.token_to_kv_pool_allocator.free(
204
- kv_indices[len(req.prefix_indices) : new_prefix_len]
205
- )
206
-
207
- # The prefix indices could be updated, reuse it
208
- new_indices, new_last_node, _, _ = self.match_prefix_with_lora_id(inserted_key)
209
- self.req_to_token_pool.write(
210
- (req.req_pool_idx, slice(len(req.prefix_indices), len(new_indices))),
211
- new_indices[len(req.prefix_indices) :],
212
- )
213
-
214
- self.dec_lock_ref(req.last_node)
215
- self.inc_lock_ref(new_last_node)
216
-
217
- # `req.prefix_indices` will be used in `PrefillAdder::add_chunked_req` later
218
- req.prefix_indices = new_indices
219
- req.last_node = new_last_node
220
-
221
- def pretty_print(self):
222
- self._print_helper(self.root_node, 0)
223
- print(f"#tokens: {self.total_size()}")
224
-
225
- def total_size(self):
226
- return self._total_size_helper()
227
-
228
- def evict(self, num_tokens: int):
229
- if self.disable:
230
- return
231
-
232
- leaves = self._collect_leaves()
233
- heapq.heapify(leaves)
234
-
235
- num_evicted = 0
236
- while num_evicted < num_tokens and len(leaves):
237
- x = heapq.heappop(leaves)
238
-
239
- if x == self.root_node:
240
- break
241
- if x.lock_ref > 0:
242
- continue
243
-
244
- self.token_to_kv_pool_allocator.free(x.value)
245
- num_evicted += len(x.value)
246
- self._delete_leaf(x)
247
-
248
- if len(x.parent.children) == 0:
249
- heapq.heappush(leaves, x.parent)
250
-
251
- def inc_lock_ref(self, node: LoRATreeNode):
252
- if self.disable:
253
- return 0
254
-
255
- delta = 0
256
- while node != self.root_node:
257
- if node.lock_ref == 0:
258
- self.evictable_size_ -= len(node.value)
259
- self.protected_size_ += len(node.value)
260
- delta -= len(node.value)
261
- node.lock_ref += 1
262
- node = node.parent
263
- return delta
264
-
265
- def dec_lock_ref(self, node: LoRATreeNode):
266
- if self.disable:
267
- return 0
268
-
269
- delta = 0
270
- while node != self.root_node:
271
- if node.lock_ref == 1:
272
- self.evictable_size_ += len(node.value)
273
- self.protected_size_ -= len(node.value)
274
- delta += len(node.value)
275
- node.lock_ref -= 1
276
- node = node.parent
277
- return delta
278
-
279
- def evictable_size(self):
280
- return self.evictable_size_
281
-
282
- def protected_size(self):
283
- # protected size refers to the size of the cache that is locked
284
- return self.protected_size_
285
-
286
- def all_values_flatten(self):
287
- values = []
288
-
289
- def _dfs_helper(node: LoRATreeNode):
290
- for _, child in node.children.items():
291
- values.append(child.value)
292
- _dfs_helper(child)
293
-
294
- _dfs_helper(self.root_node)
295
- return torch.cat(values)
296
-
297
- ##### Internal Helper Functions #####
298
-
299
- def _match_prefix_helper(self, node: LoRATreeNode, key: LoRAKey):
300
- node.last_access_time = time.monotonic()
301
-
302
- child_key = self.get_child_key_fn(key)
303
-
304
- value = []
305
- while len(key) > 0 and child_key in node.children.keys():
306
- child = node.children[child_key]
307
- child.last_access_time = time.monotonic()
308
- prefix_len = self.key_match_fn(child.key, key)
309
- if prefix_len < len(child.key):
310
- new_node = self._split_node(child.key, child, prefix_len)
311
- value.append(new_node.value)
312
- node = new_node
313
- break
314
- else:
315
- value.append(child.value)
316
- node = child
317
- key = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[prefix_len:])
318
-
319
- if len(key):
320
- child_key = self.get_child_key_fn(key)
321
-
322
- return value, node
323
-
324
- def _split_node(self, key: LoRAKey, child: LoRATreeNode, split_len: int):
325
- # new_node -> child
326
- new_node = LoRATreeNode()
327
- key_split_1 = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[:split_len])
328
- key_split_2 = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[split_len:])
329
- new_node.children = {self.get_child_key_fn(key_split_2): child}
330
- new_node.parent = child.parent
331
- new_node.lock_ref = child.lock_ref
332
- new_node.key = key_split_1
333
- new_node.value = child.value[:split_len]
334
- child.parent = new_node
335
- child.key = key_split_2
336
- child.value = child.value[split_len:]
337
- new_node.parent.children[self.get_child_key_fn(key)] = new_node
338
-
339
- return new_node
340
-
341
- def _insert_helper(self, node: LoRATreeNode, key: LoRAKey, value):
342
- node.last_access_time = time.monotonic()
343
- if len(key) == 0:
344
- return 0
345
-
346
- child_key = self.get_child_key_fn(key)
347
-
348
- total_prefix_length = 0
349
- while len(key) > 0 and child_key in node.children.keys():
350
- node = node.children[child_key]
351
- node.last_access_time = time.monotonic()
352
- prefix_len = self.key_match_fn(node.key, key)
353
- total_prefix_length += prefix_len
354
- key = LoRAKey(lora_id=key.lora_id, token_ids=key.token_ids[prefix_len:])
355
- value = value[prefix_len:]
356
-
357
- if prefix_len < len(node.key):
358
- new_node = self._split_node(node.key, node, prefix_len)
359
- node = new_node
360
-
361
- if len(key):
362
- child_key = self.get_child_key_fn(key)
363
-
364
- if len(key):
365
- new_node = LoRATreeNode()
366
- new_node.parent = node
367
- new_node.key = key
368
- new_node.value = value
369
- node.children[child_key] = new_node
370
- self.evictable_size_ += len(value)
371
- return total_prefix_length
372
-
373
- def _print_helper(self, node: LoRATreeNode, indent: int):
374
- """Prints the radix tree in a human-readable format."""
375
- stack = [(node, indent)]
376
- while stack:
377
- current_node, current_indent = stack.pop()
378
- print(
379
- " " * current_indent,
380
- len(current_node.key),
381
- current_node.key.token_ids[:10],
382
- f"r={current_node.lock_ref}",
383
- )
384
- for key, child in current_node.children.items():
385
- stack.append((child, current_indent + 2))
386
-
387
- assert key == self.get_child_key_fn(
388
- child.key
389
- ), f"{key=}, {self.get_child_key_fn(child.key)=}"
390
-
391
- def _delete_leaf(self, node):
392
- for k, v in node.parent.children.items():
393
- if v == node:
394
- break
395
- del node.parent.children[k]
396
- self.evictable_size_ -= len(node.key)
397
-
398
- def _total_size_helper(self):
399
- total_size = 0
400
- stack = [self.root_node]
401
- while stack:
402
- current_node = stack.pop()
403
- total_size += len(current_node.value)
404
- for child in current_node.children.values():
405
- if child.evicted:
406
- continue
407
- stack.append(child)
408
- return total_size
409
-
410
- def _collect_leaves(self):
411
- ret_list = []
412
- stack = [self.root_node]
413
-
414
- while stack:
415
- cur_node = stack.pop()
416
- if len(cur_node.children) == 0:
417
- ret_list.append(cur_node)
418
- else:
419
- stack.extend(cur_node.children.values())
420
-
421
- return ret_list