sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,174 @@
1
+ import torch
2
+ import triton
3
+ import triton.language as tl
4
+
5
+ from sglang.srt.lora.utils import LoRABatchInfo
6
+ from sglang.srt.utils import cached_triton_kernel
7
+
8
+
9
+ @cached_triton_kernel(lambda _, kwargs: (kwargs["NUM_SLICES"], kwargs["BLOCK_M"]))
10
+ @triton.jit
11
+ def _chunked_lora_shrink_kernel(
12
+ # Pointers to matrices
13
+ x,
14
+ weights,
15
+ output,
16
+ # Information on sequence lengths,ranks and weight id
17
+ seg_indptr,
18
+ weight_indices,
19
+ lora_ranks,
20
+ permutation,
21
+ num_segs,
22
+ # Meta parameters
23
+ N: tl.constexpr, # num_slices * r
24
+ K: tl.constexpr, # input_dim
25
+ NUM_SLICES: tl.constexpr,
26
+ BLOCK_M: tl.constexpr,
27
+ BLOCK_N: tl.constexpr,
28
+ BLOCK_K: tl.constexpr,
29
+ ):
30
+ """
31
+ Computes a chunked SGMV for LoRA shrink operations.
32
+
33
+ The kernel ensures that output[seg_start:seg_start + seg_len, :rank * num_slices]
34
+ stores the product of the input `x` and the LoRA weights for the corresponding
35
+ sequence. This implies that when rank is 0, the kernel is essentially a no-op,
36
+ as output[seg_start:seg_start + seg_len, :0] is trivially correct (empty).
37
+
38
+ Args:
39
+ x (torch.Tensor): The input activations tensor of shape `(s, K)`, where `s`
40
+ is the sum of all sequence lengths in the batch.
41
+ weights (torch.Tensor): The LoRA A weights for all available adapters,
42
+ with shape `(num_lora, N, K)` where N = num_slices * r.
43
+ output (torch.Tensor): The output tensor of shape `(s, N)`.
44
+ """
45
+ x_stride_1: tl.constexpr = 1
46
+ x_stride_0: tl.constexpr = K
47
+
48
+ w_stride_0: tl.constexpr = N * K
49
+ w_stride_1: tl.constexpr = K
50
+ w_stride_2: tl.constexpr = 1
51
+
52
+ output_stride_0: tl.constexpr = N
53
+ output_stride_1: tl.constexpr = 1
54
+
55
+ pid_s = tl.program_id(1)
56
+ if pid_s >= num_segs:
57
+ return
58
+
59
+ pid_n = tl.program_id(0)
60
+
61
+ # Current block computes sequence with batch_id,
62
+ # which starts from row seg_start of x with length seg_len
63
+ w_index = tl.load(weight_indices + pid_s)
64
+ rank = tl.load(lora_ranks + w_index)
65
+
66
+ # If rank is 0, this kernel becomes a no-op as the output is always trivially correct.
67
+ if rank == 0:
68
+ return
69
+
70
+ seg_start = tl.load(seg_indptr + pid_s)
71
+ seg_end = tl.load(seg_indptr + pid_s + 1)
72
+
73
+ # Adjust N dim according to the specific LoRA adapter
74
+ cur_n = tl.minimum(N, rank * NUM_SLICES)
75
+
76
+ # Map logical sequence index to physical index
77
+ s_offset_logical = tl.arange(0, BLOCK_M) + seg_start
78
+ s_offset_physical = tl.load(
79
+ permutation + s_offset_logical, mask=s_offset_logical < seg_end
80
+ )
81
+
82
+ n_offset = tl.arange(0, BLOCK_N) + pid_n * BLOCK_N
83
+ k_offset = tl.arange(0, BLOCK_K)
84
+ x_ptrs = x + (
85
+ s_offset_physical[:, None] * x_stride_0 + k_offset[None, :] * x_stride_1
86
+ )
87
+ w_ptrs = (weights + w_index * w_stride_0) + (
88
+ k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
89
+ )
90
+
91
+ # Iterate to compute the block in output matrix
92
+ partial_sum = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
93
+ for k in range(0, tl.cdiv(K, BLOCK_K)):
94
+ x_tile = tl.load(
95
+ x_ptrs,
96
+ mask=(s_offset_logical[:, None] < seg_end)
97
+ & (k_offset[None, :] < K - k * BLOCK_K),
98
+ other=0.0,
99
+ )
100
+ w_tile = tl.load(
101
+ w_ptrs,
102
+ mask=(k_offset[:, None] < K - k * BLOCK_K) & (n_offset[None, :] < cur_n),
103
+ other=0.0,
104
+ )
105
+ partial_sum += tl.dot(x_tile, w_tile)
106
+
107
+ x_ptrs += BLOCK_K * x_stride_1
108
+ w_ptrs += BLOCK_K * w_stride_2
109
+
110
+ # Store result to output matrix
111
+ partial_sum = partial_sum.to(x.dtype.element_ty)
112
+ output_ptr = output + (
113
+ s_offset_physical[:, None] * output_stride_0
114
+ + n_offset[None, :] * output_stride_1
115
+ )
116
+ output_mask = (s_offset_logical[:, None] < seg_end) & (n_offset[None, :] < cur_n)
117
+ tl.store(output_ptr, partial_sum, mask=output_mask)
118
+
119
+
120
+ def chunked_sgmv_lora_shrink_forward(
121
+ x: torch.Tensor,
122
+ weights: torch.Tensor,
123
+ batch_info: LoRABatchInfo,
124
+ num_slices: int,
125
+ ) -> torch.Tensor:
126
+ # x: (s, input_dim)
127
+ # weights: (num_lora, num_slices * r, input_dim)
128
+ # output: (s, num_slices * r)
129
+ # num_slices: qkv=3, gate_up=2, others=1
130
+ # when called with multiple slices, the weights.shape[-2] will be num_slices * r
131
+ # input_dim is much larger than r
132
+
133
+ assert x.is_contiguous()
134
+ assert weights.is_contiguous()
135
+ assert len(x.shape) == 2
136
+ assert len(weights.shape) == 3
137
+
138
+ # Block shapes
139
+ # TODO (lifuhuang): experiment with split-k
140
+ BLOCK_M = batch_info.max_len
141
+ BLOCK_N = 16
142
+ BLOCK_K = 256
143
+
144
+ S = x.shape[0]
145
+ N = weights.shape[1]
146
+ K = weights.shape[2]
147
+ assert x.shape[-1] == K
148
+
149
+ num_segments = batch_info.num_segments
150
+ grid = (
151
+ triton.cdiv(N, BLOCK_N),
152
+ batch_info.bs if batch_info.use_cuda_graph else num_segments,
153
+ )
154
+
155
+ output = torch.empty((S, N), device=x.device, dtype=x.dtype)
156
+ _chunked_lora_shrink_kernel[grid](
157
+ x=x,
158
+ weights=weights,
159
+ output=output,
160
+ seg_indptr=batch_info.seg_indptr,
161
+ weight_indices=batch_info.weight_indices,
162
+ lora_ranks=batch_info.lora_ranks,
163
+ permutation=batch_info.permutation,
164
+ num_segs=num_segments,
165
+ # constants
166
+ N=N,
167
+ K=K,
168
+ NUM_SLICES=num_slices,
169
+ BLOCK_M=BLOCK_M,
170
+ BLOCK_N=BLOCK_N,
171
+ BLOCK_K=BLOCK_K,
172
+ )
173
+
174
+ return output
sglang/srt/lora/utils.py CHANGED
@@ -5,24 +5,27 @@ from typing import Iterable, Optional, Set, Tuple
5
5
 
6
6
  import torch
7
7
 
8
- from sglang.srt.hf_transformers_utils import AutoConfig
8
+ from sglang.srt.utils.hf_transformers_utils import AutoConfig
9
9
 
10
10
 
11
11
  @dataclass
12
12
  class LoRABatchInfo:
13
+ # The forward mode is using CUDA Graph.
14
+ use_cuda_graph: bool
15
+
13
16
  # Batch size
14
17
  bs: int
15
18
 
16
- # Lengths of each sequence in shape (bs,)
17
- seg_lens: torch.Tensor
18
-
19
- # Indice pointers of each sequence in shape (bs + 1, )
20
- seg_indptr: torch.Tensor
19
+ # Number of segments. For triton backend, it is equal to batch size.
20
+ num_segments: int
21
21
 
22
- # Maximum sequence length of current batch
22
+ # Maximum segment length of current batch
23
23
  max_len: int
24
24
 
25
- # The index of lora adapter used by each sequence, in shape (bs,)
25
+ # Indice pointers of each segment in shape (num_segments + 1, )
26
+ seg_indptr: torch.Tensor
27
+
28
+ # The index of lora adapter used by each segment, in shape (num_segments,)
26
29
  weight_indices: torch.Tensor
27
30
 
28
31
  # ranks of each lora adapter, in shape (lora_num,)
@@ -31,6 +34,12 @@ class LoRABatchInfo:
31
34
  # scaling of each lora adapter, in shape (lora_num,)
32
35
  scalings: torch.Tensor
33
36
 
37
+ # Lengths of each segments in shape (num_segments,)
38
+ seg_lens: Optional[torch.Tensor]
39
+
40
+ # The logical (re)ordering of input rows (tokens), in shape (num_tokens,)
41
+ permutation: Optional[torch.Tensor]
42
+
34
43
 
35
44
  class LoRAType(Enum):
36
45
  LORA_A = 0
@@ -48,14 +57,14 @@ def get_layer_id(name: str) -> int:
48
57
 
49
58
 
50
59
  def get_hidden_dim(
51
- module_name: str, config: AutoConfig, base_model: torch.nn.Module
60
+ module_name: str, config: AutoConfig, base_model: torch.nn.Module, layer_idx: int
52
61
  ) -> Tuple[int]:
53
62
  """
54
63
  Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
55
64
  """
56
65
 
57
66
  if hasattr(base_model, "get_hidden_dim"):
58
- return base_model.get_hidden_dim(module_name)
67
+ return base_model.get_hidden_dim(module_name, layer_idx)
59
68
  else:
60
69
  """
61
70
  WARNING: get_hidden_dim() is not defined,
@@ -89,6 +98,7 @@ def get_normalized_target_modules(
89
98
  ) -> set[str]:
90
99
  """
91
100
  Mapping a list of target module name to names of the normalized LoRA weights.
101
+ Handles both base module names (e.g., "gate_proj") and prefixed module names (e.g., "feed_forward.gate_proj").
92
102
  """
93
103
  params_mapping = {
94
104
  "q_proj": "qkv_proj",
@@ -100,7 +110,8 @@ def get_normalized_target_modules(
100
110
 
101
111
  result = set()
102
112
  for name in target_modules:
103
- normalized_name = params_mapping.get(name, name)
113
+ base_name = name.split(".")[-1]
114
+ normalized_name = params_mapping.get(base_name, base_name)
104
115
  result.add(normalized_name)
105
116
  return result
106
117
 
@@ -0,0 +1,170 @@
1
+ """
2
+ Asynchronous dynamic batch tokenizer for SGLang.
3
+
4
+ This module provides an async tokenizer with dynamic batching capabilities
5
+ to reduce tokenization overhead when multiple requests arrive concurrently.
6
+ """
7
+
8
+ import asyncio
9
+ import logging
10
+ from concurrent.futures import ThreadPoolExecutor
11
+ from functools import partial
12
+ from typing import Any, Dict, List, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+
17
+ class AsyncDynamicbatchTokenizer:
18
+ """Asynchronous tokenizer with dynamic batching for single string prompts.
19
+
20
+ Dynamically batches pending encode requests from a queue to reduce overhead.
21
+ Only handles single string prompts - regular batch processing of multiple
22
+ strings per request should be handled at a higher level.
23
+ A single-thread ThreadPoolExecutor is used so the event loop stays responsive.
24
+
25
+ Note: Uses lazy initialization for asyncio components because this class
26
+ is instantiated in TokenizerManager.__init__() before the event loop starts.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ tokenizer,
32
+ max_batch_size: int = 32,
33
+ batch_wait_timeout_s: float = 0.002,
34
+ ) -> None:
35
+ self.tokenizer = tokenizer
36
+ self.max_batch_size = max_batch_size
37
+ self.batch_wait_timeout_s = batch_wait_timeout_s
38
+
39
+ # Single queue for all encode requests - initialized lazily
40
+ self._queue: Optional[asyncio.Queue] = None
41
+ self._batcher_task: Optional[asyncio.Task] = None
42
+
43
+ # Single-thread executor for blocking tokenizer calls
44
+ self._executor = ThreadPoolExecutor(max_workers=1)
45
+ self._initialized = False
46
+
47
+ def _ensure_initialized(self):
48
+ """Lazy initialization of event loop dependent components."""
49
+ if not self._initialized:
50
+ self._queue = asyncio.Queue()
51
+ self._batcher_task = asyncio.create_task(self._dynamic_batch_loop())
52
+ self._initialized = True
53
+
54
+ async def __call__(self, prompt: str, **kwargs) -> Any:
55
+ """Encode a single prompt."""
56
+ return await self.encode(prompt, **kwargs)
57
+
58
+ async def encode(self, prompt: str, **kwargs) -> Any:
59
+ """Encode a single prompt."""
60
+ self._ensure_initialized()
61
+ result_future: asyncio.Future = asyncio.get_running_loop().create_future()
62
+ await self._queue.put((prompt, kwargs, result_future))
63
+ return await result_future
64
+
65
+ async def _dynamic_batch_loop(self):
66
+ """Dynamically batch incoming encode requests for efficiency."""
67
+ while True:
68
+ try:
69
+ # Get the first request
70
+ prompt, kwargs, result_future = await self._queue.get()
71
+
72
+ # Collect requests into dynamic batch
73
+ prompts = [prompt]
74
+ kwargs_list = [kwargs]
75
+ result_futures = [result_future]
76
+
77
+ # Check if there are more items immediately available in the queue
78
+ # If queue is empty, process single item immediately without timeout
79
+ if self._queue.empty():
80
+ # No other requests waiting, process immediately
81
+ pass
82
+ else:
83
+ # There might be more requests, wait for dynamic batching opportunity
84
+ start_time = asyncio.get_running_loop().time()
85
+
86
+ # Collect more requests up to max_batch_size or batch_wait_timeout_s
87
+ while len(prompts) < self.max_batch_size:
88
+ elapsed = asyncio.get_running_loop().time() - start_time
89
+ if elapsed >= self.batch_wait_timeout_s:
90
+ break
91
+
92
+ remaining_time = self.batch_wait_timeout_s - elapsed
93
+ try:
94
+ prompt, kwargs, result_future = await asyncio.wait_for(
95
+ self._queue.get(), remaining_time
96
+ )
97
+ prompts.append(prompt)
98
+ kwargs_list.append(kwargs)
99
+ result_futures.append(result_future)
100
+ except asyncio.TimeoutError:
101
+ break
102
+
103
+ # Log dynamic batch information
104
+ logger.debug(
105
+ f"AsyncDynamicbatchTokenizer: Processing dynamic batch of size {len(prompts)}"
106
+ )
107
+
108
+ # Process the dynamic batch
109
+ await self._process_dynamic_batch(prompts, kwargs_list, result_futures)
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error in dynamic batch loop: {e}")
113
+ # Continue the loop to handle other requests
114
+
115
+ async def _process_dynamic_batch(
116
+ self,
117
+ prompts: List[str],
118
+ kwargs_list: List[Dict],
119
+ result_futures: List[asyncio.Future],
120
+ ) -> None:
121
+ """Process a dynamic batch of encode requests for single string prompts."""
122
+ # Check if all kwargs are identical for efficient batch processing
123
+ can_batch = len(set(str(sorted(kw.items())) for kw in kwargs_list)) == 1
124
+ kwargs = kwargs_list[0] if can_batch else None
125
+
126
+ try:
127
+ # If every request uses identical kwargs we can run a single
128
+ # batch tokenizer call for a big speed-up.
129
+ if can_batch and len(prompts) > 1:
130
+ encode_fn = partial(self.tokenizer, prompts, **kwargs)
131
+ results = await asyncio.get_running_loop().run_in_executor(
132
+ self._executor, encode_fn
133
+ )
134
+
135
+ for i, fut in enumerate(result_futures):
136
+ if not fut.done():
137
+ data = {k: v[i] for k, v in results.items()}
138
+ fut.set_result(data)
139
+ else:
140
+ # Process each request individually due to different kwargs
141
+ if len(prompts) > 1 and not can_batch:
142
+ logger.warning(
143
+ f"AsyncDynamicbatchTokenizer: Dynamic batching disabled for batch of {len(prompts)} "
144
+ f"requests due to differing kwargs. This reduces performance benefits. "
145
+ f"Consider using consistent tokenization parameters across requests."
146
+ )
147
+
148
+ encode_fn = lambda prompts=prompts, kwargs=kwargs_list: [
149
+ self.tokenizer(p, **kw) for p, kw in zip(prompts, kwargs_list)
150
+ ]
151
+ results = await asyncio.get_running_loop().run_in_executor(
152
+ self._executor, encode_fn
153
+ )
154
+
155
+ for fut, res in zip(result_futures, results):
156
+ if not fut.done():
157
+ fut.set_result(res)
158
+ except Exception as e:
159
+ logger.error(f"Error in dynamic batch processing: {e}")
160
+ for fut in result_futures:
161
+ if not fut.done():
162
+ fut.set_exception(e)
163
+
164
+ def __del__(self):
165
+ """Clean up background tasks."""
166
+ if hasattr(self, "_batcher_task") and self._batcher_task:
167
+ if not self._batcher_task.done():
168
+ self._batcher_task.cancel()
169
+ if hasattr(self, "_executor"):
170
+ self._executor.shutdown(wait=False)