sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,118 +0,0 @@
1
- import argparse
2
- import dataclasses
3
-
4
- from sglang.srt.disaggregation.mini_lb import PrefillConfig, run
5
-
6
-
7
- @dataclasses.dataclass
8
- class LBArgs:
9
- host: str = "0.0.0.0"
10
- port: int = 8000
11
- policy: str = "random"
12
- prefill_infos: list = dataclasses.field(default_factory=list)
13
- decode_infos: list = dataclasses.field(default_factory=list)
14
- log_interval: int = 5
15
- timeout: int = 600
16
-
17
- @staticmethod
18
- def add_cli_args(parser: argparse.ArgumentParser):
19
- parser.add_argument(
20
- "--host",
21
- type=str,
22
- default=LBArgs.host,
23
- help=f"Host to bind the server (default: {LBArgs.host})",
24
- )
25
- parser.add_argument(
26
- "--port",
27
- type=int,
28
- default=LBArgs.port,
29
- help=f"Port to bind the server (default: {LBArgs.port})",
30
- )
31
- parser.add_argument(
32
- "--policy",
33
- type=str,
34
- default=LBArgs.policy,
35
- choices=["random", "po2"],
36
- help=f"Policy to use for load balancing (default: {LBArgs.policy})",
37
- )
38
- parser.add_argument(
39
- "--prefill",
40
- type=str,
41
- default=[],
42
- nargs="+",
43
- help="URLs for prefill servers",
44
- )
45
- parser.add_argument(
46
- "--decode",
47
- type=str,
48
- default=[],
49
- nargs="+",
50
- help="URLs for decode servers",
51
- )
52
- parser.add_argument(
53
- "--prefill-bootstrap-ports",
54
- type=int,
55
- nargs="+",
56
- help="Bootstrap ports for prefill servers",
57
- )
58
- parser.add_argument(
59
- "--log-interval",
60
- type=int,
61
- default=LBArgs.log_interval,
62
- help=f"Log interval in seconds (default: {LBArgs.log_interval})",
63
- )
64
- parser.add_argument(
65
- "--timeout",
66
- type=int,
67
- default=LBArgs.timeout,
68
- help=f"Timeout in seconds (default: {LBArgs.timeout})",
69
- )
70
-
71
- @classmethod
72
- def from_cli_args(cls, args: argparse.Namespace) -> "LBArgs":
73
- bootstrap_ports = args.prefill_bootstrap_ports
74
- if bootstrap_ports is None:
75
- bootstrap_ports = [None] * len(args.prefill)
76
- elif len(bootstrap_ports) == 1:
77
- bootstrap_ports = bootstrap_ports * len(args.prefill)
78
- else:
79
- if len(bootstrap_ports) != len(args.prefill):
80
- raise ValueError(
81
- "Number of prefill URLs must match number of bootstrap ports"
82
- )
83
-
84
- prefill_infos = [
85
- (url, port) for url, port in zip(args.prefill, bootstrap_ports)
86
- ]
87
-
88
- return cls(
89
- host=args.host,
90
- port=args.port,
91
- policy=args.policy,
92
- prefill_infos=prefill_infos,
93
- decode_infos=args.decode,
94
- log_interval=args.log_interval,
95
- timeout=args.timeout,
96
- )
97
-
98
-
99
- def main():
100
- parser = argparse.ArgumentParser(
101
- description="PD Disaggregation Load Balancer Server"
102
- )
103
- LBArgs.add_cli_args(parser)
104
- args = parser.parse_args()
105
- lb_args = LBArgs.from_cli_args(args)
106
-
107
- prefill_configs = [PrefillConfig(url, port) for url, port in lb_args.prefill_infos]
108
- run(
109
- prefill_configs,
110
- lb_args.decode_infos,
111
- lb_args.host,
112
- lb_args.port,
113
- lb_args.timeout,
114
- )
115
-
116
-
117
- if __name__ == "__main__":
118
- main()
@@ -1,296 +0,0 @@
1
- # Copyright 2023-2024 SGLang Team
2
- # Licensed under the Apache License, Version 2.0 (the "License");
3
- # you may not use this file except in compliance with the License.
4
- # You may obtain a copy of the License at
5
- #
6
- # http://www.apache.org/licenses/LICENSE-2.0
7
- #
8
- # Unless required by applicable law or agreed to in writing, software
9
- # distributed under the License is distributed on an "AS IS" BASIS,
10
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
- # See the License for the specific language governing permissions and
12
- # limitations under the License.
13
- # ==============================================================================
14
- """A tensor parallel worker."""
15
-
16
- import dataclasses
17
- import logging
18
- import signal
19
- import threading
20
- from queue import Queue
21
- from typing import Optional, Tuple
22
-
23
- import psutil
24
- import torch
25
-
26
- from sglang.srt.managers.io_struct import (
27
- GetWeightsByNameReqInput,
28
- InitWeightsUpdateGroupReqInput,
29
- LoadLoRAAdapterReqInput,
30
- UnloadLoRAAdapterReqInput,
31
- UpdateWeightFromDiskReqInput,
32
- UpdateWeightsFromDistributedReqInput,
33
- UpdateWeightsFromTensorReqInput,
34
- )
35
- from sglang.srt.managers.schedule_batch import ModelWorkerBatch
36
- from sglang.srt.managers.tp_worker import TpModelWorker
37
- from sglang.srt.server_args import ServerArgs
38
- from sglang.srt.utils import DynamicGradMode, get_compiler_backend
39
- from sglang.utils import get_exception_traceback
40
-
41
- logger = logging.getLogger(__name__)
42
-
43
-
44
- @torch.compile(dynamic=True, backend=get_compiler_backend())
45
- def resolve_future_token_ids(input_ids, future_token_ids_map):
46
- input_ids[:] = torch.where(
47
- input_ids < 0,
48
- future_token_ids_map[torch.clamp(-input_ids, min=0)],
49
- input_ids,
50
- )
51
-
52
-
53
- class TpModelWorkerClient:
54
- """A tensor parallel model worker."""
55
-
56
- def __init__(
57
- self,
58
- server_args: ServerArgs,
59
- gpu_id: int,
60
- tp_rank: int,
61
- moe_ep_rank: int,
62
- pp_rank: int,
63
- dp_rank: Optional[int],
64
- nccl_port: int,
65
- ):
66
- # Load the model
67
- self.worker = TpModelWorker(
68
- server_args, gpu_id, tp_rank, moe_ep_rank, pp_rank, dp_rank, nccl_port
69
- )
70
- self.max_running_requests = self.worker.max_running_requests
71
- self.device = self.worker.device
72
- self.gpu_id = gpu_id
73
-
74
- # Init future mappings
75
- self.future_token_ids_ct = 0
76
- self.future_token_ids_limit = self.max_running_requests * 3
77
- self.future_token_ids_map = torch.empty(
78
- (self.max_running_requests * 5,), dtype=torch.int64, device=self.device
79
- )
80
-
81
- # Launch threads
82
- self.input_queue = Queue()
83
- self.output_queue = Queue()
84
- self.forward_stream = torch.get_device_module(self.device).Stream()
85
- self.forward_thread = threading.Thread(
86
- target=self.forward_thread_func,
87
- )
88
- self.forward_thread.start()
89
- self.parent_process = psutil.Process().parent()
90
- self.scheduler_stream = torch.get_device_module(self.device).current_stream()
91
- if self.device == "cpu":
92
- self.scheduler_stream.synchronize = lambda: None # No-op for CPU
93
-
94
- self.hicache_layer_transfer_counter = None
95
-
96
- def register_hicache_layer_transfer_counter(self, counter):
97
- self.hicache_layer_transfer_counter = counter
98
-
99
- def set_hicache_consumer(self, consumer_index):
100
- if self.hicache_layer_transfer_counter is not None:
101
- self.hicache_layer_transfer_counter.set_consumer(consumer_index)
102
-
103
- def get_worker_info(self):
104
- return self.worker.get_worker_info()
105
-
106
- def get_tokens_per_layer_info(self):
107
- return self.worker.get_tokens_per_layer_info()
108
-
109
- @property
110
- def sliding_window_size(self) -> Optional[int]:
111
- return self.worker.sliding_window_size
112
-
113
- @property
114
- def is_hybrid(self) -> bool:
115
- return self.worker.is_hybrid
116
-
117
- def get_pad_input_ids_func(self):
118
- return self.worker.get_pad_input_ids_func()
119
-
120
- def get_tp_group(self):
121
- return self.worker.get_tp_group()
122
-
123
- def get_attention_tp_group(self):
124
- return self.worker.get_attention_tp_group()
125
-
126
- def get_attention_tp_cpu_group(self):
127
- return self.worker.get_attention_tp_cpu_group()
128
-
129
- def get_memory_pool(self):
130
- return (
131
- self.worker.model_runner.req_to_token_pool,
132
- self.worker.model_runner.token_to_kv_pool_allocator,
133
- )
134
-
135
- def get_kv_cache(self):
136
- return self.worker.model_runner.token_to_kv_pool
137
-
138
- def forward_thread_func(self):
139
- try:
140
- with torch.get_device_module(self.device).stream(self.forward_stream):
141
- self.forward_thread_func_()
142
- except Exception:
143
- traceback = get_exception_traceback()
144
- logger.error(f"TpModelWorkerClient hit an exception: {traceback}")
145
- self.parent_process.send_signal(signal.SIGQUIT)
146
-
147
- @DynamicGradMode()
148
- def forward_thread_func_(self):
149
- batch_pt = 0
150
- batch_lists = [None] * 2
151
-
152
- while True:
153
- model_worker_batch, future_token_ids_ct, sync_event = self.input_queue.get()
154
- if not model_worker_batch:
155
- break
156
-
157
- sync_event.wait()
158
-
159
- # Keep a reference of model_worker_batch by storing it into a list.
160
- # Otherwise, the tensor members of model_worker_batch will be released
161
- # by pytorch and cause CUDA illegal memory access errors.
162
- batch_lists[batch_pt % 2] = model_worker_batch
163
- batch_pt += 1
164
-
165
- # Create event
166
- copy_done = torch.get_device_module(self.device).Event()
167
-
168
- # Resolve future tokens in the input
169
- input_ids = model_worker_batch.input_ids
170
- resolve_future_token_ids(input_ids, self.future_token_ids_map)
171
-
172
- # update the consumer index of hicache to the running batch
173
- self.set_hicache_consumer(model_worker_batch.hicache_consumer_index)
174
- # Run forward
175
- logits_output, next_token_ids, can_run_cuda_graph = (
176
- self.worker.forward_batch_generation(
177
- model_worker_batch, model_worker_batch.launch_done
178
- )
179
- )
180
-
181
- # Update the future token ids map
182
- bs = len(model_worker_batch.seq_lens)
183
- self.future_token_ids_map[
184
- future_token_ids_ct + 1 : future_token_ids_ct + bs + 1
185
- ] = next_token_ids
186
-
187
- # Copy results to the CPU
188
- if model_worker_batch.return_logprob:
189
- logits_output.next_token_logprobs = (
190
- logits_output.next_token_logprobs.to("cpu", non_blocking=True)
191
- )
192
- if logits_output.input_token_logprobs is not None:
193
- logits_output.input_token_logprobs = (
194
- logits_output.input_token_logprobs.to("cpu", non_blocking=True)
195
- )
196
- if logits_output.hidden_states is not None:
197
- logits_output.hidden_states = logits_output.hidden_states.to(
198
- "cpu", non_blocking=True
199
- )
200
- next_token_ids = next_token_ids.to("cpu", non_blocking=True)
201
- copy_done.record()
202
-
203
- self.output_queue.put(
204
- (copy_done, logits_output, next_token_ids, can_run_cuda_graph)
205
- )
206
-
207
- def resolve_last_batch_result(self, launch_done: Optional[threading.Event] = None):
208
- """
209
- This function is called to resolve the last batch result and
210
- wait for the current batch to be launched. Used in overlap mode.
211
- """
212
- copy_done, logits_output, next_token_ids, can_run_cuda_graph = (
213
- self.output_queue.get()
214
- )
215
-
216
- if launch_done is not None:
217
- launch_done.wait()
218
- copy_done.synchronize()
219
-
220
- if logits_output.next_token_logprobs is not None:
221
- logits_output.next_token_logprobs = (
222
- logits_output.next_token_logprobs.tolist()
223
- )
224
- if logits_output.input_token_logprobs is not None:
225
- logits_output.input_token_logprobs = tuple(
226
- logits_output.input_token_logprobs.tolist()
227
- )
228
- next_token_ids = next_token_ids.tolist()
229
- return logits_output, next_token_ids, can_run_cuda_graph
230
-
231
- def forward_batch_generation(
232
- self, model_worker_batch: ModelWorkerBatch
233
- ) -> Tuple[None, torch.Tensor, bool]:
234
- # Create a new copy of sampling_info because it will be updated in-place by the scheduler for the next batch.
235
- sampling_info = model_worker_batch.sampling_info
236
- sampling_info.update_penalties()
237
- model_worker_batch.sampling_info = self.cur_sampling_info = dataclasses.replace(
238
- sampling_info,
239
- sampling_info_done=threading.Event(),
240
- penalizer_orchestrator=None,
241
- )
242
-
243
- # A cuda stream sync here to avoid the cuda illegal memory access error.
244
- sync_event = torch.get_device_module(self.device).Event()
245
- sync_event.record(self.scheduler_stream)
246
-
247
- # Push a new batch to the queue
248
- self.input_queue.put((model_worker_batch, self.future_token_ids_ct, sync_event))
249
-
250
- # Allocate output future objects
251
- bs = len(model_worker_batch.seq_lens)
252
- future_next_token_ids = torch.arange(
253
- -(self.future_token_ids_ct + 1),
254
- -(self.future_token_ids_ct + 1 + bs),
255
- -1,
256
- dtype=torch.int64,
257
- device=self.device,
258
- )
259
- self.future_token_ids_ct = (
260
- self.future_token_ids_ct + bs
261
- ) % self.future_token_ids_limit
262
- return None, future_next_token_ids, False
263
-
264
- def update_weights_from_disk(self, recv_req: UpdateWeightFromDiskReqInput):
265
- success, message = self.worker.update_weights_from_disk(recv_req)
266
- return success, message
267
-
268
- def init_weights_update_group(self, recv_req: InitWeightsUpdateGroupReqInput):
269
- success, message = self.worker.init_weights_update_group(recv_req)
270
- return success, message
271
-
272
- def update_weights_from_distributed(
273
- self, recv_req: UpdateWeightsFromDistributedReqInput
274
- ):
275
- success, message = self.worker.update_weights_from_distributed(recv_req)
276
- return success, message
277
-
278
- def update_weights_from_tensor(self, recv_req: UpdateWeightsFromTensorReqInput):
279
- success, message = self.worker.update_weights_from_tensor(recv_req)
280
- return success, message
281
-
282
- def get_weights_by_name(self, recv_req: GetWeightsByNameReqInput):
283
- return self.worker.get_weights_by_name(recv_req)
284
-
285
- def load_lora_adapter(self, recv_req: LoadLoRAAdapterReqInput):
286
- return self.worker.load_lora_adapter(recv_req)
287
-
288
- def unload_lora_adapter(self, recv_req: UnloadLoRAAdapterReqInput):
289
- return self.worker.unload_lora_adapter(recv_req)
290
-
291
- def can_run_lora_batch(self, lora_ids: list[str]) -> bool:
292
- return self.worker.can_run_lora_batch(lora_ids)
293
-
294
- def __delete__(self):
295
- self.input_queue.put((None, None))
296
- self.copy_queue.put((None, None, None))