sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -1,445 +1,6 @@
1
- """
2
- Minimal HTTP load balancer for prefill and decode servers for testing.
3
- """
4
-
5
- import asyncio
6
- import dataclasses
7
- import logging
8
- import random
9
- import urllib
10
- from http import HTTPStatus
11
- from itertools import chain
12
- from typing import List, Optional
13
-
14
- import aiohttp
15
- import orjson
16
- import uvicorn
17
- from fastapi import FastAPI, HTTPException
18
- from fastapi.responses import ORJSONResponse, Response, StreamingResponse
19
-
20
- from sglang.srt.disaggregation.utils import PDRegistryRequest
21
- from sglang.srt.utils import maybe_wrap_ipv6_address
22
-
23
- AIOHTTP_STREAM_READ_CHUNK_SIZE = (
24
- 1024 * 64
25
- ) # 64KB, to prevent aiohttp's "Chunk too big" error
26
-
27
-
28
- def setup_logger():
29
- logger = logging.getLogger("pdlb")
30
- logger.setLevel(logging.INFO)
31
-
32
- formatter = logging.Formatter(
33
- "[PDLB (Python)] %(asctime)s - %(levelname)s - %(message)s",
34
- datefmt="%Y-%m-%d %H:%M:%S",
35
- )
36
-
37
- handler = logging.StreamHandler()
38
- handler.setFormatter(formatter)
39
- logger.addHandler(handler)
40
-
41
- return logger
42
-
43
-
44
- logger = setup_logger()
45
-
46
-
47
- @dataclasses.dataclass
48
- class PrefillConfig:
49
- url: str
50
- bootstrap_port: Optional[int] = None
51
-
52
-
53
- class MiniLoadBalancer:
54
- def __init__(
55
- self,
56
- prefill_configs: List[PrefillConfig],
57
- decode_servers: List[str],
58
- timeout: int,
59
- ):
60
- self.prefill_configs = prefill_configs
61
- self.prefill_servers = [p.url for p in prefill_configs]
62
- self.decode_servers = decode_servers
63
- self.timeout = timeout
64
-
65
- def add_prefill_server(self, new_prefill_config: PrefillConfig):
66
- self.prefill_configs.append(new_prefill_config)
67
- self.prefill_servers.append(new_prefill_config.url)
68
-
69
- def add_decode_server(self, new_decode_server: str):
70
- self.decode_servers.append(new_decode_server)
71
-
72
- def select_pair(self):
73
- # TODO: return some message instead of panic
74
- assert len(self.prefill_configs) > 0, "No prefill servers available"
75
- assert len(self.decode_servers) > 0, "No decode servers available"
76
-
77
- prefill_config = random.choice(self.prefill_configs)
78
- decode_server = random.choice(self.decode_servers)
79
- return prefill_config.url, prefill_config.bootstrap_port, decode_server
80
-
81
- async def generate(
82
- self, modified_request, prefill_server, decode_server, endpoint
83
- ) -> ORJSONResponse:
84
- assert endpoint[0] != "/", f"Endpoint should not start with '/': {endpoint}"
85
-
86
- async with aiohttp.ClientSession(
87
- timeout=aiohttp.ClientTimeout(
88
- total=self.timeout
89
- ) # Add timeout for request reliability
90
- ) as session:
91
- tasks = [
92
- session.post(f"{prefill_server}/{endpoint}", json=modified_request),
93
- session.post(f"{decode_server}/{endpoint}", json=modified_request),
94
- ]
95
-
96
- # Wait for both responses to complete. Prefill should end first.
97
- prefill_response, decode_response = await asyncio.gather(*tasks)
98
-
99
- if "return_logprob" in modified_request:
100
-
101
- prefill_json = await prefill_response.json()
102
- ret_json = await decode_response.json()
103
-
104
- # merge `meta_info.input_token_logprobs` from prefill to decode
105
- if "meta_info" in ret_json:
106
- if "input_token_logprobs" in ret_json["meta_info"]:
107
- ret_json["meta_info"]["input_token_logprobs"] = (
108
- prefill_json["meta_info"]["input_token_logprobs"]
109
- + ret_json["meta_info"]["input_token_logprobs"]
110
- )
111
- else:
112
- ret_json = await decode_response.json()
113
-
114
- return ORJSONResponse(
115
- content=ret_json,
116
- status_code=decode_response.status,
117
- )
118
-
119
- async def generate_stream(
120
- self, modified_request, prefill_server, decode_server, endpoint="generate"
121
- ):
122
- assert endpoint[0] != "/", f"Endpoint should not start with '/': {endpoint}"
123
-
124
- async def stream_results():
125
- async with aiohttp.ClientSession(
126
- timeout=aiohttp.ClientTimeout(
127
- total=self.timeout
128
- ) # Add timeout for request reliability
129
- ) as session:
130
- # Create the tasks for both prefill and decode requests
131
- tasks = [
132
- session.post(f"{prefill_server}/{endpoint}", json=modified_request),
133
- session.post(f"{decode_server}/{endpoint}", json=modified_request),
134
- ]
135
- # Wait for both responses to complete. Since this is streaming, they return immediately.
136
- prefill_response, decode_response = await asyncio.gather(*tasks)
137
-
138
- if modified_request.get("return_logprob", False):
139
- prefill_chunks = []
140
- async for chunk in prefill_response.content:
141
- prefill_chunks.append(chunk)
142
-
143
- first_prefill_chunk = (
144
- prefill_chunks[0].decode("utf-8")[5:].strip("\n")
145
- )
146
- first_prefill_chunk_json = orjson.loads(first_prefill_chunk)
147
-
148
- async for chunk in decode_response.content:
149
- # Note: This is inefficient
150
- # merge prefill input_token_logprobs, output_token_logprobs to decode
151
- decoded_chunk = chunk.decode("utf-8")
152
- if (
153
- decoded_chunk
154
- and decoded_chunk.startswith("data:")
155
- and "[DONE]" not in decoded_chunk
156
- ):
157
- ret_json = orjson.loads(decoded_chunk[5:].strip("\n"))
158
- ret_json["meta_info"]["input_token_logprobs"] = (
159
- first_prefill_chunk_json["meta_info"][
160
- "input_token_logprobs"
161
- ]
162
- + ret_json["meta_info"]["input_token_logprobs"]
163
- )
164
-
165
- yield b"data: " + orjson.dumps(ret_json) + b"\n\n"
166
- else:
167
- yield chunk
168
- else:
169
- async for chunk in decode_response.content.iter_chunked(
170
- AIOHTTP_STREAM_READ_CHUNK_SIZE
171
- ):
172
- yield chunk
173
-
174
- return StreamingResponse(
175
- stream_results(),
176
- media_type="text/event-stream",
177
- )
178
-
179
-
180
- app = FastAPI()
181
- load_balancer: Optional[MiniLoadBalancer] = None
182
-
183
-
184
- @app.get("/health")
185
- async def health_check():
186
- return Response(status_code=200)
187
-
188
-
189
- @app.get("/health_generate")
190
- async def health_generate():
191
- prefill_servers, decode_servers = (
192
- load_balancer.prefill_servers,
193
- load_balancer.decode_servers,
194
- )
195
- async with aiohttp.ClientSession() as session:
196
- # Create the tasks
197
- tasks = []
198
- for server in chain(prefill_servers, decode_servers):
199
- tasks.append(session.get(f"{server}/health_generate"))
200
- for i, response in enumerate(asyncio.as_completed(tasks)):
201
- await response
202
- return Response(status_code=200)
203
-
204
-
205
- @app.post("/flush_cache")
206
- async def flush_cache():
207
- prefill_servers, decode_servers = (
208
- load_balancer.prefill_servers,
209
- load_balancer.decode_servers,
210
- )
211
- async with aiohttp.ClientSession() as session:
212
- # Create the tasks
213
- tasks = []
214
- for server in chain(prefill_servers, decode_servers):
215
- tasks.append(session.post(f"{server}/flush_cache"))
216
- for i, response in enumerate(asyncio.as_completed(tasks)):
217
- await response
218
- return Response(status_code=200)
219
-
220
-
221
- @app.get("/get_server_info")
222
- async def get_server_info():
223
- prefill_servers, decode_servers = (
224
- load_balancer.prefill_servers,
225
- load_balancer.decode_servers,
226
- )
227
- prefill_infos = []
228
- decode_infos = []
229
- all_internal_states = []
230
-
231
- async with aiohttp.ClientSession() as session:
232
- for server in chain(prefill_servers):
233
- server_info = await session.get(f"{server}/get_server_info")
234
- prefill_infos.append(await server_info.json())
235
- for server in chain(decode_servers):
236
- server_info = await session.get(f"{server}/get_server_info")
237
- info_json = await server_info.json()
238
- decode_infos.append(info_json)
239
- # Extract internal_states from decode servers
240
- if "internal_states" in info_json:
241
- all_internal_states.extend(info_json["internal_states"])
242
-
243
- # Return format expected by bench_one_batch_server.py
244
- if all_internal_states:
245
- return {
246
- "internal_states": all_internal_states,
247
- "prefill": prefill_infos,
248
- "decode": decode_infos,
249
- }
250
- else:
251
- # Fallback with dummy data if no internal states found
252
- return {
253
- "internal_states": [
254
- {
255
- "last_gen_throughput": 0.0,
256
- "avg_spec_accept_length": None,
257
- }
258
- ],
259
- "prefill": prefill_infos,
260
- "decode": decode_infos,
261
- }
262
-
263
-
264
- @app.get("/get_model_info")
265
- async def get_model_info():
266
- global load_balancer
267
-
268
- if not load_balancer or not load_balancer.prefill_servers:
269
- raise HTTPException(
270
- status_code=HTTPStatus.SERVICE_UNAVAILABLE,
271
- detail="There is no server registered",
272
- )
273
-
274
- target_server_url = load_balancer.prefill_servers[0]
275
- endpoint_url = f"{target_server_url}/get_model_info"
276
-
277
- async with aiohttp.ClientSession() as session:
278
- try:
279
- async with session.get(endpoint_url) as response:
280
- if response.status != 200:
281
- error_text = await response.text()
282
- raise HTTPException(
283
- status_code=HTTPStatus.BAD_GATEWAY,
284
- detail=(
285
- f"Failed to get model info from {target_server_url}"
286
- f"Status: {response.status}, Response: {error_text}"
287
- ),
288
- )
289
-
290
- model_info_json = await response.json()
291
- return ORJSONResponse(content=model_info_json)
292
-
293
- except aiohttp.ClientError as e:
294
- raise HTTPException(
295
- status_code=HTTPStatus.SERVICE_UNAVAILABLE,
296
- detail=f"Failed to get model info from backend",
297
- )
298
-
299
-
300
- @app.post("/generate")
301
- async def handle_generate_request(request_data: dict):
302
- prefill_server, bootstrap_port, decode_server = load_balancer.select_pair()
303
-
304
- # Parse and transform prefill_server for bootstrap data
305
- parsed_url = urllib.parse.urlparse(prefill_server)
306
- hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
307
- modified_request = request_data.copy()
308
-
309
- batch_size = _get_request_batch_size(modified_request)
310
- if batch_size is not None:
311
- modified_request.update(
312
- {
313
- "bootstrap_host": [hostname] * batch_size,
314
- "bootstrap_port": [bootstrap_port] * batch_size,
315
- "bootstrap_room": [
316
- _generate_bootstrap_room() for _ in range(batch_size)
317
- ],
318
- }
319
- )
320
- else:
321
- modified_request.update(
322
- {
323
- "bootstrap_host": hostname,
324
- "bootstrap_port": bootstrap_port,
325
- "bootstrap_room": _generate_bootstrap_room(),
326
- }
327
- )
328
-
329
- if request_data.get("stream", False):
330
- return await load_balancer.generate_stream(
331
- modified_request, prefill_server, decode_server, "generate"
332
- )
333
- else:
334
- return await load_balancer.generate(
335
- modified_request, prefill_server, decode_server, "generate"
336
- )
337
-
338
-
339
- async def _forward_to_backend(request_data: dict, endpoint_name: str):
340
- prefill_server, bootstrap_port, decode_server = load_balancer.select_pair()
341
-
342
- # Parse and transform prefill_server for bootstrap data
343
- parsed_url = urllib.parse.urlparse(prefill_server)
344
- hostname = maybe_wrap_ipv6_address(parsed_url.hostname)
345
- modified_request = request_data.copy()
346
- modified_request.update(
347
- {
348
- "bootstrap_host": hostname,
349
- "bootstrap_port": bootstrap_port,
350
- "bootstrap_room": _generate_bootstrap_room(),
351
- }
352
- )
353
-
354
- if request_data.get("stream", False):
355
- return await load_balancer.generate_stream(
356
- modified_request,
357
- prefill_server,
358
- decode_server,
359
- endpoint=endpoint_name,
360
- )
361
- else:
362
- return await load_balancer.generate(
363
- modified_request,
364
- prefill_server,
365
- decode_server,
366
- endpoint=endpoint_name,
367
- )
368
-
369
-
370
- @app.post("/v1/chat/completions")
371
- async def handle_chat_completion_request(request_data: dict):
372
- return await _forward_to_backend(request_data, "v1/chat/completions")
373
-
374
-
375
- @app.post("/v1/completions")
376
- async def handle_completion_request(request_data: dict):
377
- return await _forward_to_backend(request_data, "v1/completions")
378
-
379
-
380
- def _generate_bootstrap_room():
381
- return random.randint(0, 2**63 - 1)
382
-
383
-
384
- # We may utilize `GenerateReqInput`'s logic later
385
- def _get_request_batch_size(request):
386
- if (text := request.get("text")) is not None:
387
- return None if isinstance(text, str) else len(text)
388
- if (input_ids := request.get("input_ids")) is not None:
389
- return None if isinstance(input_ids[0], int) else len(input_ids)
390
- return None
391
-
392
-
393
- @app.get("/v1/models")
394
- async def get_models():
395
- prefill_server = load_balancer.prefill_servers[0] # Get the first prefill server
396
- async with aiohttp.ClientSession() as session:
397
- try:
398
- response = await session.get(f"{prefill_server}/v1/models")
399
- if response.status != 200:
400
- raise HTTPException(
401
- status_code=response.status,
402
- detail=f"Prefill server error: Status {response.status}",
403
- )
404
- return ORJSONResponse(content=await response.json())
405
- except Exception as e:
406
- raise HTTPException(status_code=500, detail=str(e))
407
-
408
-
409
- @app.post("/register")
410
- async def register(obj: PDRegistryRequest):
411
- if obj.mode == "prefill":
412
- load_balancer.add_prefill_server(
413
- PrefillConfig(obj.registry_url, obj.bootstrap_port)
414
- )
415
- logger.info(
416
- f"Registered prefill server: {obj.registry_url} with bootstrap port: {obj.bootstrap_port}"
417
- )
418
- elif obj.mode == "decode":
419
- load_balancer.add_decode_server(obj.registry_url)
420
- logger.info(f"Registered decode server: {obj.registry_url}")
421
- else:
422
- raise HTTPException(
423
- status_code=400,
424
- detail="Invalid mode. Must be either PREFILL or DECODE.",
425
- )
426
-
427
- logger.info(
428
- f"#Prefill servers: {len(load_balancer.prefill_configs)}, "
429
- f"#Decode servers: {len(load_balancer.decode_servers)}"
430
- )
431
-
432
- return Response(status_code=200)
433
-
434
-
435
- def run(prefill_configs, decode_addrs, host, port, timeout):
436
- global load_balancer
437
- load_balancer = MiniLoadBalancer(prefill_configs, decode_addrs, timeout=timeout)
438
- uvicorn.run(app, host=host, port=port)
439
-
440
-
441
- if __name__ == "__main__":
442
- # FIXME: remove this, use the unified entry point: sglang.srt.disaggregation.launch_lb
443
- from sglang.srt.disaggregation.launch_lb import main
444
-
445
- main()
1
+ raise RuntimeError(
2
+ """The 'mini_lb' module has been relocated to the 'sglang_router' package.
3
+ We recommend installing 'sglang-router' with Rust support for optimal performance.
4
+ If you encounter issues building the router with Rust, set the environment variable
5
+ 'SGLANG_ROUTER_BUILD_NO_RUST=1' and add '--mini-lb' to the command line to use the Python version of 'mini_lb'."""
6
+ )