sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,9 @@
1
- import base64
2
- import os
3
1
  import pickle
4
2
  import time
5
3
  from pathlib import Path
6
4
  from typing import Any, List, Optional
7
5
 
6
+ import pybase64
8
7
  import torch
9
8
 
10
9
  from sglang.srt.utils import MultiprocessingSerializer
@@ -78,14 +77,16 @@ class NaiveDistributed:
78
77
  )
79
78
 
80
79
  _get_path(self._rank).write_text(
81
- base64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
80
+ pybase64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
82
81
  )
83
82
 
84
83
  def _read_one(interesting_rank: int):
85
84
  p = _get_path(interesting_rank)
86
85
  while True:
87
86
  if p.exists() and (text := p.read_text()).endswith(text_postfix):
88
- return pickle.loads(base64.b64decode(text[: -len(text_postfix)]))
87
+ return pickle.loads(
88
+ pybase64.b64decode(text[: -len(text_postfix)], validate=True)
89
+ )
89
90
  time.sleep(0.001)
90
91
 
91
92
  return [
@@ -39,24 +39,26 @@ import torch
39
39
  import torch.distributed
40
40
  from torch.distributed import Backend, ProcessGroup
41
41
 
42
+ from sglang.srt.environ import envs
42
43
  from sglang.srt.utils import (
43
44
  direct_register_custom_op,
44
45
  get_bool_env_var,
45
46
  get_int_env_var,
47
+ get_local_ip_auto,
46
48
  is_cpu,
47
49
  is_cuda_alike,
48
50
  is_hip,
49
51
  is_npu,
50
52
  is_shm_available,
53
+ is_xpu,
51
54
  supports_custom_op,
52
55
  )
53
56
 
54
57
  _is_npu = is_npu()
55
58
  _is_cpu = is_cpu()
59
+ _is_xpu = is_xpu()
56
60
  _supports_custom_op = supports_custom_op()
57
61
 
58
- IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
59
-
60
62
 
61
63
  TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
62
64
 
@@ -239,6 +241,9 @@ class GroupCoordinator:
239
241
  use_npu_communicator: bool,
240
242
  use_message_queue_broadcaster: bool = False,
241
243
  group_name: Optional[str] = None,
244
+ pynccl_use_current_stream: bool = False,
245
+ torch_compile: Optional[bool] = None,
246
+ gloo_timeout: timedelta = timedelta(seconds=120 * 60),
242
247
  ):
243
248
  # Set group info
244
249
  group_name = group_name or "anonymous"
@@ -256,9 +261,14 @@ class GroupCoordinator:
256
261
  device_group = torch.distributed.new_group(
257
262
  ranks, backend=torch_distributed_backend
258
263
  )
259
- # a group with `gloo` backend, to allow direct coordination between
260
- # processes through the CPU.
261
- cpu_group = torch.distributed.new_group(ranks, backend="gloo")
264
+ # a cpu_group to allow direct coordination between processes through
265
+ # the CPU. The backend is chosen based on `torch_distributed_backend`
266
+ if "mooncake" in torch_distributed_backend:
267
+ cpu_group = torch.distributed.new_group(ranks, backend="mooncake-cpu")
268
+ else:
269
+ cpu_group = torch.distributed.new_group(
270
+ ranks, backend="gloo", timeout=gloo_timeout
271
+ )
262
272
  if self.rank in ranks:
263
273
  self.ranks = ranks
264
274
  self.world_size = len(ranks)
@@ -269,17 +279,20 @@ class GroupCoordinator:
269
279
  assert self.cpu_group is not None
270
280
  assert self.device_group is not None
271
281
 
272
- device_id = 0 if IS_ONE_DEVICE_PER_PROCESS else local_rank
273
282
  if is_cuda_alike():
283
+ device_id = (
284
+ 0 if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() else local_rank
285
+ )
274
286
  self.device = torch.device(f"cuda:{device_id}")
275
287
  elif _is_npu:
276
- self.device = torch.device(f"npu:{device_id}")
288
+ self.device = torch.device(f"npu:{local_rank}")
277
289
  else:
278
290
  self.device = torch.device("cpu")
279
291
  self.device_module = torch.get_device_module(self.device)
280
292
 
281
293
  # Import communicators
282
294
  self.use_pynccl = use_pynccl
295
+ self.pynccl_use_current_stream = pynccl_use_current_stream
283
296
  self.use_pymscclpp = use_pymscclpp
284
297
  self.use_custom_allreduce = use_custom_allreduce
285
298
  self.use_torch_symm_mem = use_torch_symm_mem
@@ -313,6 +326,7 @@ class GroupCoordinator:
313
326
  self.pynccl_comm = PyNcclCommunicator(
314
327
  group=self.cpu_group,
315
328
  device=self.device,
329
+ use_current_stream=pynccl_use_current_stream,
316
330
  )
317
331
 
318
332
  self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
@@ -433,10 +447,13 @@ class GroupCoordinator:
433
447
 
434
448
  @contextmanager
435
449
  def graph_capture(
436
- self, graph_capture_context: Optional[GraphCaptureContext] = None
450
+ self,
451
+ graph_capture_context: Optional[GraphCaptureContext] = None,
452
+ stream: Optional[torch.cuda.Stream] = None,
437
453
  ):
438
454
  if graph_capture_context is None:
439
- stream = self.device_module.Stream()
455
+ if stream is None:
456
+ stream = self.device_module.Stream()
440
457
  graph_capture_context = GraphCaptureContext(stream)
441
458
  else:
442
459
  stream = graph_capture_context.stream
@@ -603,8 +620,11 @@ class GroupCoordinator:
603
620
 
604
621
  def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
605
622
  pynccl_comm = self.pynccl_comm
623
+ symm_mem_comm = self.symm_mem_comm
606
624
  if pynccl_comm is not None and not pynccl_comm.disabled:
607
625
  pynccl_comm.all_reduce(input_)
626
+ elif symm_mem_comm is not None and not symm_mem_comm.disabled:
627
+ symm_mem_comm.all_reduce(input_)
608
628
  else:
609
629
  torch.distributed.all_reduce(input_, group=self.device_group)
610
630
 
@@ -669,7 +689,7 @@ class GroupCoordinator:
669
689
  )
670
690
 
671
691
  def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
672
- if _is_npu or not _supports_custom_op:
692
+ if _is_npu or _is_xpu or not _supports_custom_op:
673
693
  self._all_gather_into_tensor(output, input)
674
694
  else:
675
695
  torch.ops.sglang.reg_all_gather_into_tensor(
@@ -1259,7 +1279,9 @@ def init_model_parallel_group(
1259
1279
  use_message_queue_broadcaster: bool = False,
1260
1280
  group_name: Optional[str] = None,
1261
1281
  use_mscclpp_allreduce: Optional[bool] = None,
1282
+ pynccl_use_current_stream: bool = True,
1262
1283
  use_symm_mem_allreduce: Optional[bool] = None,
1284
+ torch_compile: Optional[bool] = None,
1263
1285
  ) -> GroupCoordinator:
1264
1286
  if use_custom_allreduce is None:
1265
1287
  use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
@@ -1271,7 +1293,7 @@ def init_model_parallel_group(
1271
1293
  group_ranks=group_ranks,
1272
1294
  local_rank=local_rank,
1273
1295
  torch_distributed_backend=backend,
1274
- use_pynccl=not _is_npu,
1296
+ use_pynccl=not (_is_npu or _is_xpu),
1275
1297
  use_pymscclpp=use_mscclpp_allreduce,
1276
1298
  use_custom_allreduce=use_custom_allreduce,
1277
1299
  use_torch_symm_mem=use_symm_mem_allreduce,
@@ -1280,6 +1302,8 @@ def init_model_parallel_group(
1280
1302
  use_npu_communicator=True,
1281
1303
  use_message_queue_broadcaster=use_message_queue_broadcaster,
1282
1304
  group_name=group_name,
1305
+ pynccl_use_current_stream=pynccl_use_current_stream,
1306
+ torch_compile=torch_compile,
1283
1307
  )
1284
1308
 
1285
1309
 
@@ -1336,7 +1360,7 @@ get_pipeline_model_parallel_group = get_pp_group
1336
1360
 
1337
1361
 
1338
1362
  @contextmanager
1339
- def graph_capture():
1363
+ def graph_capture(stream: Optional[torch.cuda.Stream] = None):
1340
1364
  """
1341
1365
  `graph_capture` is a context manager which should surround the code that
1342
1366
  is capturing the CUDA graph. Its main purpose is to ensure that the
@@ -1350,9 +1374,9 @@ def graph_capture():
1350
1374
  in order to explicitly distinguish the kernels to capture
1351
1375
  from other kernels possibly launched on background in the default stream.
1352
1376
  """
1353
- with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
1354
- context
1355
- ):
1377
+ with get_tp_group().graph_capture(
1378
+ stream=stream
1379
+ ) as context, get_pp_group().graph_capture(context):
1356
1380
  yield context
1357
1381
 
1358
1382
 
@@ -1394,6 +1418,17 @@ def init_distributed_environment(
1394
1418
  distributed_init_method,
1395
1419
  backend,
1396
1420
  )
1421
+ if "mooncake" in backend:
1422
+ try:
1423
+ from mooncake import ep as mooncake_ep
1424
+ except ImportError as e:
1425
+ raise ImportError(
1426
+ "Please install mooncake by following the instructions at "
1427
+ "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
1428
+ "to run SGLang with Mooncake Backend."
1429
+ ) from e
1430
+ mooncake_ep.set_host_ip(get_local_ip_auto())
1431
+
1397
1432
  if not torch.distributed.is_initialized():
1398
1433
  assert distributed_init_method is not None, (
1399
1434
  "distributed_init_method must be provided when initializing "
@@ -1439,6 +1474,7 @@ def initialize_model_parallel(
1439
1474
  pipeline_model_parallel_size: int = 1,
1440
1475
  backend: Optional[str] = None,
1441
1476
  duplicate_tp_group: bool = False,
1477
+ torch_compile: Optional[bool] = None,
1442
1478
  ) -> None:
1443
1479
  """
1444
1480
  Initialize model parallel groups.
@@ -1494,6 +1530,8 @@ def initialize_model_parallel(
1494
1530
  "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
1495
1531
  ),
1496
1532
  group_name="tp",
1533
+ pynccl_use_current_stream=duplicate_tp_group,
1534
+ torch_compile=torch_compile,
1497
1535
  )
1498
1536
 
1499
1537
  if duplicate_tp_group:
@@ -1509,16 +1547,18 @@ def initialize_model_parallel(
1509
1547
  "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
1510
1548
  ),
1511
1549
  group_name="pdmux_prefill_tp",
1550
+ pynccl_use_current_stream=True,
1551
+ torch_compile=torch_compile,
1512
1552
  )
1513
- _TP.pynccl_comm.disabled = False
1514
- _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
1553
+ if _TP.pynccl_comm:
1554
+ _TP.pynccl_comm.disabled = False
1555
+ _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
1515
1556
 
1516
1557
  moe_ep_size = expert_model_parallel_size
1517
1558
  moe_tp_size = tensor_model_parallel_size // moe_ep_size
1518
1559
 
1519
1560
  global _MOE_EP
1520
1561
  assert _MOE_EP is None, "expert model parallel group is already initialized"
1521
-
1522
1562
  if moe_ep_size == tensor_model_parallel_size:
1523
1563
  _MOE_EP = _TP
1524
1564
  else:
@@ -1539,7 +1579,6 @@ def initialize_model_parallel(
1539
1579
 
1540
1580
  global _MOE_TP
1541
1581
  assert _MOE_TP is None, "expert model parallel group is already initialized"
1542
-
1543
1582
  if moe_tp_size == tensor_model_parallel_size:
1544
1583
  _MOE_TP = _TP
1545
1584
  else:
@@ -1704,6 +1743,11 @@ def destroy_model_parallel():
1704
1743
  _PP.destroy()
1705
1744
  _PP = None
1706
1745
 
1746
+ global _PDMUX_PREFILL_TP_GROUP
1747
+ if _PDMUX_PREFILL_TP_GROUP: # type: ignore[union-attr]
1748
+ _PDMUX_PREFILL_TP_GROUP.destroy()
1749
+ _PDMUX_PREFILL_TP_GROUP = None
1750
+
1707
1751
 
1708
1752
  def destroy_distributed_environment():
1709
1753
  global _WORLD
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ import torch
7
+
8
+ from sglang.srt.managers.schedule_batch import ServerArgs
9
+ from sglang.srt.utils import is_cpu, is_cuda
10
+
11
+
12
+ @dataclass
13
+ class ElasticEPState:
14
+ active_ranks: Optional[torch.Tensor]
15
+ last_active_ranks: Optional[torch.Tensor]
16
+ active_ranks_cpu: Optional[torch.Tensor]
17
+
18
+ def is_active_equal_last(self) -> bool:
19
+ return torch.equal(self.active_ranks, self.last_active_ranks)
20
+
21
+ def sync_active_to_cpu(self):
22
+ if self.active_ranks is not None:
23
+ self.active_ranks_cpu = self.active_ranks.detach().cpu().clone()
24
+
25
+ def snapshot_active_to_last(self):
26
+ if self.active_ranks is not None:
27
+ self.last_active_ranks = self.active_ranks.clone()
28
+
29
+
30
+ class ElasticEPStateManager:
31
+ _instance: Optional[ElasticEPState] = None
32
+
33
+ @classmethod
34
+ def instance(cls) -> ElasticEPState:
35
+ return cls._instance
36
+
37
+ @classmethod
38
+ def init(cls, server_args: ServerArgs):
39
+ if cls._instance is not None:
40
+ return cls._instance
41
+
42
+ if server_args.elastic_ep_backend is not None:
43
+ cls._instance = cls._build_state(ep_size=None, device=None)
44
+ return cls._instance
45
+
46
+ @staticmethod
47
+ def _select_device() -> torch.device:
48
+ if is_cuda():
49
+ return torch.device("cuda")
50
+ elif is_cpu():
51
+ return torch.device("cpu")
52
+ else:
53
+ raise NotImplementedError("Only CUDA and CPU support elastic ep now.")
54
+
55
+ @classmethod
56
+ def _build_state(
57
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
58
+ ) -> ElasticEPState:
59
+
60
+ active = cls.healthy_rank_state(ep_size=ep_size, device=device)
61
+ return ElasticEPState(
62
+ active_ranks=active,
63
+ last_active_ranks=active.clone(),
64
+ active_ranks_cpu=active.detach().cpu().clone(),
65
+ )
66
+
67
+ @classmethod
68
+ def healthy_rank_state(
69
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
70
+ ) -> torch.Tensor:
71
+ size = ep_size if ep_size is not None else torch.distributed.get_world_size()
72
+ dev = device if device is not None else cls._select_device()
73
+
74
+ return torch.ones(size, dtype=torch.int32, device=dev)
@@ -1,10 +1,11 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Copied from vLLM
3
- import json
4
3
  import logging
5
4
  from abc import ABC, abstractmethod
6
5
  from typing import Union
7
6
 
7
+ import orjson
8
+
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
  try:
@@ -148,7 +149,7 @@ class HarmonyContext(ConversationContext):
148
149
  if isinstance(tool_session, Tool):
149
150
  return await tool_session.get_result(self)
150
151
  tool_name = last_msg.recipient.split(".")[1]
151
- args = json.loads(last_msg.content[0].text)
152
+ args = orjson.loads(last_msg.content[0].text)
152
153
  result = await tool_session.call_tool(tool_name, args)
153
154
  result_str = result.content[0].text
154
155
  content = TextContent(text=result_str)