sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -1,10 +1,9 @@
1
- import base64
2
- import os
3
1
  import pickle
4
2
  import time
5
3
  from pathlib import Path
6
4
  from typing import Any, List, Optional
7
5
 
6
+ import pybase64
8
7
  import torch
9
8
 
10
9
  from sglang.srt.utils import MultiprocessingSerializer
@@ -78,14 +77,16 @@ class NaiveDistributed:
78
77
  )
79
78
 
80
79
  _get_path(self._rank).write_text(
81
- base64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
80
+ pybase64.b64encode(pickle.dumps(obj)).decode("utf-8") + text_postfix
82
81
  )
83
82
 
84
83
  def _read_one(interesting_rank: int):
85
84
  p = _get_path(interesting_rank)
86
85
  while True:
87
86
  if p.exists() and (text := p.read_text()).endswith(text_postfix):
88
- return pickle.loads(base64.b64decode(text[: -len(text_postfix)]))
87
+ return pickle.loads(
88
+ pybase64.b64decode(text[: -len(text_postfix)], validate=True)
89
+ )
89
90
  time.sleep(0.001)
90
91
 
91
92
  return [
@@ -39,24 +39,26 @@ import torch
39
39
  import torch.distributed
40
40
  from torch.distributed import Backend, ProcessGroup
41
41
 
42
+ from sglang.srt.environ import envs
42
43
  from sglang.srt.utils import (
43
44
  direct_register_custom_op,
44
45
  get_bool_env_var,
45
46
  get_int_env_var,
47
+ get_local_ip_auto,
46
48
  is_cpu,
47
49
  is_cuda_alike,
48
50
  is_hip,
49
51
  is_npu,
50
52
  is_shm_available,
53
+ is_xpu,
51
54
  supports_custom_op,
52
55
  )
53
56
 
54
57
  _is_npu = is_npu()
55
58
  _is_cpu = is_cpu()
59
+ _is_xpu = is_xpu()
56
60
  _supports_custom_op = supports_custom_op()
57
61
 
58
- IS_ONE_DEVICE_PER_PROCESS = get_bool_env_var("SGLANG_ONE_DEVICE_PER_PROCESS")
59
-
60
62
 
61
63
  TensorMetadata = namedtuple("TensorMetadata", ["device", "dtype", "size"])
62
64
 
@@ -239,6 +241,9 @@ class GroupCoordinator:
239
241
  use_npu_communicator: bool,
240
242
  use_message_queue_broadcaster: bool = False,
241
243
  group_name: Optional[str] = None,
244
+ pynccl_use_current_stream: bool = False,
245
+ torch_compile: Optional[bool] = None,
246
+ gloo_timeout: timedelta = timedelta(seconds=120 * 60),
242
247
  ):
243
248
  # Set group info
244
249
  group_name = group_name or "anonymous"
@@ -256,9 +261,14 @@ class GroupCoordinator:
256
261
  device_group = torch.distributed.new_group(
257
262
  ranks, backend=torch_distributed_backend
258
263
  )
259
- # a group with `gloo` backend, to allow direct coordination between
260
- # processes through the CPU.
261
- cpu_group = torch.distributed.new_group(ranks, backend="gloo")
264
+ # a cpu_group to allow direct coordination between processes through
265
+ # the CPU. The backend is chosen based on `torch_distributed_backend`
266
+ if "mooncake" in torch_distributed_backend:
267
+ cpu_group = torch.distributed.new_group(ranks, backend="mooncake-cpu")
268
+ else:
269
+ cpu_group = torch.distributed.new_group(
270
+ ranks, backend="gloo", timeout=gloo_timeout
271
+ )
262
272
  if self.rank in ranks:
263
273
  self.ranks = ranks
264
274
  self.world_size = len(ranks)
@@ -269,17 +279,20 @@ class GroupCoordinator:
269
279
  assert self.cpu_group is not None
270
280
  assert self.device_group is not None
271
281
 
272
- device_id = 0 if IS_ONE_DEVICE_PER_PROCESS else local_rank
273
282
  if is_cuda_alike():
283
+ device_id = (
284
+ 0 if envs.SGLANG_ONE_VISIBLE_DEVICE_PER_PROCESS.get() else local_rank
285
+ )
274
286
  self.device = torch.device(f"cuda:{device_id}")
275
287
  elif _is_npu:
276
- self.device = torch.device(f"npu:{device_id}")
288
+ self.device = torch.device(f"npu:{local_rank}")
277
289
  else:
278
290
  self.device = torch.device("cpu")
279
291
  self.device_module = torch.get_device_module(self.device)
280
292
 
281
293
  # Import communicators
282
294
  self.use_pynccl = use_pynccl
295
+ self.pynccl_use_current_stream = pynccl_use_current_stream
283
296
  self.use_pymscclpp = use_pymscclpp
284
297
  self.use_custom_allreduce = use_custom_allreduce
285
298
  self.use_torch_symm_mem = use_torch_symm_mem
@@ -313,6 +326,7 @@ class GroupCoordinator:
313
326
  self.pynccl_comm = PyNcclCommunicator(
314
327
  group=self.cpu_group,
315
328
  device=self.device,
329
+ use_current_stream=pynccl_use_current_stream,
316
330
  )
317
331
 
318
332
  self.pymscclpp_comm: Optional[PyMscclppCommunicator] = None
@@ -326,10 +340,17 @@ class GroupCoordinator:
326
340
  self.qr_comm: Optional[QuickAllReduce] = None
327
341
  if use_custom_allreduce and self.world_size > 1:
328
342
  # Initialize a custom fast all-reduce implementation.
343
+ if torch_compile is not None and torch_compile:
344
+ # For piecewise CUDA graph, the requirement for custom allreduce is larger to
345
+ # avoid illegal cuda memory access.
346
+ ca_max_size = 256 * 1024 * 1024
347
+ else:
348
+ ca_max_size = 8 * 1024 * 1024
329
349
  try:
330
350
  self.ca_comm = CustomAllreduce(
331
351
  group=self.cpu_group,
332
352
  device=self.device,
353
+ max_size=ca_max_size,
333
354
  )
334
355
  except Exception as e:
335
356
  logger.warning(
@@ -433,10 +454,13 @@ class GroupCoordinator:
433
454
 
434
455
  @contextmanager
435
456
  def graph_capture(
436
- self, graph_capture_context: Optional[GraphCaptureContext] = None
457
+ self,
458
+ graph_capture_context: Optional[GraphCaptureContext] = None,
459
+ stream: Optional[torch.cuda.Stream] = None,
437
460
  ):
438
461
  if graph_capture_context is None:
439
- stream = self.device_module.Stream()
462
+ if stream is None:
463
+ stream = self.device_module.Stream()
440
464
  graph_capture_context = GraphCaptureContext(stream)
441
465
  else:
442
466
  stream = graph_capture_context.stream
@@ -603,8 +627,11 @@ class GroupCoordinator:
603
627
 
604
628
  def _all_reduce_in_place(self, input_: torch.Tensor) -> None:
605
629
  pynccl_comm = self.pynccl_comm
630
+ symm_mem_comm = self.symm_mem_comm
606
631
  if pynccl_comm is not None and not pynccl_comm.disabled:
607
632
  pynccl_comm.all_reduce(input_)
633
+ elif symm_mem_comm is not None and not symm_mem_comm.disabled:
634
+ symm_mem_comm.all_reduce(input_)
608
635
  else:
609
636
  torch.distributed.all_reduce(input_, group=self.device_group)
610
637
 
@@ -669,7 +696,7 @@ class GroupCoordinator:
669
696
  )
670
697
 
671
698
  def all_gather_into_tensor(self, output: torch.Tensor, input: torch.Tensor):
672
- if _is_npu or not _supports_custom_op:
699
+ if _is_npu or _is_xpu or not _supports_custom_op:
673
700
  self._all_gather_into_tensor(output, input)
674
701
  else:
675
702
  torch.ops.sglang.reg_all_gather_into_tensor(
@@ -1259,7 +1286,9 @@ def init_model_parallel_group(
1259
1286
  use_message_queue_broadcaster: bool = False,
1260
1287
  group_name: Optional[str] = None,
1261
1288
  use_mscclpp_allreduce: Optional[bool] = None,
1289
+ pynccl_use_current_stream: bool = True,
1262
1290
  use_symm_mem_allreduce: Optional[bool] = None,
1291
+ torch_compile: Optional[bool] = None,
1263
1292
  ) -> GroupCoordinator:
1264
1293
  if use_custom_allreduce is None:
1265
1294
  use_custom_allreduce = _ENABLE_CUSTOM_ALL_REDUCE
@@ -1271,7 +1300,7 @@ def init_model_parallel_group(
1271
1300
  group_ranks=group_ranks,
1272
1301
  local_rank=local_rank,
1273
1302
  torch_distributed_backend=backend,
1274
- use_pynccl=not _is_npu,
1303
+ use_pynccl=not (_is_npu or _is_xpu),
1275
1304
  use_pymscclpp=use_mscclpp_allreduce,
1276
1305
  use_custom_allreduce=use_custom_allreduce,
1277
1306
  use_torch_symm_mem=use_symm_mem_allreduce,
@@ -1280,6 +1309,8 @@ def init_model_parallel_group(
1280
1309
  use_npu_communicator=True,
1281
1310
  use_message_queue_broadcaster=use_message_queue_broadcaster,
1282
1311
  group_name=group_name,
1312
+ pynccl_use_current_stream=pynccl_use_current_stream,
1313
+ torch_compile=torch_compile,
1283
1314
  )
1284
1315
 
1285
1316
 
@@ -1336,7 +1367,7 @@ get_pipeline_model_parallel_group = get_pp_group
1336
1367
 
1337
1368
 
1338
1369
  @contextmanager
1339
- def graph_capture():
1370
+ def graph_capture(stream: Optional[torch.cuda.Stream] = None):
1340
1371
  """
1341
1372
  `graph_capture` is a context manager which should surround the code that
1342
1373
  is capturing the CUDA graph. Its main purpose is to ensure that the
@@ -1350,9 +1381,9 @@ def graph_capture():
1350
1381
  in order to explicitly distinguish the kernels to capture
1351
1382
  from other kernels possibly launched on background in the default stream.
1352
1383
  """
1353
- with get_tp_group().graph_capture() as context, get_pp_group().graph_capture(
1354
- context
1355
- ):
1384
+ with get_tp_group().graph_capture(
1385
+ stream=stream
1386
+ ) as context, get_pp_group().graph_capture(context):
1356
1387
  yield context
1357
1388
 
1358
1389
 
@@ -1394,6 +1425,17 @@ def init_distributed_environment(
1394
1425
  distributed_init_method,
1395
1426
  backend,
1396
1427
  )
1428
+ if "mooncake" in backend:
1429
+ try:
1430
+ from mooncake import ep as mooncake_ep
1431
+ except ImportError as e:
1432
+ raise ImportError(
1433
+ "Please install mooncake by following the instructions at "
1434
+ "https://github.com/kvcache-ai/Mooncake/blob/main/doc/en/build.md " # noqa: E501
1435
+ "to run SGLang with Mooncake Backend."
1436
+ ) from e
1437
+ mooncake_ep.set_host_ip(get_local_ip_auto())
1438
+
1397
1439
  if not torch.distributed.is_initialized():
1398
1440
  assert distributed_init_method is not None, (
1399
1441
  "distributed_init_method must be provided when initializing "
@@ -1439,6 +1481,7 @@ def initialize_model_parallel(
1439
1481
  pipeline_model_parallel_size: int = 1,
1440
1482
  backend: Optional[str] = None,
1441
1483
  duplicate_tp_group: bool = False,
1484
+ torch_compile: Optional[bool] = None,
1442
1485
  ) -> None:
1443
1486
  """
1444
1487
  Initialize model parallel groups.
@@ -1494,6 +1537,8 @@ def initialize_model_parallel(
1494
1537
  "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
1495
1538
  ),
1496
1539
  group_name="tp",
1540
+ pynccl_use_current_stream=duplicate_tp_group,
1541
+ torch_compile=torch_compile,
1497
1542
  )
1498
1543
 
1499
1544
  if duplicate_tp_group:
@@ -1509,16 +1554,18 @@ def initialize_model_parallel(
1509
1554
  "SGLANG_USE_MESSAGE_QUEUE_BROADCASTER", "true"
1510
1555
  ),
1511
1556
  group_name="pdmux_prefill_tp",
1557
+ pynccl_use_current_stream=True,
1558
+ torch_compile=torch_compile,
1512
1559
  )
1513
- _TP.pynccl_comm.disabled = False
1514
- _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
1560
+ if _TP.pynccl_comm:
1561
+ _TP.pynccl_comm.disabled = False
1562
+ _PDMUX_PREFILL_TP_GROUP.pynccl_comm.disabled = False
1515
1563
 
1516
1564
  moe_ep_size = expert_model_parallel_size
1517
1565
  moe_tp_size = tensor_model_parallel_size // moe_ep_size
1518
1566
 
1519
1567
  global _MOE_EP
1520
1568
  assert _MOE_EP is None, "expert model parallel group is already initialized"
1521
-
1522
1569
  if moe_ep_size == tensor_model_parallel_size:
1523
1570
  _MOE_EP = _TP
1524
1571
  else:
@@ -1539,7 +1586,6 @@ def initialize_model_parallel(
1539
1586
 
1540
1587
  global _MOE_TP
1541
1588
  assert _MOE_TP is None, "expert model parallel group is already initialized"
1542
-
1543
1589
  if moe_tp_size == tensor_model_parallel_size:
1544
1590
  _MOE_TP = _TP
1545
1591
  else:
@@ -1704,6 +1750,11 @@ def destroy_model_parallel():
1704
1750
  _PP.destroy()
1705
1751
  _PP = None
1706
1752
 
1753
+ global _PDMUX_PREFILL_TP_GROUP
1754
+ if _PDMUX_PREFILL_TP_GROUP: # type: ignore[union-attr]
1755
+ _PDMUX_PREFILL_TP_GROUP.destroy()
1756
+ _PDMUX_PREFILL_TP_GROUP = None
1757
+
1707
1758
 
1708
1759
  def destroy_distributed_environment():
1709
1760
  global _WORLD
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional
5
+
6
+ import torch
7
+
8
+ from sglang.srt.managers.schedule_batch import ServerArgs
9
+ from sglang.srt.utils import is_cpu, is_cuda
10
+
11
+
12
+ @dataclass
13
+ class ElasticEPState:
14
+ active_ranks: Optional[torch.Tensor]
15
+ last_active_ranks: Optional[torch.Tensor]
16
+ active_ranks_cpu: Optional[torch.Tensor]
17
+
18
+ def is_active_equal_last(self) -> bool:
19
+ return torch.equal(self.active_ranks, self.last_active_ranks)
20
+
21
+ def sync_active_to_cpu(self):
22
+ if self.active_ranks is not None:
23
+ self.active_ranks_cpu = self.active_ranks.detach().cpu().clone()
24
+
25
+ def snapshot_active_to_last(self):
26
+ if self.active_ranks is not None:
27
+ self.last_active_ranks = self.active_ranks.clone()
28
+
29
+
30
+ class ElasticEPStateManager:
31
+ _instance: Optional[ElasticEPState] = None
32
+
33
+ @classmethod
34
+ def instance(cls) -> ElasticEPState:
35
+ return cls._instance
36
+
37
+ @classmethod
38
+ def init(cls, server_args: ServerArgs):
39
+ if cls._instance is not None:
40
+ return cls._instance
41
+
42
+ if server_args.elastic_ep_backend is not None:
43
+ cls._instance = cls._build_state(ep_size=None, device=None)
44
+ return cls._instance
45
+
46
+ @staticmethod
47
+ def _select_device() -> torch.device:
48
+ if is_cuda():
49
+ return torch.device("cuda")
50
+ elif is_cpu():
51
+ return torch.device("cpu")
52
+ else:
53
+ raise NotImplementedError("Only CUDA and CPU support elastic ep now.")
54
+
55
+ @classmethod
56
+ def _build_state(
57
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
58
+ ) -> ElasticEPState:
59
+
60
+ active = cls.healthy_rank_state(ep_size=ep_size, device=device)
61
+ return ElasticEPState(
62
+ active_ranks=active,
63
+ last_active_ranks=active.clone(),
64
+ active_ranks_cpu=active.detach().cpu().clone(),
65
+ )
66
+
67
+ @classmethod
68
+ def healthy_rank_state(
69
+ cls, *, ep_size: Optional[int] = None, device: Optional[torch.device] = None
70
+ ) -> torch.Tensor:
71
+ size = ep_size if ep_size is not None else torch.distributed.get_world_size()
72
+ dev = device if device is not None else cls._select_device()
73
+
74
+ return torch.ones(size, dtype=torch.int32, device=dev)
@@ -1,10 +1,11 @@
1
1
  # SPDX-License-Identifier: Apache-2.0
2
2
  # Copied from vLLM
3
- import json
4
3
  import logging
5
4
  from abc import ABC, abstractmethod
6
5
  from typing import Union
7
6
 
7
+ import orjson
8
+
8
9
  logger = logging.getLogger(__name__)
9
10
 
10
11
  try:
@@ -148,7 +149,7 @@ class HarmonyContext(ConversationContext):
148
149
  if isinstance(tool_session, Tool):
149
150
  return await tool_session.get_result(self)
150
151
  tool_name = last_msg.recipient.split(".")[1]
151
- args = json.loads(last_msg.content[0].text)
152
+ args = orjson.loads(last_msg.content[0].text)
152
153
  result = await tool_session.call_tool(tool_name, args)
153
154
  result_str = result.content[0].text
154
155
  content = TextContent(text=result_str)