sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,341 @@
1
+ from typing import List, Optional
2
+
3
+ from sglang.srt.layers.logits_processor import LogitsProcessorOutput
4
+ from sglang.srt.managers.schedule_batch import ScheduleBatch
5
+ from sglang.srt.managers.utils import GenerationBatchResult
6
+ from sglang.srt.model_executor.forward_batch_info import PPProxyTensors
7
+ from sglang.srt.utils import DynamicGradMode, point_to_point_pyobj
8
+
9
+
10
+ class SchedulerPPMixin:
11
+
12
+ @DynamicGradMode()
13
+ def event_loop_pp(self):
14
+ """A non-overlap scheduler loop for pipeline parallelism."""
15
+ mbs = [None] * self.pp_size
16
+ last_mbs = [None] * self.pp_size
17
+ self.running_mbs = [
18
+ ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
19
+ ]
20
+ pp_outputs: Optional[PPProxyTensors] = None
21
+ while True:
22
+ server_is_idle = True
23
+ for mb_id in range(self.pp_size):
24
+ self.running_batch = self.running_mbs[mb_id]
25
+ self.last_batch = last_mbs[mb_id]
26
+
27
+ recv_reqs = self.recv_requests()
28
+ self.process_input_requests(recv_reqs)
29
+ mbs[mb_id] = self.get_next_batch_to_run()
30
+ self.running_mbs[mb_id] = self.running_batch
31
+
32
+ self.cur_batch = mbs[mb_id]
33
+ if self.cur_batch:
34
+ server_is_idle = False
35
+ result = self.run_batch(self.cur_batch)
36
+
37
+ # (last rank) send the outputs to the next step
38
+ if self.pp_group.is_last_rank:
39
+ if self.cur_batch:
40
+ next_token_ids = result.next_token_ids
41
+ if self.cur_batch.return_logprob:
42
+ pp_outputs = PPProxyTensors(
43
+ {
44
+ "next_token_ids": next_token_ids,
45
+ "extend_input_len_per_req": result.extend_input_len_per_req,
46
+ "extend_logprob_start_len_per_req": result.extend_logprob_start_len_per_req,
47
+ }
48
+ | (
49
+ {
50
+ f"logits_output.{k}": v
51
+ for k, v in result.logits_output.__dict__.items()
52
+ }
53
+ if result.logits_output is not None
54
+ else {}
55
+ )
56
+ )
57
+ else:
58
+ pp_outputs = PPProxyTensors(
59
+ {
60
+ "next_token_ids": next_token_ids,
61
+ }
62
+ )
63
+ # send the output from the last round to let the next stage worker run post processing
64
+ self.pp_group.send_tensor_dict(
65
+ pp_outputs.tensors,
66
+ all_gather_group=self.attn_tp_group,
67
+ )
68
+
69
+ # receive outputs and post-process (filter finished reqs) the coming microbatch
70
+ next_mb_id = (mb_id + 1) % self.pp_size
71
+ next_pp_outputs = None
72
+ if mbs[next_mb_id] is not None:
73
+ next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
74
+ self.pp_group.recv_tensor_dict(
75
+ all_gather_group=self.attn_tp_group
76
+ )
77
+ )
78
+ mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
79
+ logits_output_args = {
80
+ k[len("logits_output.") :]: v
81
+ for k, v in next_pp_outputs.tensors.items()
82
+ if k.startswith("logits_output.")
83
+ }
84
+ if len(logits_output_args) > 0:
85
+ logits_output = LogitsProcessorOutput(**logits_output_args)
86
+ else:
87
+ logits_output = None
88
+
89
+ output_result = GenerationBatchResult.from_pp_proxy(
90
+ logits_output=logits_output,
91
+ next_pp_outputs=next_pp_outputs,
92
+ can_run_cuda_graph=result.can_run_cuda_graph,
93
+ )
94
+ self.process_batch_result(mbs[next_mb_id], output_result)
95
+ last_mbs[next_mb_id] = mbs[next_mb_id]
96
+
97
+ # (not last rank)
98
+ if not self.pp_group.is_last_rank:
99
+ # carry the outputs to the next stage
100
+ # send the outputs from the last round to let the next stage worker run post processing
101
+ if pp_outputs:
102
+ self.pp_group.send_tensor_dict(
103
+ pp_outputs.tensors,
104
+ all_gather_group=self.attn_tp_group,
105
+ )
106
+
107
+ # send out reqs to the next stage
108
+ dp_offset = self.attn_dp_rank * self.attn_tp_size
109
+ if self.attn_tp_rank == 0:
110
+ point_to_point_pyobj(
111
+ recv_reqs,
112
+ self.pp_rank * self.tp_size + dp_offset,
113
+ self.world_group.device_group,
114
+ self.pp_rank * self.tp_size + dp_offset,
115
+ (self.pp_rank + 1) * self.tp_size + dp_offset,
116
+ )
117
+
118
+ # send out proxy tensors to the next stage
119
+ if self.cur_batch:
120
+ # FIXME(lsyin): remove this assert
121
+ assert result.pp_hidden_states_proxy_tensors.tensors is not None
122
+ self.pp_group.send_tensor_dict(
123
+ result.pp_hidden_states_proxy_tensors.tensors,
124
+ all_gather_group=self.attn_tp_group,
125
+ )
126
+
127
+ pp_outputs = next_pp_outputs
128
+
129
+ # When the server is idle, self-check and re-init some states
130
+ if server_is_idle:
131
+ # When the server is idle, do self-check and re-init some states
132
+ self.self_check_during_idle()
133
+
134
+ @DynamicGradMode()
135
+ def event_loop_pp_disagg_prefill(self):
136
+ """
137
+ An event loop for the prefill server in pipeline parallelism.
138
+
139
+ Rules:
140
+ 1. Each stage runs in the same order and is notified by the previous stage.
141
+ 2. Each send/recv operation is blocking and matched by the neighboring stage.
142
+
143
+ Regular Schedule:
144
+ ====================================================================
145
+ Stage i | Stage i+1
146
+ send ith req | recv ith req
147
+ send ith proxy | recv ith proxy
148
+ send prev (i+1)th carry | recv prev (i+1)th carry
149
+ ====================================================================
150
+
151
+ Prefill Server Schedule:
152
+ ====================================================================
153
+ Stage i | Stage i+1
154
+ send ith req | recv ith req
155
+ send ith bootstrap req | recv ith bootstrap req
156
+ send ith transferred req | recv ith transferred req
157
+ send ith proxy | recv ith proxy
158
+ send prev (i+1)th carry | recv prev (i+1)th carry
159
+ send prev (i+1)th release req | recv prev (i+1)th release req
160
+ ====================================================================
161
+
162
+ There are two additional elements compared to the regular schedule:
163
+
164
+ 1. Bootstrap Requests:
165
+ a. Instead of polling the status on the current workers, we should wait for the previous stage to notify to avoid desynchronization.
166
+ b. The first stage polls the status and propagates the bootstrapped requests down to all other stages.
167
+ c. If the first stage polls successfully, by nature, other ranks are also successful because they performed a handshake together.
168
+
169
+ 2. Transferred Requests + Release Requests:
170
+ a. The first stage polls the transfer finished requests, performs an intersection with the next stage's finished requests, and propagates down to the last stage.
171
+ b. The last stage receives the requests that have finished transfer on all stages (consensus), then sends them to the first stage to release the memory.
172
+ c. The first stage receives the release requests, releases the memory, and then propagates the release requests down to the last stage.
173
+ """
174
+ mbs = [None] * self.pp_size
175
+ last_mbs = [None] * self.pp_size
176
+ self.running_mbs = [
177
+ ScheduleBatch(reqs=[], batch_is_full=False) for _ in range(self.pp_size)
178
+ ]
179
+ pp_outputs: Optional[PPProxyTensors] = None
180
+
181
+ # Either success or failed
182
+ bootstrapped_rids: List[str] = []
183
+ transferred_rids: List[str] = []
184
+ release_rids: Optional[List[str]] = None
185
+
186
+ # transferred microbatch
187
+ tmbs = [None] * self.pp_size
188
+
189
+ ENABLE_RELEASE = True # For debug
190
+
191
+ while True:
192
+ server_is_idle = True
193
+
194
+ for mb_id in range(self.pp_size):
195
+ self.running_batch = self.running_mbs[mb_id]
196
+ self.last_batch = last_mbs[mb_id]
197
+
198
+ recv_reqs = self.recv_requests()
199
+
200
+ self.process_input_requests(recv_reqs)
201
+
202
+ if self.pp_group.is_first_rank:
203
+ # First rank, pop the bootstrap reqs from the bootstrap queue
204
+ bootstrapped_reqs, failed_reqs = (
205
+ self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
206
+ return_failed_reqs=True
207
+ )
208
+ )
209
+ bootstrapped_rids = [req.rid for req in bootstrapped_reqs] + [
210
+ req.rid for req in failed_reqs
211
+ ]
212
+ self.waiting_queue.extend(bootstrapped_reqs)
213
+ else:
214
+ # Other ranks, receive the bootstrap reqs info from the previous rank and ensure the consensus
215
+ bootstrapped_rids = self.recv_pyobj_from_prev_stage()
216
+ bootstrapped_reqs = (
217
+ self.disagg_prefill_bootstrap_queue.pop_bootstrapped(
218
+ rids_to_check=bootstrapped_rids
219
+ )
220
+ )
221
+ self.waiting_queue.extend(bootstrapped_reqs)
222
+
223
+ if self.pp_group.is_first_rank:
224
+ transferred_rids = self.get_transferred_rids()
225
+ # if other ranks,
226
+ else:
227
+ # 1. recv previous stage's transferred reqs info
228
+ prev_transferred_rids = self.recv_pyobj_from_prev_stage()
229
+ # 2. get the current stage's transferred reqs info
230
+ curr_transferred_rids = self.get_transferred_rids()
231
+ # 3. new consensus rids = intersection(previous consensus rids, transfer finished rids)
232
+ transferred_rids = list(
233
+ set(prev_transferred_rids) & set(curr_transferred_rids)
234
+ )
235
+
236
+ tmbs[mb_id] = transferred_rids
237
+
238
+ self.process_prefill_chunk()
239
+ mbs[mb_id] = self.get_new_batch_prefill()
240
+ self.running_mbs[mb_id] = self.running_batch
241
+
242
+ self.cur_batch = mbs[mb_id]
243
+ if self.cur_batch:
244
+ server_is_idle = False
245
+ result = self.run_batch(self.cur_batch)
246
+
247
+ # send the outputs to the next step
248
+ if self.pp_group.is_last_rank:
249
+ if self.cur_batch:
250
+ next_token_ids = result.next_token_ids
251
+ pp_outputs = PPProxyTensors(
252
+ {
253
+ "next_token_ids": next_token_ids,
254
+ }
255
+ )
256
+ # send the output from the last round to let the next stage worker run post processing
257
+ self.pp_group.send_tensor_dict(
258
+ pp_outputs.tensors,
259
+ all_gather_group=self.attn_tp_group,
260
+ )
261
+
262
+ if ENABLE_RELEASE:
263
+ if self.pp_group.is_last_rank:
264
+ # At the last stage, all stages has reached the consensus to release memory for transferred_rids
265
+ release_rids = transferred_rids
266
+ # send to the first rank
267
+ self.send_pyobj_to_next_stage(release_rids)
268
+
269
+ # receive outputs and post-process (filter finished reqs) the coming microbatch
270
+ next_mb_id = (mb_id + 1) % self.pp_size
271
+ next_pp_outputs = None
272
+ next_release_rids = None
273
+
274
+ if mbs[next_mb_id] is not None:
275
+ next_pp_outputs: Optional[PPProxyTensors] = PPProxyTensors(
276
+ self.pp_group.recv_tensor_dict(
277
+ all_gather_group=self.attn_tp_group
278
+ )
279
+ )
280
+ mbs[next_mb_id].output_ids = next_pp_outputs["next_token_ids"]
281
+ output_result = GenerationBatchResult(
282
+ logits_output=None,
283
+ pp_hidden_states_proxy_tensors=None,
284
+ next_token_ids=next_pp_outputs["next_token_ids"],
285
+ extend_input_len_per_req=None,
286
+ extend_logprob_start_len_per_req=None,
287
+ can_run_cuda_graph=result.can_run_cuda_graph,
288
+ )
289
+ self.process_batch_result_disagg_prefill(
290
+ mbs[next_mb_id], output_result
291
+ )
292
+
293
+ last_mbs[next_mb_id] = mbs[next_mb_id]
294
+
295
+ if ENABLE_RELEASE:
296
+ if tmbs[next_mb_id] is not None:
297
+ # recv consensus rids from the previous rank
298
+ next_release_rids = self.recv_pyobj_from_prev_stage()
299
+ self.process_disagg_prefill_inflight_queue(next_release_rids)
300
+
301
+ # carry the outputs to the next stage
302
+ if not self.pp_group.is_last_rank:
303
+ if pp_outputs:
304
+ # send the outputs from the last round to let the next stage worker run post processing
305
+ self.pp_group.send_tensor_dict(
306
+ pp_outputs.tensors,
307
+ all_gather_group=self.attn_tp_group,
308
+ )
309
+ if ENABLE_RELEASE:
310
+ if release_rids is not None:
311
+ self.send_pyobj_to_next_stage(release_rids)
312
+
313
+ if not self.pp_group.is_last_rank:
314
+ # send out reqs to the next stage
315
+ self.send_pyobj_to_next_stage(recv_reqs)
316
+ self.send_pyobj_to_next_stage(bootstrapped_rids)
317
+ self.send_pyobj_to_next_stage(transferred_rids)
318
+
319
+ # send out proxy tensors to the next stage
320
+ if self.cur_batch:
321
+ # FIXME(lsyin): remove this assert
322
+ assert result.pp_hidden_states_proxy_tensors.tensors is not None
323
+ self.pp_group.send_tensor_dict(
324
+ result.pp_hidden_states_proxy_tensors.tensors,
325
+ all_gather_group=self.attn_tp_group,
326
+ )
327
+
328
+ pp_outputs = next_pp_outputs
329
+ release_rids = next_release_rids
330
+
331
+ self.running_batch.batch_is_full = False
332
+
333
+ if not ENABLE_RELEASE:
334
+ if len(self.disagg_prefill_inflight_queue) > 0:
335
+ self.process_disagg_prefill_inflight_queue()
336
+
337
+ # When the server is idle, self-check and re-init some states
338
+ if server_is_idle and len(self.disagg_prefill_inflight_queue) == 0:
339
+ self.check_memory()
340
+ self.check_tree_cache()
341
+ self.new_token_ratio = self.init_new_token_ratio
@@ -9,6 +9,7 @@ import torch
9
9
  from sglang.srt.managers.io_struct import ProfileReq, ProfileReqOutput, ProfileReqType
10
10
  from sglang.srt.model_executor.forward_batch_info import ForwardMode
11
11
  from sglang.srt.utils import is_npu
12
+ from sglang.srt.utils.profile_merger import ProfileMerger
12
13
 
13
14
  _is_npu = is_npu()
14
15
  if _is_npu:
@@ -25,7 +26,6 @@ logger = logging.getLogger(__name__)
25
26
 
26
27
 
27
28
  class SchedulerProfilerMixin:
28
-
29
29
  def init_profiler(self):
30
30
  self.torch_profiler = None
31
31
  self.torch_profiler_output_dir: Optional[str] = None
@@ -41,6 +41,7 @@ class SchedulerProfilerMixin:
41
41
  self.profile_steps: Optional[int] = None
42
42
  self.profile_in_progress: bool = False
43
43
  self.rpd_profiler = None
44
+ self.merge_profiles = False
44
45
 
45
46
  def init_profile(
46
47
  self,
@@ -52,6 +53,7 @@ class SchedulerProfilerMixin:
52
53
  record_shapes: Optional[bool],
53
54
  profile_by_stage: bool,
54
55
  profile_id: str,
56
+ merge_profiles: bool = False,
55
57
  ) -> ProfileReqOutput:
56
58
  if self.profile_in_progress:
57
59
  return ProfileReqOutput(
@@ -60,6 +62,7 @@ class SchedulerProfilerMixin:
60
62
  )
61
63
 
62
64
  self.profile_by_stage = profile_by_stage
65
+ self.merge_profiles = merge_profiles
63
66
 
64
67
  if output_dir is None:
65
68
  output_dir = os.getenv("SGLANG_TORCH_PROFILER_DIR", "/tmp")
@@ -97,7 +100,7 @@ class SchedulerProfilerMixin:
97
100
  def start_profile(
98
101
  self, stage: Optional[ForwardMode] = None
99
102
  ) -> ProfileReqOutput | None:
100
- stage_str = f" for {stage.__str__()}" if stage else ""
103
+ stage_str = f" for {stage.name}" if stage else ""
101
104
  logger.info(
102
105
  f"Profiling starts{stage_str}. Traces will be saved to: {self.torch_profiler_output_dir} (with profile id: {self.profile_id})",
103
106
  )
@@ -169,6 +172,38 @@ class SchedulerProfilerMixin:
169
172
 
170
173
  return ProfileReqOutput(success=True, message="Succeeded")
171
174
 
175
+ def _merge_profile_traces(self) -> str:
176
+ if not self.merge_profiles:
177
+ return ""
178
+
179
+ if self.tp_rank != 0:
180
+ return ""
181
+ if getattr(self, "dp_size", 1) > 1 and getattr(self, "dp_rank", 0) != 0:
182
+ return ""
183
+ if getattr(self, "pp_size", 1) > 1 and getattr(self, "pp_rank", 0) != 0:
184
+ return ""
185
+ if getattr(self, "moe_ep_size", 1) > 1 and getattr(self, "moe_ep_rank", 0) != 0:
186
+ return ""
187
+
188
+ try:
189
+ logger.info("Starting profile merge...")
190
+ merger = ProfileMerger(self.torch_profiler_output_dir, self.profile_id)
191
+ merged_path = merger.merge_chrome_traces()
192
+
193
+ summary = merger.get_merge_summary()
194
+ merge_message = (
195
+ f" Merged trace: {merged_path} "
196
+ f"(Events: {summary.get('total_events', '?')}, "
197
+ f"Files: {summary.get('total_files', '?')})"
198
+ )
199
+
200
+ logger.info(f"Profile merge completed: {merged_path}")
201
+ except Exception as e:
202
+ logger.error(f"Failed to merge profiles: {e}", exc_info=True)
203
+ return f" Merge failed: {e!s}"
204
+ else:
205
+ return merge_message
206
+
172
207
  def stop_profile(
173
208
  self, stage: Optional[ForwardMode] = None
174
209
  ) -> ProfileReqOutput | None:
@@ -181,19 +216,26 @@ class SchedulerProfilerMixin:
181
216
  if not Path(self.torch_profiler_output_dir).exists():
182
217
  Path(self.torch_profiler_output_dir).mkdir(parents=True, exist_ok=True)
183
218
 
184
- stage_suffix = f"-{stage.__str__()}" if stage else ""
219
+ stage_suffix = f"-{stage.name}" if stage else ""
185
220
  logger.info("Stop profiling" + stage_suffix + "...")
186
221
  if self.torch_profiler is not None:
187
222
  self.torch_profiler.stop()
188
223
  if not _is_npu:
224
+ # Build filename with only non-zero ranks to maintain backward compatibility
225
+ filename_parts = [self.profile_id, f"TP-{self.tp_rank}"]
226
+
227
+ # Only add other ranks if parallelism is enabled (size > 1)
228
+ if getattr(self, "dp_size", 1) > 1:
229
+ filename_parts.append(f"DP-{getattr(self, 'dp_rank', 0)}")
230
+ if getattr(self, "pp_size", 1) > 1:
231
+ filename_parts.append(f"PP-{getattr(self, 'pp_rank', 0)}")
232
+ if getattr(self, "moe_ep_size", 1) > 1:
233
+ filename_parts.append(f"EP-{getattr(self, 'moe_ep_rank', 0)}")
234
+
235
+ filename = "-".join(filename_parts) + stage_suffix + ".trace.json.gz"
236
+
189
237
  self.torch_profiler.export_chrome_trace(
190
- os.path.join(
191
- self.torch_profiler_output_dir,
192
- self.profile_id
193
- + f"-TP-{self.tp_rank}"
194
- + stage_suffix
195
- + ".trace.json.gz",
196
- )
238
+ os.path.join(self.torch_profiler_output_dir, filename)
197
239
  )
198
240
  torch.distributed.barrier(self.tp_cpu_group)
199
241
 
@@ -204,7 +246,7 @@ class SchedulerProfilerMixin:
204
246
 
205
247
  torch.distributed.barrier(self.tp_cpu_group)
206
248
  if self.tp_rank == 0:
207
- from sglang.srt.utils import rpd_to_chrome_trace
249
+ from sglang.srt.utils.rpd_utils import rpd_to_chrome_trace
208
250
 
209
251
  rpd_to_chrome_trace("trace.rpd", self.rpd_profile_path)
210
252
  self.rpd_profiler = None
@@ -224,15 +266,18 @@ class SchedulerProfilerMixin:
224
266
  if "CUDA_PROFILER" in self.profiler_activities:
225
267
  torch.cuda.cudart().cudaProfilerStop()
226
268
 
269
+ merge_message = self._merge_profile_traces()
270
+
227
271
  logger.info(
228
- "Profiling done. Traces are saved to: %s",
272
+ "Profiling done. Traces are saved to: %s%s",
229
273
  self.torch_profiler_output_dir,
274
+ merge_message,
230
275
  )
231
276
  self.torch_profiler = None
232
277
  self.profile_in_progress = False
233
278
  self.profiler_start_forward_ct = None
234
279
 
235
- return ProfileReqOutput(success=True, message="Succeeded.")
280
+ return ProfileReqOutput(success=True, message=f"Succeeded.{merge_message}")
236
281
 
237
282
  def _profile_batch_predicate(self, batch):
238
283
  if self.profile_by_stage:
@@ -247,7 +292,7 @@ class SchedulerProfilerMixin:
247
292
  if self.profiler_decode_ct == 0:
248
293
  if self.profile_in_progress:
249
294
  # force trace flush
250
- self.stop_profile(ForwardMode.EXTEND)
295
+ self.stop_profile(stage=ForwardMode.EXTEND)
251
296
  self.start_profile(batch.forward_mode)
252
297
  self.profiler_decode_ct += 1
253
298
  if self.profiler_decode_ct > self.profiler_target_decode_ct:
@@ -282,6 +327,7 @@ class SchedulerProfilerMixin:
282
327
  recv_req.record_shapes,
283
328
  recv_req.profile_by_stage,
284
329
  recv_req.profile_id,
330
+ recv_req.merge_profiles,
285
331
  )
286
332
  else:
287
333
  self.init_profile(
@@ -293,7 +339,8 @@ class SchedulerProfilerMixin:
293
339
  recv_req.record_shapes,
294
340
  recv_req.profile_by_stage,
295
341
  recv_req.profile_id,
342
+ recv_req.merge_profiles,
296
343
  )
297
- return self.start_profile(True)
344
+ return self.start_profile()
298
345
  else:
299
346
  return self.stop_profile()