sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -30,8 +30,6 @@ import time
30
30
  from typing import AsyncIterator, Dict, Iterator, List, Optional, Tuple, Union
31
31
 
32
32
  import zmq
33
- import zmq.asyncio
34
- from PIL.Image import Image
35
33
 
36
34
  from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
37
35
 
@@ -61,6 +59,7 @@ from sglang.srt.managers.io_struct import (
61
59
  UnloadLoRAAdapterReqInput,
62
60
  UpdateWeightFromDiskReqInput,
63
61
  UpdateWeightsFromDistributedReqInput,
62
+ UpdateWeightsFromIPCReqInput,
64
63
  UpdateWeightsFromTensorReqInput,
65
64
  )
66
65
  from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
@@ -68,7 +67,6 @@ from sglang.srt.managers.scheduler import run_scheduler_process
68
67
  from sglang.srt.managers.template_manager import TemplateManager
69
68
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
70
69
  from sglang.srt.server_args import PortArgs, ServerArgs
71
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
72
70
  from sglang.srt.utils import (
73
71
  MultiprocessingSerializer,
74
72
  assert_pkg_version,
@@ -78,10 +76,12 @@ from sglang.srt.utils import (
78
76
  is_cuda,
79
77
  kill_process_tree,
80
78
  launch_dummy_health_check_server,
79
+ maybe_reindex_device_id,
81
80
  prepare_model_and_tokenizer,
82
81
  set_prometheus_multiproc_dir,
83
82
  set_ulimit,
84
83
  )
84
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
85
85
  from sglang.version import __version__
86
86
 
87
87
  logger = logging.getLogger(__name__)
@@ -101,7 +101,7 @@ class Engine(EngineBase):
101
101
 
102
102
  Note:
103
103
  1. The HTTP server, Engine, and TokenizerManager all run in the main process.
104
- 2. Inter-process communication (IPC) is handled via the ZMQ library, with each process using a different port.
104
+ 2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
105
105
  """
106
106
 
107
107
  def __init__(self, **kwargs):
@@ -109,6 +109,8 @@ class Engine(EngineBase):
109
109
  The arguments of this function is the same as `sglang/srt/server_args.py::ServerArgs`.
110
110
  Please refer to `ServerArgs` for the documentation.
111
111
  """
112
+
113
+ # Parse server_args
112
114
  if "server_args" in kwargs:
113
115
  # Directly load server_args
114
116
  server_args = kwargs["server_args"]
@@ -118,35 +120,40 @@ class Engine(EngineBase):
118
120
  # Do not print logs by default
119
121
  kwargs["log_level"] = "error"
120
122
  server_args = ServerArgs(**kwargs)
123
+ self.server_args = server_args
124
+ logger.info(f"{server_args=}")
121
125
 
122
126
  # Shutdown the subprocesses automatically when the program exits
123
127
  atexit.register(self.shutdown)
124
128
 
125
- # Allocate ports for inter-process communications
126
- self.port_args = PortArgs.init_new(server_args)
127
- logger.info(f"{server_args=}")
128
-
129
129
  # Launch subprocesses
130
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
131
- server_args=server_args,
132
- port_args=self.port_args,
130
+ tokenizer_manager, template_manager, scheduler_info, port_args = (
131
+ _launch_subprocesses(server_args=server_args)
133
132
  )
134
- self.server_args = server_args
135
133
  self.tokenizer_manager = tokenizer_manager
136
134
  self.template_manager = template_manager
137
135
  self.scheduler_info = scheduler_info
136
+ self.port_args = port_args
138
137
 
138
+ # Initialize ZMQ sockets
139
139
  context = zmq.Context(2)
140
140
  self.send_to_rpc = get_zmq_socket(
141
141
  context, zmq.DEALER, self.port_args.rpc_ipc_name, True
142
142
  )
143
143
 
144
+ # Enable tracing
144
145
  if server_args.enable_trace:
145
146
  process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
146
147
  if server_args.disaggregation_mode == "null":
147
148
  thread_label = "Tokenizer"
148
149
  trace_set_thread_info(thread_label)
149
150
 
151
+ try:
152
+ self.loop = asyncio.get_running_loop()
153
+ except RuntimeError:
154
+ self.loop = asyncio.new_event_loop()
155
+ asyncio.set_event_loop(self.loop)
156
+
150
157
  def generate(
151
158
  self,
152
159
  # The input prompt. It can be a single prompt or a batch of prompts.
@@ -210,7 +217,6 @@ class Engine(EngineBase):
210
217
  bootstrap_room=bootstrap_room,
211
218
  data_parallel_rank=data_parallel_rank,
212
219
  )
213
- loop = asyncio.get_event_loop()
214
220
  generator = self.tokenizer_manager.generate_request(obj, None)
215
221
 
216
222
  if stream:
@@ -218,14 +224,14 @@ class Engine(EngineBase):
218
224
  def generator_wrapper():
219
225
  while True:
220
226
  try:
221
- chunk = loop.run_until_complete(generator.__anext__())
227
+ chunk = self.loop.run_until_complete(generator.__anext__())
222
228
  yield chunk
223
229
  except StopAsyncIteration:
224
230
  break
225
231
 
226
232
  return generator_wrapper()
227
233
  else:
228
- ret = loop.run_until_complete(generator.__anext__())
234
+ ret = self.loop.run_until_complete(generator.__anext__())
229
235
  return ret
230
236
 
231
237
  async def async_generate(
@@ -317,9 +323,8 @@ class Engine(EngineBase):
317
323
  audio_data=audio_data,
318
324
  video_data=video_data,
319
325
  )
320
- loop = asyncio.get_event_loop()
321
326
  generator = self.tokenizer_manager.generate_request(obj, None)
322
- ret = loop.run_until_complete(generator.__anext__())
327
+ ret = self.loop.run_until_complete(generator.__anext__())
323
328
  return ret
324
329
 
325
330
  async def async_encode(
@@ -353,9 +358,8 @@ class Engine(EngineBase):
353
358
  Please refer to `EmbeddingReqInput` for the documentation.
354
359
  """
355
360
  obj = EmbeddingReqInput(text=prompt, is_cross_encoder_request=True)
356
- loop = asyncio.get_event_loop()
357
361
  generator = self.tokenizer_manager.generate_request(obj, None)
358
- ret = loop.run_until_complete(generator.__anext__())
362
+ ret = self.loop.run_until_complete(generator.__anext__())
359
363
  return ret
360
364
 
361
365
  def shutdown(self):
@@ -370,38 +374,31 @@ class Engine(EngineBase):
370
374
  return False
371
375
 
372
376
  def flush_cache(self):
373
- loop = asyncio.get_event_loop()
374
- return loop.run_until_complete(self.tokenizer_manager.flush_cache())
377
+ return self.loop.run_until_complete(self.tokenizer_manager.flush_cache())
375
378
 
376
379
  def start_profile(self, **kwargs):
377
- loop = asyncio.get_event_loop()
378
- loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
380
+ self.loop.run_until_complete(self.tokenizer_manager.start_profile(**kwargs))
379
381
 
380
382
  def stop_profile(self):
381
- loop = asyncio.get_event_loop()
382
- loop.run_until_complete(self.tokenizer_manager.stop_profile())
383
+ self.loop.run_until_complete(self.tokenizer_manager.stop_profile())
383
384
 
384
385
  def start_expert_distribution_record(self):
385
- loop = asyncio.get_event_loop()
386
- loop.run_until_complete(
386
+ self.loop.run_until_complete(
387
387
  self.tokenizer_manager.start_expert_distribution_record()
388
388
  )
389
389
 
390
390
  def stop_expert_distribution_record(self):
391
- loop = asyncio.get_event_loop()
392
- loop.run_until_complete(
391
+ self.loop.run_until_complete(
393
392
  self.tokenizer_manager.stop_expert_distribution_record()
394
393
  )
395
394
 
396
395
  def dump_expert_distribution_record(self):
397
- loop = asyncio.get_event_loop()
398
- loop.run_until_complete(
396
+ self.loop.run_until_complete(
399
397
  self.tokenizer_manager.dump_expert_distribution_record()
400
398
  )
401
399
 
402
400
  def get_server_info(self):
403
- loop = asyncio.get_event_loop()
404
- internal_states = loop.run_until_complete(
401
+ internal_states = self.loop.run_until_complete(
405
402
  self.tokenizer_manager.get_internal_state()
406
403
  )
407
404
  return {
@@ -429,8 +426,7 @@ class Engine(EngineBase):
429
426
  group_name=group_name,
430
427
  backend=backend,
431
428
  )
432
- loop = asyncio.get_event_loop()
433
- return loop.run_until_complete(
429
+ return self.loop.run_until_complete(
434
430
  self.tokenizer_manager.init_weights_update_group(obj, None)
435
431
  )
436
432
 
@@ -442,8 +438,7 @@ class Engine(EngineBase):
442
438
  obj = DestroyWeightsUpdateGroupReqInput(
443
439
  group_name=group_name,
444
440
  )
445
- loop = asyncio.get_event_loop()
446
- return loop.run_until_complete(
441
+ return self.loop.run_until_complete(
447
442
  self.tokenizer_manager.destroy_weights_update_group(obj, None)
448
443
  )
449
444
 
@@ -463,8 +458,7 @@ class Engine(EngineBase):
463
458
  group_name=group_name,
464
459
  flush_cache=flush_cache,
465
460
  )
466
- loop = asyncio.get_event_loop()
467
- return loop.run_until_complete(
461
+ return self.loop.run_until_complete(
468
462
  self.tokenizer_manager.update_weights_from_distributed(obj, None)
469
463
  )
470
464
 
@@ -488,9 +482,7 @@ class Engine(EngineBase):
488
482
  load_format=load_format,
489
483
  flush_cache=flush_cache,
490
484
  )
491
- loop = asyncio.get_event_loop()
492
-
493
- return loop.run_until_complete(
485
+ return self.loop.run_until_complete(
494
486
  self.tokenizer_manager.update_weights_from_tensor(obj, None)
495
487
  )
496
488
 
@@ -510,16 +502,14 @@ class Engine(EngineBase):
510
502
  load_format=load_format,
511
503
  )
512
504
 
513
- loop = asyncio.get_event_loop()
514
- return loop.run_until_complete(
505
+ return self.loop.run_until_complete(
515
506
  self.tokenizer_manager.update_weights_from_disk(obj, None)
516
507
  )
517
508
 
518
509
  def get_weights_by_name(self, name: str, truncate_size: int = 100):
519
510
  """Get weights by parameter name."""
520
511
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
521
- loop = asyncio.get_event_loop()
522
- return loop.run_until_complete(
512
+ return self.loop.run_until_complete(
523
513
  self.tokenizer_manager.get_weights_by_name(obj, None)
524
514
  )
525
515
 
@@ -532,8 +522,7 @@ class Engine(EngineBase):
532
522
  pinned=pinned,
533
523
  )
534
524
 
535
- loop = asyncio.get_event_loop()
536
- return loop.run_until_complete(
525
+ return self.loop.run_until_complete(
537
526
  self.tokenizer_manager.load_lora_adapter(obj, None)
538
527
  )
539
528
 
@@ -542,22 +531,19 @@ class Engine(EngineBase):
542
531
 
543
532
  obj = UnloadLoRAAdapterReqInput(lora_name=lora_name)
544
533
 
545
- loop = asyncio.get_event_loop()
546
- return loop.run_until_complete(
534
+ return self.loop.run_until_complete(
547
535
  self.tokenizer_manager.unload_lora_adapter(obj, None)
548
536
  )
549
537
 
550
538
  def release_memory_occupation(self, tags: Optional[List[str]] = None):
551
539
  obj = ReleaseMemoryOccupationReqInput(tags=tags)
552
- loop = asyncio.get_event_loop()
553
- return loop.run_until_complete(
540
+ return self.loop.run_until_complete(
554
541
  self.tokenizer_manager.release_memory_occupation(obj, None)
555
542
  )
556
543
 
557
544
  def resume_memory_occupation(self, tags: Optional[List[str]] = None):
558
545
  obj = ResumeMemoryOccupationReqInput(tags=tags)
559
- loop = asyncio.get_event_loop()
560
- return loop.run_until_complete(
546
+ return self.loop.run_until_complete(
561
547
  self.tokenizer_manager.resume_memory_occupation(obj, None)
562
548
  )
563
549
 
@@ -574,8 +560,7 @@ class Engine(EngineBase):
574
560
  collection.
575
561
  """
576
562
 
577
- loop = asyncio.get_event_loop()
578
- loop.run_until_complete(self.tokenizer_manager.freeze_gc())
563
+ self.loop.run_until_complete(self.tokenizer_manager.freeze_gc())
579
564
 
580
565
  """
581
566
  Execute an RPC call on all scheduler processes.
@@ -633,8 +618,7 @@ class Engine(EngineBase):
633
618
  ValueError: If query is not provided, or if items is not provided,
634
619
  or if token IDs are out of vocabulary, or if logprobs are not available for the specified tokens.
635
620
  """
636
- loop = asyncio.get_event_loop()
637
- return loop.run_until_complete(
621
+ return self.loop.run_until_complete(
638
622
  self.tokenizer_manager.score_request(
639
623
  query=query,
640
624
  items=items,
@@ -667,6 +651,21 @@ class Engine(EngineBase):
667
651
  request=None,
668
652
  )
669
653
 
654
+ def update_weights_from_ipc(
655
+ self,
656
+ zmq_handles: Dict[str, str],
657
+ flush_cache: bool = True,
658
+ ):
659
+ """Update weights from IPC for checkpoint-engine integration."""
660
+ obj = UpdateWeightsFromIPCReqInput(
661
+ zmq_handles=zmq_handles,
662
+ flush_cache=flush_cache,
663
+ )
664
+ loop = asyncio.get_event_loop()
665
+ return loop.run_until_complete(
666
+ self.tokenizer_manager.update_weights_from_ipc(obj, None)
667
+ )
668
+
670
669
 
671
670
  def _set_envs_and_config(server_args: ServerArgs):
672
671
  # Set global environments
@@ -674,15 +673,17 @@ def _set_envs_and_config(server_args: ServerArgs):
674
673
  os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem))
675
674
  if not server_args.enable_symm_mem:
676
675
  os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
677
- os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
676
+ os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "8"
678
677
  os.environ["CUDA_MODULE_LOADING"] = "AUTO"
679
- # flashinfer uses this environment variable for various kernels from MoE to quant kernels
678
+
680
679
  if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
680
+ # flashinfer uses this environment variable for various kernels from MoE to quant kernels
681
681
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
682
682
 
683
683
  if os.environ.get("CUTE_DSL_LOG_LEVEL") is None:
684
684
  # Default to warning level, to avoid too many logs
685
685
  os.environ["CUTE_DSL_LOG_LEVEL"] = "30"
686
+
686
687
  if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None:
687
688
  # Need to set log to console, otherwise the log level won't take effect
688
689
  os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1"
@@ -703,7 +704,7 @@ def _set_envs_and_config(server_args: ServerArgs):
703
704
  if server_args.attention_backend == "flashinfer":
704
705
  assert_pkg_version(
705
706
  "flashinfer_python",
706
- "0.4.0rc3",
707
+ "0.4.1",
707
708
  "Please uninstall the old version and "
708
709
  "reinstall the latest version by following the instructions "
709
710
  "at https://docs.flashinfer.ai/installation.html.",
@@ -711,7 +712,7 @@ def _set_envs_and_config(server_args: ServerArgs):
711
712
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
712
713
  assert_pkg_version(
713
714
  "sgl-kernel",
714
- "0.3.14",
715
+ "0.3.16.post4",
715
716
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
716
717
  )
717
718
 
@@ -801,22 +802,24 @@ def _launch_subprocesses(
801
802
  + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
802
803
  )
803
804
  moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
804
- proc = mp.Process(
805
- target=run_scheduler_process,
806
- args=(
807
- server_args,
808
- port_args,
809
- gpu_id,
810
- tp_rank,
811
- moe_ep_rank,
812
- pp_rank,
813
- None,
814
- writer,
815
- ),
816
- )
817
805
 
818
- with memory_saver_adapter.configure_subprocess():
819
- proc.start()
806
+ with maybe_reindex_device_id(gpu_id) as gpu_id:
807
+ proc = mp.Process(
808
+ target=run_scheduler_process,
809
+ args=(
810
+ server_args,
811
+ port_args,
812
+ gpu_id,
813
+ tp_rank,
814
+ moe_ep_rank,
815
+ pp_rank,
816
+ None,
817
+ writer,
818
+ ),
819
+ )
820
+ with memory_saver_adapter.configure_subprocess():
821
+ proc.start()
822
+
820
823
  scheduler_procs.append(proc)
821
824
  scheduler_pipe_readers.append(reader)
822
825
  else:
@@ -840,7 +843,7 @@ def _launch_subprocesses(
840
843
 
841
844
  if os.getenv("SGLANG_BLOCK_NONZERO_RANK_CHILDREN") == "0":
842
845
  # When using `Engine` as a Python API, we don't want to block here.
843
- return None, None, None
846
+ return None, None, None, port_args
844
847
 
845
848
  launch_dummy_health_check_server(
846
849
  server_args.host, server_args.port, server_args.enable_metrics
@@ -851,7 +854,7 @@ def _launch_subprocesses(
851
854
  logger.error(
852
855
  f"Scheduler or DataParallelController {proc.pid} terminated with {proc.exitcode}"
853
856
  )
854
- return None, None, None
857
+ return None, None, None, port_args
855
858
 
856
859
  # Launch detokenizer process
857
860
  detoken_proc = mp.Process(
@@ -897,4 +900,4 @@ def _launch_subprocesses(
897
900
 
898
901
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
899
902
 
900
- return tokenizer_manager, template_manager, scheduler_info
903
+ return tokenizer_manager, template_manager, scheduler_info, port_args