sglang 0.5.3rc2__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (408) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +330 -156
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +8 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +4 -6
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +134 -23
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +70 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +66 -66
  69. sglang/srt/entrypoints/grpc_server.py +431 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +120 -8
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +42 -4
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +3 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +18 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/utils.py +2 -2
  93. sglang/srt/grpc/compile_proto.py +3 -3
  94. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  95. sglang/srt/grpc/health_servicer.py +189 -0
  96. sglang/srt/grpc/scheduler_launcher.py +181 -0
  97. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  98. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  99. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  100. sglang/srt/layers/activation.py +4 -1
  101. sglang/srt/layers/attention/aiter_backend.py +3 -3
  102. sglang/srt/layers/attention/ascend_backend.py +17 -1
  103. sglang/srt/layers/attention/attention_registry.py +43 -23
  104. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  105. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  106. sglang/srt/layers/attention/fla/chunk.py +0 -1
  107. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  108. sglang/srt/layers/attention/fla/index.py +0 -2
  109. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  110. sglang/srt/layers/attention/fla/utils.py +0 -3
  111. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  112. sglang/srt/layers/attention/flashattention_backend.py +12 -8
  113. sglang/srt/layers/attention/flashinfer_backend.py +248 -21
  114. sglang/srt/layers/attention/flashinfer_mla_backend.py +20 -18
  115. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  116. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  117. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  118. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  119. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  121. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  122. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  123. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  124. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  125. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  127. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  128. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  129. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  130. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  131. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  132. sglang/srt/layers/attention/nsa/utils.py +0 -1
  133. sglang/srt/layers/attention/nsa_backend.py +404 -90
  134. sglang/srt/layers/attention/triton_backend.py +208 -34
  135. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  136. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  137. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  138. sglang/srt/layers/attention/trtllm_mla_backend.py +361 -30
  139. sglang/srt/layers/attention/utils.py +11 -7
  140. sglang/srt/layers/attention/vision.py +3 -3
  141. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  142. sglang/srt/layers/communicator.py +11 -7
  143. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  146. sglang/srt/layers/dp_attention.py +17 -0
  147. sglang/srt/layers/layernorm.py +45 -15
  148. sglang/srt/layers/linear.py +9 -1
  149. sglang/srt/layers/logits_processor.py +147 -17
  150. sglang/srt/layers/modelopt_utils.py +11 -0
  151. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  152. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  153. sglang/srt/layers/moe/ep_moe/kernels.py +35 -457
  154. sglang/srt/layers/moe/ep_moe/layer.py +119 -397
  155. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  159. sglang/srt/layers/moe/fused_moe_triton/layer.py +76 -70
  160. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  161. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  162. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  163. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  164. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  165. sglang/srt/layers/moe/router.py +51 -15
  166. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  167. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  168. sglang/srt/layers/moe/token_dispatcher/deepep.py +110 -97
  169. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  170. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  171. sglang/srt/layers/moe/topk.py +3 -2
  172. sglang/srt/layers/moe/utils.py +17 -1
  173. sglang/srt/layers/quantization/__init__.py +2 -53
  174. sglang/srt/layers/quantization/awq.py +183 -6
  175. sglang/srt/layers/quantization/awq_triton.py +29 -0
  176. sglang/srt/layers/quantization/base_config.py +20 -1
  177. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  178. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  179. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  180. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  181. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  183. sglang/srt/layers/quantization/fp8.py +84 -18
  184. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  185. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  186. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  187. sglang/srt/layers/quantization/gptq.py +0 -1
  188. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  189. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  190. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  191. sglang/srt/layers/quantization/mxfp4.py +5 -30
  192. sglang/srt/layers/quantization/petit.py +1 -1
  193. sglang/srt/layers/quantization/quark/quark.py +3 -1
  194. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  195. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  196. sglang/srt/layers/quantization/unquant.py +1 -4
  197. sglang/srt/layers/quantization/utils.py +0 -1
  198. sglang/srt/layers/quantization/w4afp8.py +51 -20
  199. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  200. sglang/srt/layers/radix_attention.py +59 -9
  201. sglang/srt/layers/rotary_embedding.py +673 -16
  202. sglang/srt/layers/sampler.py +36 -16
  203. sglang/srt/layers/sparse_pooler.py +98 -0
  204. sglang/srt/layers/utils.py +0 -1
  205. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  206. sglang/srt/lora/backend/triton_backend.py +0 -1
  207. sglang/srt/lora/eviction_policy.py +139 -0
  208. sglang/srt/lora/lora_manager.py +24 -9
  209. sglang/srt/lora/lora_registry.py +1 -1
  210. sglang/srt/lora/mem_pool.py +40 -16
  211. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  212. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  213. sglang/srt/managers/cache_controller.py +48 -17
  214. sglang/srt/managers/data_parallel_controller.py +146 -42
  215. sglang/srt/managers/detokenizer_manager.py +40 -13
  216. sglang/srt/managers/io_struct.py +66 -16
  217. sglang/srt/managers/mm_utils.py +20 -18
  218. sglang/srt/managers/multi_tokenizer_mixin.py +66 -81
  219. sglang/srt/managers/overlap_utils.py +96 -19
  220. sglang/srt/managers/schedule_batch.py +241 -511
  221. sglang/srt/managers/schedule_policy.py +15 -2
  222. sglang/srt/managers/scheduler.py +399 -499
  223. sglang/srt/managers/scheduler_metrics_mixin.py +55 -8
  224. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  225. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  226. sglang/srt/managers/scheduler_profiler_mixin.py +57 -10
  227. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  228. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  229. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  230. sglang/srt/managers/tokenizer_manager.py +378 -90
  231. sglang/srt/managers/tp_worker.py +212 -161
  232. sglang/srt/managers/utils.py +78 -2
  233. sglang/srt/mem_cache/allocator.py +7 -2
  234. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  235. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  236. sglang/srt/mem_cache/chunk_cache.py +13 -2
  237. sglang/srt/mem_cache/common.py +480 -0
  238. sglang/srt/mem_cache/evict_policy.py +16 -1
  239. sglang/srt/mem_cache/hicache_storage.py +4 -1
  240. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  241. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  242. sglang/srt/mem_cache/memory_pool.py +435 -219
  243. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  244. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  245. sglang/srt/mem_cache/radix_cache.py +53 -19
  246. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  247. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  249. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  250. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  251. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  252. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  253. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  254. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  255. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  256. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  257. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  258. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  259. sglang/srt/metrics/collector.py +31 -0
  260. sglang/srt/metrics/func_timer.py +1 -1
  261. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  262. sglang/srt/model_executor/forward_batch_info.py +28 -23
  263. sglang/srt/model_executor/model_runner.py +379 -139
  264. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  265. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  266. sglang/srt/model_loader/__init__.py +1 -1
  267. sglang/srt/model_loader/loader.py +424 -27
  268. sglang/srt/model_loader/utils.py +0 -1
  269. sglang/srt/model_loader/weight_utils.py +47 -28
  270. sglang/srt/models/apertus.py +2 -3
  271. sglang/srt/models/arcee.py +2 -2
  272. sglang/srt/models/bailing_moe.py +13 -52
  273. sglang/srt/models/bailing_moe_nextn.py +3 -4
  274. sglang/srt/models/bert.py +1 -1
  275. sglang/srt/models/deepseek_nextn.py +19 -3
  276. sglang/srt/models/deepseek_ocr.py +1516 -0
  277. sglang/srt/models/deepseek_v2.py +273 -98
  278. sglang/srt/models/dots_ocr.py +0 -2
  279. sglang/srt/models/dots_vlm.py +0 -1
  280. sglang/srt/models/dots_vlm_vit.py +1 -1
  281. sglang/srt/models/falcon_h1.py +13 -19
  282. sglang/srt/models/gemma3_mm.py +16 -0
  283. sglang/srt/models/gemma3n_mm.py +1 -2
  284. sglang/srt/models/glm4_moe.py +14 -37
  285. sglang/srt/models/glm4_moe_nextn.py +2 -2
  286. sglang/srt/models/glm4v.py +2 -1
  287. sglang/srt/models/glm4v_moe.py +5 -5
  288. sglang/srt/models/gpt_oss.py +5 -5
  289. sglang/srt/models/grok.py +10 -23
  290. sglang/srt/models/hunyuan.py +2 -7
  291. sglang/srt/models/interns1.py +0 -1
  292. sglang/srt/models/kimi_vl.py +1 -7
  293. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  294. sglang/srt/models/llama.py +2 -2
  295. sglang/srt/models/llama_eagle3.py +1 -1
  296. sglang/srt/models/longcat_flash.py +5 -22
  297. sglang/srt/models/longcat_flash_nextn.py +3 -14
  298. sglang/srt/models/mimo.py +2 -13
  299. sglang/srt/models/mimo_mtp.py +1 -2
  300. sglang/srt/models/minicpmo.py +7 -5
  301. sglang/srt/models/mixtral.py +1 -4
  302. sglang/srt/models/mllama.py +1 -1
  303. sglang/srt/models/mllama4.py +13 -3
  304. sglang/srt/models/nemotron_h.py +511 -0
  305. sglang/srt/models/olmo2.py +31 -4
  306. sglang/srt/models/opt.py +5 -5
  307. sglang/srt/models/phi.py +1 -1
  308. sglang/srt/models/phi4mm.py +1 -1
  309. sglang/srt/models/phimoe.py +0 -1
  310. sglang/srt/models/pixtral.py +0 -3
  311. sglang/srt/models/points_v15_chat.py +186 -0
  312. sglang/srt/models/qwen.py +0 -1
  313. sglang/srt/models/qwen2_5_vl.py +3 -3
  314. sglang/srt/models/qwen2_audio.py +2 -15
  315. sglang/srt/models/qwen2_moe.py +15 -12
  316. sglang/srt/models/qwen2_vl.py +5 -2
  317. sglang/srt/models/qwen3_moe.py +19 -35
  318. sglang/srt/models/qwen3_next.py +7 -12
  319. sglang/srt/models/qwen3_next_mtp.py +3 -4
  320. sglang/srt/models/qwen3_omni_moe.py +661 -0
  321. sglang/srt/models/qwen3_vl.py +37 -33
  322. sglang/srt/models/qwen3_vl_moe.py +57 -185
  323. sglang/srt/models/roberta.py +55 -3
  324. sglang/srt/models/sarashina2_vision.py +0 -1
  325. sglang/srt/models/step3_vl.py +3 -5
  326. sglang/srt/models/utils.py +11 -1
  327. sglang/srt/multimodal/processors/base_processor.py +6 -2
  328. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  329. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  330. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  331. sglang/srt/multimodal/processors/glm4v.py +1 -5
  332. sglang/srt/multimodal/processors/internvl.py +0 -2
  333. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  334. sglang/srt/multimodal/processors/mllama4.py +0 -8
  335. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  336. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  337. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  338. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  339. sglang/srt/parser/conversation.py +41 -0
  340. sglang/srt/parser/reasoning_parser.py +0 -1
  341. sglang/srt/sampling/custom_logit_processor.py +77 -2
  342. sglang/srt/sampling/sampling_batch_info.py +17 -22
  343. sglang/srt/sampling/sampling_params.py +70 -2
  344. sglang/srt/server_args.py +577 -73
  345. sglang/srt/server_args_config_parser.py +1 -1
  346. sglang/srt/single_batch_overlap.py +38 -28
  347. sglang/srt/speculative/base_spec_worker.py +34 -0
  348. sglang/srt/speculative/draft_utils.py +226 -0
  349. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  350. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  351. sglang/srt/speculative/eagle_info.py +57 -18
  352. sglang/srt/speculative/eagle_info_v2.py +458 -0
  353. sglang/srt/speculative/eagle_utils.py +138 -0
  354. sglang/srt/speculative/eagle_worker.py +83 -280
  355. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  356. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  357. sglang/srt/speculative/ngram_worker.py +12 -11
  358. sglang/srt/speculative/spec_info.py +2 -0
  359. sglang/srt/speculative/spec_utils.py +38 -3
  360. sglang/srt/speculative/standalone_worker.py +4 -14
  361. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  362. sglang/srt/two_batch_overlap.py +28 -14
  363. sglang/srt/utils/__init__.py +1 -1
  364. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  365. sglang/srt/utils/common.py +192 -47
  366. sglang/srt/utils/hf_transformers_utils.py +40 -17
  367. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  368. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  369. sglang/srt/utils/profile_merger.py +199 -0
  370. sglang/test/attention/test_flashattn_backend.py +1 -1
  371. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  372. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  373. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  374. sglang/test/few_shot_gsm8k_engine.py +2 -4
  375. sglang/test/kit_matched_stop.py +157 -0
  376. sglang/test/longbench_v2/__init__.py +1 -0
  377. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  378. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  379. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  380. sglang/test/run_eval.py +41 -0
  381. sglang/test/runners.py +2 -0
  382. sglang/test/send_one.py +42 -7
  383. sglang/test/simple_eval_common.py +3 -0
  384. sglang/test/simple_eval_gpqa.py +0 -1
  385. sglang/test/simple_eval_humaneval.py +0 -3
  386. sglang/test/simple_eval_longbench_v2.py +344 -0
  387. sglang/test/test_block_fp8.py +1 -2
  388. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  389. sglang/test/test_cutlass_moe.py +1 -2
  390. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  391. sglang/test/test_deterministic.py +232 -99
  392. sglang/test/test_deterministic_utils.py +73 -0
  393. sglang/test/test_disaggregation_utils.py +81 -0
  394. sglang/test/test_marlin_moe.py +0 -1
  395. sglang/test/test_utils.py +85 -20
  396. sglang/version.py +1 -1
  397. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/METADATA +45 -33
  398. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/RECORD +404 -345
  399. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  400. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  401. sglang/srt/speculative/build_eagle_tree.py +0 -427
  402. sglang/test/test_block_fp8_ep.py +0 -358
  403. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  404. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  405. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  406. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -263,8 +263,8 @@ class GrpcRequestManager:
263
263
  response = await task
264
264
 
265
265
  # Add index for client-side ordering
266
- if isinstance(response, dict) and "meta_info" in response:
267
- response_rid = response["meta_info"].get("id", "")
266
+ if isinstance(response, dict):
267
+ response_rid = response.get("request_id", "")
268
268
  if response_rid in rid_to_index:
269
269
  response["index"] = rid_to_index[response_rid]
270
270
 
@@ -318,13 +318,8 @@ class GrpcRequestManager:
318
318
  is_stream = getattr(obj, "stream", False)
319
319
 
320
320
  while True:
321
- # Client cancelled - notify scheduler and exit
322
- if grpc_context and grpc_context.cancelled():
323
- await self.abort_request(request_id)
324
- return
325
-
326
321
  try:
327
- response = await asyncio.wait_for(state.out_queue.get(), timeout=4)
322
+ response = await state.out_queue.get()
328
323
 
329
324
  if is_stream:
330
325
  yield response
@@ -337,13 +332,11 @@ class GrpcRequestManager:
337
332
  yield final_response
338
333
  break
339
334
 
340
- except asyncio.TimeoutError:
341
- # Timeout waiting for response - abort and cleanup
342
- logger.warning(
343
- f"Timeout waiting for response for request {request_id}"
344
- )
335
+ except asyncio.CancelledError:
336
+ # Task was cancelled by gRPC framework when client disconnected
337
+ logger.info(f"Request {request_id} cancelled by client")
345
338
  await self.abort_request(request_id)
346
- return
339
+ raise # Re-raise to let gRPC server handle cleanup
347
340
 
348
341
  finally:
349
342
  # Always clean up request state when exiting
@@ -397,9 +390,7 @@ class GrpcRequestManager:
397
390
  # Wait for result in background
398
391
  async def wait_for_result():
399
392
  try:
400
- # Wait for completion
401
393
  await state.event.wait()
402
- # Get result from queue
403
394
  result = await state.out_queue.get()
404
395
  future.set_result(result)
405
396
  except Exception as e:
@@ -413,43 +404,34 @@ class GrpcRequestManager:
413
404
  return future
414
405
 
415
406
  async def abort_request(self, request_id: str) -> bool:
416
- """Abort a running request."""
417
- if request_id not in self.rid_to_state:
418
- return False
407
+ """Abort a running request.
419
408
 
420
- # Send abort to scheduler
421
- abort_req = AbortReq(rid=request_id)
422
- try:
423
- await self._send_to_scheduler(abort_req)
424
- except Exception as e:
425
- logger.error(f"Failed to send abort request: {e}")
409
+ Sends abort request to scheduler and marks local state as finished
410
+ to stop processing any further outputs from the scheduler.
411
+ """
412
+ # Skip aborting health check requests (they clean themselves up)
413
+ if request_id.startswith("HEALTH_CHECK"):
426
414
  return False
427
415
 
428
- # Mark as finished
416
+ # Mark state as finished immediately to stop processing scheduler outputs
429
417
  state = self.rid_to_state.get(request_id)
430
418
  if state:
431
419
  state.finished = True
432
420
  state.stream_finished = True
433
- state.event.set()
421
+ logger.debug(f"Marked request {request_id} as aborted locally")
434
422
 
435
- # Send abort notification to output queue
436
- await state.out_queue.put({"error": "Request aborted", "abort": True})
423
+ # Send abort to scheduler - the scheduler will send AbortReq back
424
+ # which will be handled by _handle_abort_req
425
+ abort_req = AbortReq(rid=request_id)
426
+ try:
427
+ await self._send_to_scheduler(abort_req)
428
+ logger.debug(f"Sent abort to scheduler for request {request_id}")
429
+ except Exception as e:
430
+ logger.error(f"Failed to send abort request to scheduler: {e}")
431
+ return False
437
432
 
438
433
  return True
439
434
 
440
- async def pause_generation(self):
441
- """Pause generation processing."""
442
- async with self.is_pause_cond:
443
- self.is_pause = True
444
- logger.info("Generation paused")
445
-
446
- async def resume_generation(self):
447
- """Resume generation processing."""
448
- async with self.is_pause_cond:
449
- self.is_pause = False
450
- self.is_pause_cond.notify_all()
451
- logger.info("Generation resumed")
452
-
453
435
  async def handle_loop(self):
454
436
  """
455
437
  Main event loop - processes outputs from scheduler.
@@ -461,10 +443,11 @@ class GrpcRequestManager:
461
443
  recv_obj = await self.recv_from_scheduler.recv_pyobj()
462
444
  self.last_receive_tstamp = time.time()
463
445
 
464
- # Check for pause
465
- async with self.is_pause_cond:
466
- while self.is_pause:
467
- await self.is_pause_cond.wait()
446
+ # Check for pause (optimized: check flag before acquiring lock)
447
+ if self.is_pause:
448
+ async with self.is_pause_cond:
449
+ while self.is_pause:
450
+ await self.is_pause_cond.wait()
468
451
 
469
452
  # Handle different output types
470
453
  if isinstance(recv_obj, BatchTokenIDOutput):
@@ -473,6 +456,8 @@ class GrpcRequestManager:
473
456
  await self._handle_embedding_output(recv_obj)
474
457
  elif isinstance(recv_obj, HealthCheckOutput):
475
458
  await self._handle_health_check_output(recv_obj)
459
+ elif isinstance(recv_obj, AbortReq):
460
+ await self._handle_abort_req(recv_obj)
476
461
  else:
477
462
  logger.warning(f"Unknown output type: {type(recv_obj)}")
478
463
 
@@ -547,6 +532,11 @@ class GrpcRequestManager:
547
532
 
548
533
  async def _handle_batch_output(self, batch_out: BatchTokenIDOutput):
549
534
  """Handle batch generation output from scheduler."""
535
+ # Collect all queue.put() tasks for parallel execution
536
+ put_tasks = []
537
+ cleanup_tasks = []
538
+ now = time.time()
539
+
550
540
  # Process each request in the batch
551
541
  for i, rid in enumerate(batch_out.rids):
552
542
  if rid not in self.rid_to_state:
@@ -554,8 +544,12 @@ class GrpcRequestManager:
554
544
 
555
545
  state = self.rid_to_state[rid]
556
546
 
547
+ # Skip if already aborted/finished locally (client cancelled)
548
+ if state.finished:
549
+ logger.debug(f"Skipping output for aborted request {rid}")
550
+ continue
551
+
557
552
  # Update metrics
558
- now = time.time()
559
553
  if state.first_token_time == 0.0:
560
554
  state.first_token_time = now
561
555
  state.last_time = now
@@ -649,7 +643,8 @@ class GrpcRequestManager:
649
643
  if output_data["token_ids"]:
650
644
  state.output_ids.extend(output_data["token_ids"])
651
645
 
652
- await state.out_queue.put(output_data)
646
+ # Add queue.put() to parallel task list
647
+ put_tasks.append(state.out_queue.put(output_data))
653
648
 
654
649
  # Handle completion
655
650
  if output_data["finished"]:
@@ -659,12 +654,16 @@ class GrpcRequestManager:
659
654
  state.event.set()
660
655
 
661
656
  # Remove from tracking after a delay
662
- async def cleanup():
657
+ async def cleanup(request_id):
663
658
  await asyncio.sleep(5.0)
664
- if rid in self.rid_to_state:
665
- del self.rid_to_state[rid]
659
+ if request_id in self.rid_to_state:
660
+ del self.rid_to_state[request_id]
666
661
 
667
- asyncio.create_task(cleanup())
662
+ cleanup_tasks.append(asyncio.create_task(cleanup(rid)))
663
+
664
+ # Execute all queue.put() operations in parallel
665
+ if put_tasks:
666
+ await asyncio.gather(*put_tasks, return_exceptions=True)
668
667
 
669
668
  async def _handle_embedding_output(self, batch_out: BatchEmbeddingOutput):
670
669
  """Handle batch embedding output from scheduler."""
@@ -726,6 +725,67 @@ class GrpcRequestManager:
726
725
  state.finished_time = time.time()
727
726
  state.event.set()
728
727
 
728
+ async def _handle_abort_req(self, recv_obj: AbortReq):
729
+ """Handle abort request from scheduler.
730
+
731
+ The scheduler sends AbortReq back to notify us that a request was aborted,
732
+ either due to explicit abort_request() call or scheduler-initiated abort
733
+ (priority preemption, queue full, KV cache pressure, etc).
734
+ """
735
+ # Skip health check requests
736
+ if recv_obj.rid.startswith("HEALTH_CHECK"):
737
+ return
738
+
739
+ # Check if request still exists
740
+ if recv_obj.rid not in self.rid_to_state:
741
+ logger.debug(
742
+ f"Abort request for {recv_obj.rid} not in local state (may have already finished or not started yet)"
743
+ )
744
+ return
745
+
746
+ state = self.rid_to_state[recv_obj.rid]
747
+
748
+ # Mark as finished
749
+ state.finished = True
750
+ state.stream_finished = True
751
+
752
+ # Create abort response
753
+ if recv_obj.finished_reason:
754
+ # Scheduler provided a specific finish reason (e.g., priority preemption, queue full)
755
+ abort_response = {
756
+ "request_id": recv_obj.rid,
757
+ "error": recv_obj.finished_reason.get("message", "Request aborted"),
758
+ "finished": True,
759
+ "meta_info": {
760
+ "id": recv_obj.rid,
761
+ "finish_reason": recv_obj.finished_reason,
762
+ },
763
+ }
764
+ else:
765
+ # Generic abort (e.g., explicit abort_request call)
766
+ abort_response = {
767
+ "request_id": recv_obj.rid,
768
+ "error": "Request aborted",
769
+ "finished": True,
770
+ "meta_info": {
771
+ "id": recv_obj.rid,
772
+ "finish_reason": {
773
+ "type": "abort",
774
+ "message": "Abort before prefill",
775
+ },
776
+ "prompt_tokens": 0,
777
+ "completion_tokens": 0,
778
+ },
779
+ }
780
+
781
+ # Send abort notification to output queue
782
+ await state.out_queue.put(abort_response)
783
+
784
+ # Wake up any waiting coroutines
785
+ state.event.set()
786
+
787
+ logger.debug(f"Handled abort request for {recv_obj.rid}")
788
+
729
789
  async def _send_to_scheduler(self, obj):
730
790
  """Send an object to the scheduler via ZMQ."""
731
791
  try:
@@ -0,0 +1,189 @@
1
+ """
2
+ Standard gRPC health check service implementation for Kubernetes probes.
3
+
4
+ This module implements the grpc.health.v1.Health service protocol, enabling
5
+ native Kubernetes gRPC health probes for liveness and readiness checks.
6
+ """
7
+
8
+ import logging
9
+ import time
10
+ from typing import AsyncIterator
11
+
12
+ import grpc
13
+ from grpc_health.v1 import health_pb2, health_pb2_grpc
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class SGLangHealthServicer(health_pb2_grpc.HealthServicer):
19
+ """
20
+ Standard gRPC health check service implementation for Kubernetes probes.
21
+ Implements grpc.health.v1.Health protocol.
22
+
23
+ Supports two service levels:
24
+ 1. Overall server health (service="") - for liveness probes
25
+ 2. SGLang service health (service="sglang.grpc.scheduler.SglangScheduler") - for readiness probes
26
+
27
+ Health status lifecycle:
28
+ - NOT_SERVING: Initial state, model loading, or shutting down
29
+ - SERVING: Model loaded and ready to serve requests
30
+ """
31
+
32
+ # Service names we support
33
+ OVERALL_SERVER = "" # Empty string for overall server health
34
+ SGLANG_SERVICE = "sglang.grpc.scheduler.SglangScheduler"
35
+
36
+ def __init__(self, request_manager, scheduler_info: dict):
37
+ """
38
+ Initialize health servicer.
39
+
40
+ Args:
41
+ request_manager: GrpcRequestManager instance for checking server state
42
+ scheduler_info: Dict containing scheduler metadata
43
+ """
44
+ self.request_manager = request_manager
45
+ self.scheduler_info = scheduler_info
46
+ self._serving_status = {}
47
+
48
+ # Initially set to NOT_SERVING until model is loaded
49
+ self._serving_status[self.OVERALL_SERVER] = (
50
+ health_pb2.HealthCheckResponse.NOT_SERVING
51
+ )
52
+ self._serving_status[self.SGLANG_SERVICE] = (
53
+ health_pb2.HealthCheckResponse.NOT_SERVING
54
+ )
55
+
56
+ logger.info("Standard gRPC health service initialized")
57
+
58
+ def set_serving(self):
59
+ """Mark services as SERVING - call this after model is loaded."""
60
+ self._serving_status[self.OVERALL_SERVER] = (
61
+ health_pb2.HealthCheckResponse.SERVING
62
+ )
63
+ self._serving_status[self.SGLANG_SERVICE] = (
64
+ health_pb2.HealthCheckResponse.SERVING
65
+ )
66
+ logger.info("Health service status set to SERVING")
67
+
68
+ def set_not_serving(self):
69
+ """Mark services as NOT_SERVING - call this during shutdown."""
70
+ self._serving_status[self.OVERALL_SERVER] = (
71
+ health_pb2.HealthCheckResponse.NOT_SERVING
72
+ )
73
+ self._serving_status[self.SGLANG_SERVICE] = (
74
+ health_pb2.HealthCheckResponse.NOT_SERVING
75
+ )
76
+ logger.info("Health service status set to NOT_SERVING")
77
+
78
+ async def Check(
79
+ self,
80
+ request: health_pb2.HealthCheckRequest,
81
+ context: grpc.aio.ServicerContext,
82
+ ) -> health_pb2.HealthCheckResponse:
83
+ """
84
+ Standard health check for Kubernetes probes.
85
+
86
+ Args:
87
+ request: Contains service name ("" for overall, or specific service)
88
+ context: gRPC context
89
+
90
+ Returns:
91
+ HealthCheckResponse with SERVING/NOT_SERVING/SERVICE_UNKNOWN status
92
+ """
93
+ service_name = request.service
94
+ logger.debug(f"Health check request for service: '{service_name}'")
95
+
96
+ # Check if shutting down
97
+ if self.request_manager.gracefully_exit:
98
+ logger.debug("Health check: Server is shutting down")
99
+ return health_pb2.HealthCheckResponse(
100
+ status=health_pb2.HealthCheckResponse.NOT_SERVING
101
+ )
102
+
103
+ # Overall server health - just check if process is alive
104
+ if service_name == self.OVERALL_SERVER:
105
+ status = self._serving_status.get(
106
+ self.OVERALL_SERVER, health_pb2.HealthCheckResponse.NOT_SERVING
107
+ )
108
+ logger.debug(
109
+ f"Overall health check: {health_pb2.HealthCheckResponse.ServingStatus.Name(status)}"
110
+ )
111
+ return health_pb2.HealthCheckResponse(status=status)
112
+
113
+ # Specific service health - check if ready to serve
114
+ elif service_name == self.SGLANG_SERVICE:
115
+ # Additional checks for service readiness
116
+
117
+ # Check base status first
118
+ base_status = self._serving_status.get(
119
+ self.SGLANG_SERVICE, health_pb2.HealthCheckResponse.NOT_SERVING
120
+ )
121
+
122
+ if base_status != health_pb2.HealthCheckResponse.SERVING:
123
+ logger.debug("Service health check: NOT_SERVING (base status)")
124
+ return health_pb2.HealthCheckResponse(status=base_status)
125
+
126
+ # Check if scheduler is responsive (received data recently)
127
+ time_since_last_receive = (
128
+ time.time() - self.request_manager.last_receive_tstamp
129
+ )
130
+
131
+ # If no recent activity and we have active requests, might be stuck
132
+ # NOTE: 30s timeout is hardcoded. This is more conservative than
133
+ # HEALTH_CHECK_TIMEOUT (20s) used for custom HealthCheck RPC.
134
+ # Consider making this configurable via environment variable in the future
135
+ # if different workloads need different responsiveness thresholds.
136
+ if (
137
+ time_since_last_receive > 30
138
+ and len(self.request_manager.rid_to_state) > 0
139
+ ):
140
+ logger.warning(
141
+ f"Service health check: Scheduler not responsive "
142
+ f"({time_since_last_receive:.1f}s since last receive, "
143
+ f"{len(self.request_manager.rid_to_state)} pending requests)"
144
+ )
145
+ return health_pb2.HealthCheckResponse(
146
+ status=health_pb2.HealthCheckResponse.NOT_SERVING
147
+ )
148
+
149
+ logger.debug("Service health check: SERVING")
150
+ return health_pb2.HealthCheckResponse(
151
+ status=health_pb2.HealthCheckResponse.SERVING
152
+ )
153
+
154
+ # Unknown service
155
+ else:
156
+ logger.debug(f"Health check for unknown service: '{service_name}'")
157
+ context.set_code(grpc.StatusCode.NOT_FOUND)
158
+ context.set_details(f"Unknown service: {service_name}")
159
+ return health_pb2.HealthCheckResponse(
160
+ status=health_pb2.HealthCheckResponse.SERVICE_UNKNOWN
161
+ )
162
+
163
+ async def Watch(
164
+ self,
165
+ request: health_pb2.HealthCheckRequest,
166
+ context: grpc.aio.ServicerContext,
167
+ ) -> AsyncIterator[health_pb2.HealthCheckResponse]:
168
+ """
169
+ Streaming health check - sends updates when status changes.
170
+
171
+ For now, just send current status once (Kubernetes doesn't use Watch).
172
+ A full implementation would monitor status changes and stream updates.
173
+
174
+ Args:
175
+ request: Contains service name
176
+ context: gRPC context
177
+
178
+ Yields:
179
+ HealthCheckResponse messages when status changes
180
+ """
181
+ service_name = request.service
182
+ logger.debug(f"Health watch request for service: '{service_name}'")
183
+
184
+ # Send current status
185
+ response = await self.Check(request, context)
186
+ yield response
187
+
188
+ # Note: Full Watch implementation would monitor status changes
189
+ # and stream updates. For K8s probes, Check is sufficient.
@@ -0,0 +1,181 @@
1
+ """
2
+ Scheduler process management for gRPC server.
3
+
4
+ This module handles launching and managing scheduler processes for the gRPC server,
5
+ including tensor parallelism, pipeline parallelism, and data parallelism configurations.
6
+ """
7
+
8
+ import logging
9
+ import multiprocessing as mp
10
+ import signal
11
+ from typing import Dict, List, Optional, Tuple
12
+
13
+ from sglang.srt.managers.data_parallel_controller import (
14
+ run_data_parallel_controller_process,
15
+ )
16
+ from sglang.srt.managers.scheduler import run_scheduler_process
17
+ from sglang.srt.server_args import PortArgs, ServerArgs
18
+ from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
19
+ from sglang.srt.utils.torch_memory_saver_adapter import TorchMemorySaverAdapter
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ def run_scheduler_with_signal_handling(*args, **kwargs):
25
+ """
26
+ Wrapper for run_scheduler_process that ignores SIGINT.
27
+
28
+ The scheduler process should not handle Ctrl+C - it should only terminate
29
+ when the parent gRPC server exits (via kill_itself_when_parent_died).
30
+
31
+ Args:
32
+ *args: Positional arguments for run_scheduler_process
33
+ **kwargs: Keyword arguments for run_scheduler_process
34
+ """
35
+ # Ignore SIGINT in this subprocess - let the parent handle it
36
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
37
+
38
+ # Now run the actual scheduler process
39
+ run_scheduler_process(*args, **kwargs)
40
+
41
+
42
+ def launch_scheduler_process_only(
43
+ server_args: ServerArgs,
44
+ port_args: Optional[PortArgs] = None,
45
+ ) -> Tuple[Dict, PortArgs, List[mp.Process]]:
46
+ """
47
+ Launch only the scheduler process(es) without tokenizer/detokenizer.
48
+
49
+ This function handles all scheduler startup logic including:
50
+ - Tensor parallelism (tp_size)
51
+ - Pipeline parallelism (pp_size)
52
+ - Data parallelism (dp_size)
53
+ - Multi-node distributed setup
54
+
55
+ Args:
56
+ server_args: Server configuration
57
+ port_args: Port configuration (created if None)
58
+
59
+ Returns:
60
+ Tuple of (scheduler_info, port_args, scheduler_processes):
61
+ - scheduler_info: Dict with model metadata and configuration
62
+ - port_args: Port configuration used for IPC
63
+ - scheduler_processes: List of launched scheduler Process objects
64
+
65
+ Raises:
66
+ RuntimeError: If any scheduler process fails to initialize
67
+ """
68
+ # Configure global environment
69
+ configure_logger(server_args)
70
+ server_args.check_server_args()
71
+
72
+ # Fix CUDA multiprocessing issues - must be called before any CUDA operations
73
+ mp.set_start_method("spawn", force=True)
74
+
75
+ # Allocate ports for inter-process communications
76
+ if port_args is None:
77
+ port_args = PortArgs.init_new(server_args)
78
+ logger.info(f"{server_args=}")
79
+
80
+ # Prepare model and tokenizer paths
81
+ server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
82
+ server_args.model_path, server_args.tokenizer_path
83
+ )
84
+
85
+ scheduler_procs = []
86
+
87
+ if server_args.dp_size == 1:
88
+ # Single data parallel group - launch TP/PP schedulers
89
+ memory_saver_adapter = TorchMemorySaverAdapter.create(
90
+ enable=server_args.enable_memory_saver
91
+ )
92
+ scheduler_pipe_readers = []
93
+
94
+ # Calculate TP/PP distribution across nodes
95
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
96
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
97
+ tp_rank_range = range(
98
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
99
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
100
+ )
101
+
102
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
103
+ pp_rank_range = range(
104
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
105
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
106
+ )
107
+
108
+ # Launch scheduler for each TP/PP rank combination
109
+ for pp_rank in pp_rank_range:
110
+ for tp_rank in tp_rank_range:
111
+ reader, writer = mp.Pipe(duplex=False)
112
+
113
+ # Calculate GPU ID for this rank
114
+ gpu_id = (
115
+ server_args.base_gpu_id
116
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
117
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
118
+ )
119
+
120
+ # Calculate MoE expert parallel rank
121
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
122
+
123
+ # Create scheduler process
124
+ proc = mp.Process(
125
+ target=run_scheduler_with_signal_handling,
126
+ args=(
127
+ server_args,
128
+ port_args,
129
+ gpu_id,
130
+ tp_rank,
131
+ moe_ep_rank,
132
+ pp_rank,
133
+ None, # dp_rank
134
+ writer,
135
+ ),
136
+ )
137
+
138
+ with memory_saver_adapter.configure_subprocess():
139
+ proc.start()
140
+
141
+ scheduler_procs.append(proc)
142
+ scheduler_pipe_readers.append(reader)
143
+ else:
144
+ # Data parallelism - launch data parallel controller
145
+ reader, writer = mp.Pipe(duplex=False)
146
+ scheduler_pipe_readers = [reader]
147
+
148
+ proc = mp.Process(
149
+ target=run_data_parallel_controller_process,
150
+ args=(server_args, port_args, writer),
151
+ )
152
+ proc.start()
153
+ scheduler_procs.append(proc)
154
+
155
+ # TODO(CatherineSue): handle cases for multi-node
156
+
157
+ # Wait for all scheduler processes to be ready
158
+ scheduler_infos = []
159
+ for i, reader in enumerate(scheduler_pipe_readers):
160
+ try:
161
+ data = reader.recv()
162
+ except EOFError:
163
+ logger.error(
164
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
165
+ )
166
+ scheduler_procs[i].join()
167
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
168
+ raise RuntimeError(f"Failed to initialize scheduler rank {i}")
169
+
170
+ if data.get("status") != "ready":
171
+ raise RuntimeError(
172
+ f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
173
+ )
174
+ scheduler_infos.append(data)
175
+
176
+ logger.info(
177
+ f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
178
+ )
179
+
180
+ # Return the first scheduler's info (they should all be the same)
181
+ return scheduler_infos[0], port_args, scheduler_procs