sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -3,164 +3,44 @@ Standalone gRPC Server for SGLang - Fully separated from HTTP server.
3
3
  Uses GrpcRequestManager for orchestration without tokenization.
4
4
  """
5
5
 
6
- import argparse
7
6
  import asyncio
7
+ import dataclasses
8
8
  import logging
9
9
  import multiprocessing as mp
10
10
  import os
11
11
  import signal
12
+ import threading
12
13
  import time
13
14
  from concurrent import futures
14
- from typing import AsyncIterator, Dict, Optional, Tuple
15
+ from typing import AsyncIterator, Dict, Optional
15
16
 
16
17
  import grpc
18
+ from google.protobuf.json_format import MessageToDict
19
+ from google.protobuf.struct_pb2 import Struct
20
+ from google.protobuf.timestamp_pb2 import Timestamp
21
+ from grpc_health.v1 import health_pb2_grpc
17
22
  from grpc_reflection.v1alpha import reflection
18
23
 
24
+ import sglang
19
25
  from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
20
- from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
21
26
  from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
22
- from sglang.srt.managers.data_parallel_controller import (
23
- run_data_parallel_controller_process,
24
- )
27
+ from sglang.srt.grpc.grpc_request_manager import GrpcRequestManager
28
+ from sglang.srt.grpc.health_servicer import SGLangHealthServicer
29
+ from sglang.srt.grpc.scheduler_launcher import launch_scheduler_process_only
25
30
  from sglang.srt.managers.disagg_service import start_disagg_service
26
31
  from sglang.srt.managers.io_struct import (
27
32
  TokenizedEmbeddingReqInput,
28
33
  TokenizedGenerateReqInput,
29
34
  )
30
- from sglang.srt.managers.scheduler import run_scheduler_process
31
35
  from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
32
- from sglang.srt.server_args import PortArgs, ServerArgs
33
- from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
34
- from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
36
+ from sglang.srt.server_args import ServerArgs
37
+ from sglang.srt.utils import kill_process_tree
35
38
  from sglang.utils import get_exception_traceback
36
39
 
37
40
  logger = logging.getLogger(__name__)
38
41
  HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
39
42
 
40
43
 
41
- def _run_scheduler_with_signal_handling(*args, **kwargs):
42
- """
43
- Wrapper for run_scheduler_process that ignores SIGINT.
44
-
45
- The scheduler process should not handle Ctrl+C - it should only terminate
46
- when the parent gRPC server exits (via kill_itself_when_parent_died).
47
- """
48
- # Ignore SIGINT in this subprocess - let the parent handle it
49
- signal.signal(signal.SIGINT, signal.SIG_IGN)
50
-
51
- # Now run the actual scheduler process
52
- run_scheduler_process(*args, **kwargs)
53
-
54
-
55
- def _launch_scheduler_process_only(
56
- server_args: ServerArgs,
57
- port_args: Optional[PortArgs] = None,
58
- ) -> Tuple[Dict, PortArgs, list]:
59
- """
60
- Launch only the scheduler process(es) without tokenizer/detokenizer.
61
- Returns scheduler info, port args, and list of scheduler processes.
62
- """
63
- # Configure global environment
64
- configure_logger(server_args)
65
- server_args.check_server_args()
66
-
67
- # Allocate ports for inter-process communications
68
- if port_args is None:
69
- port_args = PortArgs.init_new(server_args)
70
- logger.info(f"{server_args=}")
71
-
72
- # Prepare model and tokenizer paths
73
- server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
74
- server_args.model_path, server_args.tokenizer_path
75
- )
76
-
77
- scheduler_procs = []
78
- if server_args.dp_size == 1:
79
- memory_saver_adapter = TorchMemorySaverAdapter.create(
80
- enable=server_args.enable_memory_saver
81
- )
82
- scheduler_pipe_readers = []
83
-
84
- nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
85
- tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
86
- tp_rank_range = range(
87
- tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
88
- tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
89
- )
90
-
91
- pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
92
- pp_rank_range = range(
93
- pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
94
- pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
95
- )
96
-
97
- for pp_rank in pp_rank_range:
98
- for tp_rank in tp_rank_range:
99
- reader, writer = mp.Pipe(duplex=False)
100
- gpu_id = (
101
- server_args.base_gpu_id
102
- + ((pp_rank % pp_size_per_node) * tp_size_per_node)
103
- + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
104
- )
105
- moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
106
- proc = mp.Process(
107
- target=_run_scheduler_with_signal_handling,
108
- args=(
109
- server_args,
110
- port_args,
111
- gpu_id,
112
- tp_rank,
113
- moe_ep_rank,
114
- pp_rank,
115
- None,
116
- writer,
117
- ),
118
- )
119
-
120
- with memory_saver_adapter.configure_subprocess():
121
- proc.start()
122
- scheduler_procs.append(proc)
123
- scheduler_pipe_readers.append(reader)
124
- else:
125
- # Launch the data parallel controller
126
- reader, writer = mp.Pipe(duplex=False)
127
- scheduler_pipe_readers = [reader]
128
- proc = mp.Process(
129
- target=run_data_parallel_controller_process,
130
- args=(server_args, port_args, writer),
131
- )
132
- proc.start()
133
- scheduler_procs.append(proc)
134
-
135
- # TODO(CatherineSue): handle cases for multi-node
136
-
137
- # Wait for all scheduler processes to be ready
138
- scheduler_infos = []
139
- for i, reader in enumerate(scheduler_pipe_readers):
140
- try:
141
- data = reader.recv()
142
- except EOFError:
143
- logger.error(
144
- f"Rank {i} scheduler is dead. Please check if there are relevant logs."
145
- )
146
- scheduler_procs[i].join()
147
- logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
148
- raise RuntimeError(f"Failed to initialize scheduler rank {i}")
149
-
150
- if data.get("status") != "ready":
151
- raise RuntimeError(
152
- f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
153
- )
154
- scheduler_infos.append(data)
155
-
156
- logger.info(
157
- f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
158
- )
159
-
160
- # Return the first scheduler's info (they should all be the same)
161
- return scheduler_infos[0], port_args, scheduler_procs
162
-
163
-
164
44
  class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
165
45
  """
166
46
  Standalone gRPC service implementation using GrpcRequestManager.
@@ -172,17 +52,21 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
172
52
  request_manager: GrpcRequestManager,
173
53
  server_args: ServerArgs,
174
54
  model_info: Dict,
55
+ scheduler_info: Dict,
56
+ health_servicer: Optional[SGLangHealthServicer] = None,
175
57
  ):
176
58
  """Initialize the standalone gRPC service."""
177
59
  self.request_manager = request_manager
178
60
  self.server_args = server_args
179
61
  self.model_info = model_info
62
+ self.scheduler_info = scheduler_info
180
63
  self.start_time = time.time()
64
+ self.health_servicer = health_servicer
181
65
 
182
66
  # Start the request manager's event loop using auto_create_handle_loop
183
67
  self.request_manager.auto_create_handle_loop()
184
68
 
185
- logger.info("Standalone gRPC scheduler service initialized")
69
+ logger.info("gRPC scheduler servicer initialized")
186
70
 
187
71
  async def Generate(
188
72
  self,
@@ -190,7 +74,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
190
74
  context: grpc.aio.ServicerContext,
191
75
  ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
192
76
  """Handle generation requests with streaming responses."""
193
- logger.info(f"Generation request: {request.request_id}")
77
+ logger.info(f"Receive generation request: {request.request_id}")
194
78
 
195
79
  try:
196
80
  # Convert gRPC request to internal format
@@ -242,7 +126,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
242
126
  yield self._create_chunk_response(request.request_id, output)
243
127
 
244
128
  except Exception as e:
245
- logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
129
+ logger.error(
130
+ f"Generate failed for request {request.request_id}: {e}\n"
131
+ f"{get_exception_traceback()}"
132
+ )
246
133
  yield sglang_scheduler_pb2.GenerateResponse(
247
134
  request_id=request.request_id,
248
135
  error=sglang_scheduler_pb2.GenerateError(
@@ -255,10 +142,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
255
142
  async def Embed(
256
143
  self,
257
144
  request: sglang_scheduler_pb2.EmbedRequest,
258
- context: grpc.aio.ServicerContext,
145
+ _context: grpc.aio.ServicerContext,
259
146
  ) -> sglang_scheduler_pb2.EmbedResponse:
260
147
  """Handle embedding requests."""
261
- logger.info(f"Embedding request: {request.request_id}")
148
+ logger.info(f"Receive embedding request: {request.request_id}")
262
149
 
263
150
  try:
264
151
  # Convert request
@@ -285,7 +172,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
285
172
  )
286
173
 
287
174
  except Exception as e:
288
- logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
175
+ logger.error(
176
+ f"Embed failed for request {request.request_id}: {e}\n"
177
+ f"{get_exception_traceback()}"
178
+ )
289
179
  return sglang_scheduler_pb2.EmbedResponse(
290
180
  request_id=request.request_id,
291
181
  error=sglang_scheduler_pb2.EmbedError(
@@ -300,86 +190,95 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
300
190
  request: sglang_scheduler_pb2.HealthCheckRequest,
301
191
  context: grpc.aio.ServicerContext,
302
192
  ) -> sglang_scheduler_pb2.HealthCheckResponse:
303
- """Health check by generating from client input."""
304
- try:
305
- # Check if request manager is shutting down
306
- if self.request_manager.gracefully_exit:
307
- return sglang_scheduler_pb2.HealthCheckResponse(
308
- healthy=False, message="Server shutting down"
309
- )
310
-
311
- # Extract tokenized input from request
312
- if not request.HasField("tokenized"):
313
- return sglang_scheduler_pb2.HealthCheckResponse(
314
- healthy=False, message="Tokenized input required for health check"
315
- )
316
-
317
- input_text = request.tokenized.original_text
318
- input_ids = list(request.tokenized.input_ids)
193
+ """
194
+ Check the health of the inference server by sending a special request to generate one token.
195
+ Similar to HTTP server's /health endpoint.
196
+ """
197
+ rid = f"HEALTH_CHECK_{time.time()}"
198
+ logger.info(f"Receive health check request: {rid}")
199
+
200
+ if self.request_manager.gracefully_exit:
201
+ logger.info(
202
+ "Health check request received during shutdown. Returning unhealthy."
203
+ )
204
+ return sglang_scheduler_pb2.HealthCheckResponse(
205
+ healthy=False, message="Server is shutting down"
206
+ )
319
207
 
320
- # Create health check request
321
- rid = f"HEALTH_CHECK_GRPC_{time.time()}"
208
+ # Create a special health check request
209
+ sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
210
+ sampling_params.normalize(tokenizer=None)
322
211
 
323
- health_request = TokenizedGenerateReqInput(
212
+ # Create health check request
213
+ is_generation = self.scheduler_info.get("is_generation", True)
214
+ if is_generation:
215
+ health_req = TokenizedGenerateReqInput(
324
216
  rid=rid,
325
- input_text=input_text,
326
- input_ids=input_ids,
327
- sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
328
- stream=False,
329
- mm_inputs=None,
217
+ input_text="",
218
+ input_ids=[0],
219
+ sampling_params=sampling_params,
330
220
  return_logprob=False,
331
221
  logprob_start_len=-1,
332
222
  top_logprobs_num=0,
223
+ stream=False,
224
+ mm_inputs=None,
333
225
  token_ids_logprob=None,
334
226
  )
335
-
227
+ # Set disaggregation params if needed
336
228
  if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
337
- health_request.bootstrap_host = FAKE_BOOTSTRAP_HOST
338
- health_request.bootstrap_room = 0
339
-
340
- logger.info(f"Sending health check request to request manager...")
341
-
342
- # Submit and wait for response
343
- output_generator = self.request_manager.generate_request(
344
- health_request, request_id=rid
229
+ health_req.bootstrap_host = FAKE_BOOTSTRAP_HOST
230
+ health_req.bootstrap_room = 0
231
+ else:
232
+ health_req = TokenizedEmbeddingReqInput(
233
+ rid=rid,
234
+ input_text="",
235
+ input_ids=[0],
345
236
  )
346
237
 
238
+ # Submit health check request
239
+ async def run_health_check():
347
240
  try:
348
- # Get first response with timeout
349
- response = await asyncio.wait_for(
350
- output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT
351
- )
352
-
353
- # Clean up
354
- if rid in self.request_manager.rid_to_state:
355
- del self.request_manager.rid_to_state[rid]
356
-
241
+ async for _ in self.request_manager.generate_request(
242
+ obj=health_req,
243
+ request_id=rid,
244
+ ):
245
+ # Got at least one response, server is healthy
246
+ return True
247
+ except Exception as e:
248
+ logger.warning(f"Health check failed: {e}")
249
+ return False
250
+ return False
251
+
252
+ task = asyncio.create_task(run_health_check())
253
+
254
+ # Wait for response with timeout
255
+ tic = time.time()
256
+ while time.time() < tic + HEALTH_CHECK_TIMEOUT:
257
+ await asyncio.sleep(1)
258
+ # Check if we got a response from scheduler
259
+ if self.request_manager.last_receive_tstamp > tic:
260
+ task.cancel()
261
+ # Clean up health check state
262
+ self.request_manager._cleanup_request_state(rid)
357
263
  return sglang_scheduler_pb2.HealthCheckResponse(
358
264
  healthy=True, message="Health check passed"
359
265
  )
360
266
 
361
- except asyncio.TimeoutError:
362
- # Clean up on timeout
363
- if rid in self.request_manager.rid_to_state:
364
- del self.request_manager.rid_to_state[rid]
365
-
366
- return sglang_scheduler_pb2.HealthCheckResponse(
367
- healthy=False, message="Health check timeout"
368
- )
369
-
370
- except Exception as e:
371
- logger.error(f"Health check failed: {e}")
372
- return sglang_scheduler_pb2.HealthCheckResponse(
373
- healthy=False, message=f"Health check error: {str(e)}"
374
- )
267
+ # Timeout - server not responding
268
+ task.cancel()
269
+ self.request_manager._cleanup_request_state(rid)
270
+ logger.warning(f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s")
271
+ return sglang_scheduler_pb2.HealthCheckResponse(
272
+ healthy=False, message=f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s"
273
+ )
375
274
 
376
275
  async def Abort(
377
276
  self,
378
277
  request: sglang_scheduler_pb2.AbortRequest,
379
- context: grpc.aio.ServicerContext,
278
+ _context: grpc.aio.ServicerContext,
380
279
  ) -> sglang_scheduler_pb2.AbortResponse:
381
280
  """Abort an ongoing request."""
382
- logger.info(f"Aborting request: {request.request_id}")
281
+ logger.info(f"Receive abort request: {request.request_id}")
383
282
 
384
283
  try:
385
284
  success = await self.request_manager.abort_request(request.request_id)
@@ -389,12 +288,98 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
389
288
  message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
390
289
  )
391
290
  except Exception as e:
392
- logger.error(f"Abort failed: {e}")
291
+ logger.error(
292
+ f"Abort failed for request {request.request_id}: {e}\n"
293
+ f"{get_exception_traceback()}"
294
+ )
393
295
  return sglang_scheduler_pb2.AbortResponse(
394
296
  success=False,
395
297
  message=str(e),
396
298
  )
397
299
 
300
+ async def GetModelInfo(
301
+ self,
302
+ _request: sglang_scheduler_pb2.GetModelInfoRequest,
303
+ _context: grpc.aio.ServicerContext,
304
+ ) -> sglang_scheduler_pb2.GetModelInfoResponse:
305
+ """Get model information."""
306
+ logger.debug("Receive model info request")
307
+
308
+ is_generation = self.scheduler_info.get("is_generation")
309
+ if is_generation is None:
310
+ is_generation = not self.server_args.is_embedding
311
+
312
+ return sglang_scheduler_pb2.GetModelInfoResponse(
313
+ model_path=self.server_args.model_path,
314
+ tokenizer_path=self.server_args.tokenizer_path or "",
315
+ is_generation=is_generation,
316
+ preferred_sampling_params=(
317
+ self.server_args.preferred_sampling_params or ""
318
+ ),
319
+ weight_version=self.server_args.weight_version or "",
320
+ served_model_name=self.server_args.served_model_name,
321
+ max_context_length=self.model_info["max_context_length"],
322
+ vocab_size=self.model_info["vocab_size"],
323
+ supports_vision=self.model_info["supports_vision"],
324
+ model_type=self.model_info["model_type"],
325
+ eos_token_ids=self.model_info["eos_token_ids"],
326
+ pad_token_id=self.model_info["pad_token_id"],
327
+ bos_token_id=self.model_info["bos_token_id"],
328
+ max_req_input_len=self.model_info["max_req_input_len"],
329
+ )
330
+
331
+ async def GetServerInfo(
332
+ self,
333
+ _request: sglang_scheduler_pb2.GetServerInfoRequest,
334
+ _context: grpc.aio.ServicerContext,
335
+ ) -> sglang_scheduler_pb2.GetServerInfoResponse:
336
+ """Get server information."""
337
+ logger.debug("Receive server info request")
338
+
339
+ server_args_dict = dataclasses.asdict(self.server_args)
340
+ server_args_struct = Struct()
341
+
342
+ def make_serializable(obj):
343
+ if obj is None:
344
+ return None
345
+ elif isinstance(obj, (str, int, float, bool)):
346
+ return obj
347
+ elif isinstance(obj, (list, tuple, set)):
348
+ return [make_serializable(item) for item in obj]
349
+ elif isinstance(obj, dict):
350
+ return {k: make_serializable(v) for k, v in obj.items()}
351
+ else:
352
+ return str(obj)
353
+
354
+ serializable_args = make_serializable(server_args_dict)
355
+ server_args_struct.update(serializable_args)
356
+
357
+ # Convert scheduler_info to Struct
358
+ scheduler_info_struct = Struct()
359
+ scheduler_info_struct.update(self.scheduler_info)
360
+
361
+ # Get runtime state from request manager
362
+ manager_state = self.request_manager.get_server_info()
363
+
364
+ # Calculate uptime
365
+ uptime = time.time() - self.start_time
366
+
367
+ # Create timestamp
368
+ start_timestamp = Timestamp()
369
+ start_timestamp.FromSeconds(int(self.start_time))
370
+
371
+ return sglang_scheduler_pb2.GetServerInfoResponse(
372
+ server_args=server_args_struct,
373
+ scheduler_info=scheduler_info_struct,
374
+ active_requests=manager_state["active_requests"],
375
+ is_paused=manager_state["paused"],
376
+ last_receive_timestamp=manager_state["last_receive_time"],
377
+ uptime_seconds=uptime,
378
+ sglang_version=sglang.__version__,
379
+ server_type="grpc",
380
+ start_time=start_timestamp,
381
+ )
382
+
398
383
  # Helper methods for request/response conversion
399
384
 
400
385
  def _convert_generate_request(
@@ -411,15 +396,27 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
411
396
 
412
397
  # Convert sampling params
413
398
  sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
399
+ sampling_params.normalize(tokenizer=None)
414
400
 
415
401
  # Extract disaggregated params if present
416
402
  bootstrap_host = None
417
403
  bootstrap_port = None
418
404
  bootstrap_room = None
419
405
  if grpc_req.HasField("disaggregated_params"):
420
- bootstrap_host = grpc_req.disaggregated_params.bootstrap_host or None
421
- bootstrap_port = grpc_req.disaggregated_params.bootstrap_port or None
422
- bootstrap_room = grpc_req.disaggregated_params.bootstrap_room or None
406
+ # Don't use 'or None' as it treats 0 as falsy
407
+ bootstrap_host = (
408
+ grpc_req.disaggregated_params.bootstrap_host
409
+ if grpc_req.disaggregated_params.bootstrap_host
410
+ else None
411
+ )
412
+ bootstrap_port = (
413
+ grpc_req.disaggregated_params.bootstrap_port
414
+ if grpc_req.disaggregated_params.bootstrap_port
415
+ else None
416
+ )
417
+ bootstrap_room = (
418
+ grpc_req.disaggregated_params.bootstrap_room
419
+ ) # Can be 0, don't use 'or None'
423
420
 
424
421
  # Create request
425
422
  return TokenizedGenerateReqInput(
@@ -483,28 +480,52 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
483
480
  elif grpc_params.HasField("structural_tag"):
484
481
  structural_tag = grpc_params.structural_tag
485
482
 
483
+ # Handle optional parameters conversion
484
+ custom_params = (
485
+ MessageToDict(grpc_params.custom_params)
486
+ if grpc_params.HasField("custom_params")
487
+ else None
488
+ )
489
+ max_new_tokens = (
490
+ grpc_params.max_new_tokens
491
+ if grpc_params.HasField("max_new_tokens")
492
+ else None
493
+ )
494
+ stream_interval = (
495
+ grpc_params.stream_interval
496
+ if grpc_params.HasField("stream_interval")
497
+ else None
498
+ )
499
+ logit_bias = dict(grpc_params.logit_bias) if grpc_params.logit_bias else None
500
+ stop = list(grpc_params.stop) if grpc_params.stop else None
501
+ stop_token_ids = (
502
+ list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
503
+ )
504
+
486
505
  return SGLSamplingParams(
487
- temperature=grpc_params.temperature or 1.0,
488
- top_p=grpc_params.top_p or 1.0,
489
- top_k=grpc_params.top_k or -1,
490
- min_p=grpc_params.min_p or 0.0,
491
- frequency_penalty=grpc_params.frequency_penalty or 0.0,
492
- presence_penalty=grpc_params.presence_penalty or 0.0,
493
- repetition_penalty=grpc_params.repetition_penalty or 1.0,
494
- max_new_tokens=grpc_params.max_new_tokens or 128,
495
- min_new_tokens=grpc_params.min_new_tokens or 0,
496
- stop=list(grpc_params.stop) if grpc_params.stop else [],
497
- stop_token_ids=(
498
- list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else []
499
- ),
506
+ temperature=grpc_params.temperature,
507
+ top_p=grpc_params.top_p,
508
+ top_k=grpc_params.top_k,
509
+ min_p=grpc_params.min_p,
510
+ frequency_penalty=grpc_params.frequency_penalty,
511
+ presence_penalty=grpc_params.presence_penalty,
512
+ repetition_penalty=grpc_params.repetition_penalty,
513
+ max_new_tokens=max_new_tokens,
514
+ min_new_tokens=grpc_params.min_new_tokens,
515
+ stop=stop,
516
+ stop_token_ids=stop_token_ids,
500
517
  skip_special_tokens=grpc_params.skip_special_tokens,
501
518
  spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
519
+ no_stop_trim=grpc_params.no_stop_trim,
502
520
  regex=regex,
503
521
  json_schema=json_schema,
504
522
  ebnf=ebnf_grammar,
505
523
  structural_tag=structural_tag,
506
- n=grpc_params.n or 1,
524
+ n=grpc_params.n,
507
525
  ignore_eos=grpc_params.ignore_eos,
526
+ stream_interval=stream_interval,
527
+ logit_bias=logit_bias,
528
+ custom_params=custom_params,
508
529
  )
509
530
 
510
531
  def _convert_output_logprobs_to_proto(
@@ -667,6 +688,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
667
688
  """Shutdown the service."""
668
689
  logger.info("Shutting down gRPC service")
669
690
 
691
+ # Mark health service as NOT_SERVING before shutdown
692
+ if self.health_servicer:
693
+ self.health_servicer.set_not_serving()
694
+
670
695
  # Shutdown request manager (handles its own tasks)
671
696
  await self.request_manager.shutdown()
672
697
 
@@ -689,7 +714,7 @@ async def serve_grpc(
689
714
 
690
715
  # Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
691
716
  logger.info("Launching scheduler process(es)...")
692
- scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
717
+ scheduler_info, port_args, scheduler_procs = launch_scheduler_process_only(
693
718
  server_args=server_args,
694
719
  )
695
720
 
@@ -726,17 +751,27 @@ async def serve_grpc(
726
751
  ],
727
752
  )
728
753
 
729
- # Add service
754
+ # Create standard health service (for Kubernetes probes)
755
+ health_servicer = SGLangHealthServicer(
756
+ request_manager=request_manager,
757
+ scheduler_info=scheduler_info,
758
+ )
759
+ health_pb2_grpc.add_HealthServicer_to_server(health_servicer, server)
760
+
761
+ # Add SGLang service
730
762
  servicer = SGLangSchedulerServicer(
731
763
  request_manager=request_manager,
732
764
  server_args=server_args,
733
765
  model_info=model_info,
766
+ scheduler_info=scheduler_info,
767
+ health_servicer=health_servicer,
734
768
  )
735
769
  sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
736
770
 
737
771
  # Enable reflection
738
772
  SERVICE_NAMES = (
739
773
  sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
774
+ "grpc.health.v1.Health",
740
775
  reflection.SERVICE_NAME,
741
776
  )
742
777
  reflection.enable_server_reflection(SERVICE_NAMES, server)
@@ -745,9 +780,15 @@ async def serve_grpc(
745
780
  listen_addr = f"{server_args.host}:{server_args.port}"
746
781
  server.add_insecure_port(listen_addr)
747
782
 
748
- logger.info(f"Starting standalone gRPC server on {listen_addr}")
749
-
750
783
  await server.start()
784
+ logger.info(f"gRPC server listening on {listen_addr}")
785
+
786
+ # Start warmup in a separate thread
787
+ warmup_thread = threading.Thread(
788
+ target=_wait_and_warmup_grpc,
789
+ args=(server_args, None, health_servicer),
790
+ )
791
+ warmup_thread.start()
751
792
 
752
793
  # Handle shutdown signals
753
794
  loop = asyncio.get_running_loop()
@@ -771,6 +812,11 @@ async def serve_grpc(
771
812
  # Stop the gRPC server
772
813
  await server.stop(5.0)
773
814
 
815
+ # Wait for warmup thread to finish
816
+ if warmup_thread.is_alive():
817
+ logger.info("Waiting for warmup thread to finish...")
818
+ warmup_thread.join(timeout=5.0)
819
+
774
820
  # Terminate scheduler processes before exiting to avoid atexit hang
775
821
  # The scheduler processes have SIGINT ignored, so they won't get KeyboardInterrupt
776
822
  for i, proc in enumerate(scheduler_procs):
@@ -788,23 +834,173 @@ async def serve_grpc(
788
834
  logger.info("All scheduler processes terminated")
789
835
 
790
836
 
791
- def main():
792
- """Main entry point for standalone gRPC server."""
793
- # Fix CUDA multiprocessing issues - must be called before any CUDA operations
794
- mp.set_start_method("spawn", force=True)
837
+ def _execute_grpc_server_warmup(
838
+ server_args: ServerArgs,
839
+ pipe_finish_writer: Optional[mp.connection.Connection],
840
+ ):
841
+ """Execute warmup for gRPC server by checking health and sending test request."""
842
+ try:
843
+ # Connect to the gRPC server
844
+ grpc_url = f"{server_args.host}:{server_args.port}"
845
+ channel = grpc.insecure_channel(
846
+ grpc_url,
847
+ options=[
848
+ ("grpc.max_send_message_length", 1024 * 1024 * 256),
849
+ ("grpc.max_receive_message_length", 1024 * 1024 * 256),
850
+ ],
851
+ )
852
+ stub = sglang_scheduler_pb2_grpc.SglangSchedulerStub(channel)
853
+
854
+ # Wait until the server is launched (poll GetModelInfo)
855
+ success = False
856
+ last_error = None
857
+ for _ in range(120):
858
+ time.sleep(1)
859
+ try:
860
+ request = sglang_scheduler_pb2.GetModelInfoRequest()
861
+ response = stub.GetModelInfo(request, timeout=5)
862
+ success = True
863
+ break
864
+ except Exception as e:
865
+ last_error = str(e)
866
+ pass
867
+
868
+ if not success:
869
+ error_msg = f"gRPC server warmup failed: Could not connect to server after 120 seconds. Last error: {last_error}"
870
+ logger.error(error_msg)
871
+ if pipe_finish_writer is not None:
872
+ pipe_finish_writer.send(error_msg)
873
+ channel.close()
874
+ kill_process_tree(os.getpid())
875
+ return False
876
+
877
+ # Get model info to determine if it's generation or embedding
878
+ is_generation = response.is_generation
879
+
880
+ # Send a warmup request
881
+ logger.info("Sending warmup request to gRPC server...")
882
+ max_new_tokens = 8 if is_generation else 1
883
+
884
+ if is_generation:
885
+ warmup_request_kwargs = {
886
+ "request_id": f"WARMUP_{time.time()}",
887
+ "tokenized": sglang_scheduler_pb2.TokenizedInput(
888
+ input_ids=[
889
+ 123,
890
+ 456,
891
+ 789,
892
+ 234,
893
+ 567,
894
+ 890,
895
+ 345,
896
+ ], # Random-looking but safe token IDs
897
+ original_text="warmup request",
898
+ ),
899
+ "sampling_params": sglang_scheduler_pb2.SamplingParams(
900
+ temperature=0.0,
901
+ max_new_tokens=max_new_tokens,
902
+ ),
903
+ "stream": False,
904
+ }
905
+
906
+ # Set disaggregation params if needed
907
+ if server_args.disaggregation_mode != DisaggregationMode.NULL:
908
+ warmup_request_kwargs["disaggregated_params"] = (
909
+ sglang_scheduler_pb2.DisaggregatedParams(
910
+ bootstrap_host=FAKE_BOOTSTRAP_HOST,
911
+ bootstrap_room=0,
912
+ )
913
+ )
795
914
 
796
- parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
797
- ServerArgs.add_cli_args(parser)
798
- args = parser.parse_args()
799
- server_args = ServerArgs.from_cli_args(args)
915
+ warmup_request = sglang_scheduler_pb2.GenerateRequest(
916
+ **warmup_request_kwargs
917
+ )
800
918
 
801
- # Run server
802
- asyncio.run(
803
- serve_grpc(
804
- server_args=server_args,
919
+ # Send the warmup request
920
+ try:
921
+ responses = list(stub.Generate(warmup_request, timeout=600))
922
+ # Check if we got a valid response
923
+ if responses and not responses[-1].HasField("error"):
924
+ logger.info("gRPC warmup request completed successfully")
925
+ success = True
926
+ else:
927
+ error_msg = (
928
+ responses[-1].error.message if responses else "No response"
929
+ )
930
+ logger.warning(f"gRPC warmup request returned error: {error_msg}")
931
+ success = False
932
+ except Exception as e:
933
+ error_msg = f"gRPC warmup request failed: {e}"
934
+ logger.error(error_msg)
935
+ if pipe_finish_writer is not None:
936
+ pipe_finish_writer.send(error_msg)
937
+ channel.close()
938
+ kill_process_tree(os.getpid())
939
+ return False
940
+ else:
941
+ # For embedding models
942
+ warmup_request = sglang_scheduler_pb2.EmbedRequest(
943
+ request_id=f"WARMUP_{time.time()}",
944
+ tokenized=sglang_scheduler_pb2.TokenizedInput(
945
+ input_ids=[10, 11, 12],
946
+ original_text="test embedding",
947
+ ),
948
+ )
949
+
950
+ try:
951
+ response = stub.Embed(warmup_request, timeout=600)
952
+ if not response.HasField("error"):
953
+ logger.info("gRPC warmup request completed successfully")
954
+ success = True
955
+ else:
956
+ logger.warning(
957
+ f"gRPC warmup request returned error: {response.error.message}"
958
+ )
959
+ success = False
960
+ except Exception as e:
961
+ error_msg = f"gRPC warmup request failed: {e}"
962
+ logger.error(error_msg)
963
+ if pipe_finish_writer is not None:
964
+ pipe_finish_writer.send(error_msg)
965
+ channel.close()
966
+ kill_process_tree(os.getpid())
967
+ return False
968
+
969
+ channel.close()
970
+ return success
971
+
972
+ except Exception as e:
973
+ error_msg = (
974
+ f"gRPC warmup failed with exception: {e}\n{get_exception_traceback()}"
805
975
  )
806
- )
976
+ logger.error(error_msg)
977
+ if pipe_finish_writer is not None:
978
+ pipe_finish_writer.send(error_msg)
979
+ try:
980
+ channel.close()
981
+ except Exception:
982
+ pass
983
+ kill_process_tree(os.getpid())
984
+ return False
985
+
986
+
987
+ def _wait_and_warmup_grpc(
988
+ server_args: ServerArgs,
989
+ pipe_finish_writer: Optional[mp.connection.Connection],
990
+ health_servicer: Optional[SGLangHealthServicer] = None,
991
+ ):
992
+ """Wait for gRPC server to be ready and execute warmup."""
993
+ if not server_args.skip_server_warmup:
994
+ if not _execute_grpc_server_warmup(server_args, pipe_finish_writer):
995
+ return
996
+ else:
997
+ logger.info("Skipping gRPC server warmup (skip_server_warmup=True)")
998
+
999
+ # Mark health service as SERVING after warmup completes
1000
+ if health_servicer:
1001
+ health_servicer.set_serving()
807
1002
 
1003
+ logger.info("The server is fired up and ready to roll!")
808
1004
 
809
- if __name__ == "__main__":
810
- main()
1005
+ if pipe_finish_writer is not None:
1006
+ pipe_finish_writer.send("ready")