sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,950 @@
1
+ """
2
+ Standalone gRPC Server for SGLang - Fully separated from HTTP server.
3
+ Uses GrpcRequestManager for orchestration without tokenization.
4
+ """
5
+
6
+ import argparse
7
+ import asyncio
8
+ import dataclasses
9
+ import logging
10
+ import multiprocessing as mp
11
+ import os
12
+ import signal
13
+ import time
14
+ from concurrent import futures
15
+ from typing import AsyncIterator, Dict, Optional, Tuple
16
+
17
+ import grpc
18
+ from google.protobuf.json_format import MessageToDict
19
+ from google.protobuf.struct_pb2 import Struct
20
+ from google.protobuf.timestamp_pb2 import Timestamp
21
+ from grpc_reflection.v1alpha import reflection
22
+
23
+ import sglang
24
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
25
+ from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
26
+ from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
27
+ from sglang.srt.managers.data_parallel_controller import (
28
+ run_data_parallel_controller_process,
29
+ )
30
+ from sglang.srt.managers.disagg_service import start_disagg_service
31
+ from sglang.srt.managers.io_struct import (
32
+ TokenizedEmbeddingReqInput,
33
+ TokenizedGenerateReqInput,
34
+ )
35
+ from sglang.srt.managers.scheduler import run_scheduler_process
36
+ from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
37
+ from sglang.srt.server_args import PortArgs, ServerArgs
38
+ from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
39
+ from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
40
+ from sglang.utils import get_exception_traceback
41
+
42
+ logger = logging.getLogger(__name__)
43
+ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
44
+
45
+
46
+ def _run_scheduler_with_signal_handling(*args, **kwargs):
47
+ """
48
+ Wrapper for run_scheduler_process that ignores SIGINT.
49
+
50
+ The scheduler process should not handle Ctrl+C - it should only terminate
51
+ when the parent gRPC server exits (via kill_itself_when_parent_died).
52
+ """
53
+ # Ignore SIGINT in this subprocess - let the parent handle it
54
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
55
+
56
+ # Now run the actual scheduler process
57
+ run_scheduler_process(*args, **kwargs)
58
+
59
+
60
+ def _launch_scheduler_process_only(
61
+ server_args: ServerArgs,
62
+ port_args: Optional[PortArgs] = None,
63
+ ) -> Tuple[Dict, PortArgs, list]:
64
+ """
65
+ Launch only the scheduler process(es) without tokenizer/detokenizer.
66
+ Returns scheduler info, port args, and list of scheduler processes.
67
+ """
68
+ # Configure global environment
69
+ configure_logger(server_args)
70
+ server_args.check_server_args()
71
+
72
+ # Allocate ports for inter-process communications
73
+ if port_args is None:
74
+ port_args = PortArgs.init_new(server_args)
75
+ logger.info(f"{server_args=}")
76
+
77
+ # Prepare model and tokenizer paths
78
+ server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
79
+ server_args.model_path, server_args.tokenizer_path
80
+ )
81
+
82
+ scheduler_procs = []
83
+ if server_args.dp_size == 1:
84
+ memory_saver_adapter = TorchMemorySaverAdapter.create(
85
+ enable=server_args.enable_memory_saver
86
+ )
87
+ scheduler_pipe_readers = []
88
+
89
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
90
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
91
+ tp_rank_range = range(
92
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
93
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
94
+ )
95
+
96
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
97
+ pp_rank_range = range(
98
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
99
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
100
+ )
101
+
102
+ for pp_rank in pp_rank_range:
103
+ for tp_rank in tp_rank_range:
104
+ reader, writer = mp.Pipe(duplex=False)
105
+ gpu_id = (
106
+ server_args.base_gpu_id
107
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
108
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
109
+ )
110
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
111
+ proc = mp.Process(
112
+ target=_run_scheduler_with_signal_handling,
113
+ args=(
114
+ server_args,
115
+ port_args,
116
+ gpu_id,
117
+ tp_rank,
118
+ moe_ep_rank,
119
+ pp_rank,
120
+ None,
121
+ writer,
122
+ ),
123
+ )
124
+
125
+ with memory_saver_adapter.configure_subprocess():
126
+ proc.start()
127
+ scheduler_procs.append(proc)
128
+ scheduler_pipe_readers.append(reader)
129
+ else:
130
+ # Launch the data parallel controller
131
+ reader, writer = mp.Pipe(duplex=False)
132
+ scheduler_pipe_readers = [reader]
133
+ proc = mp.Process(
134
+ target=run_data_parallel_controller_process,
135
+ args=(server_args, port_args, writer),
136
+ )
137
+ proc.start()
138
+ scheduler_procs.append(proc)
139
+
140
+ # TODO(CatherineSue): handle cases for multi-node
141
+
142
+ # Wait for all scheduler processes to be ready
143
+ scheduler_infos = []
144
+ for i, reader in enumerate(scheduler_pipe_readers):
145
+ try:
146
+ data = reader.recv()
147
+ except EOFError:
148
+ logger.error(
149
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
150
+ )
151
+ scheduler_procs[i].join()
152
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
153
+ raise RuntimeError(f"Failed to initialize scheduler rank {i}")
154
+
155
+ if data.get("status") != "ready":
156
+ raise RuntimeError(
157
+ f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
158
+ )
159
+ scheduler_infos.append(data)
160
+
161
+ logger.info(
162
+ f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
163
+ )
164
+
165
+ # Return the first scheduler's info (they should all be the same)
166
+ return scheduler_infos[0], port_args, scheduler_procs
167
+
168
+
169
+ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
170
+ """
171
+ Standalone gRPC service implementation using GrpcRequestManager.
172
+ Fully separated from HTTP server with its own process and no shared globals.
173
+ """
174
+
175
+ def __init__(
176
+ self,
177
+ request_manager: GrpcRequestManager,
178
+ server_args: ServerArgs,
179
+ model_info: Dict,
180
+ scheduler_info: Dict,
181
+ ):
182
+ """Initialize the standalone gRPC service."""
183
+ self.request_manager = request_manager
184
+ self.server_args = server_args
185
+ self.model_info = model_info
186
+ self.scheduler_info = scheduler_info
187
+ self.start_time = time.time()
188
+
189
+ # Start the request manager's event loop using auto_create_handle_loop
190
+ self.request_manager.auto_create_handle_loop()
191
+
192
+ logger.info("gRPC scheduler servicer initialized")
193
+
194
+ async def Generate(
195
+ self,
196
+ request: sglang_scheduler_pb2.GenerateRequest,
197
+ context: grpc.aio.ServicerContext,
198
+ ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
199
+ """Handle generation requests with streaming responses."""
200
+ logger.info(f"Receive generation request: {request.request_id}")
201
+
202
+ try:
203
+ # Convert gRPC request to internal format
204
+ tokenized_req = self._convert_generate_request(request)
205
+
206
+ # Submit to request manager (automatically handles n>1)
207
+ response_generator = self.request_manager.generate_request(
208
+ obj=tokenized_req,
209
+ request_id=request.request_id,
210
+ grpc_context=context,
211
+ )
212
+
213
+ async for output in response_generator:
214
+ # Check if client cancelled before processing/yielding
215
+ if context.cancelled():
216
+ logger.info(f"Client cancelled request {request.request_id}")
217
+ # Explicitly abort the request to notify scheduler
218
+ await self.request_manager.abort_request(request.request_id)
219
+ break
220
+
221
+ # Handle batch responses (for n>1 non-streaming)
222
+ if isinstance(output, list):
223
+ for batch_output in output:
224
+ if "error" in batch_output:
225
+ yield sglang_scheduler_pb2.GenerateResponse(
226
+ request_id=request.request_id,
227
+ error=sglang_scheduler_pb2.GenerateError(
228
+ message=batch_output["error"],
229
+ http_status_code=(
230
+ "500" if "abort" not in batch_output else "499"
231
+ ),
232
+ ),
233
+ )
234
+ else:
235
+ # All non-error batch outputs are final responses
236
+ yield self._create_completion_response(
237
+ request.request_id, batch_output
238
+ )
239
+ else:
240
+ # Handle single response (for streaming or n=1 non-streaming)
241
+ if "error" in output:
242
+ yield sglang_scheduler_pb2.GenerateResponse(
243
+ request_id=request.request_id,
244
+ error=sglang_scheduler_pb2.GenerateError(
245
+ message=output["error"],
246
+ http_status_code=(
247
+ "500" if "abort" not in output else "499"
248
+ ),
249
+ ),
250
+ )
251
+ elif output.get("finished", False):
252
+ yield self._create_completion_response(
253
+ request.request_id, output
254
+ )
255
+ else:
256
+ yield self._create_chunk_response(request.request_id, output)
257
+
258
+ except Exception as e:
259
+ logger.error(
260
+ f"Generate failed for request {request.request_id}: {e}\n"
261
+ f"{get_exception_traceback()}"
262
+ )
263
+ yield sglang_scheduler_pb2.GenerateResponse(
264
+ request_id=request.request_id,
265
+ error=sglang_scheduler_pb2.GenerateError(
266
+ message=str(e),
267
+ http_status_code="500",
268
+ details=get_exception_traceback(),
269
+ ),
270
+ )
271
+
272
+ async def Embed(
273
+ self,
274
+ request: sglang_scheduler_pb2.EmbedRequest,
275
+ _context: grpc.aio.ServicerContext,
276
+ ) -> sglang_scheduler_pb2.EmbedResponse:
277
+ """Handle embedding requests."""
278
+ logger.info(f"Receive embedding request: {request.request_id}")
279
+
280
+ try:
281
+ # Convert request
282
+ tokenized_req = self._convert_embed_request(request)
283
+
284
+ # Submit to request manager
285
+ future = await self.request_manager.embedding_request(
286
+ obj=tokenized_req,
287
+ request_id=request.request_id,
288
+ )
289
+
290
+ # Wait for result
291
+ result = await future
292
+
293
+ # Create response
294
+ return sglang_scheduler_pb2.EmbedResponse(
295
+ request_id=request.request_id,
296
+ complete=sglang_scheduler_pb2.EmbedComplete(
297
+ embedding=result["embedding"],
298
+ prompt_tokens=result.get("prompt_tokens", 0),
299
+ cached_tokens=0,
300
+ embedding_dim=len(result["embedding"]),
301
+ ),
302
+ )
303
+
304
+ except Exception as e:
305
+ logger.error(
306
+ f"Embed failed for request {request.request_id}: {e}\n"
307
+ f"{get_exception_traceback()}"
308
+ )
309
+ return sglang_scheduler_pb2.EmbedResponse(
310
+ request_id=request.request_id,
311
+ error=sglang_scheduler_pb2.EmbedError(
312
+ message=str(e),
313
+ code="INTERNAL_ERROR",
314
+ details=get_exception_traceback(),
315
+ ),
316
+ )
317
+
318
+ async def HealthCheck(
319
+ self,
320
+ request: sglang_scheduler_pb2.HealthCheckRequest,
321
+ context: grpc.aio.ServicerContext,
322
+ ) -> sglang_scheduler_pb2.HealthCheckResponse:
323
+ """
324
+ Check the health of the inference server by sending a special request to generate one token.
325
+ Similar to HTTP server's /health endpoint.
326
+ """
327
+ logger.info("Receive health check request")
328
+
329
+ if self.request_manager.gracefully_exit:
330
+ logger.info(
331
+ "Health check request received during shutdown. Returning unhealthy."
332
+ )
333
+ return sglang_scheduler_pb2.HealthCheckResponse(
334
+ healthy=False, message="Server is shutting down"
335
+ )
336
+
337
+ # Create a special health check request
338
+ rid = f"HEALTH_CHECK_{time.time()}"
339
+ sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
340
+ sampling_params.normalize(tokenizer=None)
341
+
342
+ # Create health check request
343
+ is_generation = self.scheduler_info.get("is_generation", True)
344
+ if is_generation:
345
+ health_req = TokenizedGenerateReqInput(
346
+ rid=rid,
347
+ input_text="",
348
+ input_ids=[0],
349
+ sampling_params=sampling_params,
350
+ return_logprob=False,
351
+ logprob_start_len=-1,
352
+ top_logprobs_num=0,
353
+ stream=False,
354
+ mm_inputs=None,
355
+ token_ids_logprob=None,
356
+ )
357
+ # Set disaggregation params if needed
358
+ if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
359
+ health_req.bootstrap_host = FAKE_BOOTSTRAP_HOST
360
+ health_req.bootstrap_room = 0
361
+ else:
362
+ health_req = TokenizedEmbeddingReqInput(
363
+ rid=rid,
364
+ input_text="",
365
+ input_ids=[0],
366
+ )
367
+
368
+ # Submit health check request
369
+ async def run_health_check():
370
+ try:
371
+ async for _ in self.request_manager.generate_request(
372
+ obj=health_req,
373
+ request_id=rid,
374
+ ):
375
+ # Got at least one response, server is healthy
376
+ return True
377
+ except Exception as e:
378
+ logger.warning(f"Health check failed: {e}")
379
+ return False
380
+ return False
381
+
382
+ task = asyncio.create_task(run_health_check())
383
+
384
+ # Wait for response with timeout
385
+ tic = time.time()
386
+ while time.time() < tic + HEALTH_CHECK_TIMEOUT:
387
+ await asyncio.sleep(1)
388
+ # Check if we got a response from scheduler
389
+ if self.request_manager.last_receive_tstamp > tic:
390
+ task.cancel()
391
+ # Clean up health check state
392
+ self.request_manager._cleanup_request_state(rid)
393
+ return sglang_scheduler_pb2.HealthCheckResponse(
394
+ healthy=True, message="Health check passed"
395
+ )
396
+
397
+ # Timeout - server not responding
398
+ task.cancel()
399
+ self.request_manager._cleanup_request_state(rid)
400
+ logger.warning(f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s")
401
+ return sglang_scheduler_pb2.HealthCheckResponse(
402
+ healthy=False, message=f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s"
403
+ )
404
+
405
+ async def Abort(
406
+ self,
407
+ request: sglang_scheduler_pb2.AbortRequest,
408
+ _context: grpc.aio.ServicerContext,
409
+ ) -> sglang_scheduler_pb2.AbortResponse:
410
+ """Abort an ongoing request."""
411
+ logger.info(f"Receive abort request: {request.request_id}")
412
+
413
+ try:
414
+ success = await self.request_manager.abort_request(request.request_id)
415
+
416
+ return sglang_scheduler_pb2.AbortResponse(
417
+ success=success,
418
+ message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
419
+ )
420
+ except Exception as e:
421
+ logger.error(
422
+ f"Abort failed for request {request.request_id}: {e}\n"
423
+ f"{get_exception_traceback()}"
424
+ )
425
+ return sglang_scheduler_pb2.AbortResponse(
426
+ success=False,
427
+ message=str(e),
428
+ )
429
+
430
+ async def GetModelInfo(
431
+ self,
432
+ _request: sglang_scheduler_pb2.GetModelInfoRequest,
433
+ _context: grpc.aio.ServicerContext,
434
+ ) -> sglang_scheduler_pb2.GetModelInfoResponse:
435
+ """Get model information."""
436
+ logger.debug("Receive model info request")
437
+
438
+ is_generation = self.scheduler_info.get("is_generation")
439
+ if is_generation is None:
440
+ is_generation = not self.server_args.is_embedding
441
+
442
+ return sglang_scheduler_pb2.GetModelInfoResponse(
443
+ model_path=self.server_args.model_path,
444
+ tokenizer_path=self.server_args.tokenizer_path or "",
445
+ is_generation=is_generation,
446
+ preferred_sampling_params=(
447
+ self.server_args.preferred_sampling_params or ""
448
+ ),
449
+ weight_version=self.server_args.weight_version or "",
450
+ served_model_name=self.server_args.served_model_name,
451
+ max_context_length=self.model_info["max_context_length"],
452
+ vocab_size=self.model_info["vocab_size"],
453
+ supports_vision=self.model_info["supports_vision"],
454
+ model_type=self.model_info["model_type"],
455
+ eos_token_ids=self.model_info["eos_token_ids"],
456
+ pad_token_id=self.model_info["pad_token_id"],
457
+ bos_token_id=self.model_info["bos_token_id"],
458
+ max_req_input_len=self.model_info["max_req_input_len"],
459
+ )
460
+
461
+ async def GetServerInfo(
462
+ self,
463
+ _request: sglang_scheduler_pb2.GetServerInfoRequest,
464
+ _context: grpc.aio.ServicerContext,
465
+ ) -> sglang_scheduler_pb2.GetServerInfoResponse:
466
+ """Get server information."""
467
+ logger.debug("Receive server info request")
468
+
469
+ server_args_dict = dataclasses.asdict(self.server_args)
470
+ server_args_struct = Struct()
471
+
472
+ def make_serializable(obj):
473
+ if obj is None:
474
+ return None
475
+ elif isinstance(obj, (str, int, float, bool)):
476
+ return obj
477
+ elif isinstance(obj, (list, tuple, set)):
478
+ return [make_serializable(item) for item in obj]
479
+ elif isinstance(obj, dict):
480
+ return {k: make_serializable(v) for k, v in obj.items()}
481
+ else:
482
+ return str(obj)
483
+
484
+ serializable_args = make_serializable(server_args_dict)
485
+ server_args_struct.update(serializable_args)
486
+
487
+ # Convert scheduler_info to Struct
488
+ scheduler_info_struct = Struct()
489
+ scheduler_info_struct.update(self.scheduler_info)
490
+
491
+ # Get runtime state from request manager
492
+ manager_state = self.request_manager.get_server_info()
493
+
494
+ # Calculate uptime
495
+ uptime = time.time() - self.start_time
496
+
497
+ # Create timestamp
498
+ start_timestamp = Timestamp()
499
+ start_timestamp.FromSeconds(int(self.start_time))
500
+
501
+ return sglang_scheduler_pb2.GetServerInfoResponse(
502
+ server_args=server_args_struct,
503
+ scheduler_info=scheduler_info_struct,
504
+ active_requests=manager_state["active_requests"],
505
+ is_paused=manager_state["paused"],
506
+ last_receive_timestamp=manager_state["last_receive_time"],
507
+ uptime_seconds=uptime,
508
+ sglang_version=sglang.__version__,
509
+ server_type="grpc",
510
+ start_time=start_timestamp,
511
+ )
512
+
513
+ # Helper methods for request/response conversion
514
+
515
+ def _convert_generate_request(
516
+ self, grpc_req: sglang_scheduler_pb2.GenerateRequest
517
+ ) -> TokenizedGenerateReqInput:
518
+ """Convert gRPC GenerateRequest to internal format."""
519
+
520
+ # Extract tokenized input
521
+ if not grpc_req.HasField("tokenized"):
522
+ raise ValueError("Tokenized input must be provided")
523
+
524
+ input_text = grpc_req.tokenized.original_text
525
+ input_ids = list(grpc_req.tokenized.input_ids)
526
+
527
+ # Convert sampling params
528
+ sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
529
+ sampling_params.normalize(tokenizer=None)
530
+
531
+ # Extract disaggregated params if present
532
+ bootstrap_host = None
533
+ bootstrap_port = None
534
+ bootstrap_room = None
535
+ if grpc_req.HasField("disaggregated_params"):
536
+ bootstrap_host = grpc_req.disaggregated_params.bootstrap_host or None
537
+ bootstrap_port = grpc_req.disaggregated_params.bootstrap_port or None
538
+ bootstrap_room = grpc_req.disaggregated_params.bootstrap_room or None
539
+
540
+ # Create request
541
+ return TokenizedGenerateReqInput(
542
+ rid=grpc_req.request_id,
543
+ input_text=input_text,
544
+ input_ids=input_ids,
545
+ mm_inputs=None, # TODO: implement mm support
546
+ sampling_params=sampling_params,
547
+ return_logprob=grpc_req.return_logprob,
548
+ logprob_start_len=(
549
+ grpc_req.logprob_start_len
550
+ if grpc_req.logprob_start_len is not None
551
+ else -1
552
+ ),
553
+ top_logprobs_num=grpc_req.top_logprobs_num or 0,
554
+ stream=grpc_req.stream or False,
555
+ lora_id=grpc_req.lora_id if grpc_req.lora_id else None,
556
+ token_ids_logprob=(
557
+ list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
558
+ ),
559
+ bootstrap_host=bootstrap_host,
560
+ bootstrap_port=bootstrap_port,
561
+ bootstrap_room=bootstrap_room,
562
+ )
563
+
564
+ def _convert_embed_request(
565
+ self, grpc_req: sglang_scheduler_pb2.EmbedRequest
566
+ ) -> TokenizedEmbeddingReqInput:
567
+ """Convert gRPC EmbedRequest to internal format."""
568
+
569
+ # Extract tokenized input
570
+ if not grpc_req.HasField("tokenized"):
571
+ raise ValueError("Tokenized input must be provided")
572
+
573
+ input_text = grpc_req.tokenized.original_text
574
+ input_ids = list(grpc_req.tokenized.input_ids)
575
+
576
+ return TokenizedEmbeddingReqInput(
577
+ rid=grpc_req.request_id,
578
+ input_text=input_text,
579
+ input_ids=input_ids,
580
+ )
581
+
582
+ def _convert_sampling_params(
583
+ self, grpc_params: sglang_scheduler_pb2.SamplingParams
584
+ ) -> SGLSamplingParams:
585
+ """Convert gRPC SamplingParams to internal format."""
586
+
587
+ # Handle constraint types
588
+ regex = None
589
+ json_schema = None
590
+ ebnf_grammar = None
591
+ structural_tag = None
592
+
593
+ if grpc_params.HasField("regex"):
594
+ regex = grpc_params.regex
595
+ elif grpc_params.HasField("json_schema"):
596
+ json_schema = grpc_params.json_schema
597
+ elif grpc_params.HasField("ebnf_grammar"):
598
+ ebnf_grammar = grpc_params.ebnf_grammar
599
+ elif grpc_params.HasField("structural_tag"):
600
+ structural_tag = grpc_params.structural_tag
601
+
602
+ # Handle optional parameters conversion
603
+ custom_params = (
604
+ MessageToDict(grpc_params.custom_params)
605
+ if grpc_params.HasField("custom_params")
606
+ else None
607
+ )
608
+ max_new_tokens = (
609
+ grpc_params.max_new_tokens
610
+ if grpc_params.HasField("max_new_tokens")
611
+ else None
612
+ )
613
+ stream_interval = (
614
+ grpc_params.stream_interval
615
+ if grpc_params.HasField("stream_interval")
616
+ else None
617
+ )
618
+ logit_bias = dict(grpc_params.logit_bias) if grpc_params.logit_bias else None
619
+ stop = list(grpc_params.stop) if grpc_params.stop else None
620
+ stop_token_ids = (
621
+ list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
622
+ )
623
+
624
+ return SGLSamplingParams(
625
+ temperature=grpc_params.temperature,
626
+ top_p=grpc_params.top_p,
627
+ top_k=grpc_params.top_k,
628
+ min_p=grpc_params.min_p,
629
+ frequency_penalty=grpc_params.frequency_penalty,
630
+ presence_penalty=grpc_params.presence_penalty,
631
+ repetition_penalty=grpc_params.repetition_penalty,
632
+ max_new_tokens=max_new_tokens,
633
+ min_new_tokens=grpc_params.min_new_tokens,
634
+ stop=stop,
635
+ stop_token_ids=stop_token_ids,
636
+ skip_special_tokens=grpc_params.skip_special_tokens,
637
+ spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
638
+ no_stop_trim=grpc_params.no_stop_trim,
639
+ regex=regex,
640
+ json_schema=json_schema,
641
+ ebnf=ebnf_grammar,
642
+ structural_tag=structural_tag,
643
+ n=grpc_params.n,
644
+ ignore_eos=grpc_params.ignore_eos,
645
+ stream_interval=stream_interval,
646
+ logit_bias=logit_bias,
647
+ custom_params=custom_params,
648
+ )
649
+
650
+ def _convert_output_logprobs_to_proto(
651
+ self, logprobs_data: Dict
652
+ ) -> Optional[sglang_scheduler_pb2.OutputLogProbs]:
653
+ """Convert output logprobs dict to proto (no None values, plain floats)."""
654
+ if not logprobs_data:
655
+ return None
656
+
657
+ token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
658
+ token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
659
+ top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
660
+ top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
661
+
662
+ # Build TopLogProbs entries
663
+ top_logprobs_proto = []
664
+ if top_logprobs_val and top_logprobs_idx:
665
+ for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
666
+ top_logprobs_proto.append(
667
+ sglang_scheduler_pb2.TopLogProbs(
668
+ values=val_list,
669
+ token_ids=idx_list,
670
+ )
671
+ )
672
+
673
+ return sglang_scheduler_pb2.OutputLogProbs(
674
+ token_logprobs=token_logprobs_val, # Plain float array
675
+ token_ids=token_logprobs_idx,
676
+ top_logprobs=top_logprobs_proto,
677
+ )
678
+
679
+ def _convert_input_logprobs_to_proto(
680
+ self, logprobs_data: Dict
681
+ ) -> Optional[sglang_scheduler_pb2.InputLogProbs]:
682
+ """Convert input logprobs dict to proto (first token is None, wrapped in InputTokenLogProb)."""
683
+ if not logprobs_data:
684
+ return None
685
+
686
+ token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
687
+ token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
688
+ top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
689
+ top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
690
+
691
+ # Wrap values in InputTokenLogProb (None for first token, value for others)
692
+ token_logprobs_wrapped = [
693
+ (
694
+ sglang_scheduler_pb2.InputTokenLogProb()
695
+ if x is None
696
+ else sglang_scheduler_pb2.InputTokenLogProb(value=x)
697
+ )
698
+ for x in token_logprobs_val
699
+ ]
700
+
701
+ # Build TopLogProbs entries
702
+ top_logprobs_proto = []
703
+ if top_logprobs_val and top_logprobs_idx:
704
+ for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
705
+ top_logprobs_proto.append(
706
+ sglang_scheduler_pb2.TopLogProbs(
707
+ values=val_list,
708
+ token_ids=idx_list,
709
+ )
710
+ )
711
+
712
+ return sglang_scheduler_pb2.InputLogProbs(
713
+ token_logprobs=token_logprobs_wrapped,
714
+ token_ids=token_logprobs_idx,
715
+ top_logprobs=top_logprobs_proto,
716
+ )
717
+
718
+ def _create_chunk_response(
719
+ self, request_id: str, output: Dict
720
+ ) -> sglang_scheduler_pb2.GenerateResponse:
721
+ """Create a streaming chunk response."""
722
+ meta_info = output.get("meta_info", {})
723
+
724
+ # Convert output logprobs if present
725
+ output_logprobs_proto = self._convert_output_logprobs_to_proto(
726
+ output.get("output_logprobs")
727
+ )
728
+
729
+ # Convert input logprobs if present (only in first chunk)
730
+ input_logprobs_proto = self._convert_input_logprobs_to_proto(
731
+ output.get("input_logprobs")
732
+ )
733
+
734
+ return sglang_scheduler_pb2.GenerateResponse(
735
+ request_id=request_id,
736
+ chunk=sglang_scheduler_pb2.GenerateStreamChunk(
737
+ token_ids=output.get("token_ids", []),
738
+ prompt_tokens=meta_info.get("prompt_tokens", 0),
739
+ completion_tokens=meta_info.get("completion_tokens", 0),
740
+ cached_tokens=meta_info.get("cached_tokens", 0),
741
+ output_logprobs=output_logprobs_proto,
742
+ input_logprobs=input_logprobs_proto,
743
+ index=output.get("index", 0),
744
+ ),
745
+ )
746
+
747
+ def _create_completion_response(
748
+ self, request_id: str, output: Dict
749
+ ) -> sglang_scheduler_pb2.GenerateResponse:
750
+ """Create a completion response."""
751
+
752
+ # Extract meta info and finish reason details
753
+ meta_info = output.get("meta_info", {})
754
+ finish_reason_data = meta_info.get("finish_reason")
755
+
756
+ # Determine finish reason, default is stop
757
+ finish_reason = "stop"
758
+ if finish_reason_data:
759
+ if isinstance(finish_reason_data, dict):
760
+ finish_reason_type = finish_reason_data.get("type")
761
+ else:
762
+ # Handle legacy string format
763
+ finish_reason_type = finish_reason_data
764
+
765
+ if finish_reason_type == "length":
766
+ finish_reason = "length"
767
+ elif finish_reason_type == "abort":
768
+ finish_reason = "abort"
769
+
770
+ # Extract matched_stop information
771
+ matched_stop_kwargs = {}
772
+ if isinstance(finish_reason_data, dict) and "matched" in finish_reason_data:
773
+ matched = finish_reason_data["matched"]
774
+ if isinstance(matched, int):
775
+ matched_stop_kwargs["matched_token_id"] = matched
776
+ elif isinstance(matched, str):
777
+ matched_stop_kwargs["matched_stop_str"] = matched
778
+
779
+ # Convert output logprobs if present
780
+ output_logprobs_proto = self._convert_output_logprobs_to_proto(
781
+ output.get("output_logprobs")
782
+ )
783
+
784
+ # Convert input logprobs if present
785
+ input_logprobs_proto = self._convert_input_logprobs_to_proto(
786
+ output.get("input_logprobs")
787
+ )
788
+
789
+ return sglang_scheduler_pb2.GenerateResponse(
790
+ request_id=request_id,
791
+ complete=sglang_scheduler_pb2.GenerateComplete(
792
+ output_ids=output.get("token_ids", []),
793
+ finish_reason=finish_reason,
794
+ prompt_tokens=meta_info.get("prompt_tokens", 0),
795
+ completion_tokens=meta_info.get(
796
+ "completion_tokens", len(output.get("token_ids", []))
797
+ ),
798
+ cached_tokens=meta_info.get("cached_tokens", 0),
799
+ output_logprobs=output_logprobs_proto,
800
+ input_logprobs=input_logprobs_proto,
801
+ index=output.get("index", 0),
802
+ **matched_stop_kwargs,
803
+ ),
804
+ )
805
+
806
+ async def shutdown(self):
807
+ """Shutdown the service."""
808
+ logger.info("Shutting down gRPC service")
809
+
810
+ # Shutdown request manager (handles its own tasks)
811
+ await self.request_manager.shutdown()
812
+
813
+
814
+ async def serve_grpc(
815
+ server_args: ServerArgs,
816
+ model_info: Optional[Dict] = None,
817
+ ):
818
+ """Start the standalone gRPC server with integrated scheduler."""
819
+
820
+ # Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
821
+ # This ensures the bootstrap server is ready when prefill schedulers try to register
822
+ bootstrap_server = None
823
+ if server_args.disaggregation_mode == "prefill":
824
+ bootstrap_server = start_disagg_service(server_args)
825
+ if bootstrap_server:
826
+ logger.info(
827
+ f"Bootstrap server started for disaggregation mode on {server_args.host}:{server_args.disaggregation_bootstrap_port}"
828
+ )
829
+
830
+ # Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
831
+ logger.info("Launching scheduler process(es)...")
832
+ scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
833
+ server_args=server_args,
834
+ )
835
+
836
+ # Update model info from scheduler info
837
+ if model_info is None:
838
+ model_info = {
839
+ "model_name": server_args.model_path,
840
+ "max_context_length": scheduler_info.get(
841
+ "max_total_num_tokens", server_args.context_length or 8192
842
+ ),
843
+ "vocab_size": scheduler_info.get("vocab_size", 128256),
844
+ "supports_vision": scheduler_info.get("supports_vision", False),
845
+ "model_type": scheduler_info.get("model_type", "transformer"),
846
+ "max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
847
+ "eos_token_ids": scheduler_info.get("eos_token_ids", []),
848
+ "pad_token_id": scheduler_info.get("pad_token_id", 0),
849
+ "bos_token_id": scheduler_info.get("bos_token_id", 1),
850
+ }
851
+
852
+ # Create request manager with the correct port args
853
+ # Note: We pass None for bootstrap_server since it's already started above
854
+ request_manager = GrpcRequestManager(
855
+ server_args=server_args,
856
+ port_args=port_args,
857
+ bootstrap_server=bootstrap_server,
858
+ )
859
+
860
+ # Create gRPC server
861
+ server = grpc.aio.server(
862
+ futures.ThreadPoolExecutor(max_workers=10),
863
+ options=[
864
+ ("grpc.max_send_message_length", 1024 * 1024 * 256),
865
+ ("grpc.max_receive_message_length", 1024 * 1024 * 256),
866
+ ],
867
+ )
868
+
869
+ # Add service
870
+ servicer = SGLangSchedulerServicer(
871
+ request_manager=request_manager,
872
+ server_args=server_args,
873
+ model_info=model_info,
874
+ scheduler_info=scheduler_info,
875
+ )
876
+ sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
877
+
878
+ # Enable reflection
879
+ SERVICE_NAMES = (
880
+ sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
881
+ reflection.SERVICE_NAME,
882
+ )
883
+ reflection.enable_server_reflection(SERVICE_NAMES, server)
884
+
885
+ # Start server
886
+ listen_addr = f"{server_args.host}:{server_args.port}"
887
+ server.add_insecure_port(listen_addr)
888
+
889
+ await server.start()
890
+ logger.info(f"gRPC server listening on {listen_addr}")
891
+
892
+ # Handle shutdown signals
893
+ loop = asyncio.get_running_loop()
894
+ stop_event = asyncio.Event()
895
+
896
+ def signal_handler():
897
+ logger.info("Received shutdown signal")
898
+ stop_event.set()
899
+
900
+ for sig in (signal.SIGTERM, signal.SIGINT):
901
+ loop.add_signal_handler(sig, signal_handler)
902
+
903
+ try:
904
+ await stop_event.wait()
905
+ finally:
906
+ logger.info("Shutting down gRPC server")
907
+
908
+ # Shutdown request manager first - this closes ZMQ sockets and stops background tasks
909
+ await servicer.shutdown()
910
+
911
+ # Stop the gRPC server
912
+ await server.stop(5.0)
913
+
914
+ # Terminate scheduler processes before exiting to avoid atexit hang
915
+ # The scheduler processes have SIGINT ignored, so they won't get KeyboardInterrupt
916
+ for i, proc in enumerate(scheduler_procs):
917
+ if proc.is_alive():
918
+ logger.info(f"Terminating scheduler process {i}...")
919
+ proc.terminate()
920
+ proc.join(timeout=2.0)
921
+ if proc.is_alive():
922
+ logger.warning(
923
+ f"Scheduler process {i} did not terminate, killing..."
924
+ )
925
+ proc.kill()
926
+ proc.join(timeout=1.0)
927
+
928
+ logger.info("All scheduler processes terminated")
929
+
930
+
931
+ def main():
932
+ """Main entry point for standalone gRPC server."""
933
+ # Fix CUDA multiprocessing issues - must be called before any CUDA operations
934
+ mp.set_start_method("spawn", force=True)
935
+
936
+ parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
937
+ ServerArgs.add_cli_args(parser)
938
+ args = parser.parse_args()
939
+ server_args = ServerArgs.from_cli_args(args)
940
+
941
+ # Run server
942
+ asyncio.run(
943
+ serve_grpc(
944
+ server_args=server_args,
945
+ )
946
+ )
947
+
948
+
949
+ if __name__ == "__main__":
950
+ main()