sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (377) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +267 -32
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +8 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +360 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/model_config.py +181 -82
  17. sglang/srt/configs/qwen3_next.py +326 -0
  18. sglang/srt/configs/qwen3_vl.py +586 -0
  19. sglang/srt/connector/__init__.py +8 -1
  20. sglang/srt/connector/remote_instance.py +82 -0
  21. sglang/srt/constrained/base_grammar_backend.py +49 -12
  22. sglang/srt/constrained/llguidance_backend.py +0 -1
  23. sglang/srt/constrained/outlines_backend.py +0 -1
  24. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  25. sglang/srt/constrained/xgrammar_backend.py +30 -9
  26. sglang/srt/custom_op.py +11 -1
  27. sglang/srt/debug_utils/dump_comparator.py +81 -44
  28. sglang/srt/debug_utils/dump_loader.py +97 -0
  29. sglang/srt/debug_utils/dumper.py +21 -6
  30. sglang/srt/debug_utils/text_comparator.py +73 -11
  31. sglang/srt/disaggregation/ascend/conn.py +2 -2
  32. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  33. sglang/srt/disaggregation/base/conn.py +1 -1
  34. sglang/srt/disaggregation/common/conn.py +279 -108
  35. sglang/srt/disaggregation/decode.py +71 -19
  36. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  37. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  38. sglang/srt/disaggregation/fake/conn.py +1 -1
  39. sglang/srt/disaggregation/mini_lb.py +6 -445
  40. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  41. sglang/srt/disaggregation/nixl/conn.py +326 -53
  42. sglang/srt/disaggregation/prefill.py +36 -17
  43. sglang/srt/disaggregation/utils.py +40 -54
  44. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  45. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  46. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  47. sglang/srt/distributed/parallel_state.py +156 -80
  48. sglang/srt/entrypoints/engine.py +59 -18
  49. sglang/srt/entrypoints/grpc_request_manager.py +855 -0
  50. sglang/srt/entrypoints/grpc_server.py +810 -0
  51. sglang/srt/entrypoints/http_server.py +130 -59
  52. sglang/srt/entrypoints/openai/protocol.py +112 -4
  53. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  54. sglang/srt/entrypoints/openai/serving_chat.py +204 -55
  55. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  56. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  57. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  58. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  59. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  60. sglang/srt/environ.py +285 -0
  61. sglang/srt/eplb/eplb_manager.py +2 -2
  62. sglang/srt/eplb/expert_distribution.py +26 -13
  63. sglang/srt/eplb/expert_location.py +38 -8
  64. sglang/srt/eplb/expert_location_updater.py +1 -1
  65. sglang/srt/function_call/base_format_detector.py +3 -6
  66. sglang/srt/function_call/ebnf_composer.py +11 -9
  67. sglang/srt/function_call/function_call_parser.py +9 -2
  68. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  69. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  70. sglang/srt/function_call/json_array_parser.py +63 -0
  71. sglang/srt/function_call/kimik2_detector.py +17 -4
  72. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  73. sglang/srt/function_call/utils.py +96 -5
  74. sglang/srt/grpc/__init__.py +1 -0
  75. sglang/srt/grpc/compile_proto.py +245 -0
  76. sglang/srt/grpc/sglang_scheduler_pb2.py +111 -0
  77. sglang/srt/grpc/sglang_scheduler_pb2.pyi +434 -0
  78. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +239 -0
  79. sglang/srt/layers/activation.py +143 -9
  80. sglang/srt/layers/attention/aiter_backend.py +14 -15
  81. sglang/srt/layers/attention/ascend_backend.py +115 -9
  82. sglang/srt/layers/attention/attention_registry.py +206 -0
  83. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  84. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  85. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  86. sglang/srt/layers/attention/fla/chunk.py +242 -0
  87. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  88. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  89. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  90. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  91. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  92. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  93. sglang/srt/layers/attention/fla/index.py +37 -0
  94. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  95. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  96. sglang/srt/layers/attention/fla/op.py +66 -0
  97. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  98. sglang/srt/layers/attention/fla/utils.py +331 -0
  99. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  100. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  101. sglang/srt/layers/attention/flashinfer_backend.py +118 -198
  102. sglang/srt/layers/attention/flashinfer_mla_backend.py +27 -27
  103. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  104. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  105. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  106. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  107. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  108. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +969 -0
  109. sglang/srt/layers/attention/mamba/mamba.py +629 -0
  110. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  111. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  112. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  113. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  114. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  115. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  116. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  117. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  119. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  120. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  121. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  122. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  123. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  124. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  125. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  126. sglang/srt/layers/attention/nsa/utils.py +24 -0
  127. sglang/srt/layers/attention/nsa_backend.py +887 -0
  128. sglang/srt/layers/attention/tbo_backend.py +6 -6
  129. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  130. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  131. sglang/srt/layers/attention/triton_backend.py +57 -7
  132. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  133. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  134. sglang/srt/layers/attention/vision.py +58 -0
  135. sglang/srt/layers/attention/wave_backend.py +4 -4
  136. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  137. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  138. sglang/srt/layers/communicator.py +8 -0
  139. sglang/srt/layers/dp_attention.py +41 -2
  140. sglang/srt/layers/elementwise.py +3 -1
  141. sglang/srt/layers/layernorm.py +34 -15
  142. sglang/srt/layers/linear.py +55 -7
  143. sglang/srt/layers/logits_processor.py +44 -12
  144. sglang/srt/layers/moe/__init__.py +2 -1
  145. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  146. sglang/srt/layers/moe/ep_moe/kernels.py +2 -2
  147. sglang/srt/layers/moe/ep_moe/layer.py +256 -63
  148. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  149. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  150. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  151. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  152. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  153. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  154. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  155. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  164. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  165. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  166. sglang/srt/layers/moe/fused_moe_triton/layer.py +71 -70
  167. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  168. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  169. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  170. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  171. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  172. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  173. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  174. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  175. sglang/srt/layers/moe/topk.py +30 -9
  176. sglang/srt/layers/moe/utils.py +22 -6
  177. sglang/srt/layers/parameter.py +23 -6
  178. sglang/srt/layers/quantization/awq.py +19 -7
  179. sglang/srt/layers/quantization/base_config.py +11 -6
  180. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  182. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  185. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  186. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  187. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  188. sglang/srt/layers/quantization/fp8.py +78 -49
  189. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  190. sglang/srt/layers/quantization/gptq.py +25 -17
  191. sglang/srt/layers/quantization/modelopt_quant.py +190 -55
  192. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  193. sglang/srt/layers/quantization/mxfp4.py +74 -42
  194. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  195. sglang/srt/layers/quantization/unquant.py +135 -47
  196. sglang/srt/layers/quantization/w4afp8.py +26 -17
  197. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  198. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  199. sglang/srt/layers/rotary_embedding.py +78 -31
  200. sglang/srt/layers/sampler.py +213 -21
  201. sglang/srt/layers/utils.py +23 -0
  202. sglang/srt/lora/backend/base_backend.py +50 -8
  203. sglang/srt/lora/backend/chunked_backend.py +348 -0
  204. sglang/srt/lora/backend/triton_backend.py +99 -5
  205. sglang/srt/lora/layers.py +32 -0
  206. sglang/srt/lora/lora.py +8 -3
  207. sglang/srt/lora/lora_manager.py +52 -118
  208. sglang/srt/lora/mem_pool.py +25 -11
  209. sglang/srt/lora/triton_ops/__init__.py +4 -0
  210. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  211. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  212. sglang/srt/lora/utils.py +22 -11
  213. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  214. sglang/srt/managers/cache_controller.py +199 -301
  215. sglang/srt/managers/data_parallel_controller.py +115 -80
  216. sglang/srt/managers/detokenizer_manager.py +19 -15
  217. sglang/srt/managers/disagg_service.py +46 -0
  218. sglang/srt/managers/io_struct.py +340 -109
  219. sglang/srt/managers/mm_utils.py +44 -6
  220. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  221. sglang/srt/managers/multimodal_processor.py +1 -2
  222. sglang/srt/managers/overlap_utils.py +53 -0
  223. sglang/srt/managers/schedule_batch.py +240 -138
  224. sglang/srt/managers/schedule_policy.py +144 -17
  225. sglang/srt/managers/scheduler.py +502 -209
  226. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  227. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  228. sglang/srt/managers/scheduler_output_processor_mixin.py +75 -22
  229. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  230. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  231. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  232. sglang/srt/managers/tokenizer_manager.py +320 -632
  233. sglang/srt/managers/tp_worker.py +81 -22
  234. sglang/srt/managers/tp_worker_overlap_thread.py +71 -56
  235. sglang/srt/managers/utils.py +1 -45
  236. sglang/srt/mem_cache/allocator.py +14 -20
  237. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  238. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  239. sglang/srt/mem_cache/chunk_cache.py +8 -1
  240. sglang/srt/mem_cache/evict_policy.py +23 -0
  241. sglang/srt/mem_cache/hicache_storage.py +43 -24
  242. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  243. sglang/srt/mem_cache/memory_pool.py +535 -58
  244. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  245. sglang/srt/mem_cache/radix_cache.py +222 -73
  246. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  247. sglang/srt/mem_cache/storage/__init__.py +10 -0
  248. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  249. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  250. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  251. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  252. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  253. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  254. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  255. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  256. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  257. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  259. sglang/srt/mem_cache/swa_radix_cache.py +25 -36
  260. sglang/srt/metrics/collector.py +511 -132
  261. sglang/srt/metrics/func_timer.py +2 -7
  262. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  263. sglang/srt/metrics/utils.py +8 -1
  264. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  265. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  266. sglang/srt/model_executor/forward_batch_info.py +82 -40
  267. sglang/srt/model_executor/model_runner.py +432 -157
  268. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  269. sglang/srt/model_loader/__init__.py +9 -3
  270. sglang/srt/model_loader/loader.py +133 -5
  271. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  272. sglang/srt/model_loader/weight_utils.py +158 -3
  273. sglang/srt/models/apertus.py +686 -0
  274. sglang/srt/models/bailing_moe.py +820 -217
  275. sglang/srt/models/bailing_moe_nextn.py +168 -0
  276. sglang/srt/models/deepseek_nextn.py +6 -1
  277. sglang/srt/models/deepseek_v2.py +607 -130
  278. sglang/srt/models/dots_ocr.py +173 -0
  279. sglang/srt/models/dots_vlm.py +174 -0
  280. sglang/srt/models/dots_vlm_vit.py +337 -0
  281. sglang/srt/models/ernie4.py +1 -1
  282. sglang/srt/models/falcon_h1.py +576 -0
  283. sglang/srt/models/gemma3_causal.py +0 -2
  284. sglang/srt/models/gemma3_mm.py +1 -1
  285. sglang/srt/models/gemma3n_mm.py +2 -2
  286. sglang/srt/models/glm4_moe.py +4 -4
  287. sglang/srt/models/glm4_moe_nextn.py +2 -2
  288. sglang/srt/models/glm4v.py +5 -3
  289. sglang/srt/models/glm4v_moe.py +4 -1
  290. sglang/srt/models/gpt_oss.py +8 -31
  291. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  292. sglang/srt/models/llama.py +4 -0
  293. sglang/srt/models/llama4.py +9 -0
  294. sglang/srt/models/llama_eagle3.py +13 -0
  295. sglang/srt/models/longcat_flash.py +3 -3
  296. sglang/srt/models/longcat_flash_nextn.py +1 -1
  297. sglang/srt/models/mllama4.py +40 -4
  298. sglang/srt/models/opt.py +637 -0
  299. sglang/srt/models/qwen2_5_vl.py +29 -5
  300. sglang/srt/models/qwen2_audio.py +1 -1
  301. sglang/srt/models/qwen2_moe.py +120 -13
  302. sglang/srt/models/qwen2_vl.py +1 -1
  303. sglang/srt/models/qwen3.py +18 -3
  304. sglang/srt/models/qwen3_moe.py +32 -4
  305. sglang/srt/models/qwen3_next.py +1069 -0
  306. sglang/srt/models/qwen3_next_mtp.py +112 -0
  307. sglang/srt/models/qwen3_vl.py +787 -0
  308. sglang/srt/models/qwen3_vl_moe.py +471 -0
  309. sglang/srt/models/registry.py +15 -3
  310. sglang/srt/models/sarashina2_vision.py +269 -0
  311. sglang/srt/models/solar.py +505 -0
  312. sglang/srt/models/starcoder2.py +357 -0
  313. sglang/srt/models/step3_vl.py +1 -1
  314. sglang/srt/models/torch_native_llama.py +9 -2
  315. sglang/srt/models/utils.py +51 -0
  316. sglang/srt/multimodal/processors/base_processor.py +15 -7
  317. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  318. sglang/srt/multimodal/processors/glm4v.py +9 -9
  319. sglang/srt/multimodal/processors/internvl.py +153 -129
  320. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  321. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  322. sglang/srt/offloader.py +27 -3
  323. sglang/srt/parser/jinja_template_utils.py +6 -0
  324. sglang/srt/sampling/sampling_batch_info.py +38 -17
  325. sglang/srt/sampling/sampling_params.py +7 -0
  326. sglang/srt/server_args.py +966 -267
  327. sglang/srt/server_args_config_parser.py +146 -0
  328. sglang/srt/single_batch_overlap.py +151 -0
  329. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  330. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  331. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  332. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  333. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  334. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  335. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  336. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  337. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  338. sglang/srt/speculative/eagle_worker.py +99 -28
  339. sglang/srt/speculative/ngram_utils.py +428 -0
  340. sglang/srt/speculative/ngram_worker.py +245 -0
  341. sglang/srt/speculative/spec_info.py +52 -0
  342. sglang/srt/speculative/spec_utils.py +606 -0
  343. sglang/srt/speculative/standalone_worker.py +109 -0
  344. sglang/srt/torch_memory_saver_adapter.py +5 -7
  345. sglang/srt/tracing/trace.py +578 -0
  346. sglang/srt/two_batch_overlap.py +8 -5
  347. sglang/srt/utils/__init__.py +2 -0
  348. sglang/srt/{utils.py → utils/common.py} +433 -77
  349. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +53 -5
  350. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  351. sglang/srt/utils/rpd_utils.py +452 -0
  352. sglang/srt/utils/slow_rank_detector.py +71 -0
  353. sglang/srt/warmup.py +8 -4
  354. sglang/srt/weight_sync/utils.py +2 -2
  355. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  356. sglang/test/get_logits_ut.py +57 -0
  357. sglang/test/run_eval.py +79 -11
  358. sglang/test/runners.py +5 -1
  359. sglang/test/simple_eval_common.py +5 -2
  360. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  361. sglang/test/test_block_fp8.py +2 -2
  362. sglang/test/test_cutlass_moe.py +24 -6
  363. sglang/test/test_deterministic.py +297 -0
  364. sglang/test/test_disaggregation_utils.py +77 -0
  365. sglang/test/test_fp4_moe.py +370 -1
  366. sglang/test/test_programs.py +1 -1
  367. sglang/test/test_utils.py +383 -5
  368. sglang/utils.py +21 -1
  369. sglang/version.py +1 -1
  370. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +69 -124
  371. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +375 -245
  372. sglang/srt/disaggregation/launch_lb.py +0 -118
  373. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  374. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  375. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  376. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  377. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,810 @@
1
+ """
2
+ Standalone gRPC Server for SGLang - Fully separated from HTTP server.
3
+ Uses GrpcRequestManager for orchestration without tokenization.
4
+ """
5
+
6
+ import argparse
7
+ import asyncio
8
+ import logging
9
+ import multiprocessing as mp
10
+ import os
11
+ import signal
12
+ import time
13
+ from concurrent import futures
14
+ from typing import AsyncIterator, Dict, Optional, Tuple
15
+
16
+ import grpc
17
+ from grpc_reflection.v1alpha import reflection
18
+
19
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
20
+ from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
21
+ from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
22
+ from sglang.srt.managers.data_parallel_controller import (
23
+ run_data_parallel_controller_process,
24
+ )
25
+ from sglang.srt.managers.disagg_service import start_disagg_service
26
+ from sglang.srt.managers.io_struct import (
27
+ TokenizedEmbeddingReqInput,
28
+ TokenizedGenerateReqInput,
29
+ )
30
+ from sglang.srt.managers.scheduler import run_scheduler_process
31
+ from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
32
+ from sglang.srt.server_args import PortArgs, ServerArgs
33
+ from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
34
+ from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
35
+ from sglang.utils import get_exception_traceback
36
+
37
+ logger = logging.getLogger(__name__)
38
+ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
39
+
40
+
41
+ def _run_scheduler_with_signal_handling(*args, **kwargs):
42
+ """
43
+ Wrapper for run_scheduler_process that ignores SIGINT.
44
+
45
+ The scheduler process should not handle Ctrl+C - it should only terminate
46
+ when the parent gRPC server exits (via kill_itself_when_parent_died).
47
+ """
48
+ # Ignore SIGINT in this subprocess - let the parent handle it
49
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
50
+
51
+ # Now run the actual scheduler process
52
+ run_scheduler_process(*args, **kwargs)
53
+
54
+
55
+ def _launch_scheduler_process_only(
56
+ server_args: ServerArgs,
57
+ port_args: Optional[PortArgs] = None,
58
+ ) -> Tuple[Dict, PortArgs, list]:
59
+ """
60
+ Launch only the scheduler process(es) without tokenizer/detokenizer.
61
+ Returns scheduler info, port args, and list of scheduler processes.
62
+ """
63
+ # Configure global environment
64
+ configure_logger(server_args)
65
+ server_args.check_server_args()
66
+
67
+ # Allocate ports for inter-process communications
68
+ if port_args is None:
69
+ port_args = PortArgs.init_new(server_args)
70
+ logger.info(f"{server_args=}")
71
+
72
+ # Prepare model and tokenizer paths
73
+ server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
74
+ server_args.model_path, server_args.tokenizer_path
75
+ )
76
+
77
+ scheduler_procs = []
78
+ if server_args.dp_size == 1:
79
+ memory_saver_adapter = TorchMemorySaverAdapter.create(
80
+ enable=server_args.enable_memory_saver
81
+ )
82
+ scheduler_pipe_readers = []
83
+
84
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
85
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
86
+ tp_rank_range = range(
87
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
88
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
89
+ )
90
+
91
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
92
+ pp_rank_range = range(
93
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
94
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
95
+ )
96
+
97
+ for pp_rank in pp_rank_range:
98
+ for tp_rank in tp_rank_range:
99
+ reader, writer = mp.Pipe(duplex=False)
100
+ gpu_id = (
101
+ server_args.base_gpu_id
102
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
103
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
104
+ )
105
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
106
+ proc = mp.Process(
107
+ target=_run_scheduler_with_signal_handling,
108
+ args=(
109
+ server_args,
110
+ port_args,
111
+ gpu_id,
112
+ tp_rank,
113
+ moe_ep_rank,
114
+ pp_rank,
115
+ None,
116
+ writer,
117
+ ),
118
+ )
119
+
120
+ with memory_saver_adapter.configure_subprocess():
121
+ proc.start()
122
+ scheduler_procs.append(proc)
123
+ scheduler_pipe_readers.append(reader)
124
+ else:
125
+ # Launch the data parallel controller
126
+ reader, writer = mp.Pipe(duplex=False)
127
+ scheduler_pipe_readers = [reader]
128
+ proc = mp.Process(
129
+ target=run_data_parallel_controller_process,
130
+ args=(server_args, port_args, writer),
131
+ )
132
+ proc.start()
133
+ scheduler_procs.append(proc)
134
+
135
+ # TODO(CatherineSue): handle cases for multi-node
136
+
137
+ # Wait for all scheduler processes to be ready
138
+ scheduler_infos = []
139
+ for i, reader in enumerate(scheduler_pipe_readers):
140
+ try:
141
+ data = reader.recv()
142
+ except EOFError:
143
+ logger.error(
144
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
145
+ )
146
+ scheduler_procs[i].join()
147
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
148
+ raise RuntimeError(f"Failed to initialize scheduler rank {i}")
149
+
150
+ if data.get("status") != "ready":
151
+ raise RuntimeError(
152
+ f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
153
+ )
154
+ scheduler_infos.append(data)
155
+
156
+ logger.info(
157
+ f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
158
+ )
159
+
160
+ # Return the first scheduler's info (they should all be the same)
161
+ return scheduler_infos[0], port_args, scheduler_procs
162
+
163
+
164
+ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
165
+ """
166
+ Standalone gRPC service implementation using GrpcRequestManager.
167
+ Fully separated from HTTP server with its own process and no shared globals.
168
+ """
169
+
170
+ def __init__(
171
+ self,
172
+ request_manager: GrpcRequestManager,
173
+ server_args: ServerArgs,
174
+ model_info: Dict,
175
+ ):
176
+ """Initialize the standalone gRPC service."""
177
+ self.request_manager = request_manager
178
+ self.server_args = server_args
179
+ self.model_info = model_info
180
+ self.start_time = time.time()
181
+
182
+ # Start the request manager's event loop using auto_create_handle_loop
183
+ self.request_manager.auto_create_handle_loop()
184
+
185
+ logger.info("Standalone gRPC scheduler service initialized")
186
+
187
+ async def Generate(
188
+ self,
189
+ request: sglang_scheduler_pb2.GenerateRequest,
190
+ context: grpc.aio.ServicerContext,
191
+ ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
192
+ """Handle generation requests with streaming responses."""
193
+ logger.info(f"Generation request: {request.request_id}")
194
+
195
+ try:
196
+ # Convert gRPC request to internal format
197
+ tokenized_req = self._convert_generate_request(request)
198
+
199
+ # Submit to request manager (automatically handles n>1)
200
+ response_generator = self.request_manager.generate_request(
201
+ obj=tokenized_req,
202
+ request_id=request.request_id,
203
+ grpc_context=context,
204
+ )
205
+
206
+ async for output in response_generator:
207
+ # Handle batch responses (for n>1 non-streaming)
208
+ if isinstance(output, list):
209
+ for batch_output in output:
210
+ if "error" in batch_output:
211
+ yield sglang_scheduler_pb2.GenerateResponse(
212
+ request_id=request.request_id,
213
+ error=sglang_scheduler_pb2.GenerateError(
214
+ message=batch_output["error"],
215
+ http_status_code=(
216
+ "500" if "abort" not in batch_output else "499"
217
+ ),
218
+ ),
219
+ )
220
+ else:
221
+ # All non-error batch outputs are final responses
222
+ yield self._create_completion_response(
223
+ request.request_id, batch_output
224
+ )
225
+ else:
226
+ # Handle single response (for streaming or n=1 non-streaming)
227
+ if "error" in output:
228
+ yield sglang_scheduler_pb2.GenerateResponse(
229
+ request_id=request.request_id,
230
+ error=sglang_scheduler_pb2.GenerateError(
231
+ message=output["error"],
232
+ http_status_code=(
233
+ "500" if "abort" not in output else "499"
234
+ ),
235
+ ),
236
+ )
237
+ elif output.get("finished", False):
238
+ yield self._create_completion_response(
239
+ request.request_id, output
240
+ )
241
+ else:
242
+ yield self._create_chunk_response(request.request_id, output)
243
+
244
+ except Exception as e:
245
+ logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
246
+ yield sglang_scheduler_pb2.GenerateResponse(
247
+ request_id=request.request_id,
248
+ error=sglang_scheduler_pb2.GenerateError(
249
+ message=str(e),
250
+ http_status_code="500",
251
+ details=get_exception_traceback(),
252
+ ),
253
+ )
254
+
255
+ async def Embed(
256
+ self,
257
+ request: sglang_scheduler_pb2.EmbedRequest,
258
+ context: grpc.aio.ServicerContext,
259
+ ) -> sglang_scheduler_pb2.EmbedResponse:
260
+ """Handle embedding requests."""
261
+ logger.info(f"Embedding request: {request.request_id}")
262
+
263
+ try:
264
+ # Convert request
265
+ tokenized_req = self._convert_embed_request(request)
266
+
267
+ # Submit to request manager
268
+ future = await self.request_manager.embedding_request(
269
+ obj=tokenized_req,
270
+ request_id=request.request_id,
271
+ )
272
+
273
+ # Wait for result
274
+ result = await future
275
+
276
+ # Create response
277
+ return sglang_scheduler_pb2.EmbedResponse(
278
+ request_id=request.request_id,
279
+ complete=sglang_scheduler_pb2.EmbedComplete(
280
+ embedding=result["embedding"],
281
+ prompt_tokens=result.get("prompt_tokens", 0),
282
+ cached_tokens=0,
283
+ embedding_dim=len(result["embedding"]),
284
+ ),
285
+ )
286
+
287
+ except Exception as e:
288
+ logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
289
+ return sglang_scheduler_pb2.EmbedResponse(
290
+ request_id=request.request_id,
291
+ error=sglang_scheduler_pb2.EmbedError(
292
+ message=str(e),
293
+ code="INTERNAL_ERROR",
294
+ details=get_exception_traceback(),
295
+ ),
296
+ )
297
+
298
+ async def HealthCheck(
299
+ self,
300
+ request: sglang_scheduler_pb2.HealthCheckRequest,
301
+ context: grpc.aio.ServicerContext,
302
+ ) -> sglang_scheduler_pb2.HealthCheckResponse:
303
+ """Health check by generating from client input."""
304
+ try:
305
+ # Check if request manager is shutting down
306
+ if self.request_manager.gracefully_exit:
307
+ return sglang_scheduler_pb2.HealthCheckResponse(
308
+ healthy=False, message="Server shutting down"
309
+ )
310
+
311
+ # Extract tokenized input from request
312
+ if not request.HasField("tokenized"):
313
+ return sglang_scheduler_pb2.HealthCheckResponse(
314
+ healthy=False, message="Tokenized input required for health check"
315
+ )
316
+
317
+ input_text = request.tokenized.original_text
318
+ input_ids = list(request.tokenized.input_ids)
319
+
320
+ # Create health check request
321
+ rid = f"HEALTH_CHECK_GRPC_{time.time()}"
322
+
323
+ health_request = TokenizedGenerateReqInput(
324
+ rid=rid,
325
+ input_text=input_text,
326
+ input_ids=input_ids,
327
+ sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
328
+ stream=False,
329
+ mm_inputs=None,
330
+ return_logprob=False,
331
+ logprob_start_len=-1,
332
+ top_logprobs_num=0,
333
+ token_ids_logprob=None,
334
+ )
335
+
336
+ if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
337
+ health_request.bootstrap_host = FAKE_BOOTSTRAP_HOST
338
+ health_request.bootstrap_room = 0
339
+
340
+ logger.info(f"Sending health check request to request manager...")
341
+
342
+ # Submit and wait for response
343
+ output_generator = self.request_manager.generate_request(
344
+ health_request, request_id=rid
345
+ )
346
+
347
+ try:
348
+ # Get first response with timeout
349
+ response = await asyncio.wait_for(
350
+ output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT
351
+ )
352
+
353
+ # Clean up
354
+ if rid in self.request_manager.rid_to_state:
355
+ del self.request_manager.rid_to_state[rid]
356
+
357
+ return sglang_scheduler_pb2.HealthCheckResponse(
358
+ healthy=True, message="Health check passed"
359
+ )
360
+
361
+ except asyncio.TimeoutError:
362
+ # Clean up on timeout
363
+ if rid in self.request_manager.rid_to_state:
364
+ del self.request_manager.rid_to_state[rid]
365
+
366
+ return sglang_scheduler_pb2.HealthCheckResponse(
367
+ healthy=False, message="Health check timeout"
368
+ )
369
+
370
+ except Exception as e:
371
+ logger.error(f"Health check failed: {e}")
372
+ return sglang_scheduler_pb2.HealthCheckResponse(
373
+ healthy=False, message=f"Health check error: {str(e)}"
374
+ )
375
+
376
+ async def Abort(
377
+ self,
378
+ request: sglang_scheduler_pb2.AbortRequest,
379
+ context: grpc.aio.ServicerContext,
380
+ ) -> sglang_scheduler_pb2.AbortResponse:
381
+ """Abort an ongoing request."""
382
+ logger.info(f"Aborting request: {request.request_id}")
383
+
384
+ try:
385
+ success = await self.request_manager.abort_request(request.request_id)
386
+
387
+ return sglang_scheduler_pb2.AbortResponse(
388
+ success=success,
389
+ message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
390
+ )
391
+ except Exception as e:
392
+ logger.error(f"Abort failed: {e}")
393
+ return sglang_scheduler_pb2.AbortResponse(
394
+ success=False,
395
+ message=str(e),
396
+ )
397
+
398
+ # Helper methods for request/response conversion
399
+
400
+ def _convert_generate_request(
401
+ self, grpc_req: sglang_scheduler_pb2.GenerateRequest
402
+ ) -> TokenizedGenerateReqInput:
403
+ """Convert gRPC GenerateRequest to internal format."""
404
+
405
+ # Extract tokenized input
406
+ if not grpc_req.HasField("tokenized"):
407
+ raise ValueError("Tokenized input must be provided")
408
+
409
+ input_text = grpc_req.tokenized.original_text
410
+ input_ids = list(grpc_req.tokenized.input_ids)
411
+
412
+ # Convert sampling params
413
+ sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
414
+
415
+ # Extract disaggregated params if present
416
+ bootstrap_host = None
417
+ bootstrap_port = None
418
+ bootstrap_room = None
419
+ if grpc_req.HasField("disaggregated_params"):
420
+ bootstrap_host = grpc_req.disaggregated_params.bootstrap_host or None
421
+ bootstrap_port = grpc_req.disaggregated_params.bootstrap_port or None
422
+ bootstrap_room = grpc_req.disaggregated_params.bootstrap_room or None
423
+
424
+ # Create request
425
+ return TokenizedGenerateReqInput(
426
+ rid=grpc_req.request_id,
427
+ input_text=input_text,
428
+ input_ids=input_ids,
429
+ mm_inputs=None, # TODO: implement mm support
430
+ sampling_params=sampling_params,
431
+ return_logprob=grpc_req.return_logprob,
432
+ logprob_start_len=(
433
+ grpc_req.logprob_start_len
434
+ if grpc_req.logprob_start_len is not None
435
+ else -1
436
+ ),
437
+ top_logprobs_num=grpc_req.top_logprobs_num or 0,
438
+ stream=grpc_req.stream or False,
439
+ lora_id=grpc_req.lora_id if grpc_req.lora_id else None,
440
+ token_ids_logprob=(
441
+ list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
442
+ ),
443
+ bootstrap_host=bootstrap_host,
444
+ bootstrap_port=bootstrap_port,
445
+ bootstrap_room=bootstrap_room,
446
+ )
447
+
448
+ def _convert_embed_request(
449
+ self, grpc_req: sglang_scheduler_pb2.EmbedRequest
450
+ ) -> TokenizedEmbeddingReqInput:
451
+ """Convert gRPC EmbedRequest to internal format."""
452
+
453
+ # Extract tokenized input
454
+ if not grpc_req.HasField("tokenized"):
455
+ raise ValueError("Tokenized input must be provided")
456
+
457
+ input_text = grpc_req.tokenized.original_text
458
+ input_ids = list(grpc_req.tokenized.input_ids)
459
+
460
+ return TokenizedEmbeddingReqInput(
461
+ rid=grpc_req.request_id,
462
+ input_text=input_text,
463
+ input_ids=input_ids,
464
+ )
465
+
466
+ def _convert_sampling_params(
467
+ self, grpc_params: sglang_scheduler_pb2.SamplingParams
468
+ ) -> SGLSamplingParams:
469
+ """Convert gRPC SamplingParams to internal format."""
470
+
471
+ # Handle constraint types
472
+ regex = None
473
+ json_schema = None
474
+ ebnf_grammar = None
475
+ structural_tag = None
476
+
477
+ if grpc_params.HasField("regex"):
478
+ regex = grpc_params.regex
479
+ elif grpc_params.HasField("json_schema"):
480
+ json_schema = grpc_params.json_schema
481
+ elif grpc_params.HasField("ebnf_grammar"):
482
+ ebnf_grammar = grpc_params.ebnf_grammar
483
+ elif grpc_params.HasField("structural_tag"):
484
+ structural_tag = grpc_params.structural_tag
485
+
486
+ return SGLSamplingParams(
487
+ temperature=grpc_params.temperature or 1.0,
488
+ top_p=grpc_params.top_p or 1.0,
489
+ top_k=grpc_params.top_k or -1,
490
+ min_p=grpc_params.min_p or 0.0,
491
+ frequency_penalty=grpc_params.frequency_penalty or 0.0,
492
+ presence_penalty=grpc_params.presence_penalty or 0.0,
493
+ repetition_penalty=grpc_params.repetition_penalty or 1.0,
494
+ max_new_tokens=grpc_params.max_new_tokens or 128,
495
+ min_new_tokens=grpc_params.min_new_tokens or 0,
496
+ stop=list(grpc_params.stop) if grpc_params.stop else [],
497
+ stop_token_ids=(
498
+ list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else []
499
+ ),
500
+ skip_special_tokens=grpc_params.skip_special_tokens,
501
+ spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
502
+ regex=regex,
503
+ json_schema=json_schema,
504
+ ebnf=ebnf_grammar,
505
+ structural_tag=structural_tag,
506
+ n=grpc_params.n or 1,
507
+ ignore_eos=grpc_params.ignore_eos,
508
+ )
509
+
510
+ def _convert_output_logprobs_to_proto(
511
+ self, logprobs_data: Dict
512
+ ) -> Optional[sglang_scheduler_pb2.OutputLogProbs]:
513
+ """Convert output logprobs dict to proto (no None values, plain floats)."""
514
+ if not logprobs_data:
515
+ return None
516
+
517
+ token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
518
+ token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
519
+ top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
520
+ top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
521
+
522
+ # Build TopLogProbs entries
523
+ top_logprobs_proto = []
524
+ if top_logprobs_val and top_logprobs_idx:
525
+ for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
526
+ top_logprobs_proto.append(
527
+ sglang_scheduler_pb2.TopLogProbs(
528
+ values=val_list,
529
+ token_ids=idx_list,
530
+ )
531
+ )
532
+
533
+ return sglang_scheduler_pb2.OutputLogProbs(
534
+ token_logprobs=token_logprobs_val, # Plain float array
535
+ token_ids=token_logprobs_idx,
536
+ top_logprobs=top_logprobs_proto,
537
+ )
538
+
539
+ def _convert_input_logprobs_to_proto(
540
+ self, logprobs_data: Dict
541
+ ) -> Optional[sglang_scheduler_pb2.InputLogProbs]:
542
+ """Convert input logprobs dict to proto (first token is None, wrapped in InputTokenLogProb)."""
543
+ if not logprobs_data:
544
+ return None
545
+
546
+ token_logprobs_val = logprobs_data.get("token_logprobs_val", [])
547
+ token_logprobs_idx = logprobs_data.get("token_logprobs_idx", [])
548
+ top_logprobs_val = logprobs_data.get("top_logprobs_val", [])
549
+ top_logprobs_idx = logprobs_data.get("top_logprobs_idx", [])
550
+
551
+ # Wrap values in InputTokenLogProb (None for first token, value for others)
552
+ token_logprobs_wrapped = [
553
+ (
554
+ sglang_scheduler_pb2.InputTokenLogProb()
555
+ if x is None
556
+ else sglang_scheduler_pb2.InputTokenLogProb(value=x)
557
+ )
558
+ for x in token_logprobs_val
559
+ ]
560
+
561
+ # Build TopLogProbs entries
562
+ top_logprobs_proto = []
563
+ if top_logprobs_val and top_logprobs_idx:
564
+ for val_list, idx_list in zip(top_logprobs_val, top_logprobs_idx):
565
+ top_logprobs_proto.append(
566
+ sglang_scheduler_pb2.TopLogProbs(
567
+ values=val_list,
568
+ token_ids=idx_list,
569
+ )
570
+ )
571
+
572
+ return sglang_scheduler_pb2.InputLogProbs(
573
+ token_logprobs=token_logprobs_wrapped,
574
+ token_ids=token_logprobs_idx,
575
+ top_logprobs=top_logprobs_proto,
576
+ )
577
+
578
+ def _create_chunk_response(
579
+ self, request_id: str, output: Dict
580
+ ) -> sglang_scheduler_pb2.GenerateResponse:
581
+ """Create a streaming chunk response."""
582
+ meta_info = output.get("meta_info", {})
583
+
584
+ # Convert output logprobs if present
585
+ output_logprobs_proto = self._convert_output_logprobs_to_proto(
586
+ output.get("output_logprobs")
587
+ )
588
+
589
+ # Convert input logprobs if present (only in first chunk)
590
+ input_logprobs_proto = self._convert_input_logprobs_to_proto(
591
+ output.get("input_logprobs")
592
+ )
593
+
594
+ return sglang_scheduler_pb2.GenerateResponse(
595
+ request_id=request_id,
596
+ chunk=sglang_scheduler_pb2.GenerateStreamChunk(
597
+ token_ids=output.get("token_ids", []),
598
+ prompt_tokens=meta_info.get("prompt_tokens", 0),
599
+ completion_tokens=meta_info.get("completion_tokens", 0),
600
+ cached_tokens=meta_info.get("cached_tokens", 0),
601
+ output_logprobs=output_logprobs_proto,
602
+ input_logprobs=input_logprobs_proto,
603
+ index=output.get("index", 0),
604
+ ),
605
+ )
606
+
607
+ def _create_completion_response(
608
+ self, request_id: str, output: Dict
609
+ ) -> sglang_scheduler_pb2.GenerateResponse:
610
+ """Create a completion response."""
611
+
612
+ # Extract meta info and finish reason details
613
+ meta_info = output.get("meta_info", {})
614
+ finish_reason_data = meta_info.get("finish_reason")
615
+
616
+ # Determine finish reason, default is stop
617
+ finish_reason = "stop"
618
+ if finish_reason_data:
619
+ if isinstance(finish_reason_data, dict):
620
+ finish_reason_type = finish_reason_data.get("type")
621
+ else:
622
+ # Handle legacy string format
623
+ finish_reason_type = finish_reason_data
624
+
625
+ if finish_reason_type == "length":
626
+ finish_reason = "length"
627
+ elif finish_reason_type == "abort":
628
+ finish_reason = "abort"
629
+
630
+ # Extract matched_stop information
631
+ matched_stop_kwargs = {}
632
+ if isinstance(finish_reason_data, dict) and "matched" in finish_reason_data:
633
+ matched = finish_reason_data["matched"]
634
+ if isinstance(matched, int):
635
+ matched_stop_kwargs["matched_token_id"] = matched
636
+ elif isinstance(matched, str):
637
+ matched_stop_kwargs["matched_stop_str"] = matched
638
+
639
+ # Convert output logprobs if present
640
+ output_logprobs_proto = self._convert_output_logprobs_to_proto(
641
+ output.get("output_logprobs")
642
+ )
643
+
644
+ # Convert input logprobs if present
645
+ input_logprobs_proto = self._convert_input_logprobs_to_proto(
646
+ output.get("input_logprobs")
647
+ )
648
+
649
+ return sglang_scheduler_pb2.GenerateResponse(
650
+ request_id=request_id,
651
+ complete=sglang_scheduler_pb2.GenerateComplete(
652
+ output_ids=output.get("token_ids", []),
653
+ finish_reason=finish_reason,
654
+ prompt_tokens=meta_info.get("prompt_tokens", 0),
655
+ completion_tokens=meta_info.get(
656
+ "completion_tokens", len(output.get("token_ids", []))
657
+ ),
658
+ cached_tokens=meta_info.get("cached_tokens", 0),
659
+ output_logprobs=output_logprobs_proto,
660
+ input_logprobs=input_logprobs_proto,
661
+ index=output.get("index", 0),
662
+ **matched_stop_kwargs,
663
+ ),
664
+ )
665
+
666
+ async def shutdown(self):
667
+ """Shutdown the service."""
668
+ logger.info("Shutting down gRPC service")
669
+
670
+ # Shutdown request manager (handles its own tasks)
671
+ await self.request_manager.shutdown()
672
+
673
+
674
+ async def serve_grpc(
675
+ server_args: ServerArgs,
676
+ model_info: Optional[Dict] = None,
677
+ ):
678
+ """Start the standalone gRPC server with integrated scheduler."""
679
+
680
+ # Start bootstrap server BEFORE launching scheduler processes (only in PREFILL mode)
681
+ # This ensures the bootstrap server is ready when prefill schedulers try to register
682
+ bootstrap_server = None
683
+ if server_args.disaggregation_mode == "prefill":
684
+ bootstrap_server = start_disagg_service(server_args)
685
+ if bootstrap_server:
686
+ logger.info(
687
+ f"Bootstrap server started for disaggregation mode on {server_args.host}:{server_args.disaggregation_bootstrap_port}"
688
+ )
689
+
690
+ # Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
691
+ logger.info("Launching scheduler process(es)...")
692
+ scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
693
+ server_args=server_args,
694
+ )
695
+
696
+ # Update model info from scheduler info
697
+ if model_info is None:
698
+ model_info = {
699
+ "model_name": server_args.model_path,
700
+ "max_context_length": scheduler_info.get(
701
+ "max_total_num_tokens", server_args.context_length or 8192
702
+ ),
703
+ "vocab_size": scheduler_info.get("vocab_size", 128256),
704
+ "supports_vision": scheduler_info.get("supports_vision", False),
705
+ "model_type": scheduler_info.get("model_type", "transformer"),
706
+ "max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
707
+ "eos_token_ids": scheduler_info.get("eos_token_ids", []),
708
+ "pad_token_id": scheduler_info.get("pad_token_id", 0),
709
+ "bos_token_id": scheduler_info.get("bos_token_id", 1),
710
+ }
711
+
712
+ # Create request manager with the correct port args
713
+ # Note: We pass None for bootstrap_server since it's already started above
714
+ request_manager = GrpcRequestManager(
715
+ server_args=server_args,
716
+ port_args=port_args,
717
+ bootstrap_server=bootstrap_server,
718
+ )
719
+
720
+ # Create gRPC server
721
+ server = grpc.aio.server(
722
+ futures.ThreadPoolExecutor(max_workers=10),
723
+ options=[
724
+ ("grpc.max_send_message_length", 1024 * 1024 * 256),
725
+ ("grpc.max_receive_message_length", 1024 * 1024 * 256),
726
+ ],
727
+ )
728
+
729
+ # Add service
730
+ servicer = SGLangSchedulerServicer(
731
+ request_manager=request_manager,
732
+ server_args=server_args,
733
+ model_info=model_info,
734
+ )
735
+ sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
736
+
737
+ # Enable reflection
738
+ SERVICE_NAMES = (
739
+ sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
740
+ reflection.SERVICE_NAME,
741
+ )
742
+ reflection.enable_server_reflection(SERVICE_NAMES, server)
743
+
744
+ # Start server
745
+ listen_addr = f"{server_args.host}:{server_args.port}"
746
+ server.add_insecure_port(listen_addr)
747
+
748
+ logger.info(f"Starting standalone gRPC server on {listen_addr}")
749
+
750
+ await server.start()
751
+
752
+ # Handle shutdown signals
753
+ loop = asyncio.get_running_loop()
754
+ stop_event = asyncio.Event()
755
+
756
+ def signal_handler():
757
+ logger.info("Received shutdown signal")
758
+ stop_event.set()
759
+
760
+ for sig in (signal.SIGTERM, signal.SIGINT):
761
+ loop.add_signal_handler(sig, signal_handler)
762
+
763
+ try:
764
+ await stop_event.wait()
765
+ finally:
766
+ logger.info("Shutting down gRPC server")
767
+
768
+ # Shutdown request manager first - this closes ZMQ sockets and stops background tasks
769
+ await servicer.shutdown()
770
+
771
+ # Stop the gRPC server
772
+ await server.stop(5.0)
773
+
774
+ # Terminate scheduler processes before exiting to avoid atexit hang
775
+ # The scheduler processes have SIGINT ignored, so they won't get KeyboardInterrupt
776
+ for i, proc in enumerate(scheduler_procs):
777
+ if proc.is_alive():
778
+ logger.info(f"Terminating scheduler process {i}...")
779
+ proc.terminate()
780
+ proc.join(timeout=2.0)
781
+ if proc.is_alive():
782
+ logger.warning(
783
+ f"Scheduler process {i} did not terminate, killing..."
784
+ )
785
+ proc.kill()
786
+ proc.join(timeout=1.0)
787
+
788
+ logger.info("All scheduler processes terminated")
789
+
790
+
791
+ def main():
792
+ """Main entry point for standalone gRPC server."""
793
+ # Fix CUDA multiprocessing issues - must be called before any CUDA operations
794
+ mp.set_start_method("spawn", force=True)
795
+
796
+ parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
797
+ ServerArgs.add_cli_args(parser)
798
+ args = parser.parse_args()
799
+ server_args = ServerArgs.from_cli_args(args)
800
+
801
+ # Run server
802
+ asyncio.run(
803
+ serve_grpc(
804
+ server_args=server_args,
805
+ )
806
+ )
807
+
808
+
809
+ if __name__ == "__main__":
810
+ main()