sglang 0.5.2rc2__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (238) hide show
  1. sglang/bench_one_batch_server.py +10 -1
  2. sglang/bench_serving.py +257 -29
  3. sglang/srt/configs/__init__.py +4 -0
  4. sglang/srt/configs/device_config.py +3 -1
  5. sglang/srt/configs/dots_vlm.py +139 -0
  6. sglang/srt/configs/load_config.py +1 -0
  7. sglang/srt/configs/model_config.py +50 -6
  8. sglang/srt/configs/qwen3_next.py +326 -0
  9. sglang/srt/connector/__init__.py +8 -1
  10. sglang/srt/connector/remote_instance.py +82 -0
  11. sglang/srt/constrained/base_grammar_backend.py +48 -12
  12. sglang/srt/constrained/llguidance_backend.py +0 -1
  13. sglang/srt/constrained/outlines_backend.py +0 -1
  14. sglang/srt/constrained/xgrammar_backend.py +28 -9
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/base/conn.py +1 -1
  21. sglang/srt/disaggregation/common/conn.py +15 -12
  22. sglang/srt/disaggregation/decode.py +21 -10
  23. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -445
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +5 -3
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +24 -3
  31. sglang/srt/entrypoints/engine.py +38 -17
  32. sglang/srt/entrypoints/grpc_request_manager.py +580 -0
  33. sglang/srt/entrypoints/grpc_server.py +680 -0
  34. sglang/srt/entrypoints/http_server.py +85 -54
  35. sglang/srt/entrypoints/openai/protocol.py +4 -1
  36. sglang/srt/entrypoints/openai/serving_base.py +46 -3
  37. sglang/srt/entrypoints/openai/serving_chat.py +36 -16
  38. sglang/srt/entrypoints/openai/serving_completions.py +12 -3
  39. sglang/srt/entrypoints/openai/serving_embedding.py +8 -3
  40. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  41. sglang/srt/entrypoints/openai/serving_responses.py +6 -3
  42. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  43. sglang/srt/eplb/eplb_manager.py +2 -2
  44. sglang/srt/eplb/expert_distribution.py +26 -13
  45. sglang/srt/eplb/expert_location.py +8 -3
  46. sglang/srt/eplb/expert_location_updater.py +1 -1
  47. sglang/srt/function_call/base_format_detector.py +3 -6
  48. sglang/srt/function_call/ebnf_composer.py +11 -9
  49. sglang/srt/function_call/function_call_parser.py +6 -0
  50. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  51. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  52. sglang/srt/grpc/__init__.py +1 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
  56. sglang/srt/hf_transformers_utils.py +4 -0
  57. sglang/srt/layers/activation.py +142 -9
  58. sglang/srt/layers/attention/ascend_backend.py +11 -4
  59. sglang/srt/layers/attention/fla/chunk.py +242 -0
  60. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  61. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  62. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  63. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  66. sglang/srt/layers/attention/fla/index.py +37 -0
  67. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  68. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  69. sglang/srt/layers/attention/fla/op.py +66 -0
  70. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  71. sglang/srt/layers/attention/fla/utils.py +331 -0
  72. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  73. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  74. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  75. sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
  76. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
  77. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  78. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  79. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  80. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  81. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  82. sglang/srt/layers/attention/triton_backend.py +18 -1
  83. sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
  84. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  85. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  86. sglang/srt/layers/dp_attention.py +30 -1
  87. sglang/srt/layers/layernorm.py +32 -15
  88. sglang/srt/layers/linear.py +34 -3
  89. sglang/srt/layers/logits_processor.py +29 -10
  90. sglang/srt/layers/moe/__init__.py +2 -1
  91. sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
  92. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  93. sglang/srt/layers/moe/ep_moe/layer.py +182 -62
  94. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
  95. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  96. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  100. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  101. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  102. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  103. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  104. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  105. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  106. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  107. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
  108. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  109. sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
  110. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  111. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  112. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  113. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  114. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  115. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  116. sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
  117. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  118. sglang/srt/layers/moe/topk.py +30 -9
  119. sglang/srt/layers/moe/utils.py +12 -6
  120. sglang/srt/layers/quantization/awq.py +19 -7
  121. sglang/srt/layers/quantization/base_config.py +11 -6
  122. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  123. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  124. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  125. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  126. sglang/srt/layers/quantization/fp8.py +76 -47
  127. sglang/srt/layers/quantization/fp8_utils.py +50 -31
  128. sglang/srt/layers/quantization/gptq.py +25 -17
  129. sglang/srt/layers/quantization/modelopt_quant.py +147 -47
  130. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  131. sglang/srt/layers/quantization/mxfp4.py +64 -40
  132. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  133. sglang/srt/layers/quantization/unquant.py +135 -47
  134. sglang/srt/layers/quantization/w4afp8.py +30 -17
  135. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  136. sglang/srt/layers/quantization/w8a8_int8.py +76 -38
  137. sglang/srt/layers/sampler.py +162 -18
  138. sglang/srt/lora/backend/base_backend.py +50 -8
  139. sglang/srt/lora/backend/triton_backend.py +90 -2
  140. sglang/srt/lora/layers.py +32 -0
  141. sglang/srt/lora/lora.py +4 -1
  142. sglang/srt/lora/lora_manager.py +35 -112
  143. sglang/srt/lora/mem_pool.py +24 -10
  144. sglang/srt/lora/utils.py +18 -9
  145. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  146. sglang/srt/managers/cache_controller.py +158 -160
  147. sglang/srt/managers/data_parallel_controller.py +105 -35
  148. sglang/srt/managers/detokenizer_manager.py +8 -4
  149. sglang/srt/managers/disagg_service.py +46 -0
  150. sglang/srt/managers/io_struct.py +199 -12
  151. sglang/srt/managers/mm_utils.py +1 -0
  152. sglang/srt/managers/multi_tokenizer_mixin.py +350 -400
  153. sglang/srt/managers/schedule_batch.py +77 -56
  154. sglang/srt/managers/schedule_policy.py +1 -1
  155. sglang/srt/managers/scheduler.py +187 -39
  156. sglang/srt/managers/scheduler_metrics_mixin.py +4 -3
  157. sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
  158. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  159. sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
  160. sglang/srt/managers/tokenizer_manager.py +259 -519
  161. sglang/srt/managers/tp_worker.py +53 -4
  162. sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
  163. sglang/srt/mem_cache/hicache_storage.py +3 -23
  164. sglang/srt/mem_cache/hiradix_cache.py +103 -43
  165. sglang/srt/mem_cache/memory_pool.py +347 -48
  166. sglang/srt/mem_cache/memory_pool_host.py +105 -46
  167. sglang/srt/mem_cache/radix_cache.py +0 -2
  168. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  169. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  170. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +86 -4
  171. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  172. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  173. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +49 -7
  174. sglang/srt/mem_cache/swa_radix_cache.py +0 -2
  175. sglang/srt/metrics/collector.py +493 -76
  176. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  177. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  178. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  179. sglang/srt/model_executor/forward_batch_info.py +59 -2
  180. sglang/srt/model_executor/model_runner.py +356 -29
  181. sglang/srt/model_loader/__init__.py +9 -3
  182. sglang/srt/model_loader/loader.py +128 -4
  183. sglang/srt/model_loader/weight_utils.py +2 -1
  184. sglang/srt/models/apertus.py +686 -0
  185. sglang/srt/models/bailing_moe.py +798 -218
  186. sglang/srt/models/bailing_moe_nextn.py +168 -0
  187. sglang/srt/models/deepseek_v2.py +109 -15
  188. sglang/srt/models/dots_vlm.py +174 -0
  189. sglang/srt/models/dots_vlm_vit.py +337 -0
  190. sglang/srt/models/ernie4.py +1 -1
  191. sglang/srt/models/gemma3n_mm.py +1 -1
  192. sglang/srt/models/glm4_moe.py +1 -1
  193. sglang/srt/models/glm4v.py +4 -2
  194. sglang/srt/models/glm4v_moe.py +3 -0
  195. sglang/srt/models/gpt_oss.py +1 -1
  196. sglang/srt/models/llama4.py +9 -0
  197. sglang/srt/models/llama_eagle3.py +13 -0
  198. sglang/srt/models/longcat_flash.py +2 -2
  199. sglang/srt/models/mllama4.py +25 -0
  200. sglang/srt/models/opt.py +637 -0
  201. sglang/srt/models/qwen2.py +7 -0
  202. sglang/srt/models/qwen2_5_vl.py +27 -3
  203. sglang/srt/models/qwen2_moe.py +56 -12
  204. sglang/srt/models/qwen3_moe.py +1 -1
  205. sglang/srt/models/qwen3_next.py +1042 -0
  206. sglang/srt/models/qwen3_next_mtp.py +112 -0
  207. sglang/srt/models/step3_vl.py +1 -1
  208. sglang/srt/multimodal/processors/dots_vlm.py +99 -0
  209. sglang/srt/multimodal/processors/glm4v.py +9 -9
  210. sglang/srt/multimodal/processors/internvl.py +141 -129
  211. sglang/srt/multimodal/processors/qwen_vl.py +15 -5
  212. sglang/srt/offloader.py +27 -3
  213. sglang/srt/remote_instance_weight_loader_utils.py +69 -0
  214. sglang/srt/sampling/sampling_batch_info.py +18 -15
  215. sglang/srt/server_args.py +276 -35
  216. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  217. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  218. sglang/srt/speculative/eagle_utils.py +0 -2
  219. sglang/srt/speculative/eagle_worker.py +43 -4
  220. sglang/srt/speculative/spec_info.py +5 -0
  221. sglang/srt/speculative/standalone_worker.py +109 -0
  222. sglang/srt/tracing/trace.py +552 -0
  223. sglang/srt/utils.py +34 -3
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  226. sglang/test/runners.py +4 -0
  227. sglang/test/test_cutlass_moe.py +24 -6
  228. sglang/test/test_disaggregation_utils.py +66 -0
  229. sglang/test/test_fp4_moe.py +370 -1
  230. sglang/test/test_utils.py +28 -1
  231. sglang/utils.py +11 -0
  232. sglang/version.py +1 -1
  233. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
  234. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +237 -178
  235. sglang/srt/disaggregation/launch_lb.py +0 -118
  236. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
  237. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
  238. {sglang-0.5.2rc2.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,680 @@
1
+ """
2
+ Standalone gRPC Server for SGLang - Fully separated from HTTP server.
3
+ Uses GrpcRequestManager for orchestration without tokenization.
4
+ """
5
+
6
+ import argparse
7
+ import asyncio
8
+ import logging
9
+ import multiprocessing as mp
10
+ import os
11
+ import signal
12
+ import time
13
+ from concurrent import futures
14
+ from typing import AsyncIterator, Dict, Optional, Tuple
15
+
16
+ import grpc
17
+ from grpc_reflection.v1alpha import reflection
18
+
19
+ from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
20
+ from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
21
+ from sglang.srt.managers.data_parallel_controller import (
22
+ run_data_parallel_controller_process,
23
+ )
24
+ from sglang.srt.managers.io_struct import (
25
+ TokenizedEmbeddingReqInput,
26
+ TokenizedGenerateReqInput,
27
+ )
28
+ from sglang.srt.managers.scheduler import run_scheduler_process
29
+ from sglang.srt.sampling.sampling_params import SamplingParams as SGLSamplingParams
30
+ from sglang.srt.server_args import PortArgs, ServerArgs
31
+ from sglang.srt.torch_memory_saver_adapter import TorchMemorySaverAdapter
32
+ from sglang.srt.utils import configure_logger, prepare_model_and_tokenizer
33
+ from sglang.utils import get_exception_traceback
34
+
35
+ logger = logging.getLogger(__name__)
36
+ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
37
+
38
+
39
+ def _launch_scheduler_process_only(
40
+ server_args: ServerArgs,
41
+ port_args: Optional[PortArgs] = None,
42
+ ) -> Tuple[Dict, PortArgs, list]:
43
+ """
44
+ Launch only the scheduler process(es) without tokenizer/detokenizer.
45
+ Returns scheduler info, port args, and list of scheduler processes.
46
+ """
47
+ # Configure global environment
48
+ configure_logger(server_args)
49
+ server_args.check_server_args()
50
+
51
+ # Allocate ports for inter-process communications
52
+ if port_args is None:
53
+ port_args = PortArgs.init_new(server_args)
54
+ logger.info(f"{server_args=}")
55
+
56
+ # Prepare model and tokenizer paths
57
+ server_args.model_path, server_args.tokenizer_path = prepare_model_and_tokenizer(
58
+ server_args.model_path, server_args.tokenizer_path
59
+ )
60
+
61
+ scheduler_procs = []
62
+ if server_args.dp_size == 1:
63
+ memory_saver_adapter = TorchMemorySaverAdapter.create(
64
+ enable=server_args.enable_memory_saver
65
+ )
66
+ scheduler_pipe_readers = []
67
+
68
+ nnodes_per_tp_group = max(server_args.nnodes // server_args.pp_size, 1)
69
+ tp_size_per_node = server_args.tp_size // nnodes_per_tp_group
70
+ tp_rank_range = range(
71
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group),
72
+ tp_size_per_node * (server_args.node_rank % nnodes_per_tp_group + 1),
73
+ )
74
+
75
+ pp_size_per_node = max(server_args.pp_size // server_args.nnodes, 1)
76
+ pp_rank_range = range(
77
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group),
78
+ pp_size_per_node * (server_args.node_rank // nnodes_per_tp_group + 1),
79
+ )
80
+
81
+ for pp_rank in pp_rank_range:
82
+ for tp_rank in tp_rank_range:
83
+ reader, writer = mp.Pipe(duplex=False)
84
+ gpu_id = (
85
+ server_args.base_gpu_id
86
+ + ((pp_rank % pp_size_per_node) * tp_size_per_node)
87
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
88
+ )
89
+ moe_ep_rank = tp_rank // (server_args.tp_size // server_args.ep_size)
90
+ proc = mp.Process(
91
+ target=run_scheduler_process,
92
+ args=(
93
+ server_args,
94
+ port_args,
95
+ gpu_id,
96
+ tp_rank,
97
+ moe_ep_rank,
98
+ pp_rank,
99
+ None,
100
+ writer,
101
+ None,
102
+ ),
103
+ )
104
+
105
+ with memory_saver_adapter.configure_subprocess():
106
+ proc.start()
107
+ scheduler_procs.append(proc)
108
+ scheduler_pipe_readers.append(reader)
109
+ else:
110
+ # Launch the data parallel controller
111
+ reader, writer = mp.Pipe(duplex=False)
112
+ scheduler_pipe_readers = [reader]
113
+ proc = mp.Process(
114
+ target=run_data_parallel_controller_process,
115
+ args=(server_args, port_args, writer),
116
+ )
117
+ proc.start()
118
+ scheduler_procs.append(proc)
119
+
120
+ # TODO(CatherineSue): handle cases for multi-node
121
+
122
+ # Wait for all scheduler processes to be ready
123
+ scheduler_infos = []
124
+ for i, reader in enumerate(scheduler_pipe_readers):
125
+ try:
126
+ data = reader.recv()
127
+ except EOFError:
128
+ logger.error(
129
+ f"Rank {i} scheduler is dead. Please check if there are relevant logs."
130
+ )
131
+ scheduler_procs[i].join()
132
+ logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
133
+ raise RuntimeError(f"Failed to initialize scheduler rank {i}")
134
+
135
+ if data.get("status") != "ready":
136
+ raise RuntimeError(
137
+ f"Scheduler rank {i} initialization failed: {data.get('error', 'Unknown error')}"
138
+ )
139
+ scheduler_infos.append(data)
140
+
141
+ logger.info(
142
+ f"All {len(scheduler_procs)} scheduler process(es) initialized successfully"
143
+ )
144
+
145
+ # Return the first scheduler's info (they should all be the same)
146
+ return scheduler_infos[0], port_args, scheduler_procs
147
+
148
+
149
+ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer):
150
+ """
151
+ Standalone gRPC service implementation using GrpcRequestManager.
152
+ Fully separated from HTTP server with its own process and no shared globals.
153
+ """
154
+
155
+ def __init__(
156
+ self,
157
+ request_manager: GrpcRequestManager,
158
+ server_args: ServerArgs,
159
+ model_info: Dict,
160
+ ):
161
+ """Initialize the standalone gRPC service."""
162
+ self.request_manager = request_manager
163
+ self.server_args = server_args
164
+ self.model_info = model_info
165
+ self.start_time = time.time()
166
+
167
+ # Start the request manager's event loop using auto_create_handle_loop
168
+ self.request_manager.auto_create_handle_loop()
169
+
170
+ logger.info("Standalone gRPC scheduler service initialized")
171
+
172
+ async def Generate(
173
+ self,
174
+ request: sglang_scheduler_pb2.GenerateRequest,
175
+ context: grpc.aio.ServicerContext,
176
+ ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
177
+ """Handle generation requests with streaming responses."""
178
+ logger.info(f"Generation request: {request.request_id}")
179
+
180
+ try:
181
+ # Convert gRPC request to internal format
182
+ tokenized_req = self._convert_generate_request(request)
183
+
184
+ # Submit to request manager
185
+ output_queue = await self.request_manager.generate_request(
186
+ obj=tokenized_req,
187
+ request_id=request.request_id,
188
+ grpc_context=context,
189
+ )
190
+
191
+ # Stream outputs
192
+ while True:
193
+ try:
194
+ # Get output with timeout
195
+ output = await asyncio.wait_for(output_queue.get(), timeout=4)
196
+
197
+ # Check for errors
198
+ if "error" in output:
199
+ yield sglang_scheduler_pb2.GenerateResponse(
200
+ request_id=request.request_id,
201
+ error=sglang_scheduler_pb2.GenerateError(
202
+ message=output["error"],
203
+ http_status_code=(
204
+ "500" if "abort" not in output else "499"
205
+ ),
206
+ ),
207
+ )
208
+ break
209
+
210
+ # Check if finished
211
+ if output.get("finished", False):
212
+ # Send completion
213
+ yield self._create_completion_response(
214
+ request.request_id, output
215
+ )
216
+ break
217
+ else:
218
+ # Send chunk
219
+ yield self._create_chunk_response(request.request_id, output)
220
+
221
+ except asyncio.TimeoutError:
222
+ # Check if context is still active
223
+ if context.cancelled():
224
+ # Abort the request
225
+ await self.request_manager.abort_request(request.request_id)
226
+ break
227
+ continue
228
+
229
+ except Exception as e:
230
+ logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
231
+ yield sglang_scheduler_pb2.GenerateResponse(
232
+ request_id=request.request_id,
233
+ error=sglang_scheduler_pb2.GenerateError(
234
+ message=str(e),
235
+ http_status_code="500",
236
+ details=get_exception_traceback(),
237
+ ),
238
+ )
239
+
240
+ async def Embed(
241
+ self,
242
+ request: sglang_scheduler_pb2.EmbedRequest,
243
+ context: grpc.aio.ServicerContext,
244
+ ) -> sglang_scheduler_pb2.EmbedResponse:
245
+ """Handle embedding requests."""
246
+ logger.info(f"Embedding request: {request.request_id}")
247
+
248
+ try:
249
+ # Convert request
250
+ tokenized_req = self._convert_embed_request(request)
251
+
252
+ # Submit to request manager
253
+ future = await self.request_manager.embedding_request(
254
+ obj=tokenized_req,
255
+ request_id=request.request_id,
256
+ )
257
+
258
+ # Wait for result
259
+ result = await future
260
+
261
+ # Create response
262
+ return sglang_scheduler_pb2.EmbedResponse(
263
+ request_id=request.request_id,
264
+ complete=sglang_scheduler_pb2.EmbedComplete(
265
+ embedding=result["embedding"],
266
+ prompt_tokens=result.get("prompt_tokens", 0),
267
+ cached_tokens=0,
268
+ embedding_dim=len(result["embedding"]),
269
+ generation_time=time.time() - self.start_time,
270
+ ),
271
+ )
272
+
273
+ except Exception as e:
274
+ logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
275
+ return sglang_scheduler_pb2.EmbedResponse(
276
+ request_id=request.request_id,
277
+ error=sglang_scheduler_pb2.EmbedError(
278
+ message=str(e),
279
+ code="INTERNAL_ERROR",
280
+ details=get_exception_traceback(),
281
+ ),
282
+ )
283
+
284
+ async def HealthCheck(
285
+ self,
286
+ request: sglang_scheduler_pb2.HealthCheckRequest,
287
+ context: grpc.aio.ServicerContext,
288
+ ) -> sglang_scheduler_pb2.HealthCheckResponse:
289
+ """Health check by generating from client input."""
290
+ try:
291
+ # Check if request manager is shutting down
292
+ if self.request_manager.gracefully_exit:
293
+ return sglang_scheduler_pb2.HealthCheckResponse(
294
+ healthy=False, message="Server shutting down"
295
+ )
296
+
297
+ # Extract tokenized input from request
298
+ if not request.HasField("tokenized"):
299
+ return sglang_scheduler_pb2.HealthCheckResponse(
300
+ healthy=False, message="Tokenized input required for health check"
301
+ )
302
+
303
+ input_text = request.tokenized.original_text
304
+ input_ids = list(request.tokenized.input_ids)
305
+
306
+ # Create health check request
307
+ rid = f"HEALTH_CHECK_GRPC_{time.time()}"
308
+
309
+ health_request = TokenizedGenerateReqInput(
310
+ rid=rid,
311
+ input_text=input_text,
312
+ input_ids=input_ids,
313
+ sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
314
+ stream=False,
315
+ mm_inputs=None,
316
+ return_logprob=False,
317
+ logprob_start_len=-1,
318
+ top_logprobs_num=0,
319
+ token_ids_logprob=None,
320
+ )
321
+
322
+ logger.info(f"Sending health check request to request manager...")
323
+
324
+ # Submit and wait for response
325
+ output_queue = await self.request_manager.generate_request(
326
+ health_request, request_id=rid
327
+ )
328
+
329
+ try:
330
+ # Wait for response with configurable timeout
331
+ response = await asyncio.wait_for(
332
+ output_queue.get(), timeout=HEALTH_CHECK_TIMEOUT
333
+ )
334
+
335
+ # Clean up
336
+ if rid in self.request_manager.rid_to_state:
337
+ del self.request_manager.rid_to_state[rid]
338
+
339
+ return sglang_scheduler_pb2.HealthCheckResponse(
340
+ healthy=True, message="Health check passed"
341
+ )
342
+
343
+ except asyncio.TimeoutError:
344
+ # Clean up on timeout
345
+ if rid in self.request_manager.rid_to_state:
346
+ del self.request_manager.rid_to_state[rid]
347
+
348
+ return sglang_scheduler_pb2.HealthCheckResponse(
349
+ healthy=False, message="Health check timeout"
350
+ )
351
+
352
+ except Exception as e:
353
+ logger.error(f"Health check failed: {e}")
354
+ return sglang_scheduler_pb2.HealthCheckResponse(
355
+ healthy=False, message=f"Health check error: {str(e)}"
356
+ )
357
+
358
+ async def Abort(
359
+ self,
360
+ request: sglang_scheduler_pb2.AbortRequest,
361
+ context: grpc.aio.ServicerContext,
362
+ ) -> sglang_scheduler_pb2.AbortResponse:
363
+ """Abort an ongoing request."""
364
+ logger.info(f"Aborting request: {request.request_id}")
365
+
366
+ try:
367
+ success = await self.request_manager.abort_request(request.request_id)
368
+
369
+ return sglang_scheduler_pb2.AbortResponse(
370
+ success=success,
371
+ message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
372
+ )
373
+ except Exception as e:
374
+ logger.error(f"Abort failed: {e}")
375
+ return sglang_scheduler_pb2.AbortResponse(
376
+ success=False,
377
+ message=str(e),
378
+ )
379
+
380
+ # Helper methods for request/response conversion
381
+
382
+ def _convert_generate_request(
383
+ self, grpc_req: sglang_scheduler_pb2.GenerateRequest
384
+ ) -> TokenizedGenerateReqInput:
385
+ """Convert gRPC GenerateRequest to internal format."""
386
+
387
+ # Extract tokenized input
388
+ if not grpc_req.HasField("tokenized"):
389
+ raise ValueError("Tokenized input must be provided")
390
+
391
+ input_text = grpc_req.tokenized.original_text
392
+ input_ids = list(grpc_req.tokenized.input_ids)
393
+
394
+ # Convert sampling params
395
+ sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
396
+
397
+ # Create request
398
+ return TokenizedGenerateReqInput(
399
+ rid=grpc_req.request_id,
400
+ input_text=input_text,
401
+ input_ids=input_ids,
402
+ mm_inputs=None, # TODO: implement mm support
403
+ sampling_params=sampling_params,
404
+ return_logprob=grpc_req.return_logprob,
405
+ logprob_start_len=grpc_req.logprob_start_len or -1,
406
+ top_logprobs_num=grpc_req.top_logprobs_num or 0,
407
+ stream=True, # Always stream for gRPC
408
+ lora_path=grpc_req.lora_id if grpc_req.lora_id else None,
409
+ token_ids_logprob=(
410
+ list(grpc_req.token_ids_logprob) if grpc_req.token_ids_logprob else None
411
+ ),
412
+ )
413
+
414
+ def _convert_embed_request(
415
+ self, grpc_req: sglang_scheduler_pb2.EmbedRequest
416
+ ) -> TokenizedEmbeddingReqInput:
417
+ """Convert gRPC EmbedRequest to internal format."""
418
+
419
+ # Extract tokenized input
420
+ if not grpc_req.HasField("tokenized"):
421
+ raise ValueError("Tokenized input must be provided")
422
+
423
+ input_text = grpc_req.tokenized.original_text
424
+ input_ids = list(grpc_req.tokenized.input_ids)
425
+
426
+ return TokenizedEmbeddingReqInput(
427
+ rid=grpc_req.request_id,
428
+ input_text=input_text,
429
+ input_ids=input_ids,
430
+ )
431
+
432
+ def _convert_sampling_params(
433
+ self, grpc_params: sglang_scheduler_pb2.SamplingParams
434
+ ) -> SGLSamplingParams:
435
+ """Convert gRPC SamplingParams to internal format."""
436
+
437
+ # Handle constraint types
438
+ regex = None
439
+ json_schema = None
440
+ ebnf_grammar = None
441
+
442
+ if grpc_params.HasField("regex"):
443
+ regex = grpc_params.regex
444
+ elif grpc_params.HasField("json_schema"):
445
+ json_schema = grpc_params.json_schema
446
+ elif grpc_params.HasField("ebnf_grammar"):
447
+ ebnf_grammar = grpc_params.ebnf_grammar
448
+
449
+ return SGLSamplingParams(
450
+ temperature=grpc_params.temperature or 1.0,
451
+ top_p=grpc_params.top_p or 1.0,
452
+ top_k=grpc_params.top_k or -1,
453
+ min_p=grpc_params.min_p or 0.0,
454
+ frequency_penalty=grpc_params.frequency_penalty or 0.0,
455
+ presence_penalty=grpc_params.presence_penalty or 0.0,
456
+ repetition_penalty=grpc_params.repetition_penalty or 1.0,
457
+ max_new_tokens=grpc_params.max_new_tokens or 128,
458
+ min_new_tokens=grpc_params.min_new_tokens or 0,
459
+ stop=list(grpc_params.stop) if grpc_params.stop else None,
460
+ stop_token_ids=(
461
+ list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
462
+ ),
463
+ skip_special_tokens=grpc_params.skip_special_tokens,
464
+ spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
465
+ regex=regex,
466
+ json_schema=json_schema,
467
+ ebnf=ebnf_grammar,
468
+ n=grpc_params.n or 1,
469
+ ignore_eos=grpc_params.ignore_eos,
470
+ )
471
+
472
+ def _create_chunk_response(
473
+ self, request_id: str, output: Dict
474
+ ) -> sglang_scheduler_pb2.GenerateResponse:
475
+ """Create a streaming chunk response."""
476
+ return sglang_scheduler_pb2.GenerateResponse(
477
+ request_id=request_id,
478
+ chunk=sglang_scheduler_pb2.GenerateStreamChunk(
479
+ token_id=output["token_ids"][-1] if output.get("token_ids") else 0,
480
+ text=output.get("text", ""),
481
+ prompt_tokens=0,
482
+ completion_tokens=len(output.get("token_ids", [])),
483
+ cached_tokens=0,
484
+ generation_time=time.time() - self.start_time,
485
+ queue_time=0.0,
486
+ ),
487
+ )
488
+
489
+ def _create_completion_response(
490
+ self, request_id: str, output: Dict
491
+ ) -> sglang_scheduler_pb2.GenerateResponse:
492
+ """Create a completion response."""
493
+
494
+ # Determine finish reason
495
+ finish_reason = sglang_scheduler_pb2.GenerateComplete.STOP
496
+ meta_info = output.get("meta_info", {})
497
+ if meta_info.get("finish_reason") == "length":
498
+ finish_reason = sglang_scheduler_pb2.GenerateComplete.LENGTH
499
+ elif meta_info.get("finish_reason") == "eos_token":
500
+ finish_reason = sglang_scheduler_pb2.GenerateComplete.EOS_TOKEN
501
+
502
+ return sglang_scheduler_pb2.GenerateResponse(
503
+ request_id=request_id,
504
+ complete=sglang_scheduler_pb2.GenerateComplete(
505
+ output_ids=output.get("token_ids", []),
506
+ output_text=output.get("text", ""),
507
+ finish_reason=finish_reason,
508
+ ),
509
+ )
510
+
511
+ async def shutdown(self):
512
+ """Shutdown the service."""
513
+ logger.info("Shutting down gRPC service")
514
+
515
+ # Shutdown request manager (handles its own tasks)
516
+ await self.request_manager.shutdown()
517
+
518
+
519
+ async def serve_grpc(
520
+ server_args: ServerArgs,
521
+ model_info: Optional[Dict] = None,
522
+ ):
523
+ """Start the standalone gRPC server with integrated scheduler."""
524
+
525
+ # Launch only the scheduler process(es) (no tokenizer/detokenizer needed for gRPC)
526
+ logger.info("Launching scheduler process(es)...")
527
+ scheduler_info, port_args, scheduler_procs = _launch_scheduler_process_only(
528
+ server_args=server_args,
529
+ )
530
+
531
+ # Update model info from scheduler info
532
+ if model_info is None:
533
+ model_info = {
534
+ "model_name": server_args.model_path,
535
+ "max_context_length": scheduler_info.get(
536
+ "max_total_num_tokens", server_args.context_length or 8192
537
+ ),
538
+ "vocab_size": scheduler_info.get("vocab_size", 128256),
539
+ "supports_vision": scheduler_info.get("supports_vision", False),
540
+ "model_type": scheduler_info.get("model_type", "transformer"),
541
+ "max_req_input_len": scheduler_info.get("max_req_input_len", 8192),
542
+ "eos_token_ids": scheduler_info.get("eos_token_ids", []),
543
+ "pad_token_id": scheduler_info.get("pad_token_id", 0),
544
+ "bos_token_id": scheduler_info.get("bos_token_id", 1),
545
+ }
546
+
547
+ # Create request manager with the correct port args
548
+ request_manager = GrpcRequestManager(
549
+ server_args=server_args,
550
+ port_args=port_args,
551
+ )
552
+
553
+ # Create gRPC server
554
+ server = grpc.aio.server(
555
+ futures.ThreadPoolExecutor(max_workers=10),
556
+ options=[
557
+ ("grpc.max_send_message_length", 1024 * 1024 * 256),
558
+ ("grpc.max_receive_message_length", 1024 * 1024 * 256),
559
+ ],
560
+ )
561
+
562
+ # Add service
563
+ servicer = SGLangSchedulerServicer(
564
+ request_manager=request_manager,
565
+ server_args=server_args,
566
+ model_info=model_info,
567
+ )
568
+ sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
569
+
570
+ # Enable reflection
571
+ SERVICE_NAMES = (
572
+ sglang_scheduler_pb2.DESCRIPTOR.services_by_name["SglangScheduler"].full_name,
573
+ reflection.SERVICE_NAME,
574
+ )
575
+ reflection.enable_server_reflection(SERVICE_NAMES, server)
576
+
577
+ # Start server
578
+ listen_addr = f"{server_args.host}:{server_args.port}"
579
+ server.add_insecure_port(listen_addr)
580
+
581
+ logger.info(f"Starting standalone gRPC server on {listen_addr}")
582
+
583
+ await server.start()
584
+
585
+ # Handle shutdown signals
586
+ loop = asyncio.get_running_loop()
587
+ stop_event = asyncio.Event()
588
+
589
+ def signal_handler():
590
+ logger.info("Received shutdown signal")
591
+ stop_event.set()
592
+
593
+ for sig in (signal.SIGTERM, signal.SIGINT):
594
+ loop.add_signal_handler(sig, signal_handler)
595
+
596
+ try:
597
+ await stop_event.wait()
598
+ finally:
599
+ logger.info("Shutting down gRPC server")
600
+ await servicer.shutdown()
601
+ await server.stop(5.0)
602
+
603
+ # Terminate scheduler processes
604
+ for i, proc in enumerate(scheduler_procs):
605
+ if proc and proc.is_alive():
606
+ logger.info(f"Terminating scheduler process {i}...")
607
+ proc.terminate()
608
+ proc.join(timeout=5.0)
609
+ if proc.is_alive():
610
+ logger.warning(f"Force killing scheduler process {i}...")
611
+ proc.kill()
612
+ proc.join()
613
+
614
+
615
+ def main():
616
+ """Main entry point for standalone gRPC server."""
617
+ # Fix CUDA multiprocessing issues - must be called before any CUDA operations
618
+ mp.set_start_method("spawn", force=True)
619
+
620
+ parser = argparse.ArgumentParser(description="SGLang Standalone gRPC Server")
621
+
622
+ # Server arguments
623
+ parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
624
+ parser.add_argument("--port", type=int, default=30000, help="gRPC server port")
625
+
626
+ # Model arguments
627
+ parser.add_argument("--model-path", type=str, required=True, help="Model path")
628
+ parser.add_argument("--tokenizer-path", type=str, help="Tokenizer path")
629
+ parser.add_argument("--context-length", type=int, help="Context length")
630
+ parser.add_argument("--tp-size", type=int, default=1, help="Tensor parallel size")
631
+ parser.add_argument("--dp-size", type=int, default=1, help="Data parallel size")
632
+
633
+ # Runtime arguments
634
+ parser.add_argument(
635
+ "--max-running-requests", type=int, default=2048, help="Max concurrent requests"
636
+ )
637
+ parser.add_argument(
638
+ "--max-total-tokens", type=int, default=1000000, help="Max total tokens"
639
+ )
640
+ parser.add_argument(
641
+ "--max-prefill-tokens", type=int, default=16384, help="Max prefill tokens"
642
+ )
643
+ parser.add_argument(
644
+ "--attention-backend", type=str, default="flashinfer", help="Attention backend"
645
+ )
646
+ parser.add_argument("--lora-paths", type=str, help="LoRA adapter paths")
647
+
648
+ # Logging
649
+ parser.add_argument("--log-level", type=str, default="INFO", help="Logging level")
650
+
651
+ args = parser.parse_args()
652
+
653
+ # Convert to ServerArgs with gRPC host and port
654
+ server_args = ServerArgs(
655
+ model_path=args.model_path,
656
+ tokenizer_path=args.tokenizer_path or args.model_path,
657
+ context_length=args.context_length,
658
+ tp_size=args.tp_size,
659
+ dp_size=args.dp_size,
660
+ max_running_requests=args.max_running_requests,
661
+ max_total_tokens=args.max_total_tokens,
662
+ max_prefill_tokens=args.max_prefill_tokens,
663
+ attention_backend=args.attention_backend,
664
+ lora_paths=args.lora_paths.split(",") if args.lora_paths else None,
665
+ log_level=args.log_level,
666
+ # Override with gRPC server host and port
667
+ host=args.host,
668
+ port=args.port,
669
+ )
670
+
671
+ # Run server
672
+ asyncio.run(
673
+ serve_grpc(
674
+ server_args=server_args,
675
+ )
676
+ )
677
+
678
+
679
+ if __name__ == "__main__":
680
+ main()