sglang 0.5.2rc2__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (396) hide show
  1. sglang/bench_one_batch.py +7 -11
  2. sglang/bench_one_batch_server.py +330 -31
  3. sglang/bench_serving.py +474 -142
  4. sglang/compile_deep_gemm.py +3 -0
  5. sglang/global_config.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +10 -0
  11. sglang/srt/configs/device_config.py +3 -1
  12. sglang/srt/configs/dots_ocr.py +64 -0
  13. sglang/srt/configs/dots_vlm.py +139 -0
  14. sglang/srt/configs/falcon_h1.py +314 -0
  15. sglang/srt/configs/load_config.py +9 -0
  16. sglang/srt/configs/mamba_utils.py +117 -0
  17. sglang/srt/configs/model_config.py +228 -92
  18. sglang/srt/configs/nemotron_h.py +286 -0
  19. sglang/srt/configs/qwen3_next.py +294 -0
  20. sglang/srt/configs/qwen3_vl.py +586 -0
  21. sglang/srt/connector/__init__.py +8 -1
  22. sglang/srt/connector/remote_instance.py +82 -0
  23. sglang/srt/constrained/base_grammar_backend.py +49 -12
  24. sglang/srt/constrained/llguidance_backend.py +0 -1
  25. sglang/srt/constrained/outlines_backend.py +0 -1
  26. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  27. sglang/srt/constrained/xgrammar_backend.py +30 -9
  28. sglang/srt/custom_op.py +11 -1
  29. sglang/srt/debug_utils/dump_comparator.py +81 -44
  30. sglang/srt/debug_utils/dump_loader.py +97 -0
  31. sglang/srt/debug_utils/dumper.py +21 -6
  32. sglang/srt/debug_utils/text_comparator.py +73 -11
  33. sglang/srt/disaggregation/ascend/conn.py +2 -2
  34. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  35. sglang/srt/disaggregation/base/conn.py +1 -1
  36. sglang/srt/disaggregation/common/conn.py +279 -108
  37. sglang/srt/disaggregation/decode.py +78 -37
  38. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  39. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +29 -17
  40. sglang/srt/disaggregation/fake/conn.py +1 -1
  41. sglang/srt/disaggregation/mini_lb.py +6 -445
  42. sglang/srt/disaggregation/mooncake/conn.py +55 -537
  43. sglang/srt/disaggregation/nixl/conn.py +373 -68
  44. sglang/srt/disaggregation/prefill.py +53 -49
  45. sglang/srt/disaggregation/utils.py +40 -54
  46. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  47. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  48. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  49. sglang/srt/distributed/parallel_state.py +156 -80
  50. sglang/srt/entrypoints/engine.py +59 -18
  51. sglang/srt/entrypoints/grpc_request_manager.py +842 -0
  52. sglang/srt/entrypoints/grpc_server.py +950 -0
  53. sglang/srt/entrypoints/http_server.py +179 -60
  54. sglang/srt/entrypoints/openai/protocol.py +265 -29
  55. sglang/srt/entrypoints/openai/serving_base.py +65 -3
  56. sglang/srt/entrypoints/openai/serving_chat.py +213 -122
  57. sglang/srt/entrypoints/openai/serving_completions.py +14 -3
  58. sglang/srt/entrypoints/openai/serving_embedding.py +9 -3
  59. sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
  60. sglang/srt/entrypoints/openai/serving_responses.py +48 -3
  61. sglang/srt/entrypoints/openai/serving_score.py +1 -0
  62. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  63. sglang/srt/environ.py +289 -0
  64. sglang/srt/eplb/eplb_manager.py +2 -2
  65. sglang/srt/eplb/expert_distribution.py +26 -13
  66. sglang/srt/eplb/expert_location.py +38 -8
  67. sglang/srt/eplb/expert_location_updater.py +1 -1
  68. sglang/srt/function_call/base_format_detector.py +3 -6
  69. sglang/srt/function_call/ebnf_composer.py +11 -9
  70. sglang/srt/function_call/function_call_parser.py +17 -8
  71. sglang/srt/function_call/glm4_moe_detector.py +4 -4
  72. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  73. sglang/srt/function_call/json_array_parser.py +63 -0
  74. sglang/srt/function_call/kimik2_detector.py +17 -4
  75. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  76. sglang/srt/function_call/utils.py +96 -5
  77. sglang/srt/grpc/__init__.py +1 -0
  78. sglang/srt/grpc/compile_proto.py +245 -0
  79. sglang/srt/grpc/sglang_scheduler_pb2.py +119 -0
  80. sglang/srt/grpc/sglang_scheduler_pb2.pyi +492 -0
  81. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +327 -0
  82. sglang/srt/layers/activation.py +143 -9
  83. sglang/srt/layers/attention/aiter_backend.py +14 -15
  84. sglang/srt/layers/attention/ascend_backend.py +115 -9
  85. sglang/srt/layers/attention/attention_registry.py +215 -0
  86. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  87. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  88. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  89. sglang/srt/layers/attention/fla/chunk.py +242 -0
  90. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  91. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  92. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  93. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  94. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  95. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  96. sglang/srt/layers/attention/fla/index.py +37 -0
  97. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  98. sglang/srt/layers/attention/fla/layernorm_gated.py +343 -0
  99. sglang/srt/layers/attention/fla/op.py +66 -0
  100. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  101. sglang/srt/layers/attention/fla/utils.py +331 -0
  102. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  103. sglang/srt/layers/attention/flashattention_backend.py +40 -8
  104. sglang/srt/layers/attention/flashinfer_backend.py +341 -204
  105. sglang/srt/layers/attention/flashinfer_mla_backend.py +28 -28
  106. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  107. sglang/srt/layers/attention/hybrid_attn_backend.py +68 -53
  108. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +708 -0
  109. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  110. sglang/srt/layers/attention/mamba/causal_conv1d.py +129 -0
  111. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +974 -0
  112. sglang/srt/layers/attention/mamba/mamba.py +577 -0
  113. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  114. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  115. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  116. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  117. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  118. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  119. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  120. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  121. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  122. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  123. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  124. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  125. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  126. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  127. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  128. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  129. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  130. sglang/srt/layers/attention/nsa/utils.py +24 -0
  131. sglang/srt/layers/attention/nsa_backend.py +887 -0
  132. sglang/srt/layers/attention/tbo_backend.py +6 -6
  133. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  134. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  135. sglang/srt/layers/attention/triton_backend.py +57 -7
  136. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  137. sglang/srt/layers/attention/trtllm_mla_backend.py +276 -39
  138. sglang/srt/layers/attention/vision.py +58 -0
  139. sglang/srt/layers/attention/wave_backend.py +4 -4
  140. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  141. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  142. sglang/srt/layers/communicator.py +8 -0
  143. sglang/srt/layers/dp_attention.py +41 -2
  144. sglang/srt/layers/elementwise.py +3 -1
  145. sglang/srt/layers/layernorm.py +34 -15
  146. sglang/srt/layers/linear.py +55 -7
  147. sglang/srt/layers/logits_processor.py +180 -18
  148. sglang/srt/layers/modelopt_utils.py +11 -0
  149. sglang/srt/layers/moe/__init__.py +2 -1
  150. sglang/srt/layers/moe/cutlass_w4a8_moe.py +21 -24
  151. sglang/srt/layers/moe/ep_moe/kernels.py +33 -454
  152. sglang/srt/layers/moe/ep_moe/layer.py +248 -333
  153. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +183 -0
  154. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  155. sglang/srt/layers/moe/fused_moe_triton/configs/{triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json } +35 -35
  156. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  160. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  161. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
  162. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  163. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  164. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  165. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
  166. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  167. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
  168. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
  169. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +7 -3
  170. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
  171. sglang/srt/layers/moe/fused_moe_triton/layer.py +68 -72
  172. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  173. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  174. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  175. sglang/srt/layers/moe/moe_runner/runner.py +83 -0
  176. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  177. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  178. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  179. sglang/srt/layers/moe/token_dispatcher/deepep.py +118 -56
  180. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  181. sglang/srt/layers/moe/topk.py +30 -9
  182. sglang/srt/layers/moe/utils.py +29 -7
  183. sglang/srt/layers/parameter.py +23 -6
  184. sglang/srt/layers/quantization/__init__.py +1 -1
  185. sglang/srt/layers/quantization/awq.py +19 -7
  186. sglang/srt/layers/quantization/base_config.py +11 -6
  187. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  188. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  189. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  190. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  191. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  192. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  193. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  194. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  195. sglang/srt/layers/quantization/fp8.py +155 -60
  196. sglang/srt/layers/quantization/fp8_utils.py +51 -32
  197. sglang/srt/layers/quantization/gptq.py +25 -17
  198. sglang/srt/layers/quantization/modelopt_quant.py +191 -56
  199. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  200. sglang/srt/layers/quantization/mxfp4.py +74 -42
  201. sglang/srt/layers/quantization/quark/quark.py +3 -1
  202. sglang/srt/layers/quantization/quark/quark_moe.py +48 -30
  203. sglang/srt/layers/quantization/unquant.py +135 -47
  204. sglang/srt/layers/quantization/w4afp8.py +28 -33
  205. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  206. sglang/srt/layers/quantization/w8a8_int8.py +91 -41
  207. sglang/srt/layers/rotary_embedding.py +78 -31
  208. sglang/srt/layers/sampler.py +213 -21
  209. sglang/srt/layers/utils.py +23 -0
  210. sglang/srt/lora/backend/base_backend.py +50 -8
  211. sglang/srt/lora/backend/chunked_backend.py +348 -0
  212. sglang/srt/lora/backend/triton_backend.py +99 -5
  213. sglang/srt/lora/layers.py +32 -0
  214. sglang/srt/lora/lora.py +8 -3
  215. sglang/srt/lora/lora_manager.py +44 -118
  216. sglang/srt/lora/mem_pool.py +25 -11
  217. sglang/srt/lora/triton_ops/__init__.py +4 -0
  218. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  219. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  220. sglang/srt/lora/utils.py +22 -11
  221. sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
  222. sglang/srt/managers/cache_controller.py +199 -301
  223. sglang/srt/managers/data_parallel_controller.py +115 -80
  224. sglang/srt/managers/detokenizer_manager.py +19 -15
  225. sglang/srt/managers/disagg_service.py +46 -0
  226. sglang/srt/managers/io_struct.py +340 -109
  227. sglang/srt/managers/mm_utils.py +44 -6
  228. sglang/srt/managers/multi_tokenizer_mixin.py +357 -407
  229. sglang/srt/managers/multimodal_processor.py +1 -2
  230. sglang/srt/managers/overlap_utils.py +55 -0
  231. sglang/srt/managers/schedule_batch.py +343 -212
  232. sglang/srt/managers/schedule_policy.py +145 -18
  233. sglang/srt/managers/scheduler.py +653 -273
  234. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  235. sglang/srt/managers/scheduler_metrics_mixin.py +99 -126
  236. sglang/srt/managers/scheduler_output_processor_mixin.py +255 -108
  237. sglang/srt/managers/scheduler_profiler_mixin.py +6 -6
  238. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  239. sglang/srt/managers/tokenizer_communicator_mixin.py +675 -0
  240. sglang/srt/managers/tokenizer_manager.py +579 -674
  241. sglang/srt/managers/tp_worker.py +96 -26
  242. sglang/srt/managers/utils.py +1 -45
  243. sglang/srt/mem_cache/allocator.py +21 -22
  244. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  245. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  246. sglang/srt/mem_cache/chunk_cache.py +9 -2
  247. sglang/srt/mem_cache/evict_policy.py +23 -0
  248. sglang/srt/mem_cache/hicache_storage.py +43 -24
  249. sglang/srt/mem_cache/hiradix_cache.py +222 -75
  250. sglang/srt/mem_cache/memory_pool.py +651 -80
  251. sglang/srt/mem_cache/memory_pool_host.py +239 -228
  252. sglang/srt/mem_cache/radix_cache.py +227 -73
  253. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  254. sglang/srt/mem_cache/storage/__init__.py +10 -0
  255. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  256. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  257. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  258. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  259. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  260. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  261. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  262. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +259 -62
  263. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +284 -0
  264. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  265. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +166 -17
  266. sglang/srt/mem_cache/swa_radix_cache.py +93 -48
  267. sglang/srt/metrics/collector.py +511 -132
  268. sglang/srt/metrics/func_timer.py +2 -7
  269. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  270. sglang/srt/metrics/utils.py +8 -1
  271. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  272. sglang/srt/model_executor/cuda_graph_runner.py +52 -37
  273. sglang/srt/model_executor/forward_batch_info.py +74 -46
  274. sglang/srt/model_executor/model_runner.py +455 -176
  275. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  276. sglang/srt/model_loader/__init__.py +10 -4
  277. sglang/srt/model_loader/loader.py +319 -10
  278. sglang/srt/model_loader/remote_instance_weight_loader_utils.py +69 -0
  279. sglang/srt/model_loader/weight_utils.py +161 -3
  280. sglang/srt/models/apertus.py +686 -0
  281. sglang/srt/models/bailing_moe.py +820 -217
  282. sglang/srt/models/bailing_moe_nextn.py +168 -0
  283. sglang/srt/models/deepseek_nextn.py +6 -1
  284. sglang/srt/models/deepseek_v2.py +607 -130
  285. sglang/srt/models/dots_ocr.py +173 -0
  286. sglang/srt/models/dots_vlm.py +174 -0
  287. sglang/srt/models/dots_vlm_vit.py +337 -0
  288. sglang/srt/models/ernie4.py +1 -1
  289. sglang/srt/models/falcon_h1.py +578 -0
  290. sglang/srt/models/gemma3_causal.py +0 -2
  291. sglang/srt/models/gemma3_mm.py +17 -1
  292. sglang/srt/models/gemma3n_mm.py +2 -2
  293. sglang/srt/models/glm4_moe.py +4 -4
  294. sglang/srt/models/glm4_moe_nextn.py +2 -2
  295. sglang/srt/models/glm4v.py +5 -3
  296. sglang/srt/models/glm4v_moe.py +4 -1
  297. sglang/srt/models/gpt_oss.py +8 -31
  298. sglang/srt/models/grok.py +5 -13
  299. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  300. sglang/srt/models/llama.py +4 -0
  301. sglang/srt/models/llama4.py +9 -0
  302. sglang/srt/models/llama_eagle3.py +13 -0
  303. sglang/srt/models/longcat_flash.py +3 -3
  304. sglang/srt/models/longcat_flash_nextn.py +1 -1
  305. sglang/srt/models/mixtral.py +1 -3
  306. sglang/srt/models/mllama4.py +50 -4
  307. sglang/srt/models/nemotron_h.py +514 -0
  308. sglang/srt/models/opt.py +637 -0
  309. sglang/srt/models/qwen2_5_vl.py +29 -5
  310. sglang/srt/models/qwen2_audio.py +1 -1
  311. sglang/srt/models/qwen2_moe.py +120 -13
  312. sglang/srt/models/qwen2_vl.py +1 -1
  313. sglang/srt/models/qwen3.py +18 -3
  314. sglang/srt/models/qwen3_moe.py +32 -4
  315. sglang/srt/models/qwen3_next.py +1069 -0
  316. sglang/srt/models/qwen3_next_mtp.py +112 -0
  317. sglang/srt/models/qwen3_vl.py +787 -0
  318. sglang/srt/models/qwen3_vl_moe.py +471 -0
  319. sglang/srt/models/registry.py +15 -3
  320. sglang/srt/models/sarashina2_vision.py +269 -0
  321. sglang/srt/models/solar.py +505 -0
  322. sglang/srt/models/starcoder2.py +357 -0
  323. sglang/srt/models/step3_vl.py +1 -1
  324. sglang/srt/models/torch_native_llama.py +9 -2
  325. sglang/srt/models/utils.py +55 -0
  326. sglang/srt/multimodal/processors/base_processor.py +15 -7
  327. sglang/srt/multimodal/processors/dots_vlm.py +98 -0
  328. sglang/srt/multimodal/processors/glm4v.py +9 -9
  329. sglang/srt/multimodal/processors/internvl.py +153 -129
  330. sglang/srt/multimodal/processors/qwen_vl.py +23 -6
  331. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  332. sglang/srt/offloader.py +27 -3
  333. sglang/srt/parser/jinja_template_utils.py +6 -0
  334. sglang/srt/sampling/sampling_batch_info.py +49 -26
  335. sglang/srt/sampling/sampling_params.py +7 -0
  336. sglang/srt/server_args.py +1051 -285
  337. sglang/srt/server_args_config_parser.py +146 -0
  338. sglang/srt/single_batch_overlap.py +151 -0
  339. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  340. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  341. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  342. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  343. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  344. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  345. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +7 -1
  346. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +13 -2
  347. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -757
  348. sglang/srt/speculative/eagle_worker.py +98 -29
  349. sglang/srt/speculative/ngram_info.py +428 -0
  350. sglang/srt/speculative/ngram_worker.py +246 -0
  351. sglang/srt/speculative/spec_info.py +52 -0
  352. sglang/srt/speculative/spec_utils.py +605 -0
  353. sglang/srt/speculative/standalone_worker.py +109 -0
  354. sglang/srt/torch_memory_saver_adapter.py +5 -7
  355. sglang/srt/tracing/trace.py +578 -0
  356. sglang/srt/two_batch_overlap.py +9 -5
  357. sglang/srt/utils/__init__.py +2 -0
  358. sglang/srt/{utils.py → utils/common.py} +451 -77
  359. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +55 -5
  360. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  361. sglang/srt/utils/rpd_utils.py +452 -0
  362. sglang/srt/utils/slow_rank_detector.py +71 -0
  363. sglang/srt/warmup.py +8 -4
  364. sglang/srt/weight_sync/utils.py +2 -2
  365. sglang/test/attention/test_trtllm_mla_backend.py +169 -5
  366. sglang/test/get_logits_ut.py +57 -0
  367. sglang/test/longbench_v2/__init__.py +1 -0
  368. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  369. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  370. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  371. sglang/test/run_eval.py +119 -11
  372. sglang/test/runners.py +5 -1
  373. sglang/test/simple_eval_common.py +5 -2
  374. sglang/test/simple_eval_longbench_v2.py +332 -0
  375. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  376. sglang/test/test_block_fp8.py +2 -2
  377. sglang/test/test_cutlass_moe.py +24 -6
  378. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  379. sglang/test/test_deterministic.py +313 -0
  380. sglang/test/test_deterministic_utils.py +81 -0
  381. sglang/test/test_disaggregation_utils.py +140 -0
  382. sglang/test/test_fp4_moe.py +370 -1
  383. sglang/test/test_programs.py +1 -1
  384. sglang/test/test_utils.py +407 -8
  385. sglang/utils.py +21 -1
  386. sglang/version.py +1 -1
  387. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +69 -124
  388. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +392 -251
  389. sglang/srt/disaggregation/launch_lb.py +0 -118
  390. sglang/srt/managers/tp_worker_overlap_thread.py +0 -296
  391. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  392. sglang/test/test_block_fp8_ep.py +0 -358
  393. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  394. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  395. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  396. {sglang-0.5.2rc2.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -27,9 +27,9 @@ import tempfile
27
27
  import threading
28
28
  import time
29
29
  from http import HTTPStatus
30
- from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
+ from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
31
 
32
- import setproctitle
32
+ from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
33
33
 
34
34
  # Fix a bug of Python threading
35
35
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
@@ -47,21 +47,19 @@ from fastapi.exceptions import RequestValidationError
47
47
  from fastapi.middleware.cors import CORSMiddleware
48
48
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
49
49
 
50
- from sglang.srt.disaggregation.utils import (
51
- FAKE_BOOTSTRAP_HOST,
52
- DisaggregationMode,
53
- register_disaggregation_server,
54
- )
50
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
55
51
  from sglang.srt.entrypoints.engine import _launch_subprocesses
56
52
  from sglang.srt.entrypoints.openai.protocol import (
57
53
  ChatCompletionRequest,
58
54
  CompletionRequest,
55
+ DetokenizeRequest,
59
56
  EmbeddingRequest,
60
57
  ErrorResponse,
61
58
  ModelCard,
62
59
  ModelList,
63
60
  ResponsesRequest,
64
61
  ScoringRequest,
62
+ TokenizeRequest,
65
63
  V1RerankReqInput,
66
64
  )
67
65
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -69,14 +67,20 @@ from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompl
69
67
  from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
70
68
  from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
71
69
  from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
70
+ from sglang.srt.entrypoints.openai.serving_tokenize import (
71
+ OpenAIServingDetokenize,
72
+ OpenAIServingTokenize,
73
+ )
72
74
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
73
75
  from sglang.srt.managers.io_struct import (
74
76
  AbortReq,
75
77
  CloseSessionReqInput,
76
78
  ConfigureLoggingReq,
79
+ DestroyWeightsUpdateGroupReqInput,
77
80
  EmbeddingReqInput,
78
81
  GenerateReqInput,
79
82
  GetWeightsByNameReqInput,
83
+ InitWeightsSendGroupForRemoteInstanceReqInput,
80
84
  InitWeightsUpdateGroupReqInput,
81
85
  LoadLoRAAdapterReqInput,
82
86
  OpenSessionReqInput,
@@ -84,6 +88,7 @@ from sglang.srt.managers.io_struct import (
84
88
  ProfileReqInput,
85
89
  ReleaseMemoryOccupationReqInput,
86
90
  ResumeMemoryOccupationReqInput,
91
+ SendWeightsToRemoteInstanceReqInput,
87
92
  SeparateReasoningReqInput,
88
93
  SetInternalStateReq,
89
94
  SlowDownReqInput,
@@ -95,9 +100,10 @@ from sglang.srt.managers.io_struct import (
95
100
  VertexGenerateReqInput,
96
101
  )
97
102
  from sglang.srt.managers.multi_tokenizer_mixin import (
98
- MultiTokenizerManager,
99
- deserialize_data,
103
+ MultiTokenizerRouter,
104
+ TokenizerWorker,
100
105
  get_main_process_id,
106
+ monkey_patch_uvicorn_multiprocessing,
101
107
  read_from_shared_memory,
102
108
  write_data_for_multi_tokenizer,
103
109
  )
@@ -127,7 +133,7 @@ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
127
133
  # Store global states
128
134
  @dataclasses.dataclass
129
135
  class _GlobalState:
130
- tokenizer_manager: TokenizerManager
136
+ tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
131
137
  template_manager: TemplateManager
132
138
  scheduler_info: Dict
133
139
 
@@ -140,21 +146,6 @@ def set_global_state(global_state: _GlobalState):
140
146
  _global_state = global_state
141
147
 
142
148
 
143
- # Function to set up all middlewares for multi-tokenizer compatibility
144
- def setup_middlewares(api_key: Optional[str], enable_metrics: bool):
145
- """Setup all middlewares for both single and multi-process modes"""
146
- worker_pid = os.getpid()
147
-
148
- if api_key:
149
- add_api_key_middleware(app, api_key)
150
- logger.info(f"Worker {worker_pid} added API key middleware")
151
-
152
- if enable_metrics:
153
- add_prometheus_middleware(app)
154
- enable_func_timer()
155
- logger.info(f"Worker {worker_pid} added prometheus middleware")
156
-
157
-
158
149
  async def init_multi_tokenizer() -> ServerArgs:
159
150
  """Read args information from shm and init tokenizer manager for current process"""
160
151
  pid = os.getpid()
@@ -162,18 +153,22 @@ async def init_multi_tokenizer() -> ServerArgs:
162
153
  logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
163
154
 
164
155
  # Read configuration from shared memory
165
- port_args_data = read_from_shared_memory(f"port_args_{main_pid}")
166
- server_args_data = read_from_shared_memory(f"server_args_{main_pid}")
167
- scheduler_info_data = read_from_shared_memory(f"scheduler_info_{main_pid}")
168
- port_args, server_args = deserialize_data(port_args_data, server_args_data)
169
- scheduler_info = scheduler_info_data
156
+ port_args, server_args, scheduler_info = read_from_shared_memory(
157
+ f"multi_tokenizer_args_{main_pid}"
158
+ )
159
+ server_args: ServerArgs
160
+
161
+ # API key authentication is not supported in multi-tokenizer mode
162
+ assert (
163
+ server_args.api_key is None
164
+ ), "API key is not supported in multi-tokenizer mode"
170
165
 
171
166
  port_args.tokenizer_ipc_name = (
172
167
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
173
168
  )
174
169
 
175
170
  # Launch multi-tokenizer manager process
176
- tokenizer_manager = MultiTokenizerManager(server_args, port_args)
171
+ tokenizer_manager = TokenizerWorker(server_args, port_args)
177
172
  template_manager = TemplateManager()
178
173
  template_manager.initialize_templates(
179
174
  tokenizer_manager=tokenizer_manager,
@@ -192,18 +187,29 @@ async def init_multi_tokenizer() -> ServerArgs:
192
187
  scheduler_info=scheduler_info,
193
188
  )
194
189
  )
190
+
191
+ if server_args.enable_trace:
192
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
193
+ if server_args.disaggregation_mode == "null":
194
+ thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
195
+ trace_set_thread_info(thread_label)
196
+
195
197
  return server_args
196
198
 
197
199
 
198
200
  @asynccontextmanager
199
201
  async def lifespan(fast_api_app: FastAPI):
200
- server_args = getattr(fast_api_app, "server_args", None)
201
- if server_args is None:
202
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
203
  # Initialize multi-tokenizer support for worker processes
203
- fast_api_app.server_args = await init_multi_tokenizer()
204
- setup_middlewares(
205
- fast_api_app.server_args.api_key, fast_api_app.server_args.enable_metrics
206
- )
204
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
205
+
206
+ # only metrics middleware is supported in multi-tokenizer mode
207
+ worker_pid = os.getpid()
208
+ if fast_api_app.server_args.enable_metrics:
209
+ add_prometheus_middleware(app)
210
+ enable_func_timer()
211
+
212
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
207
213
  fast_api_app.warmup_thread = threading.Thread(
208
214
  target=_wait_and_warmup,
209
215
  args=(
@@ -229,6 +235,12 @@ async def lifespan(fast_api_app: FastAPI):
229
235
  fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
230
236
  _global_state.tokenizer_manager
231
237
  )
238
+ fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
239
+ _global_state.tokenizer_manager
240
+ )
241
+ fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
242
+ _global_state.tokenizer_manager
243
+ )
232
244
 
233
245
  server_args: ServerArgs = fast_api_app.server_args
234
246
 
@@ -299,7 +311,23 @@ app.add_middleware(
299
311
 
300
312
  @app.exception_handler(HTTPException)
301
313
  async def validation_exception_handler(request: Request, exc: HTTPException):
302
- """Enrich HTTP exception with status code and other details"""
314
+ """Enrich HTTP exception with status code and other details.
315
+
316
+ For /v1/responses, emit OpenAI-style nested error envelope:
317
+ {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
318
+ """
319
+ # adjust fmt for responses api
320
+ if request.url.path.startswith("/v1/responses"):
321
+ nested_error = {
322
+ "message": exc.detail,
323
+ "type": HTTPStatus(exc.status_code).phrase,
324
+ "param": None,
325
+ "code": exc.status_code,
326
+ }
327
+ return ORJSONResponse(
328
+ content={"error": nested_error}, status_code=exc.status_code
329
+ )
330
+
303
331
  error = ErrorResponse(
304
332
  object="error",
305
333
  message=exc.detail,
@@ -312,7 +340,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
312
340
  # Custom exception handlers to change validation error status codes
313
341
  @app.exception_handler(RequestValidationError)
314
342
  async def validation_exception_handler(request: Request, exc: RequestValidationError):
315
- """Override FastAPI's default 422 validation error with 400"""
343
+ """Override FastAPI's default 422 validation error with 400.
344
+
345
+ For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
346
+ """
316
347
  exc_str = str(exc)
317
348
  errors_str = str(exc.errors())
318
349
 
@@ -321,6 +352,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
321
352
  else:
322
353
  message = exc_str
323
354
 
355
+ if request.url.path.startswith("/v1/responses"):
356
+ # adapt specially, for v1/responses API only (notice the error key is different)
357
+ nested_error = {
358
+ "message": message,
359
+ "type": HTTPStatus.BAD_REQUEST.phrase,
360
+ "param": None,
361
+ "code": HTTPStatus.BAD_REQUEST.value,
362
+ }
363
+ return ORJSONResponse(status_code=400, content={"error": nested_error})
364
+
324
365
  err = ErrorResponse(
325
366
  message=message,
326
367
  type=HTTPStatus.BAD_REQUEST.phrase,
@@ -465,7 +506,7 @@ async def get_load():
465
506
 
466
507
 
467
508
  # example usage:
468
- # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
509
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
469
510
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
470
511
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
471
512
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -681,6 +722,38 @@ async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: R
681
722
  )
682
723
 
683
724
 
725
+ @app.post("/init_weights_send_group_for_remote_instance")
726
+ async def init_weights_send_group_for_remote_instance(
727
+ obj: InitWeightsSendGroupForRemoteInstanceReqInput, request: Request
728
+ ):
729
+ success, message = (
730
+ await _global_state.tokenizer_manager.init_weights_send_group_for_remote_instance(
731
+ obj, request
732
+ )
733
+ )
734
+ content = {"success": success, "message": message}
735
+ if success:
736
+ return ORJSONResponse(content, status_code=200)
737
+ else:
738
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
739
+
740
+
741
+ @app.post("/send_weights_to_remote_instance")
742
+ async def send_weights_to_remote_instance(
743
+ obj: SendWeightsToRemoteInstanceReqInput, request: Request
744
+ ):
745
+ success, message = (
746
+ await _global_state.tokenizer_manager.send_weights_to_remote_instance(
747
+ obj, request
748
+ )
749
+ )
750
+ content = {"success": success, "message": message}
751
+ if success:
752
+ return ORJSONResponse(content, status_code=200)
753
+ else:
754
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
755
+
756
+
684
757
  @app.post("/init_weights_update_group")
685
758
  async def init_weights_update_group(
686
759
  obj: InitWeightsUpdateGroupReqInput, request: Request
@@ -696,6 +769,20 @@ async def init_weights_update_group(
696
769
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
697
770
 
698
771
 
772
+ @app.post("/destroy_weights_update_group")
773
+ async def destroy_weights_update_group(
774
+ obj: DestroyWeightsUpdateGroupReqInput, request: Request
775
+ ):
776
+ """Destroy the parameter update group."""
777
+ success, message = (
778
+ await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
779
+ )
780
+ content = {"success": success, "message": message}
781
+ return ORJSONResponse(
782
+ content, status_code=200 if success else HTTPStatus.BAD_REQUEST
783
+ )
784
+
785
+
699
786
  @app.post("/update_weights_from_tensor")
700
787
  async def update_weights_from_tensor(
701
788
  obj: UpdateWeightsFromTensorReqInput, request: Request
@@ -995,6 +1082,42 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
995
1082
  )
996
1083
 
997
1084
 
1085
+ @app.post(
1086
+ "/v1/tokenize",
1087
+ response_class=ORJSONResponse,
1088
+ dependencies=[Depends(validate_json_request)],
1089
+ )
1090
+ @app.post(
1091
+ "/tokenize",
1092
+ response_class=ORJSONResponse,
1093
+ dependencies=[Depends(validate_json_request)],
1094
+ include_in_schema=False,
1095
+ )
1096
+ async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
1097
+ """OpenAI-compatible tokenization endpoint."""
1098
+ return await raw_request.app.state.openai_serving_tokenize.handle_request(
1099
+ request, raw_request
1100
+ )
1101
+
1102
+
1103
+ @app.post(
1104
+ "/v1/detokenize",
1105
+ response_class=ORJSONResponse,
1106
+ dependencies=[Depends(validate_json_request)],
1107
+ )
1108
+ @app.post(
1109
+ "/detokenize",
1110
+ response_class=ORJSONResponse,
1111
+ dependencies=[Depends(validate_json_request)],
1112
+ include_in_schema=False,
1113
+ )
1114
+ async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
1115
+ """OpenAI-compatible detokenization endpoint."""
1116
+ return await raw_request.app.state.openai_serving_detokenize.handle_request(
1117
+ request, raw_request
1118
+ )
1119
+
1120
+
998
1121
  @app.get("/v1/models", response_class=ORJSONResponse)
999
1122
  async def available_models():
1000
1123
  """Show available models. OpenAI-compatible endpoint."""
@@ -1168,7 +1291,6 @@ def launch_server(
1168
1291
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1169
1292
  """
1170
1293
  if server_args.tokenizer_worker_num > 1:
1171
- setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
1172
1294
  port_args = PortArgs.init_new(server_args)
1173
1295
  port_args.tokenizer_worker_ipc_name = (
1174
1296
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
@@ -1177,11 +1299,16 @@ def launch_server(
1177
1299
  server_args=server_args, port_args=port_args
1178
1300
  )
1179
1301
  else:
1180
- setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
1181
1302
  tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1182
1303
  server_args=server_args,
1183
1304
  )
1184
1305
 
1306
+ if server_args.enable_trace:
1307
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1308
+ if server_args.disaggregation_mode == "null":
1309
+ thread_label = "Tokenizer"
1310
+ trace_set_thread_info(thread_label)
1311
+
1185
1312
  set_global_state(
1186
1313
  _GlobalState(
1187
1314
  tokenizer_manager=tokenizer_manager,
@@ -1191,12 +1318,10 @@ def launch_server(
1191
1318
  )
1192
1319
 
1193
1320
  if server_args.tokenizer_worker_num > 1:
1194
- port_args_shm, server_args_shm, scheduler_info_shm = (
1195
- write_data_for_multi_tokenizer(
1196
- port_args,
1197
- server_args,
1198
- scheduler_info,
1199
- )
1321
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1322
+ port_args,
1323
+ server_args,
1324
+ scheduler_info,
1200
1325
  )
1201
1326
  else:
1202
1327
  # Add api key authorization
@@ -1233,6 +1358,9 @@ def launch_server(
1233
1358
  "level": "INFO",
1234
1359
  "propagate": False,
1235
1360
  }
1361
+
1362
+ monkey_patch_uvicorn_multiprocessing()
1363
+
1236
1364
  uvicorn.run(
1237
1365
  "sglang.srt.entrypoints.http_server:app",
1238
1366
  host=server_args.host,
@@ -1243,6 +1371,7 @@ def launch_server(
1243
1371
  workers=server_args.tokenizer_worker_num,
1244
1372
  )
1245
1373
  else:
1374
+ app.is_single_tokenizer_mode = True
1246
1375
  uvicorn.run(
1247
1376
  app,
1248
1377
  host=server_args.host,
@@ -1253,10 +1382,8 @@ def launch_server(
1253
1382
  )
1254
1383
  finally:
1255
1384
  if server_args.tokenizer_worker_num > 1:
1256
- port_args_shm.unlink()
1257
- server_args_shm.unlink()
1258
- scheduler_info_shm.unlink()
1259
- _global_state.tokenizer_manager.clear_tokenizer_mapping()
1385
+ multi_tokenizer_args_shm.unlink()
1386
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1260
1387
  else:
1261
1388
  warmup_thread.join()
1262
1389
 
@@ -1405,13 +1532,5 @@ def _wait_and_warmup(
1405
1532
  if server_args.debug_tensor_dump_input_file:
1406
1533
  kill_process_tree(os.getpid())
1407
1534
 
1408
- if server_args.pdlb_url is not None:
1409
- register_disaggregation_server(
1410
- server_args.disaggregation_mode,
1411
- server_args.port,
1412
- server_args.disaggregation_bootstrap_port,
1413
- server_args.pdlb_url,
1414
- )
1415
-
1416
1535
  if launch_callback is not None:
1417
1536
  launch_callback()