sglang 0.5.3rc2__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (419) hide show
  1. sglang/bench_one_batch.py +47 -28
  2. sglang/bench_one_batch_server.py +41 -25
  3. sglang/bench_serving.py +378 -160
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/interpreter.py +1 -0
  9. sglang/lang/ir.py +13 -0
  10. sglang/launch_server.py +10 -15
  11. sglang/profiler.py +18 -1
  12. sglang/srt/_custom_ops.py +1 -1
  13. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +105 -10
  14. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  15. sglang/srt/compilation/backend.py +437 -0
  16. sglang/srt/compilation/compilation_config.py +20 -0
  17. sglang/srt/compilation/compilation_counter.py +47 -0
  18. sglang/srt/compilation/compile.py +210 -0
  19. sglang/srt/compilation/compiler_interface.py +503 -0
  20. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  21. sglang/srt/compilation/fix_functionalization.py +134 -0
  22. sglang/srt/compilation/fx_utils.py +83 -0
  23. sglang/srt/compilation/inductor_pass.py +140 -0
  24. sglang/srt/compilation/pass_manager.py +66 -0
  25. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  26. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  27. sglang/srt/configs/__init__.py +4 -0
  28. sglang/srt/configs/deepseek_ocr.py +262 -0
  29. sglang/srt/configs/deepseekvl2.py +194 -96
  30. sglang/srt/configs/dots_vlm.py +2 -7
  31. sglang/srt/configs/falcon_h1.py +13 -64
  32. sglang/srt/configs/load_config.py +25 -2
  33. sglang/srt/configs/mamba_utils.py +117 -0
  34. sglang/srt/configs/model_config.py +136 -25
  35. sglang/srt/configs/modelopt_config.py +30 -0
  36. sglang/srt/configs/nemotron_h.py +286 -0
  37. sglang/srt/configs/olmo3.py +105 -0
  38. sglang/srt/configs/points_v15_chat.py +29 -0
  39. sglang/srt/configs/qwen3_next.py +11 -47
  40. sglang/srt/configs/qwen3_omni.py +613 -0
  41. sglang/srt/configs/qwen3_vl.py +0 -10
  42. sglang/srt/connector/remote_instance.py +1 -1
  43. sglang/srt/constrained/base_grammar_backend.py +5 -1
  44. sglang/srt/constrained/llguidance_backend.py +5 -0
  45. sglang/srt/constrained/outlines_backend.py +1 -1
  46. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  47. sglang/srt/constrained/utils.py +12 -0
  48. sglang/srt/constrained/xgrammar_backend.py +20 -11
  49. sglang/srt/disaggregation/ascend/transfer_engine.py +1 -1
  50. sglang/srt/disaggregation/base/conn.py +17 -4
  51. sglang/srt/disaggregation/common/conn.py +4 -2
  52. sglang/srt/disaggregation/decode.py +123 -31
  53. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  54. sglang/srt/disaggregation/fake/conn.py +11 -3
  55. sglang/srt/disaggregation/mooncake/conn.py +157 -19
  56. sglang/srt/disaggregation/nixl/conn.py +69 -24
  57. sglang/srt/disaggregation/prefill.py +96 -270
  58. sglang/srt/distributed/device_communicators/all_reduce_utils.py +4 -4
  59. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  60. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  61. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  62. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  63. sglang/srt/distributed/device_communicators/symm_mem.py +1 -1
  64. sglang/srt/distributed/naive_distributed.py +5 -4
  65. sglang/srt/distributed/parallel_state.py +63 -19
  66. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  67. sglang/srt/entrypoints/context.py +3 -2
  68. sglang/srt/entrypoints/engine.py +83 -80
  69. sglang/srt/entrypoints/grpc_server.py +430 -234
  70. sglang/srt/entrypoints/harmony_utils.py +2 -2
  71. sglang/srt/entrypoints/http_server.py +195 -102
  72. sglang/srt/entrypoints/http_server_engine.py +1 -7
  73. sglang/srt/entrypoints/openai/protocol.py +225 -37
  74. sglang/srt/entrypoints/openai/serving_base.py +49 -2
  75. sglang/srt/entrypoints/openai/serving_chat.py +29 -74
  76. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  77. sglang/srt/entrypoints/openai/serving_completions.py +15 -1
  78. sglang/srt/entrypoints/openai/serving_responses.py +5 -2
  79. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  80. sglang/srt/environ.py +58 -6
  81. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  82. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  83. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  84. sglang/srt/eplb/expert_distribution.py +33 -4
  85. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  86. sglang/srt/eplb/expert_location_updater.py +2 -2
  87. sglang/srt/function_call/base_format_detector.py +17 -18
  88. sglang/srt/function_call/function_call_parser.py +20 -14
  89. sglang/srt/function_call/glm4_moe_detector.py +1 -5
  90. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  91. sglang/srt/function_call/json_array_parser.py +0 -2
  92. sglang/srt/function_call/minimax_m2.py +367 -0
  93. sglang/srt/function_call/utils.py +2 -2
  94. sglang/srt/grpc/compile_proto.py +3 -3
  95. sglang/srt/{entrypoints → grpc}/grpc_request_manager.py +112 -52
  96. sglang/srt/grpc/health_servicer.py +189 -0
  97. sglang/srt/grpc/scheduler_launcher.py +181 -0
  98. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  99. sglang/srt/grpc/sglang_scheduler_pb2.pyi +66 -10
  100. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +89 -1
  101. sglang/srt/layers/activation.py +10 -1
  102. sglang/srt/layers/attention/aiter_backend.py +3 -3
  103. sglang/srt/layers/attention/ascend_backend.py +17 -1
  104. sglang/srt/layers/attention/attention_registry.py +43 -23
  105. sglang/srt/layers/attention/base_attn_backend.py +20 -1
  106. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  107. sglang/srt/layers/attention/fla/chunk.py +0 -1
  108. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  109. sglang/srt/layers/attention/fla/index.py +0 -2
  110. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  111. sglang/srt/layers/attention/fla/utils.py +0 -3
  112. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  113. sglang/srt/layers/attention/flashattention_backend.py +24 -10
  114. sglang/srt/layers/attention/flashinfer_backend.py +258 -22
  115. sglang/srt/layers/attention/flashinfer_mla_backend.py +38 -28
  116. sglang/srt/layers/attention/flashmla_backend.py +2 -2
  117. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  118. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -62
  119. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  120. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  121. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -5
  122. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  123. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  124. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  125. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  126. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  127. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  128. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +0 -1
  129. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  130. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +1 -1
  131. sglang/srt/layers/attention/nsa/nsa_indexer.py +40 -83
  132. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  133. sglang/srt/layers/attention/nsa/utils.py +0 -1
  134. sglang/srt/layers/attention/nsa_backend.py +404 -90
  135. sglang/srt/layers/attention/triton_backend.py +208 -34
  136. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  137. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  138. sglang/srt/layers/attention/trtllm_mha_backend.py +2 -2
  139. sglang/srt/layers/attention/trtllm_mla_backend.py +362 -43
  140. sglang/srt/layers/attention/utils.py +89 -7
  141. sglang/srt/layers/attention/vision.py +3 -3
  142. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  143. sglang/srt/layers/communicator.py +12 -7
  144. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +5 -9
  145. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/configurer.py +4 -3
  146. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  147. sglang/srt/layers/dp_attention.py +17 -0
  148. sglang/srt/layers/layernorm.py +64 -19
  149. sglang/srt/layers/linear.py +9 -1
  150. sglang/srt/layers/logits_processor.py +152 -17
  151. sglang/srt/layers/modelopt_utils.py +11 -0
  152. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  153. sglang/srt/layers/moe/cutlass_w4a8_moe.py +351 -21
  154. sglang/srt/layers/moe/ep_moe/kernels.py +229 -457
  155. sglang/srt/layers/moe/ep_moe/layer.py +154 -625
  156. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +1 -1
  157. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  158. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  159. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +11 -3
  160. sglang/srt/layers/moe/fused_moe_triton/layer.py +79 -73
  161. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +25 -46
  162. sglang/srt/layers/moe/moe_runner/deep_gemm.py +569 -0
  163. sglang/srt/layers/moe/moe_runner/runner.py +6 -0
  164. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  165. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  166. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  167. sglang/srt/layers/moe/router.py +51 -15
  168. sglang/srt/layers/moe/token_dispatcher/__init__.py +14 -4
  169. sglang/srt/layers/moe/token_dispatcher/base.py +12 -6
  170. sglang/srt/layers/moe/token_dispatcher/deepep.py +127 -110
  171. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  172. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  173. sglang/srt/layers/moe/topk.py +7 -6
  174. sglang/srt/layers/moe/utils.py +20 -5
  175. sglang/srt/layers/quantization/__init__.py +5 -58
  176. sglang/srt/layers/quantization/awq.py +183 -9
  177. sglang/srt/layers/quantization/awq_triton.py +29 -0
  178. sglang/srt/layers/quantization/base_config.py +27 -1
  179. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  180. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +20 -49
  181. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  182. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +3 -0
  183. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  184. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  185. sglang/srt/layers/quantization/fp8.py +152 -81
  186. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  187. sglang/srt/layers/quantization/fp8_utils.py +42 -14
  188. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  189. sglang/srt/layers/quantization/gguf.py +566 -0
  190. sglang/srt/layers/quantization/gptq.py +0 -1
  191. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  192. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  193. sglang/srt/layers/quantization/modelopt_quant.py +125 -100
  194. sglang/srt/layers/quantization/mxfp4.py +35 -68
  195. sglang/srt/layers/quantization/petit.py +1 -1
  196. sglang/srt/layers/quantization/quark/quark.py +3 -1
  197. sglang/srt/layers/quantization/quark/quark_moe.py +3 -3
  198. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  199. sglang/srt/layers/quantization/unquant.py +23 -48
  200. sglang/srt/layers/quantization/utils.py +0 -1
  201. sglang/srt/layers/quantization/w4afp8.py +87 -20
  202. sglang/srt/layers/quantization/w8a8_int8.py +30 -24
  203. sglang/srt/layers/radix_attention.py +62 -9
  204. sglang/srt/layers/rotary_embedding.py +686 -17
  205. sglang/srt/layers/sampler.py +47 -16
  206. sglang/srt/layers/sparse_pooler.py +98 -0
  207. sglang/srt/layers/utils.py +0 -1
  208. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  209. sglang/srt/lora/backend/triton_backend.py +0 -1
  210. sglang/srt/lora/eviction_policy.py +139 -0
  211. sglang/srt/lora/lora_manager.py +24 -9
  212. sglang/srt/lora/lora_registry.py +1 -1
  213. sglang/srt/lora/mem_pool.py +40 -16
  214. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +1 -1
  215. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +4 -2
  216. sglang/srt/managers/cache_controller.py +48 -17
  217. sglang/srt/managers/data_parallel_controller.py +146 -42
  218. sglang/srt/managers/detokenizer_manager.py +40 -13
  219. sglang/srt/managers/io_struct.py +69 -16
  220. sglang/srt/managers/mm_utils.py +20 -18
  221. sglang/srt/managers/multi_tokenizer_mixin.py +83 -82
  222. sglang/srt/managers/overlap_utils.py +96 -19
  223. sglang/srt/managers/schedule_batch.py +241 -511
  224. sglang/srt/managers/schedule_policy.py +15 -2
  225. sglang/srt/managers/scheduler.py +420 -514
  226. sglang/srt/managers/scheduler_metrics_mixin.py +73 -18
  227. sglang/srt/managers/scheduler_output_processor_mixin.py +317 -111
  228. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  229. sglang/srt/managers/scheduler_profiler_mixin.py +60 -14
  230. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  231. sglang/srt/managers/scheduler_update_weights_mixin.py +33 -14
  232. sglang/srt/managers/tokenizer_communicator_mixin.py +71 -55
  233. sglang/srt/managers/tokenizer_manager.py +375 -95
  234. sglang/srt/managers/tp_worker.py +212 -161
  235. sglang/srt/managers/utils.py +78 -2
  236. sglang/srt/mem_cache/allocator.py +7 -2
  237. sglang/srt/mem_cache/allocator_ascend.py +2 -2
  238. sglang/srt/mem_cache/base_prefix_cache.py +2 -2
  239. sglang/srt/mem_cache/chunk_cache.py +13 -2
  240. sglang/srt/mem_cache/common.py +480 -0
  241. sglang/srt/mem_cache/evict_policy.py +16 -1
  242. sglang/srt/mem_cache/hicache_storage.py +11 -2
  243. sglang/srt/mem_cache/hiradix_cache.py +16 -3
  244. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  245. sglang/srt/mem_cache/memory_pool.py +517 -219
  246. sglang/srt/mem_cache/memory_pool_host.py +0 -1
  247. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  248. sglang/srt/mem_cache/radix_cache.py +53 -19
  249. sglang/srt/mem_cache/radix_cache_cpp.py +19 -14
  250. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +8 -2
  251. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +1 -13
  252. sglang/srt/mem_cache/storage/backend_factory.py +2 -2
  253. sglang/srt/mem_cache/storage/eic/eic_storage.py +5 -6
  254. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  255. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  256. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +9 -3
  257. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +5 -3
  258. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +101 -17
  259. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  260. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  261. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  262. sglang/srt/mem_cache/swa_radix_cache.py +92 -26
  263. sglang/srt/metrics/collector.py +31 -0
  264. sglang/srt/metrics/func_timer.py +1 -1
  265. sglang/srt/model_executor/cuda_graph_runner.py +43 -5
  266. sglang/srt/model_executor/forward_batch_info.py +71 -25
  267. sglang/srt/model_executor/model_runner.py +362 -270
  268. sglang/srt/model_executor/npu_graph_runner.py +2 -3
  269. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +549 -0
  270. sglang/srt/model_loader/__init__.py +1 -1
  271. sglang/srt/model_loader/loader.py +424 -27
  272. sglang/srt/model_loader/utils.py +0 -1
  273. sglang/srt/model_loader/weight_utils.py +47 -28
  274. sglang/srt/models/apertus.py +2 -3
  275. sglang/srt/models/arcee.py +2 -2
  276. sglang/srt/models/bailing_moe.py +13 -52
  277. sglang/srt/models/bailing_moe_nextn.py +3 -4
  278. sglang/srt/models/bert.py +1 -1
  279. sglang/srt/models/deepseek_nextn.py +19 -3
  280. sglang/srt/models/deepseek_ocr.py +1516 -0
  281. sglang/srt/models/deepseek_v2.py +418 -140
  282. sglang/srt/models/dots_ocr.py +0 -2
  283. sglang/srt/models/dots_vlm.py +0 -1
  284. sglang/srt/models/dots_vlm_vit.py +1 -1
  285. sglang/srt/models/falcon_h1.py +13 -19
  286. sglang/srt/models/gemma3_mm.py +16 -0
  287. sglang/srt/models/gemma3n_mm.py +1 -2
  288. sglang/srt/models/glm4_moe.py +327 -382
  289. sglang/srt/models/glm4_moe_nextn.py +6 -16
  290. sglang/srt/models/glm4v.py +2 -1
  291. sglang/srt/models/glm4v_moe.py +32 -199
  292. sglang/srt/models/gpt_oss.py +5 -5
  293. sglang/srt/models/grok.py +10 -23
  294. sglang/srt/models/hunyuan.py +2 -7
  295. sglang/srt/models/interns1.py +0 -1
  296. sglang/srt/models/kimi_vl.py +1 -7
  297. sglang/srt/models/kimi_vl_moonvit.py +3 -1
  298. sglang/srt/models/llama.py +2 -2
  299. sglang/srt/models/llama_eagle3.py +1 -1
  300. sglang/srt/models/longcat_flash.py +5 -22
  301. sglang/srt/models/longcat_flash_nextn.py +3 -14
  302. sglang/srt/models/mimo.py +2 -13
  303. sglang/srt/models/mimo_mtp.py +1 -2
  304. sglang/srt/models/minicpmo.py +7 -5
  305. sglang/srt/models/minimax_m2.py +922 -0
  306. sglang/srt/models/mixtral.py +1 -4
  307. sglang/srt/models/mllama.py +1 -1
  308. sglang/srt/models/mllama4.py +13 -3
  309. sglang/srt/models/nemotron_h.py +511 -0
  310. sglang/srt/models/nvila.py +355 -0
  311. sglang/srt/models/nvila_lite.py +184 -0
  312. sglang/srt/models/olmo2.py +31 -4
  313. sglang/srt/models/opt.py +5 -5
  314. sglang/srt/models/phi.py +1 -1
  315. sglang/srt/models/phi4mm.py +1 -1
  316. sglang/srt/models/phimoe.py +0 -1
  317. sglang/srt/models/pixtral.py +0 -3
  318. sglang/srt/models/points_v15_chat.py +186 -0
  319. sglang/srt/models/qwen.py +0 -1
  320. sglang/srt/models/qwen2.py +22 -1
  321. sglang/srt/models/qwen2_5_vl.py +3 -3
  322. sglang/srt/models/qwen2_audio.py +2 -15
  323. sglang/srt/models/qwen2_moe.py +15 -12
  324. sglang/srt/models/qwen2_vl.py +5 -2
  325. sglang/srt/models/qwen3.py +34 -4
  326. sglang/srt/models/qwen3_moe.py +19 -37
  327. sglang/srt/models/qwen3_next.py +7 -12
  328. sglang/srt/models/qwen3_next_mtp.py +3 -4
  329. sglang/srt/models/qwen3_omni_moe.py +661 -0
  330. sglang/srt/models/qwen3_vl.py +37 -33
  331. sglang/srt/models/qwen3_vl_moe.py +57 -185
  332. sglang/srt/models/roberta.py +55 -3
  333. sglang/srt/models/sarashina2_vision.py +0 -1
  334. sglang/srt/models/step3_vl.py +3 -5
  335. sglang/srt/models/utils.py +11 -1
  336. sglang/srt/multimodal/processors/base_processor.py +7 -2
  337. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  338. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  339. sglang/srt/multimodal/processors/dots_vlm.py +0 -1
  340. sglang/srt/multimodal/processors/glm4v.py +2 -6
  341. sglang/srt/multimodal/processors/internvl.py +0 -2
  342. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  343. sglang/srt/multimodal/processors/mllama4.py +0 -8
  344. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  345. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  346. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  347. sglang/srt/multimodal/processors/qwen_vl.py +75 -16
  348. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  349. sglang/srt/parser/conversation.py +41 -0
  350. sglang/srt/parser/reasoning_parser.py +28 -2
  351. sglang/srt/sampling/custom_logit_processor.py +77 -2
  352. sglang/srt/sampling/sampling_batch_info.py +17 -22
  353. sglang/srt/sampling/sampling_params.py +70 -2
  354. sglang/srt/server_args.py +846 -163
  355. sglang/srt/server_args_config_parser.py +1 -1
  356. sglang/srt/single_batch_overlap.py +36 -31
  357. sglang/srt/speculative/base_spec_worker.py +34 -0
  358. sglang/srt/speculative/draft_utils.py +226 -0
  359. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +24 -7
  360. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +23 -2
  361. sglang/srt/speculative/eagle_info.py +57 -18
  362. sglang/srt/speculative/eagle_info_v2.py +458 -0
  363. sglang/srt/speculative/eagle_utils.py +138 -0
  364. sglang/srt/speculative/eagle_worker.py +83 -280
  365. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  366. sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +14 -9
  367. sglang/srt/speculative/ngram_worker.py +12 -11
  368. sglang/srt/speculative/spec_info.py +2 -0
  369. sglang/srt/speculative/spec_utils.py +38 -3
  370. sglang/srt/speculative/standalone_worker.py +4 -14
  371. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  372. sglang/srt/two_batch_overlap.py +28 -14
  373. sglang/srt/utils/__init__.py +1 -1
  374. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  375. sglang/srt/utils/common.py +272 -82
  376. sglang/srt/utils/hf_transformers_utils.py +44 -17
  377. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  378. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  379. sglang/srt/utils/profile_merger.py +199 -0
  380. sglang/test/attention/test_flashattn_backend.py +1 -1
  381. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  382. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  383. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  384. sglang/test/few_shot_gsm8k_engine.py +2 -4
  385. sglang/test/kit_matched_stop.py +157 -0
  386. sglang/test/longbench_v2/__init__.py +1 -0
  387. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  388. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  389. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  390. sglang/test/run_eval.py +41 -0
  391. sglang/test/runners.py +2 -0
  392. sglang/test/send_one.py +42 -7
  393. sglang/test/simple_eval_common.py +3 -0
  394. sglang/test/simple_eval_gpqa.py +0 -1
  395. sglang/test/simple_eval_humaneval.py +0 -3
  396. sglang/test/simple_eval_longbench_v2.py +344 -0
  397. sglang/test/test_block_fp8.py +1 -2
  398. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  399. sglang/test/test_cutlass_moe.py +1 -2
  400. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  401. sglang/test/test_deterministic.py +463 -107
  402. sglang/test/test_deterministic_utils.py +74 -0
  403. sglang/test/test_disaggregation_utils.py +81 -0
  404. sglang/test/test_marlin_moe.py +0 -1
  405. sglang/test/test_utils.py +85 -20
  406. sglang/version.py +1 -1
  407. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +48 -35
  408. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +414 -350
  409. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  410. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  411. sglang/srt/models/vila.py +0 -306
  412. sglang/srt/speculative/build_eagle_tree.py +0 -427
  413. sglang/test/test_block_fp8_ep.py +0 -358
  414. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  415. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  416. /sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +0 -0
  417. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  418. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  419. {sglang-0.5.3rc2.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,10 @@
3
3
  # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
4
4
  # Slight differences in processing chat messages
5
5
  import datetime
6
- import json
7
6
  from collections.abc import Iterable
8
7
  from typing import Literal, Optional, Union
9
8
 
9
+ import orjson
10
10
  from openai.types.responses import (
11
11
  ResponseOutputItem,
12
12
  ResponseOutputMessage,
@@ -228,7 +228,7 @@ def parse_output_message(message: Message):
228
228
  if len(message.content) != 1:
229
229
  raise ValueError("Invalid number of contents in browser message")
230
230
  content = message.content[0]
231
- browser_call = json.loads(content.text)
231
+ browser_call = orjson.loads(content.text)
232
232
  # TODO: translate to url properly!
233
233
  if recipient == "browser.search":
234
234
  action = ActionSearch(
@@ -19,9 +19,8 @@ This file implements HTTP APIs for the inference engine via fastapi.
19
19
 
20
20
  import asyncio
21
21
  import dataclasses
22
- import json
23
22
  import logging
24
- import multiprocessing as multiprocessing
23
+ import multiprocessing
25
24
  import os
26
25
  import tempfile
27
26
  import threading
@@ -51,20 +50,28 @@ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationM
51
50
  from sglang.srt.entrypoints.engine import _launch_subprocesses
52
51
  from sglang.srt.entrypoints.openai.protocol import (
53
52
  ChatCompletionRequest,
53
+ ClassifyRequest,
54
54
  CompletionRequest,
55
+ DetokenizeRequest,
55
56
  EmbeddingRequest,
56
57
  ErrorResponse,
57
58
  ModelCard,
58
59
  ModelList,
59
60
  ResponsesRequest,
60
61
  ScoringRequest,
62
+ TokenizeRequest,
61
63
  V1RerankReqInput,
62
64
  )
63
65
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
66
+ from sglang.srt.entrypoints.openai.serving_classify import OpenAIServingClassify
64
67
  from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
65
68
  from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
66
69
  from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
67
70
  from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
71
+ from sglang.srt.entrypoints.openai.serving_tokenize import (
72
+ OpenAIServingDetokenize,
73
+ OpenAIServingTokenize,
74
+ )
68
75
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
69
76
  from sglang.srt.managers.io_struct import (
70
77
  AbortReq,
@@ -89,6 +96,7 @@ from sglang.srt.managers.io_struct import (
89
96
  UnloadLoRAAdapterReqInput,
90
97
  UpdateWeightFromDiskReqInput,
91
98
  UpdateWeightsFromDistributedReqInput,
99
+ UpdateWeightsFromIPCReqInput,
92
100
  UpdateWeightsFromTensorReqInput,
93
101
  UpdateWeightVersionReqInput,
94
102
  VertexGenerateReqInput,
@@ -122,6 +130,7 @@ logger = logging.getLogger(__name__)
122
130
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
123
131
 
124
132
  HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
133
+ WAIT_WEIGHTS_READY_TIMEOUT = int(os.getenv("SGLANG_WAIT_WEIGHTS_READY_TIMEOUT", 120))
125
134
 
126
135
 
127
136
  # Store global states
@@ -142,24 +151,28 @@ def set_global_state(global_state: _GlobalState):
142
151
 
143
152
  async def init_multi_tokenizer() -> ServerArgs:
144
153
  """Read args information from shm and init tokenizer manager for current process"""
145
- pid = os.getpid()
146
- main_pid = get_main_process_id()
147
- logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
148
154
 
149
155
  # Read configuration from shared memory
156
+ main_pid = get_main_process_id()
150
157
  port_args, server_args, scheduler_info = read_from_shared_memory(
151
158
  f"multi_tokenizer_args_{main_pid}"
152
159
  )
153
160
  server_args: ServerArgs
161
+ port_args: PortArgs
154
162
 
155
163
  # API key authentication is not supported in multi-tokenizer mode
156
164
  assert (
157
165
  server_args.api_key is None
158
166
  ), "API key is not supported in multi-tokenizer mode"
159
167
 
168
+ # Create a new ipc name for the current process
160
169
  port_args.tokenizer_ipc_name = (
161
170
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
162
171
  )
172
+ logger.info(
173
+ f"Start multi-tokenizer worker process {os.getpid()}, "
174
+ f"ipc_name={port_args.tokenizer_ipc_name}"
175
+ )
163
176
 
164
177
  # Launch multi-tokenizer manager process
165
178
  tokenizer_manager = TokenizerWorker(server_args, port_args)
@@ -170,10 +183,9 @@ async def init_multi_tokenizer() -> ServerArgs:
170
183
  chat_template=server_args.chat_template,
171
184
  completion_template=server_args.completion_template,
172
185
  )
173
- # Register this tokenizer with the main tokenizer manager
174
- await tokenizer_manager.register_to_main_tokenizer_manager()
175
186
 
176
187
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
188
+
177
189
  set_global_state(
178
190
  _GlobalState(
179
191
  tokenizer_manager=tokenizer_manager,
@@ -182,36 +194,35 @@ async def init_multi_tokenizer() -> ServerArgs:
182
194
  )
183
195
  )
184
196
 
185
- if server_args.enable_trace:
186
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
187
- if server_args.disaggregation_mode == "null":
188
- thread_label = f"MultiTokenizer-{tokenizer_manager.worker_id}"
189
- trace_set_thread_info(thread_label)
190
-
191
197
  return server_args
192
198
 
193
199
 
194
200
  @asynccontextmanager
195
201
  async def lifespan(fast_api_app: FastAPI):
196
- if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
202
+ if getattr(fast_api_app, "is_single_tokenizer_mode", False):
203
+ server_args = fast_api_app.server_args
204
+ warmup_thread_args = fast_api_app.warmup_thread_args
205
+ thread_label = "Tokenizer"
206
+ else:
197
207
  # Initialize multi-tokenizer support for worker processes
198
- fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
199
-
200
- # only metrics middleware is supported in multi-tokenizer mode
201
- worker_pid = os.getpid()
202
- if fast_api_app.server_args.enable_metrics:
203
- add_prometheus_middleware(app)
204
- enable_func_timer()
205
-
206
- logger.info(f"Worker {worker_pid} added prometheus middleware")
207
- fast_api_app.warmup_thread = threading.Thread(
208
- target=_wait_and_warmup,
209
- args=(
210
- fast_api_app.server_args,
211
- None, # pipe_finish_writer not needed in worker
212
- None, # launch_callback not needed in worker
213
- ),
208
+ server_args = await init_multi_tokenizer()
209
+ warmup_thread_args = (
210
+ server_args,
211
+ None,
212
+ None,
214
213
  )
214
+ thread_label = f"MultiTokenizer-{_global_state.tokenizer_manager.worker_id}"
215
+
216
+ # Add prometheus middleware
217
+ if server_args.enable_metrics:
218
+ add_prometheus_middleware(app)
219
+ enable_func_timer()
220
+
221
+ # Init tracing
222
+ if server_args.enable_trace:
223
+ process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
224
+ if server_args.disaggregation_mode == "null":
225
+ trace_set_thread_info(thread_label)
215
226
 
216
227
  # Initialize OpenAI serving handlers
217
228
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
@@ -223,15 +234,23 @@ async def lifespan(fast_api_app: FastAPI):
223
234
  fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
224
235
  _global_state.tokenizer_manager, _global_state.template_manager
225
236
  )
237
+ fast_api_app.state.openai_serving_classify = OpenAIServingClassify(
238
+ _global_state.tokenizer_manager, _global_state.template_manager
239
+ )
226
240
  fast_api_app.state.openai_serving_score = OpenAIServingScore(
227
241
  _global_state.tokenizer_manager
228
242
  )
229
243
  fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
230
244
  _global_state.tokenizer_manager
231
245
  )
246
+ fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
247
+ _global_state.tokenizer_manager
248
+ )
249
+ fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
250
+ _global_state.tokenizer_manager
251
+ )
232
252
 
233
- server_args: ServerArgs = fast_api_app.server_args
234
-
253
+ # Launch tool server
235
254
  tool_server = None
236
255
  if server_args.tool_server == "demo":
237
256
  from sglang.srt.entrypoints.openai.tool_server import DemoToolServer
@@ -255,12 +274,11 @@ async def lifespan(fast_api_app: FastAPI):
255
274
  enable_force_include_usage=True,
256
275
  tool_server=tool_server,
257
276
  )
258
- except Exception as e:
259
- import traceback
260
-
261
- traceback.print_exc()
262
- logger.warning(f"Can not initialize OpenAIServingResponses, error: {e}")
277
+ except Exception:
278
+ traceback = get_exception_traceback()
279
+ logger.warning(f"Can not initialize OpenAIServingResponses, error: {traceback}")
263
280
 
281
+ # Execute custom warmups
264
282
  if server_args.warmups is not None:
265
283
  await execute_warmups(
266
284
  server_args.disaggregation_mode,
@@ -269,18 +287,18 @@ async def lifespan(fast_api_app: FastAPI):
269
287
  )
270
288
  logger.info("Warmup ended")
271
289
 
272
- warmup_thread = getattr(fast_api_app, "warmup_thread", None)
273
- if warmup_thread is not None:
274
- warmup_thread.start()
290
+ # Execute the general warmup
291
+ warmup_thread = threading.Thread(
292
+ target=_wait_and_warmup,
293
+ args=warmup_thread_args,
294
+ )
295
+ warmup_thread.start()
275
296
 
297
+ # Start the HTTP server
276
298
  try:
277
299
  yield
278
300
  finally:
279
- if server_args.tokenizer_worker_num > 1:
280
- pid = os.getpid()
281
- logger.info(f"uvicorn worker {pid} ending...")
282
- warmup_thread.join()
283
- logger.info(f"uvicorn worker {pid} ended.")
301
+ warmup_thread.join()
284
302
 
285
303
 
286
304
  # Fast API
@@ -480,6 +498,11 @@ async def get_server_info():
480
498
  internal_states: List[Dict[Any, Any]] = (
481
499
  await _global_state.tokenizer_manager.get_internal_state()
482
500
  )
501
+
502
+ # This field is not serializable.
503
+ if hasattr(_global_state.tokenizer_manager.server_args, "model_config"):
504
+ del _global_state.tokenizer_manager.server_args.model_config
505
+
483
506
  return {
484
507
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
485
508
  **_global_state.scheduler_info,
@@ -494,7 +517,7 @@ async def get_load():
494
517
 
495
518
 
496
519
  # example usage:
497
- # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
520
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
498
521
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
499
522
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
500
523
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -543,7 +566,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
543
566
  async def generate_from_file_request(file: UploadFile, request: Request):
544
567
  """Handle a generate request, this is purely to work with input_embeds."""
545
568
  content = await file.read()
546
- input_embeds = json.loads(content.decode("utf-8"))
569
+ input_embeds = orjson.loads(content.decode("utf-8"))
547
570
 
548
571
  obj = GenerateReqInput(
549
572
  input_embeds=input_embeds,
@@ -622,6 +645,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
622
645
  with_stack=obj.with_stack,
623
646
  record_shapes=obj.record_shapes,
624
647
  profile_by_stage=obj.profile_by_stage,
648
+ merge_profiles=obj.merge_profiles,
625
649
  )
626
650
  return Response(
627
651
  content="Start profiling.\n",
@@ -820,6 +844,27 @@ async def update_weights_from_distributed(
820
844
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
821
845
 
822
846
 
847
+ @app.post("/update_weights_from_ipc")
848
+ async def update_weights_from_ipc(obj: UpdateWeightsFromIPCReqInput, request: Request):
849
+ """Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration."""
850
+ success, message = await _global_state.tokenizer_manager.update_weights_from_ipc(
851
+ obj, request
852
+ )
853
+
854
+ # Update weight version if provided and weights update was successful
855
+ if success and obj.weight_version is not None:
856
+ _update_weight_version_if_provided(obj.weight_version)
857
+ message += f" Weight version updated to {obj.weight_version}."
858
+
859
+ content = {"success": success, "message": message}
860
+ if success:
861
+ if _global_state.tokenizer_manager.initial_weights_loaded is False:
862
+ _global_state.tokenizer_manager.initial_weights_loaded = True
863
+ return ORJSONResponse(content)
864
+ else:
865
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
866
+
867
+
823
868
  @app.post("/update_weight_version")
824
869
  async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
825
870
  """Update the weight version. This operation requires no active requests."""
@@ -1070,6 +1115,54 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
1070
1115
  )
1071
1116
 
1072
1117
 
1118
+ @app.post(
1119
+ "/v1/classify",
1120
+ response_class=ORJSONResponse,
1121
+ dependencies=[Depends(validate_json_request)],
1122
+ )
1123
+ async def openai_v1_classify(request: ClassifyRequest, raw_request: Request):
1124
+ """OpenAI-compatible classification endpoint."""
1125
+ return await raw_request.app.state.openai_serving_classify.handle_request(
1126
+ request, raw_request
1127
+ )
1128
+
1129
+
1130
+ @app.post(
1131
+ "/v1/tokenize",
1132
+ response_class=ORJSONResponse,
1133
+ dependencies=[Depends(validate_json_request)],
1134
+ )
1135
+ @app.post(
1136
+ "/tokenize",
1137
+ response_class=ORJSONResponse,
1138
+ dependencies=[Depends(validate_json_request)],
1139
+ include_in_schema=False,
1140
+ )
1141
+ async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
1142
+ """OpenAI-compatible tokenization endpoint."""
1143
+ return await raw_request.app.state.openai_serving_tokenize.handle_request(
1144
+ request, raw_request
1145
+ )
1146
+
1147
+
1148
+ @app.post(
1149
+ "/v1/detokenize",
1150
+ response_class=ORJSONResponse,
1151
+ dependencies=[Depends(validate_json_request)],
1152
+ )
1153
+ @app.post(
1154
+ "/detokenize",
1155
+ response_class=ORJSONResponse,
1156
+ dependencies=[Depends(validate_json_request)],
1157
+ include_in_schema=False,
1158
+ )
1159
+ async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
1160
+ """OpenAI-compatible detokenization endpoint."""
1161
+ return await raw_request.app.state.openai_serving_detokenize.handle_request(
1162
+ request, raw_request
1163
+ )
1164
+
1165
+
1073
1166
  @app.get("/v1/models", response_class=ORJSONResponse)
1074
1167
  async def available_models():
1075
1168
  """Show available models. OpenAI-compatible endpoint."""
@@ -1239,27 +1332,12 @@ def launch_server(
1239
1332
  3. DetokenizerManager (subprocess): Detokenizes the output tokens and sends the result back to the Tokenizer Manager.
1240
1333
 
1241
1334
  Note:
1242
- 1. The HTTP server, Engine, and TokenizerManager both run in the main process.
1335
+ 1. The HTTP server, Engine, and TokenizerManager all run in the main process.
1243
1336
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1244
1337
  """
1245
- if server_args.tokenizer_worker_num > 1:
1246
- port_args = PortArgs.init_new(server_args)
1247
- port_args.tokenizer_worker_ipc_name = (
1248
- f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
1249
- )
1250
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1251
- server_args=server_args, port_args=port_args
1252
- )
1253
- else:
1254
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1255
- server_args=server_args,
1256
- )
1257
-
1258
- if server_args.enable_trace:
1259
- process_tracing_init(server_args.oltp_traces_endpoint, "sglang")
1260
- if server_args.disaggregation_mode == "null":
1261
- thread_label = "Tokenizer"
1262
- trace_set_thread_info(thread_label)
1338
+ tokenizer_manager, template_manager, scheduler_info, port_args = (
1339
+ _launch_subprocesses(server_args=server_args)
1340
+ )
1263
1341
 
1264
1342
  set_global_state(
1265
1343
  _GlobalState(
@@ -1269,40 +1347,45 @@ def launch_server(
1269
1347
  )
1270
1348
  )
1271
1349
 
1272
- if server_args.tokenizer_worker_num > 1:
1273
- multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1274
- port_args,
1350
+ # Pass additional arguments to the lifespan function.
1351
+ # They will be used for additional initialization setups.
1352
+ if server_args.tokenizer_worker_num == 1:
1353
+ # If it is single tokenizer mode, we can pass the arguments by attributes of the app object.
1354
+ app.is_single_tokenizer_mode = True
1355
+ app.server_args = server_args
1356
+ app.warmup_thread_args = (
1275
1357
  server_args,
1276
- scheduler_info,
1358
+ pipe_finish_writer,
1359
+ launch_callback,
1277
1360
  )
1278
- else:
1361
+
1279
1362
  # Add api key authorization
1363
+ # This is only supported in single tokenizer mode.
1280
1364
  if server_args.api_key:
1281
1365
  add_api_key_middleware(app, server_args.api_key)
1282
-
1283
- # Add prometheus middleware
1284
- if server_args.enable_metrics:
1285
- add_prometheus_middleware(app)
1286
- enable_func_timer()
1287
-
1288
- # Send a warmup request - we will create the thread launch it
1289
- # in the lifespan after all other warmups have fired.
1290
- warmup_thread = threading.Thread(
1291
- target=_wait_and_warmup,
1292
- args=(
1293
- server_args,
1294
- pipe_finish_writer,
1295
- launch_callback,
1296
- ),
1366
+ else:
1367
+ # If it is multi-tokenizer mode, we need to write the arguments to shared memory
1368
+ # for other worker processes to read.
1369
+ app.is_single_tokenizer_mode = False
1370
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1371
+ port_args, server_args, scheduler_info
1297
1372
  )
1298
- app.warmup_thread = warmup_thread
1299
1373
 
1300
1374
  try:
1301
1375
  # Update logging configs
1302
1376
  set_uvicorn_logging_configs()
1303
- app.server_args = server_args
1377
+
1304
1378
  # Listen for HTTP requests
1305
- if server_args.tokenizer_worker_num > 1:
1379
+ if server_args.tokenizer_worker_num == 1:
1380
+ uvicorn.run(
1381
+ app,
1382
+ host=server_args.host,
1383
+ port=server_args.port,
1384
+ log_level=server_args.log_level_http or server_args.log_level,
1385
+ timeout_keep_alive=5,
1386
+ loop="uvloop",
1387
+ )
1388
+ else:
1306
1389
  from uvicorn.config import LOGGING_CONFIG
1307
1390
 
1308
1391
  LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
@@ -1310,7 +1393,6 @@ def launch_server(
1310
1393
  "level": "INFO",
1311
1394
  "propagate": False,
1312
1395
  }
1313
-
1314
1396
  monkey_patch_uvicorn_multiprocessing()
1315
1397
 
1316
1398
  uvicorn.run(
@@ -1322,22 +1404,10 @@ def launch_server(
1322
1404
  loop="uvloop",
1323
1405
  workers=server_args.tokenizer_worker_num,
1324
1406
  )
1325
- else:
1326
- app.is_single_tokenizer_mode = True
1327
- uvicorn.run(
1328
- app,
1329
- host=server_args.host,
1330
- port=server_args.port,
1331
- log_level=server_args.log_level_http or server_args.log_level,
1332
- timeout_keep_alive=5,
1333
- loop="uvloop",
1334
- )
1335
1407
  finally:
1336
1408
  if server_args.tokenizer_worker_num > 1:
1337
1409
  multi_tokenizer_args_shm.unlink()
1338
1410
  _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1339
- else:
1340
- warmup_thread.join()
1341
1411
 
1342
1412
 
1343
1413
  def _execute_server_warmup(
@@ -1464,6 +1534,8 @@ def _wait_and_warmup(
1464
1534
  pipe_finish_writer: Optional[multiprocessing.connection.Connection],
1465
1535
  launch_callback: Optional[Callable[[], None]] = None,
1466
1536
  ):
1537
+ if server_args.checkpoint_engine_wait_weights_before_ready:
1538
+ _wait_weights_ready()
1467
1539
  if not server_args.skip_server_warmup:
1468
1540
  if not _execute_server_warmup(
1469
1541
  server_args,
@@ -1486,3 +1558,24 @@ def _wait_and_warmup(
1486
1558
 
1487
1559
  if launch_callback is not None:
1488
1560
  launch_callback()
1561
+
1562
+
1563
+ def _wait_weights_ready():
1564
+ """Wait for weights to be ready within the specified timeout."""
1565
+ timeout = WAIT_WEIGHTS_READY_TIMEOUT
1566
+ start_time = time.time()
1567
+
1568
+ for _ in range(timeout):
1569
+ if _global_state.tokenizer_manager.initial_weights_loaded:
1570
+ logger.info(
1571
+ f"Weights are ready after {time.time() - start_time:.2f} seconds"
1572
+ )
1573
+ return
1574
+ time.sleep(1)
1575
+
1576
+ # Timeout reached without weights being ready
1577
+ logger.error(
1578
+ f"Weights are not ready after waiting {timeout} seconds. "
1579
+ f"Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. "
1580
+ f"Current status: initial_weights_loaded={_global_state.tokenizer_manager.initial_weights_loaded}"
1581
+ )
@@ -1,15 +1,9 @@
1
- import copy
2
- import dataclasses
3
1
  import multiprocessing
4
- import pickle
5
- import threading
6
2
  import time
7
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import List, Optional, Tuple
8
4
 
9
- import pybase64
10
5
  import requests
11
6
  import torch
12
- import torch.distributed as dist
13
7
 
14
8
  from sglang.srt.entrypoints.EngineBase import EngineBase
15
9
  from sglang.srt.entrypoints.http_server import launch_server