sglang 0.5.3rc0__py3-none-any.whl → 0.5.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (482) hide show
  1. sglang/bench_one_batch.py +54 -37
  2. sglang/bench_one_batch_server.py +340 -34
  3. sglang/bench_serving.py +340 -159
  4. sglang/check_env.py +1 -1
  5. sglang/compile_deep_gemm.py +6 -2
  6. sglang/global_config.py +1 -25
  7. sglang/lang/api.py +6 -0
  8. sglang/lang/backend/runtime_endpoint.py +1 -1
  9. sglang/lang/interpreter.py +1 -0
  10. sglang/lang/ir.py +13 -0
  11. sglang/launch_server.py +9 -2
  12. sglang/profiler.py +20 -3
  13. sglang/srt/_custom_ops.py +1 -1
  14. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  15. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +547 -0
  16. sglang/srt/checkpoint_engine/checkpoint_engine_worker.py +142 -0
  17. sglang/srt/compilation/backend.py +437 -0
  18. sglang/srt/compilation/compilation_config.py +20 -0
  19. sglang/srt/compilation/compilation_counter.py +47 -0
  20. sglang/srt/compilation/compile.py +210 -0
  21. sglang/srt/compilation/compiler_interface.py +503 -0
  22. sglang/srt/compilation/cuda_piecewise_backend.py +228 -0
  23. sglang/srt/compilation/fix_functionalization.py +134 -0
  24. sglang/srt/compilation/fx_utils.py +83 -0
  25. sglang/srt/compilation/inductor_pass.py +140 -0
  26. sglang/srt/compilation/pass_manager.py +66 -0
  27. sglang/srt/compilation/piecewise_context_manager.py +40 -0
  28. sglang/srt/compilation/weak_ref_tensor_jit.py +16 -0
  29. sglang/srt/configs/__init__.py +8 -0
  30. sglang/srt/configs/deepseek_ocr.py +262 -0
  31. sglang/srt/configs/deepseekvl2.py +194 -96
  32. sglang/srt/configs/dots_ocr.py +64 -0
  33. sglang/srt/configs/dots_vlm.py +2 -7
  34. sglang/srt/configs/falcon_h1.py +309 -0
  35. sglang/srt/configs/load_config.py +33 -2
  36. sglang/srt/configs/mamba_utils.py +117 -0
  37. sglang/srt/configs/model_config.py +284 -118
  38. sglang/srt/configs/modelopt_config.py +30 -0
  39. sglang/srt/configs/nemotron_h.py +286 -0
  40. sglang/srt/configs/olmo3.py +105 -0
  41. sglang/srt/configs/points_v15_chat.py +29 -0
  42. sglang/srt/configs/qwen3_next.py +11 -47
  43. sglang/srt/configs/qwen3_omni.py +613 -0
  44. sglang/srt/configs/qwen3_vl.py +576 -0
  45. sglang/srt/connector/remote_instance.py +1 -1
  46. sglang/srt/constrained/base_grammar_backend.py +6 -1
  47. sglang/srt/constrained/llguidance_backend.py +5 -0
  48. sglang/srt/constrained/outlines_backend.py +1 -1
  49. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  50. sglang/srt/constrained/reasoner_grammar_backend.py +9 -6
  51. sglang/srt/constrained/utils.py +12 -0
  52. sglang/srt/constrained/xgrammar_backend.py +26 -15
  53. sglang/srt/debug_utils/dumper.py +10 -3
  54. sglang/srt/disaggregation/ascend/conn.py +2 -2
  55. sglang/srt/disaggregation/ascend/transfer_engine.py +48 -10
  56. sglang/srt/disaggregation/base/conn.py +17 -4
  57. sglang/srt/disaggregation/common/conn.py +268 -98
  58. sglang/srt/disaggregation/decode.py +172 -39
  59. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  60. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  61. sglang/srt/disaggregation/fake/conn.py +11 -3
  62. sglang/srt/disaggregation/mooncake/conn.py +203 -555
  63. sglang/srt/disaggregation/nixl/conn.py +217 -63
  64. sglang/srt/disaggregation/prefill.py +113 -270
  65. sglang/srt/disaggregation/utils.py +36 -5
  66. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  67. sglang/srt/distributed/device_communicators/custom_all_reduce.py +6 -6
  68. sglang/srt/distributed/device_communicators/pymscclpp.py +2 -2
  69. sglang/srt/distributed/device_communicators/pynccl.py +24 -12
  70. sglang/srt/distributed/device_communicators/pynccl_allocator.py +2 -2
  71. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  72. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  73. sglang/srt/distributed/naive_distributed.py +5 -4
  74. sglang/srt/distributed/parallel_state.py +203 -97
  75. sglang/srt/elastic_ep/elastic_ep.py +74 -0
  76. sglang/srt/entrypoints/context.py +3 -2
  77. sglang/srt/entrypoints/engine.py +85 -65
  78. sglang/srt/entrypoints/grpc_server.py +632 -305
  79. sglang/srt/entrypoints/harmony_utils.py +2 -2
  80. sglang/srt/entrypoints/http_server.py +169 -17
  81. sglang/srt/entrypoints/http_server_engine.py +1 -7
  82. sglang/srt/entrypoints/openai/protocol.py +327 -34
  83. sglang/srt/entrypoints/openai/serving_base.py +74 -8
  84. sglang/srt/entrypoints/openai/serving_chat.py +202 -118
  85. sglang/srt/entrypoints/openai/serving_classify.py +204 -0
  86. sglang/srt/entrypoints/openai/serving_completions.py +20 -4
  87. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  88. sglang/srt/entrypoints/openai/serving_responses.py +47 -2
  89. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  90. sglang/srt/environ.py +323 -0
  91. sglang/srt/eplb/eplb_algorithms/__init__.py +18 -1
  92. sglang/srt/eplb/eplb_algorithms/deepseek.py +0 -2
  93. sglang/srt/eplb/eplb_algorithms/elasticity_aware.py +87 -0
  94. sglang/srt/eplb/expert_distribution.py +3 -4
  95. sglang/srt/eplb/expert_location.py +30 -5
  96. sglang/srt/eplb/expert_location_dispatch.py +2 -2
  97. sglang/srt/eplb/expert_location_updater.py +2 -2
  98. sglang/srt/function_call/base_format_detector.py +17 -18
  99. sglang/srt/function_call/function_call_parser.py +21 -16
  100. sglang/srt/function_call/glm4_moe_detector.py +4 -8
  101. sglang/srt/function_call/gpt_oss_detector.py +24 -1
  102. sglang/srt/function_call/json_array_parser.py +61 -0
  103. sglang/srt/function_call/kimik2_detector.py +17 -4
  104. sglang/srt/function_call/utils.py +98 -7
  105. sglang/srt/grpc/compile_proto.py +245 -0
  106. sglang/srt/grpc/grpc_request_manager.py +915 -0
  107. sglang/srt/grpc/health_servicer.py +189 -0
  108. sglang/srt/grpc/scheduler_launcher.py +181 -0
  109. sglang/srt/grpc/sglang_scheduler_pb2.py +81 -68
  110. sglang/srt/grpc/sglang_scheduler_pb2.pyi +124 -61
  111. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +92 -1
  112. sglang/srt/layers/activation.py +11 -7
  113. sglang/srt/layers/attention/aiter_backend.py +17 -18
  114. sglang/srt/layers/attention/ascend_backend.py +125 -10
  115. sglang/srt/layers/attention/attention_registry.py +226 -0
  116. sglang/srt/layers/attention/base_attn_backend.py +32 -4
  117. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  118. sglang/srt/layers/attention/double_sparsity_backend.py +2 -2
  119. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  120. sglang/srt/layers/attention/fla/chunk.py +0 -1
  121. sglang/srt/layers/attention/fla/chunk_o.py +1 -1
  122. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  123. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  124. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  125. sglang/srt/layers/attention/fla/index.py +0 -2
  126. sglang/srt/layers/attention/fla/layernorm_gated.py +50 -32
  127. sglang/srt/layers/attention/fla/utils.py +0 -3
  128. sglang/srt/layers/attention/fla/wy_fast.py +0 -2
  129. sglang/srt/layers/attention/flashattention_backend.py +52 -15
  130. sglang/srt/layers/attention/flashinfer_backend.py +357 -212
  131. sglang/srt/layers/attention/flashinfer_mla_backend.py +31 -33
  132. sglang/srt/layers/attention/flashmla_backend.py +9 -7
  133. sglang/srt/layers/attention/hybrid_attn_backend.py +12 -4
  134. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +236 -133
  135. sglang/srt/layers/attention/intel_amx_backend.py +1 -1
  136. sglang/srt/layers/attention/mamba/causal_conv1d.py +2 -1
  137. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +24 -103
  138. sglang/srt/layers/attention/mamba/mamba.py +514 -1
  139. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  140. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  141. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  142. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  143. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  144. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +214 -0
  145. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +562 -0
  146. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +646 -0
  147. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +261 -0
  148. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +264 -0
  149. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  150. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  151. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  152. sglang/srt/layers/attention/nsa/nsa_indexer.py +718 -0
  153. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  154. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  155. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  156. sglang/srt/layers/attention/nsa/triton_kernel.py +136 -0
  157. sglang/srt/layers/attention/nsa/utils.py +23 -0
  158. sglang/srt/layers/attention/nsa_backend.py +1201 -0
  159. sglang/srt/layers/attention/tbo_backend.py +6 -6
  160. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  161. sglang/srt/layers/attention/triton_backend.py +249 -42
  162. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +2 -2
  163. sglang/srt/layers/attention/triton_ops/extend_attention.py +539 -44
  164. sglang/srt/layers/attention/trtllm_mha_backend.py +7 -9
  165. sglang/srt/layers/attention/trtllm_mla_backend.py +523 -48
  166. sglang/srt/layers/attention/utils.py +11 -7
  167. sglang/srt/layers/attention/vision.py +61 -3
  168. sglang/srt/layers/attention/wave_backend.py +4 -4
  169. sglang/srt/layers/attention/xpu_backend.py +1028 -0
  170. sglang/srt/layers/communicator.py +19 -7
  171. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/compile_utils.py +4 -8
  172. sglang/srt/layers/deep_gemm_wrapper/configurer.py +25 -0
  173. sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/entrypoint.py +3 -3
  174. sglang/srt/layers/dp_attention.py +28 -1
  175. sglang/srt/layers/elementwise.py +3 -1
  176. sglang/srt/layers/layernorm.py +47 -15
  177. sglang/srt/layers/linear.py +30 -5
  178. sglang/srt/layers/logits_processor.py +161 -18
  179. sglang/srt/layers/modelopt_utils.py +11 -0
  180. sglang/srt/layers/moe/cutlass_moe.py +0 -2
  181. sglang/srt/layers/moe/cutlass_w4a8_moe.py +213 -21
  182. sglang/srt/layers/moe/ep_moe/kernels.py +36 -458
  183. sglang/srt/layers/moe/ep_moe/layer.py +243 -448
  184. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  185. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=192,device_name=NVIDIA_H200,dtype=fp8_w8a8.json +146 -0
  186. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200.json +146 -0
  187. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  188. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  189. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  190. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +17 -5
  191. sglang/srt/layers/moe/fused_moe_triton/layer.py +86 -81
  192. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +18 -42
  193. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  194. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  195. sglang/srt/layers/moe/moe_runner/triton.py +3 -1
  196. sglang/srt/layers/moe/rocm_moe_utils.py +0 -1
  197. sglang/srt/layers/moe/router.py +51 -15
  198. sglang/srt/layers/moe/token_dispatcher/__init__.py +10 -0
  199. sglang/srt/layers/moe/token_dispatcher/base.py +1 -1
  200. sglang/srt/layers/moe/token_dispatcher/deepep.py +177 -106
  201. sglang/srt/layers/moe/token_dispatcher/mooncake.py +386 -0
  202. sglang/srt/layers/moe/token_dispatcher/standard.py +46 -0
  203. sglang/srt/layers/moe/topk.py +3 -2
  204. sglang/srt/layers/moe/utils.py +27 -1
  205. sglang/srt/layers/parameter.py +23 -6
  206. sglang/srt/layers/quantization/__init__.py +2 -53
  207. sglang/srt/layers/quantization/awq.py +183 -6
  208. sglang/srt/layers/quantization/awq_triton.py +29 -0
  209. sglang/srt/layers/quantization/base_config.py +20 -1
  210. sglang/srt/layers/quantization/compressed_tensors/__init__.py +7 -0
  211. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +21 -49
  212. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +421 -70
  213. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +5 -0
  214. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a16_fp8.py +4 -22
  215. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  216. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +339 -0
  217. sglang/srt/layers/quantization/fp8.py +86 -20
  218. sglang/srt/layers/quantization/fp8_kernel.py +55 -10
  219. sglang/srt/layers/quantization/fp8_utils.py +43 -15
  220. sglang/srt/layers/quantization/fpgemm_fp8.py +2 -3
  221. sglang/srt/layers/quantization/gptq.py +0 -1
  222. sglang/srt/layers/quantization/int8_kernel.py +18 -2
  223. sglang/srt/layers/quantization/marlin_utils.py +12 -0
  224. sglang/srt/layers/quantization/modelopt_quant.py +141 -81
  225. sglang/srt/layers/quantization/mxfp4.py +17 -34
  226. sglang/srt/layers/quantization/petit.py +1 -1
  227. sglang/srt/layers/quantization/quark/quark.py +3 -1
  228. sglang/srt/layers/quantization/quark/quark_moe.py +18 -5
  229. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +0 -7
  230. sglang/srt/layers/quantization/unquant.py +1 -4
  231. sglang/srt/layers/quantization/utils.py +0 -1
  232. sglang/srt/layers/quantization/w4afp8.py +51 -24
  233. sglang/srt/layers/quantization/w8a8_int8.py +45 -27
  234. sglang/srt/layers/radix_attention.py +59 -9
  235. sglang/srt/layers/rotary_embedding.py +750 -46
  236. sglang/srt/layers/sampler.py +84 -16
  237. sglang/srt/layers/sparse_pooler.py +98 -0
  238. sglang/srt/layers/utils.py +23 -1
  239. sglang/srt/layers/vocab_parallel_embedding.py +4 -1
  240. sglang/srt/lora/backend/base_backend.py +3 -3
  241. sglang/srt/lora/backend/chunked_backend.py +348 -0
  242. sglang/srt/lora/backend/triton_backend.py +9 -4
  243. sglang/srt/lora/eviction_policy.py +139 -0
  244. sglang/srt/lora/lora.py +7 -5
  245. sglang/srt/lora/lora_manager.py +33 -7
  246. sglang/srt/lora/lora_registry.py +1 -1
  247. sglang/srt/lora/mem_pool.py +41 -17
  248. sglang/srt/lora/triton_ops/__init__.py +4 -0
  249. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  250. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +176 -0
  251. sglang/srt/lora/utils.py +7 -5
  252. sglang/srt/managers/cache_controller.py +83 -152
  253. sglang/srt/managers/data_parallel_controller.py +156 -87
  254. sglang/srt/managers/detokenizer_manager.py +51 -24
  255. sglang/srt/managers/io_struct.py +223 -129
  256. sglang/srt/managers/mm_utils.py +49 -10
  257. sglang/srt/managers/multi_tokenizer_mixin.py +83 -98
  258. sglang/srt/managers/multimodal_processor.py +1 -2
  259. sglang/srt/managers/overlap_utils.py +130 -0
  260. sglang/srt/managers/schedule_batch.py +340 -529
  261. sglang/srt/managers/schedule_policy.py +158 -18
  262. sglang/srt/managers/scheduler.py +665 -620
  263. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  264. sglang/srt/managers/scheduler_metrics_mixin.py +150 -131
  265. sglang/srt/managers/scheduler_output_processor_mixin.py +337 -122
  266. sglang/srt/managers/scheduler_pp_mixin.py +341 -0
  267. sglang/srt/managers/scheduler_profiler_mixin.py +62 -15
  268. sglang/srt/managers/scheduler_runtime_checker_mixin.py +217 -0
  269. sglang/srt/managers/scheduler_update_weights_mixin.py +40 -14
  270. sglang/srt/managers/tokenizer_communicator_mixin.py +141 -19
  271. sglang/srt/managers/tokenizer_manager.py +462 -226
  272. sglang/srt/managers/tp_worker.py +217 -156
  273. sglang/srt/managers/utils.py +79 -47
  274. sglang/srt/mem_cache/allocator.py +21 -22
  275. sglang/srt/mem_cache/allocator_ascend.py +42 -28
  276. sglang/srt/mem_cache/base_prefix_cache.py +3 -3
  277. sglang/srt/mem_cache/chunk_cache.py +20 -2
  278. sglang/srt/mem_cache/common.py +480 -0
  279. sglang/srt/mem_cache/evict_policy.py +38 -0
  280. sglang/srt/mem_cache/hicache_storage.py +44 -2
  281. sglang/srt/mem_cache/hiradix_cache.py +134 -34
  282. sglang/srt/mem_cache/mamba_radix_cache.py +993 -0
  283. sglang/srt/mem_cache/memory_pool.py +602 -208
  284. sglang/srt/mem_cache/memory_pool_host.py +134 -183
  285. sglang/srt/mem_cache/multimodal_cache.py +0 -1
  286. sglang/srt/mem_cache/radix_cache.py +263 -78
  287. sglang/srt/mem_cache/radix_cache_cpp.py +29 -21
  288. sglang/srt/mem_cache/storage/__init__.py +10 -0
  289. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +157 -0
  290. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +97 -0
  291. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  292. sglang/srt/mem_cache/storage/eic/eic_storage.py +777 -0
  293. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  294. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +0 -1
  295. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +180 -59
  296. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +15 -9
  297. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +217 -26
  298. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +38 -9
  299. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +1 -1
  300. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +17 -2
  301. sglang/srt/mem_cache/swa_radix_cache.py +115 -58
  302. sglang/srt/metrics/collector.py +113 -120
  303. sglang/srt/metrics/func_timer.py +3 -8
  304. sglang/srt/metrics/utils.py +8 -1
  305. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  306. sglang/srt/model_executor/cuda_graph_runner.py +81 -36
  307. sglang/srt/model_executor/forward_batch_info.py +40 -50
  308. sglang/srt/model_executor/model_runner.py +507 -319
  309. sglang/srt/model_executor/npu_graph_runner.py +11 -5
  310. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +539 -0
  311. sglang/srt/model_loader/__init__.py +1 -1
  312. sglang/srt/model_loader/loader.py +438 -37
  313. sglang/srt/model_loader/utils.py +0 -1
  314. sglang/srt/model_loader/weight_utils.py +200 -27
  315. sglang/srt/models/apertus.py +2 -3
  316. sglang/srt/models/arcee.py +2 -2
  317. sglang/srt/models/bailing_moe.py +40 -56
  318. sglang/srt/models/bailing_moe_nextn.py +3 -4
  319. sglang/srt/models/bert.py +1 -1
  320. sglang/srt/models/deepseek_nextn.py +25 -4
  321. sglang/srt/models/deepseek_ocr.py +1516 -0
  322. sglang/srt/models/deepseek_v2.py +793 -235
  323. sglang/srt/models/dots_ocr.py +171 -0
  324. sglang/srt/models/dots_vlm.py +0 -1
  325. sglang/srt/models/dots_vlm_vit.py +1 -1
  326. sglang/srt/models/falcon_h1.py +570 -0
  327. sglang/srt/models/gemma3_causal.py +0 -2
  328. sglang/srt/models/gemma3_mm.py +17 -1
  329. sglang/srt/models/gemma3n_mm.py +2 -3
  330. sglang/srt/models/glm4_moe.py +17 -40
  331. sglang/srt/models/glm4_moe_nextn.py +4 -4
  332. sglang/srt/models/glm4v.py +3 -2
  333. sglang/srt/models/glm4v_moe.py +6 -6
  334. sglang/srt/models/gpt_oss.py +12 -35
  335. sglang/srt/models/grok.py +10 -23
  336. sglang/srt/models/hunyuan.py +2 -7
  337. sglang/srt/models/interns1.py +0 -1
  338. sglang/srt/models/kimi_vl.py +1 -7
  339. sglang/srt/models/kimi_vl_moonvit.py +4 -2
  340. sglang/srt/models/llama.py +6 -2
  341. sglang/srt/models/llama_eagle3.py +1 -1
  342. sglang/srt/models/longcat_flash.py +6 -23
  343. sglang/srt/models/longcat_flash_nextn.py +4 -15
  344. sglang/srt/models/mimo.py +2 -13
  345. sglang/srt/models/mimo_mtp.py +1 -2
  346. sglang/srt/models/minicpmo.py +7 -5
  347. sglang/srt/models/mixtral.py +1 -4
  348. sglang/srt/models/mllama.py +1 -1
  349. sglang/srt/models/mllama4.py +27 -6
  350. sglang/srt/models/nemotron_h.py +511 -0
  351. sglang/srt/models/olmo2.py +31 -4
  352. sglang/srt/models/opt.py +5 -5
  353. sglang/srt/models/phi.py +1 -1
  354. sglang/srt/models/phi4mm.py +1 -1
  355. sglang/srt/models/phimoe.py +0 -1
  356. sglang/srt/models/pixtral.py +0 -3
  357. sglang/srt/models/points_v15_chat.py +186 -0
  358. sglang/srt/models/qwen.py +0 -1
  359. sglang/srt/models/qwen2.py +0 -7
  360. sglang/srt/models/qwen2_5_vl.py +5 -5
  361. sglang/srt/models/qwen2_audio.py +2 -15
  362. sglang/srt/models/qwen2_moe.py +70 -4
  363. sglang/srt/models/qwen2_vl.py +6 -3
  364. sglang/srt/models/qwen3.py +18 -3
  365. sglang/srt/models/qwen3_moe.py +50 -38
  366. sglang/srt/models/qwen3_next.py +43 -21
  367. sglang/srt/models/qwen3_next_mtp.py +3 -4
  368. sglang/srt/models/qwen3_omni_moe.py +661 -0
  369. sglang/srt/models/qwen3_vl.py +791 -0
  370. sglang/srt/models/qwen3_vl_moe.py +343 -0
  371. sglang/srt/models/registry.py +15 -3
  372. sglang/srt/models/roberta.py +55 -3
  373. sglang/srt/models/sarashina2_vision.py +268 -0
  374. sglang/srt/models/solar.py +505 -0
  375. sglang/srt/models/starcoder2.py +357 -0
  376. sglang/srt/models/step3_vl.py +3 -5
  377. sglang/srt/models/torch_native_llama.py +9 -2
  378. sglang/srt/models/utils.py +61 -0
  379. sglang/srt/multimodal/processors/base_processor.py +21 -9
  380. sglang/srt/multimodal/processors/deepseek_ocr.py +37 -0
  381. sglang/srt/multimodal/processors/deepseek_vl_v2.py +0 -3
  382. sglang/srt/multimodal/processors/dots_vlm.py +2 -4
  383. sglang/srt/multimodal/processors/glm4v.py +1 -5
  384. sglang/srt/multimodal/processors/internvl.py +20 -10
  385. sglang/srt/multimodal/processors/janus_pro.py +0 -1
  386. sglang/srt/multimodal/processors/mllama4.py +0 -8
  387. sglang/srt/multimodal/processors/phi4mm.py +0 -1
  388. sglang/srt/multimodal/processors/points_v15_chat.py +52 -0
  389. sglang/srt/multimodal/processors/qwen_vl.py +83 -17
  390. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  391. sglang/srt/multimodal/processors/step3_vl.py +1 -1
  392. sglang/srt/parser/conversation.py +41 -0
  393. sglang/srt/parser/jinja_template_utils.py +6 -0
  394. sglang/srt/parser/reasoning_parser.py +0 -1
  395. sglang/srt/sampling/custom_logit_processor.py +77 -2
  396. sglang/srt/sampling/sampling_batch_info.py +36 -23
  397. sglang/srt/sampling/sampling_params.py +75 -0
  398. sglang/srt/server_args.py +1300 -338
  399. sglang/srt/server_args_config_parser.py +146 -0
  400. sglang/srt/single_batch_overlap.py +161 -0
  401. sglang/srt/speculative/base_spec_worker.py +34 -0
  402. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  403. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  404. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  405. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  406. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  407. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  408. sglang/srt/speculative/draft_utils.py +226 -0
  409. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +26 -8
  410. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +26 -3
  411. sglang/srt/speculative/eagle_info.py +786 -0
  412. sglang/srt/speculative/eagle_info_v2.py +458 -0
  413. sglang/srt/speculative/eagle_utils.py +113 -1270
  414. sglang/srt/speculative/eagle_worker.py +120 -285
  415. sglang/srt/speculative/eagle_worker_v2.py +702 -0
  416. sglang/srt/speculative/ngram_info.py +433 -0
  417. sglang/srt/speculative/ngram_worker.py +246 -0
  418. sglang/srt/speculative/spec_info.py +49 -0
  419. sglang/srt/speculative/spec_utils.py +641 -0
  420. sglang/srt/speculative/standalone_worker.py +4 -14
  421. sglang/srt/tokenizer/tiktoken_tokenizer.py +2 -2
  422. sglang/srt/tracing/trace.py +32 -6
  423. sglang/srt/two_batch_overlap.py +35 -18
  424. sglang/srt/utils/__init__.py +2 -0
  425. sglang/srt/{bench_utils.py → utils/bench_utils.py} +4 -2
  426. sglang/srt/{utils.py → utils/common.py} +583 -113
  427. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +86 -19
  428. sglang/srt/{host_shared_memory.py → utils/host_shared_memory.py} +0 -1
  429. sglang/srt/{offloader.py → utils/offloader.py} +4 -4
  430. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  431. sglang/srt/utils/profile_merger.py +199 -0
  432. sglang/srt/utils/rpd_utils.py +452 -0
  433. sglang/srt/utils/slow_rank_detector.py +71 -0
  434. sglang/srt/{torch_memory_saver_adapter.py → utils/torch_memory_saver_adapter.py} +5 -7
  435. sglang/srt/warmup.py +8 -4
  436. sglang/srt/weight_sync/utils.py +1 -1
  437. sglang/test/attention/test_flashattn_backend.py +1 -1
  438. sglang/test/attention/test_flashattn_mla_backend.py +0 -1
  439. sglang/test/attention/test_prefix_chunk_info.py +0 -2
  440. sglang/test/attention/test_trtllm_mla_backend.py +221 -53
  441. sglang/test/few_shot_gsm8k_engine.py +2 -4
  442. sglang/test/get_logits_ut.py +57 -0
  443. sglang/test/kit_matched_stop.py +157 -0
  444. sglang/test/longbench_v2/__init__.py +1 -0
  445. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  446. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  447. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  448. sglang/test/run_eval.py +120 -11
  449. sglang/test/runners.py +3 -1
  450. sglang/test/send_one.py +42 -7
  451. sglang/test/simple_eval_common.py +8 -2
  452. sglang/test/simple_eval_gpqa.py +0 -1
  453. sglang/test/simple_eval_humaneval.py +0 -3
  454. sglang/test/simple_eval_longbench_v2.py +344 -0
  455. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  456. sglang/test/test_block_fp8.py +3 -4
  457. sglang/test/test_block_fp8_deep_gemm_blackwell.py +0 -1
  458. sglang/test/test_cutlass_moe.py +1 -2
  459. sglang/test/test_cutlass_w4a8_moe.py +10 -20
  460. sglang/test/test_deterministic.py +430 -0
  461. sglang/test/test_deterministic_utils.py +73 -0
  462. sglang/test/test_disaggregation_utils.py +93 -1
  463. sglang/test/test_marlin_moe.py +0 -1
  464. sglang/test/test_programs.py +1 -1
  465. sglang/test/test_utils.py +432 -16
  466. sglang/utils.py +10 -1
  467. sglang/version.py +1 -1
  468. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/METADATA +64 -43
  469. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/RECORD +476 -346
  470. sglang/srt/entrypoints/grpc_request_manager.py +0 -580
  471. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -32
  472. sglang/srt/managers/tp_worker_overlap_thread.py +0 -319
  473. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  474. sglang/srt/speculative/build_eagle_tree.py +0 -427
  475. sglang/test/test_block_fp8_ep.py +0 -358
  476. /sglang/srt/layers/{quantization/deep_gemm_wrapper → deep_gemm_wrapper}/__init__.py +0 -0
  477. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  478. /sglang/srt/{aio_rwlock.py → utils/aio_rwlock.py} +0 -0
  479. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  480. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/WHEEL +0 -0
  481. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/licenses/LICENSE +0 -0
  482. {sglang-0.5.3rc0.dist-info → sglang-0.5.4.dist-info}/top_level.txt +0 -0
@@ -3,10 +3,10 @@
3
3
  # Adapted from vLLM: https://github.com/vllm-project/vllm/blob/1b9902806915040ac9b3029f2ab7522ec505afc3/vllm/entrypoints/harmony_utils.py
4
4
  # Slight differences in processing chat messages
5
5
  import datetime
6
- import json
7
6
  from collections.abc import Iterable
8
7
  from typing import Literal, Optional, Union
9
8
 
9
+ import orjson
10
10
  from openai.types.responses import (
11
11
  ResponseOutputItem,
12
12
  ResponseOutputMessage,
@@ -228,7 +228,7 @@ def parse_output_message(message: Message):
228
228
  if len(message.content) != 1:
229
229
  raise ValueError("Invalid number of contents in browser message")
230
230
  content = message.content[0]
231
- browser_call = json.loads(content.text)
231
+ browser_call = orjson.loads(content.text)
232
232
  # TODO: translate to url properly!
233
233
  if recipient == "browser.search":
234
234
  action = ActionSearch(
@@ -19,7 +19,6 @@ This file implements HTTP APIs for the inference engine via fastapi.
19
19
 
20
20
  import asyncio
21
21
  import dataclasses
22
- import json
23
22
  import logging
24
23
  import multiprocessing as multiprocessing
25
24
  import os
@@ -29,8 +28,6 @@ import time
29
28
  from http import HTTPStatus
30
29
  from typing import Any, AsyncIterator, Callable, Dict, List, Optional, Union
31
30
 
32
- import setproctitle
33
-
34
31
  from sglang.srt.tracing.trace import process_tracing_init, trace_set_thread_info
35
32
 
36
33
  # Fix a bug of Python threading
@@ -53,25 +50,34 @@ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationM
53
50
  from sglang.srt.entrypoints.engine import _launch_subprocesses
54
51
  from sglang.srt.entrypoints.openai.protocol import (
55
52
  ChatCompletionRequest,
53
+ ClassifyRequest,
56
54
  CompletionRequest,
55
+ DetokenizeRequest,
57
56
  EmbeddingRequest,
58
57
  ErrorResponse,
59
58
  ModelCard,
60
59
  ModelList,
61
60
  ResponsesRequest,
62
61
  ScoringRequest,
62
+ TokenizeRequest,
63
63
  V1RerankReqInput,
64
64
  )
65
65
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
66
+ from sglang.srt.entrypoints.openai.serving_classify import OpenAIServingClassify
66
67
  from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompletion
67
68
  from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
68
69
  from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
69
70
  from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
71
+ from sglang.srt.entrypoints.openai.serving_tokenize import (
72
+ OpenAIServingDetokenize,
73
+ OpenAIServingTokenize,
74
+ )
70
75
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
71
76
  from sglang.srt.managers.io_struct import (
72
77
  AbortReq,
73
78
  CloseSessionReqInput,
74
79
  ConfigureLoggingReq,
80
+ DestroyWeightsUpdateGroupReqInput,
75
81
  EmbeddingReqInput,
76
82
  GenerateReqInput,
77
83
  GetWeightsByNameReqInput,
@@ -90,13 +96,14 @@ from sglang.srt.managers.io_struct import (
90
96
  UnloadLoRAAdapterReqInput,
91
97
  UpdateWeightFromDiskReqInput,
92
98
  UpdateWeightsFromDistributedReqInput,
99
+ UpdateWeightsFromIPCReqInput,
93
100
  UpdateWeightsFromTensorReqInput,
94
101
  UpdateWeightVersionReqInput,
95
102
  VertexGenerateReqInput,
96
103
  )
97
104
  from sglang.srt.managers.multi_tokenizer_mixin import (
98
- MultiTokenizerManager,
99
105
  MultiTokenizerRouter,
106
+ TokenizerWorker,
100
107
  get_main_process_id,
101
108
  monkey_patch_uvicorn_multiprocessing,
102
109
  read_from_shared_memory,
@@ -123,14 +130,13 @@ logger = logging.getLogger(__name__)
123
130
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
124
131
 
125
132
  HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
133
+ WAIT_WEIGHTS_READY_TIMEOUT = int(os.getenv("SGLANG_WAIT_WEIGHTS_READY_TIMEOUT", 120))
126
134
 
127
135
 
128
136
  # Store global states
129
137
  @dataclasses.dataclass
130
138
  class _GlobalState:
131
- tokenizer_manager: Union[
132
- TokenizerManager, MultiTokenizerRouter, MultiTokenizerManager
133
- ]
139
+ tokenizer_manager: Union[TokenizerManager, MultiTokenizerRouter, TokenizerWorker]
134
140
  template_manager: TemplateManager
135
141
  scheduler_info: Dict
136
142
 
@@ -145,15 +151,14 @@ def set_global_state(global_state: _GlobalState):
145
151
 
146
152
  async def init_multi_tokenizer() -> ServerArgs:
147
153
  """Read args information from shm and init tokenizer manager for current process"""
148
- pid = os.getpid()
149
- main_pid = get_main_process_id()
150
- logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
151
154
 
152
155
  # Read configuration from shared memory
156
+ main_pid = get_main_process_id()
153
157
  port_args, server_args, scheduler_info = read_from_shared_memory(
154
158
  f"multi_tokenizer_args_{main_pid}"
155
159
  )
156
160
  server_args: ServerArgs
161
+ port_args: PortArgs
157
162
 
158
163
  # API key authentication is not supported in multi-tokenizer mode
159
164
  assert (
@@ -163,9 +168,13 @@ async def init_multi_tokenizer() -> ServerArgs:
163
168
  port_args.tokenizer_ipc_name = (
164
169
  f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
165
170
  )
171
+ logger.info(
172
+ f"Start multi-tokenizer worker process {os.getpid()}, "
173
+ f"ipc_name={port_args.tokenizer_ipc_name}"
174
+ )
166
175
 
167
176
  # Launch multi-tokenizer manager process
168
- tokenizer_manager = MultiTokenizerManager(server_args, port_args)
177
+ tokenizer_manager = TokenizerWorker(server_args, port_args)
169
178
  template_manager = TemplateManager()
170
179
  template_manager.initialize_templates(
171
180
  tokenizer_manager=tokenizer_manager,
@@ -173,8 +182,6 @@ async def init_multi_tokenizer() -> ServerArgs:
173
182
  chat_template=server_args.chat_template,
174
183
  completion_template=server_args.completion_template,
175
184
  )
176
- # Register this tokenizer with the main tokenizer manager
177
- await tokenizer_manager.register_to_main_tokenizer_manager()
178
185
 
179
186
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
180
187
  set_global_state(
@@ -226,12 +233,21 @@ async def lifespan(fast_api_app: FastAPI):
226
233
  fast_api_app.state.openai_serving_embedding = OpenAIServingEmbedding(
227
234
  _global_state.tokenizer_manager, _global_state.template_manager
228
235
  )
236
+ fast_api_app.state.openai_serving_classify = OpenAIServingClassify(
237
+ _global_state.tokenizer_manager, _global_state.template_manager
238
+ )
229
239
  fast_api_app.state.openai_serving_score = OpenAIServingScore(
230
240
  _global_state.tokenizer_manager
231
241
  )
232
242
  fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
233
243
  _global_state.tokenizer_manager
234
244
  )
245
+ fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
246
+ _global_state.tokenizer_manager
247
+ )
248
+ fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
249
+ _global_state.tokenizer_manager
250
+ )
235
251
 
236
252
  server_args: ServerArgs = fast_api_app.server_args
237
253
 
@@ -302,7 +318,23 @@ app.add_middleware(
302
318
 
303
319
  @app.exception_handler(HTTPException)
304
320
  async def validation_exception_handler(request: Request, exc: HTTPException):
305
- """Enrich HTTP exception with status code and other details"""
321
+ """Enrich HTTP exception with status code and other details.
322
+
323
+ For /v1/responses, emit OpenAI-style nested error envelope:
324
+ {"error": {"message": "...", "type": "...", "param": null, "code": <status>}}
325
+ """
326
+ # adjust fmt for responses api
327
+ if request.url.path.startswith("/v1/responses"):
328
+ nested_error = {
329
+ "message": exc.detail,
330
+ "type": HTTPStatus(exc.status_code).phrase,
331
+ "param": None,
332
+ "code": exc.status_code,
333
+ }
334
+ return ORJSONResponse(
335
+ content={"error": nested_error}, status_code=exc.status_code
336
+ )
337
+
306
338
  error = ErrorResponse(
307
339
  object="error",
308
340
  message=exc.detail,
@@ -315,7 +347,10 @@ async def validation_exception_handler(request: Request, exc: HTTPException):
315
347
  # Custom exception handlers to change validation error status codes
316
348
  @app.exception_handler(RequestValidationError)
317
349
  async def validation_exception_handler(request: Request, exc: RequestValidationError):
318
- """Override FastAPI's default 422 validation error with 400"""
350
+ """Override FastAPI's default 422 validation error with 400.
351
+
352
+ For /v1/responses, emit OpenAI-style nested error envelope; for other endpoints keep legacy format.
353
+ """
319
354
  exc_str = str(exc)
320
355
  errors_str = str(exc.errors())
321
356
 
@@ -324,6 +359,16 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
324
359
  else:
325
360
  message = exc_str
326
361
 
362
+ if request.url.path.startswith("/v1/responses"):
363
+ # adapt specially, for v1/responses API only (notice the error key is different)
364
+ nested_error = {
365
+ "message": message,
366
+ "type": HTTPStatus.BAD_REQUEST.phrase,
367
+ "param": None,
368
+ "code": HTTPStatus.BAD_REQUEST.value,
369
+ }
370
+ return ORJSONResponse(status_code=400, content={"error": nested_error})
371
+
327
372
  err = ErrorResponse(
328
373
  message=message,
329
374
  type=HTTPStatus.BAD_REQUEST.phrase,
@@ -468,7 +513,7 @@ async def get_load():
468
513
 
469
514
 
470
515
  # example usage:
471
- # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
516
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
472
517
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
473
518
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
474
519
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -517,7 +562,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
517
562
  async def generate_from_file_request(file: UploadFile, request: Request):
518
563
  """Handle a generate request, this is purely to work with input_embeds."""
519
564
  content = await file.read()
520
- input_embeds = json.loads(content.decode("utf-8"))
565
+ input_embeds = orjson.loads(content.decode("utf-8"))
521
566
 
522
567
  obj = GenerateReqInput(
523
568
  input_embeds=input_embeds,
@@ -596,6 +641,7 @@ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
596
641
  with_stack=obj.with_stack,
597
642
  record_shapes=obj.record_shapes,
598
643
  profile_by_stage=obj.profile_by_stage,
644
+ merge_profiles=obj.merge_profiles,
599
645
  )
600
646
  return Response(
601
647
  content="Start profiling.\n",
@@ -731,6 +777,20 @@ async def init_weights_update_group(
731
777
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
732
778
 
733
779
 
780
+ @app.post("/destroy_weights_update_group")
781
+ async def destroy_weights_update_group(
782
+ obj: DestroyWeightsUpdateGroupReqInput, request: Request
783
+ ):
784
+ """Destroy the parameter update group."""
785
+ success, message = (
786
+ await _global_state.tokenizer_manager.destroy_weights_update_group(obj, request)
787
+ )
788
+ content = {"success": success, "message": message}
789
+ return ORJSONResponse(
790
+ content, status_code=200 if success else HTTPStatus.BAD_REQUEST
791
+ )
792
+
793
+
734
794
  @app.post("/update_weights_from_tensor")
735
795
  async def update_weights_from_tensor(
736
796
  obj: UpdateWeightsFromTensorReqInput, request: Request
@@ -780,6 +840,27 @@ async def update_weights_from_distributed(
780
840
  return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
781
841
 
782
842
 
843
+ @app.post("/update_weights_from_ipc")
844
+ async def update_weights_from_ipc(obj: UpdateWeightsFromIPCReqInput, request: Request):
845
+ """Update the weights from IPC (Inter-Process Communication) for checkpoint-engine integration."""
846
+ success, message = await _global_state.tokenizer_manager.update_weights_from_ipc(
847
+ obj, request
848
+ )
849
+
850
+ # Update weight version if provided and weights update was successful
851
+ if success and obj.weight_version is not None:
852
+ _update_weight_version_if_provided(obj.weight_version)
853
+ message += f" Weight version updated to {obj.weight_version}."
854
+
855
+ content = {"success": success, "message": message}
856
+ if success:
857
+ if _global_state.tokenizer_manager.initial_weights_loaded is False:
858
+ _global_state.tokenizer_manager.initial_weights_loaded = True
859
+ return ORJSONResponse(content)
860
+ else:
861
+ return ORJSONResponse(content, status_code=HTTPStatus.BAD_REQUEST)
862
+
863
+
783
864
  @app.post("/update_weight_version")
784
865
  async def update_weight_version(obj: UpdateWeightVersionReqInput, request: Request):
785
866
  """Update the weight version. This operation requires no active requests."""
@@ -1030,6 +1111,54 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
1030
1111
  )
1031
1112
 
1032
1113
 
1114
+ @app.post(
1115
+ "/v1/classify",
1116
+ response_class=ORJSONResponse,
1117
+ dependencies=[Depends(validate_json_request)],
1118
+ )
1119
+ async def openai_v1_classify(request: ClassifyRequest, raw_request: Request):
1120
+ """OpenAI-compatible classification endpoint."""
1121
+ return await raw_request.app.state.openai_serving_classify.handle_request(
1122
+ request, raw_request
1123
+ )
1124
+
1125
+
1126
+ @app.post(
1127
+ "/v1/tokenize",
1128
+ response_class=ORJSONResponse,
1129
+ dependencies=[Depends(validate_json_request)],
1130
+ )
1131
+ @app.post(
1132
+ "/tokenize",
1133
+ response_class=ORJSONResponse,
1134
+ dependencies=[Depends(validate_json_request)],
1135
+ include_in_schema=False,
1136
+ )
1137
+ async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
1138
+ """OpenAI-compatible tokenization endpoint."""
1139
+ return await raw_request.app.state.openai_serving_tokenize.handle_request(
1140
+ request, raw_request
1141
+ )
1142
+
1143
+
1144
+ @app.post(
1145
+ "/v1/detokenize",
1146
+ response_class=ORJSONResponse,
1147
+ dependencies=[Depends(validate_json_request)],
1148
+ )
1149
+ @app.post(
1150
+ "/detokenize",
1151
+ response_class=ORJSONResponse,
1152
+ dependencies=[Depends(validate_json_request)],
1153
+ include_in_schema=False,
1154
+ )
1155
+ async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
1156
+ """OpenAI-compatible detokenization endpoint."""
1157
+ return await raw_request.app.state.openai_serving_detokenize.handle_request(
1158
+ request, raw_request
1159
+ )
1160
+
1161
+
1033
1162
  @app.get("/v1/models", response_class=ORJSONResponse)
1034
1163
  async def available_models():
1035
1164
  """Show available models. OpenAI-compatible endpoint."""
@@ -1424,6 +1553,8 @@ def _wait_and_warmup(
1424
1553
  pipe_finish_writer: Optional[multiprocessing.connection.Connection],
1425
1554
  launch_callback: Optional[Callable[[], None]] = None,
1426
1555
  ):
1556
+ if server_args.checkpoint_engine_wait_weights_before_ready:
1557
+ _wait_weights_ready()
1427
1558
  if not server_args.skip_server_warmup:
1428
1559
  if not _execute_server_warmup(
1429
1560
  server_args,
@@ -1446,3 +1577,24 @@ def _wait_and_warmup(
1446
1577
 
1447
1578
  if launch_callback is not None:
1448
1579
  launch_callback()
1580
+
1581
+
1582
+ def _wait_weights_ready():
1583
+ """Wait for weights to be ready within the specified timeout."""
1584
+ timeout = WAIT_WEIGHTS_READY_TIMEOUT
1585
+ start_time = time.time()
1586
+
1587
+ for _ in range(timeout):
1588
+ if _global_state.tokenizer_manager.initial_weights_loaded:
1589
+ logger.info(
1590
+ f"Weights are ready after {time.time() - start_time:.2f} seconds"
1591
+ )
1592
+ return
1593
+ time.sleep(1)
1594
+
1595
+ # Timeout reached without weights being ready
1596
+ logger.error(
1597
+ f"Weights are not ready after waiting {timeout} seconds. "
1598
+ f"Consider increasing SGLANG_WAIT_WEIGHTS_READY_TIMEOUT environment variable. "
1599
+ f"Current status: initial_weights_loaded={_global_state.tokenizer_manager.initial_weights_loaded}"
1600
+ )
@@ -1,15 +1,9 @@
1
- import copy
2
- import dataclasses
3
1
  import multiprocessing
4
- import pickle
5
- import threading
6
2
  import time
7
- from typing import Any, Dict, List, Optional, Tuple, Union
3
+ from typing import List, Optional, Tuple
8
4
 
9
- import pybase64
10
5
  import requests
11
6
  import torch
12
- import torch.distributed as dist
13
7
 
14
8
  from sglang.srt.entrypoints.EngineBase import EngineBase
15
9
  from sglang.srt.entrypoints.http_server import launch_server