sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (256) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +89 -54
  3. sglang/bench_serving.py +437 -40
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/profiler.py +0 -1
  6. sglang/srt/configs/__init__.py +4 -0
  7. sglang/srt/configs/internvl.py +6 -0
  8. sglang/srt/configs/longcat_flash.py +104 -0
  9. sglang/srt/configs/model_config.py +37 -7
  10. sglang/srt/configs/qwen3_next.py +326 -0
  11. sglang/srt/connector/__init__.py +1 -1
  12. sglang/srt/connector/base_connector.py +1 -2
  13. sglang/srt/connector/redis.py +2 -2
  14. sglang/srt/connector/serde/__init__.py +1 -1
  15. sglang/srt/connector/serde/safe_serde.py +4 -3
  16. sglang/srt/custom_op.py +11 -1
  17. sglang/srt/debug_utils/dump_comparator.py +81 -44
  18. sglang/srt/debug_utils/dump_loader.py +97 -0
  19. sglang/srt/debug_utils/dumper.py +11 -3
  20. sglang/srt/debug_utils/text_comparator.py +73 -11
  21. sglang/srt/disaggregation/ascend/conn.py +75 -0
  22. sglang/srt/disaggregation/base/conn.py +1 -1
  23. sglang/srt/disaggregation/common/conn.py +15 -12
  24. sglang/srt/disaggregation/decode.py +6 -4
  25. sglang/srt/disaggregation/fake/conn.py +1 -1
  26. sglang/srt/disaggregation/mini_lb.py +6 -420
  27. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  28. sglang/srt/disaggregation/nixl/conn.py +180 -16
  29. sglang/srt/disaggregation/prefill.py +6 -4
  30. sglang/srt/disaggregation/utils.py +5 -50
  31. sglang/srt/distributed/parallel_state.py +94 -58
  32. sglang/srt/entrypoints/engine.py +34 -14
  33. sglang/srt/entrypoints/http_server.py +172 -47
  34. sglang/srt/entrypoints/openai/protocol.py +90 -27
  35. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  36. sglang/srt/entrypoints/openai/serving_chat.py +82 -26
  37. sglang/srt/entrypoints/openai/serving_completions.py +25 -4
  38. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  39. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  40. sglang/srt/eplb/eplb_manager.py +28 -4
  41. sglang/srt/eplb/expert_distribution.py +55 -15
  42. sglang/srt/eplb/expert_location.py +8 -3
  43. sglang/srt/eplb/expert_location_updater.py +1 -1
  44. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  45. sglang/srt/function_call/ebnf_composer.py +11 -9
  46. sglang/srt/function_call/function_call_parser.py +2 -0
  47. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  48. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  49. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  50. sglang/srt/hf_transformers_utils.py +28 -7
  51. sglang/srt/layers/activation.py +44 -9
  52. sglang/srt/layers/attention/aiter_backend.py +93 -68
  53. sglang/srt/layers/attention/ascend_backend.py +381 -136
  54. sglang/srt/layers/attention/fla/chunk.py +242 -0
  55. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  56. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  57. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  58. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  59. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  60. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  61. sglang/srt/layers/attention/fla/index.py +37 -0
  62. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  63. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  64. sglang/srt/layers/attention/fla/op.py +66 -0
  65. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  66. sglang/srt/layers/attention/fla/utils.py +331 -0
  67. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  68. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  69. sglang/srt/layers/attention/flashinfer_backend.py +11 -6
  70. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
  71. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  72. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  73. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  74. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  75. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  76. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  77. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  78. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  79. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  80. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  81. sglang/srt/layers/communicator.py +45 -8
  82. sglang/srt/layers/layernorm.py +54 -12
  83. sglang/srt/layers/logits_processor.py +10 -3
  84. sglang/srt/layers/moe/__init__.py +2 -1
  85. sglang/srt/layers/moe/cutlass_moe.py +0 -8
  86. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  87. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  88. sglang/srt/layers/moe/ep_moe/layer.py +111 -56
  89. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  90. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  94. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  95. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  96. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  97. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  98. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  99. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  100. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  101. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  102. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  103. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  104. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  105. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  106. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  107. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  108. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  109. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  110. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  111. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  112. sglang/srt/layers/moe/topk.py +43 -12
  113. sglang/srt/layers/moe/utils.py +6 -5
  114. sglang/srt/layers/quantization/awq.py +19 -7
  115. sglang/srt/layers/quantization/base_config.py +11 -6
  116. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  119. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
  121. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
  122. sglang/srt/layers/quantization/fp8.py +78 -48
  123. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  124. sglang/srt/layers/quantization/fp8_utils.py +45 -31
  125. sglang/srt/layers/quantization/gptq.py +25 -17
  126. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  127. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  128. sglang/srt/layers/quantization/mxfp4.py +93 -68
  129. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  130. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  131. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  132. sglang/srt/layers/quantization/quark/utils.py +97 -0
  133. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  134. sglang/srt/layers/quantization/unquant.py +135 -47
  135. sglang/srt/layers/quantization/utils.py +13 -0
  136. sglang/srt/layers/quantization/w4afp8.py +60 -42
  137. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  138. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  139. sglang/srt/layers/rocm_linear_utils.py +44 -0
  140. sglang/srt/layers/rotary_embedding.py +28 -19
  141. sglang/srt/layers/sampler.py +29 -5
  142. sglang/srt/layers/utils.py +0 -14
  143. sglang/srt/lora/backend/base_backend.py +50 -8
  144. sglang/srt/lora/backend/triton_backend.py +90 -2
  145. sglang/srt/lora/layers.py +32 -0
  146. sglang/srt/lora/lora.py +4 -1
  147. sglang/srt/lora/lora_manager.py +35 -112
  148. sglang/srt/lora/mem_pool.py +24 -10
  149. sglang/srt/lora/utils.py +18 -9
  150. sglang/srt/managers/cache_controller.py +396 -365
  151. sglang/srt/managers/data_parallel_controller.py +30 -15
  152. sglang/srt/managers/detokenizer_manager.py +18 -2
  153. sglang/srt/managers/disagg_service.py +46 -0
  154. sglang/srt/managers/io_struct.py +190 -11
  155. sglang/srt/managers/mm_utils.py +6 -1
  156. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  157. sglang/srt/managers/schedule_batch.py +27 -44
  158. sglang/srt/managers/schedule_policy.py +4 -3
  159. sglang/srt/managers/scheduler.py +148 -122
  160. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  161. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  162. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  163. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  164. sglang/srt/managers/template_manager.py +3 -3
  165. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  166. sglang/srt/managers/tokenizer_manager.py +77 -480
  167. sglang/srt/managers/tp_worker.py +16 -4
  168. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  169. sglang/srt/mem_cache/allocator.py +1 -1
  170. sglang/srt/mem_cache/chunk_cache.py +1 -1
  171. sglang/srt/mem_cache/hicache_storage.py +53 -40
  172. sglang/srt/mem_cache/hiradix_cache.py +196 -104
  173. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  174. sglang/srt/mem_cache/memory_pool.py +395 -53
  175. sglang/srt/mem_cache/memory_pool_host.py +27 -19
  176. sglang/srt/mem_cache/radix_cache.py +6 -6
  177. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  178. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  179. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  180. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  181. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
  182. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  183. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  184. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
  185. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  186. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  187. sglang/srt/metrics/collector.py +484 -63
  188. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  189. sglang/srt/metrics/utils.py +48 -0
  190. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  191. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  192. sglang/srt/model_executor/forward_batch_info.py +72 -18
  193. sglang/srt/model_executor/model_runner.py +190 -32
  194. sglang/srt/model_loader/__init__.py +9 -3
  195. sglang/srt/model_loader/loader.py +33 -28
  196. sglang/srt/model_loader/utils.py +12 -0
  197. sglang/srt/model_loader/weight_utils.py +2 -1
  198. sglang/srt/models/deepseek_v2.py +323 -53
  199. sglang/srt/models/gemma3n_mm.py +1 -1
  200. sglang/srt/models/glm4_moe.py +10 -1
  201. sglang/srt/models/glm4v.py +4 -2
  202. sglang/srt/models/gpt_oss.py +7 -19
  203. sglang/srt/models/internvl.py +28 -0
  204. sglang/srt/models/llama4.py +9 -0
  205. sglang/srt/models/llama_eagle3.py +17 -0
  206. sglang/srt/models/longcat_flash.py +1026 -0
  207. sglang/srt/models/longcat_flash_nextn.py +699 -0
  208. sglang/srt/models/minicpmv.py +165 -3
  209. sglang/srt/models/mllama4.py +25 -0
  210. sglang/srt/models/opt.py +637 -0
  211. sglang/srt/models/qwen2.py +33 -3
  212. sglang/srt/models/qwen2_5_vl.py +91 -42
  213. sglang/srt/models/qwen2_moe.py +79 -14
  214. sglang/srt/models/qwen3.py +8 -2
  215. sglang/srt/models/qwen3_moe.py +39 -8
  216. sglang/srt/models/qwen3_next.py +1039 -0
  217. sglang/srt/models/qwen3_next_mtp.py +109 -0
  218. sglang/srt/models/torch_native_llama.py +1 -1
  219. sglang/srt/models/transformers.py +1 -1
  220. sglang/srt/multimodal/processors/base_processor.py +4 -2
  221. sglang/srt/multimodal/processors/glm4v.py +9 -9
  222. sglang/srt/multimodal/processors/internvl.py +141 -129
  223. sglang/srt/{conversation.py → parser/conversation.py} +38 -5
  224. sglang/srt/parser/harmony_parser.py +588 -0
  225. sglang/srt/parser/reasoning_parser.py +309 -0
  226. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  227. sglang/srt/sampling/sampling_batch_info.py +18 -15
  228. sglang/srt/server_args.py +307 -80
  229. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  230. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  231. sglang/srt/speculative/eagle_worker.py +216 -120
  232. sglang/srt/speculative/spec_info.py +5 -0
  233. sglang/srt/speculative/standalone_worker.py +109 -0
  234. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  235. sglang/srt/utils.py +96 -7
  236. sglang/srt/weight_sync/utils.py +1 -1
  237. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  238. sglang/test/few_shot_gsm8k.py +1 -0
  239. sglang/test/runners.py +4 -0
  240. sglang/test/test_cutlass_moe.py +24 -6
  241. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  242. sglang/test/test_disaggregation_utils.py +66 -0
  243. sglang/test/test_utils.py +25 -1
  244. sglang/utils.py +5 -0
  245. sglang/version.py +1 -1
  246. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
  247. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
  248. sglang/srt/disaggregation/launch_lb.py +0 -131
  249. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  250. sglang/srt/reasoning_parser.py +0 -553
  251. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  252. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  253. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  254. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  255. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  256. {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -60,6 +60,7 @@ from sglang.srt.managers.io_struct import (
60
60
  UpdateWeightsFromDistributedReqInput,
61
61
  UpdateWeightsFromTensorReqInput,
62
62
  )
63
+ from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
63
64
  from sglang.srt.managers.scheduler import run_scheduler_process
64
65
  from sglang.srt.managers.template_manager import TemplateManager
65
66
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -654,7 +655,8 @@ def _set_envs_and_config(server_args: ServerArgs):
654
655
  os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
655
656
  os.environ["CUDA_MODULE_LOADING"] = "AUTO"
656
657
  # flashinfer uses this environment variable for various kernels from MoE to quant kernels
657
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
658
+ if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
659
+ os.environ["TRTLLM_ENABLE_PDL"] = "1"
658
660
 
659
661
  # Can also be passed as argument
660
662
  os.environ["SGLANG_RUN_ID"] = (
@@ -672,7 +674,7 @@ def _set_envs_and_config(server_args: ServerArgs):
672
674
  if server_args.attention_backend == "flashinfer":
673
675
  assert_pkg_version(
674
676
  "flashinfer_python",
675
- "0.2.14.post1",
677
+ "0.3.1",
676
678
  "Please uninstall the old version and "
677
679
  "reinstall the latest version by following the instructions "
678
680
  "at https://docs.flashinfer.ai/installation.html.",
@@ -680,7 +682,7 @@ def _set_envs_and_config(server_args: ServerArgs):
680
682
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
681
683
  assert_pkg_version(
682
684
  "sgl-kernel",
683
- "0.3.5",
685
+ "0.3.9.post2",
684
686
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
685
687
  )
686
688
 
@@ -702,6 +704,24 @@ def _set_envs_and_config(server_args: ServerArgs):
702
704
  mp.set_start_method("spawn", force=True)
703
705
 
704
706
 
707
+ def _init_tokenizer_manager(
708
+ server_args: ServerArgs, port_args: PortArgs
709
+ ) -> TokenizerManager:
710
+ # Launch tokenizer process
711
+ tokenizer_manager = TokenizerManager(server_args, port_args)
712
+
713
+ # Initialize templates
714
+ template_manager = TemplateManager()
715
+ template_manager.initialize_templates(
716
+ tokenizer_manager=tokenizer_manager,
717
+ model_path=server_args.model_path,
718
+ chat_template=server_args.chat_template,
719
+ completion_template=server_args.completion_template,
720
+ )
721
+
722
+ return tokenizer_manager, template_manager
723
+
724
+
705
725
  def _launch_subprocesses(
706
726
  server_args: ServerArgs, port_args: Optional[PortArgs] = None
707
727
  ) -> Tuple[TokenizerManager, TemplateManager, Dict]:
@@ -815,17 +835,15 @@ def _launch_subprocesses(
815
835
  )
816
836
  detoken_proc.start()
817
837
 
818
- # Launch tokenizer process
819
- tokenizer_manager = TokenizerManager(server_args, port_args)
820
-
821
- # Initialize templates
822
- template_manager = TemplateManager()
823
- template_manager.initialize_templates(
824
- tokenizer_manager=tokenizer_manager,
825
- model_path=server_args.model_path,
826
- chat_template=server_args.chat_template,
827
- completion_template=server_args.completion_template,
828
- )
838
+ # Init tokenizer manager first, as the bootstrap server is initialized here
839
+ if server_args.tokenizer_worker_num > 1:
840
+ # Launch multi-tokenizer router
841
+ tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
842
+ template_manager = None
843
+ else:
844
+ tokenizer_manager, template_manager = _init_tokenizer_manager(
845
+ server_args, port_args
846
+ )
829
847
 
830
848
  # Wait for the model to finish loading
831
849
  scheduler_infos = []
@@ -848,5 +866,7 @@ def _launch_subprocesses(
848
866
 
849
867
  # Assume all schedulers have the same scheduler_info
850
868
  scheduler_info = scheduler_infos[0]
869
+
851
870
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
871
+
852
872
  return tokenizer_manager, template_manager, scheduler_info
@@ -23,11 +23,14 @@ import json
23
23
  import logging
24
24
  import multiprocessing as multiprocessing
25
25
  import os
26
+ import tempfile
26
27
  import threading
27
28
  import time
28
29
  from http import HTTPStatus
29
30
  from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
31
 
32
+ import setproctitle
33
+
31
34
  # Fix a bug of Python threading
32
35
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
33
36
 
@@ -44,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
44
47
  from fastapi.middleware.cors import CORSMiddleware
45
48
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
46
49
 
47
- from sglang.srt.disaggregation.utils import (
48
- FAKE_BOOTSTRAP_HOST,
49
- DisaggregationMode,
50
- register_disaggregation_server,
51
- )
50
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
52
51
  from sglang.srt.entrypoints.engine import _launch_subprocesses
53
52
  from sglang.srt.entrypoints.openai.protocol import (
54
53
  ChatCompletionRequest,
@@ -91,11 +90,18 @@ from sglang.srt.managers.io_struct import (
91
90
  UpdateWeightVersionReqInput,
92
91
  VertexGenerateReqInput,
93
92
  )
93
+ from sglang.srt.managers.multi_tokenizer_mixin import (
94
+ MultiTokenizerManager,
95
+ get_main_process_id,
96
+ monkey_patch_uvicorn_multiprocessing,
97
+ read_from_shared_memory,
98
+ write_data_for_multi_tokenizer,
99
+ )
94
100
  from sglang.srt.managers.template_manager import TemplateManager
95
101
  from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
96
102
  from sglang.srt.metrics.func_timer import enable_func_timer
97
- from sglang.srt.reasoning_parser import ReasoningParser
98
- from sglang.srt.server_args import ServerArgs
103
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
104
+ from sglang.srt.server_args import PortArgs, ServerArgs
99
105
  from sglang.srt.utils import (
100
106
  add_api_key_middleware,
101
107
  add_prometheus_middleware,
@@ -130,8 +136,72 @@ def set_global_state(global_state: _GlobalState):
130
136
  _global_state = global_state
131
137
 
132
138
 
139
+ async def init_multi_tokenizer() -> ServerArgs:
140
+ """Read args information from shm and init tokenizer manager for current process"""
141
+ pid = os.getpid()
142
+ main_pid = get_main_process_id()
143
+ logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
144
+
145
+ # Read configuration from shared memory
146
+ port_args, server_args, scheduler_info = read_from_shared_memory(
147
+ f"multi_tokenizer_args_{main_pid}"
148
+ )
149
+ server_args: ServerArgs
150
+
151
+ # API key authentication is not supported in multi-tokenizer mode
152
+ assert (
153
+ server_args.api_key is None
154
+ ), "API key is not supported in multi-tokenizer mode"
155
+
156
+ port_args.tokenizer_ipc_name = (
157
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
158
+ )
159
+
160
+ # Launch multi-tokenizer manager process
161
+ tokenizer_manager = MultiTokenizerManager(server_args, port_args)
162
+ template_manager = TemplateManager()
163
+ template_manager.initialize_templates(
164
+ tokenizer_manager=tokenizer_manager,
165
+ model_path=server_args.model_path,
166
+ chat_template=server_args.chat_template,
167
+ completion_template=server_args.completion_template,
168
+ )
169
+ # Register this tokenizer with the main tokenizer manager
170
+ await tokenizer_manager.register_to_main_tokenizer_manager()
171
+
172
+ tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
173
+ set_global_state(
174
+ _GlobalState(
175
+ tokenizer_manager=tokenizer_manager,
176
+ template_manager=template_manager,
177
+ scheduler_info=scheduler_info,
178
+ )
179
+ )
180
+ return server_args
181
+
182
+
133
183
  @asynccontextmanager
134
184
  async def lifespan(fast_api_app: FastAPI):
185
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
186
+ # Initialize multi-tokenizer support for worker processes
187
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
188
+
189
+ # only metrics middleware is supported in multi-tokenizer mode
190
+ worker_pid = os.getpid()
191
+ if fast_api_app.server_args.enable_metrics:
192
+ add_prometheus_middleware(app)
193
+ enable_func_timer()
194
+
195
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
196
+ fast_api_app.warmup_thread = threading.Thread(
197
+ target=_wait_and_warmup,
198
+ args=(
199
+ fast_api_app.server_args,
200
+ None, # pipe_finish_writer not needed in worker
201
+ None, # launch_callback not needed in worker
202
+ ),
203
+ )
204
+
135
205
  # Initialize OpenAI serving handlers
136
206
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
137
207
  _global_state.tokenizer_manager, _global_state.template_manager
@@ -191,7 +261,15 @@ async def lifespan(fast_api_app: FastAPI):
191
261
  warmup_thread = getattr(fast_api_app, "warmup_thread", None)
192
262
  if warmup_thread is not None:
193
263
  warmup_thread.start()
194
- yield
264
+
265
+ try:
266
+ yield
267
+ finally:
268
+ if server_args.tokenizer_worker_num > 1:
269
+ pid = os.getpid()
270
+ logger.info(f"uvicorn worker {pid} ending...")
271
+ warmup_thread.join()
272
+ logger.info(f"uvicorn worker {pid} ended.")
195
273
 
196
274
 
197
275
  # Fast API
@@ -480,6 +558,16 @@ async def flush_cache():
480
558
  )
481
559
 
482
560
 
561
+ @app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
562
+ async def clear_hicache_storage_backend():
563
+ """Clear the hierarchical cache storage backend."""
564
+ ret = await _global_state.tokenizer_manager.clear_hicache_storage()
565
+ return Response(
566
+ content="Hierarchical cache storage backend cleared.\n",
567
+ status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
568
+ )
569
+
570
+
483
571
  @app.api_route("/start_profile", methods=["GET", "POST"])
484
572
  async def start_profile_async(obj: Optional[ProfileReqInput] = None):
485
573
  """Start profiling."""
@@ -1068,9 +1156,21 @@ def launch_server(
1068
1156
  1. The HTTP server, Engine, and TokenizerManager both run in the main process.
1069
1157
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1070
1158
  """
1071
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1072
- server_args=server_args
1073
- )
1159
+ if server_args.tokenizer_worker_num > 1:
1160
+ setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
1161
+ port_args = PortArgs.init_new(server_args)
1162
+ port_args.tokenizer_worker_ipc_name = (
1163
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
1164
+ )
1165
+ tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1166
+ server_args=server_args, port_args=port_args
1167
+ )
1168
+ else:
1169
+ setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
1170
+ tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1171
+ server_args=server_args,
1172
+ )
1173
+
1074
1174
  set_global_state(
1075
1175
  _GlobalState(
1076
1176
  tokenizer_manager=tokenizer_manager,
@@ -1079,42 +1179,75 @@ def launch_server(
1079
1179
  )
1080
1180
  )
1081
1181
 
1082
- # Add api key authorization
1083
- if server_args.api_key:
1084
- add_api_key_middleware(app, server_args.api_key)
1085
-
1086
- # Add prometheus middleware
1087
- if server_args.enable_metrics:
1088
- add_prometheus_middleware(app)
1089
- enable_func_timer()
1090
-
1091
- # Send a warmup request - we will create the thread launch it
1092
- # in the lifespan after all other warmups have fired.
1093
- warmup_thread = threading.Thread(
1094
- target=_wait_and_warmup,
1095
- args=(
1182
+ if server_args.tokenizer_worker_num > 1:
1183
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1184
+ port_args,
1096
1185
  server_args,
1097
- pipe_finish_writer,
1098
- launch_callback,
1099
- ),
1100
- )
1101
- app.warmup_thread = warmup_thread
1186
+ scheduler_info,
1187
+ )
1188
+ else:
1189
+ # Add api key authorization
1190
+ if server_args.api_key:
1191
+ add_api_key_middleware(app, server_args.api_key)
1192
+
1193
+ # Add prometheus middleware
1194
+ if server_args.enable_metrics:
1195
+ add_prometheus_middleware(app)
1196
+ enable_func_timer()
1197
+
1198
+ # Send a warmup request - we will create the thread launch it
1199
+ # in the lifespan after all other warmups have fired.
1200
+ warmup_thread = threading.Thread(
1201
+ target=_wait_and_warmup,
1202
+ args=(
1203
+ server_args,
1204
+ pipe_finish_writer,
1205
+ launch_callback,
1206
+ ),
1207
+ )
1208
+ app.warmup_thread = warmup_thread
1102
1209
 
1103
1210
  try:
1104
1211
  # Update logging configs
1105
1212
  set_uvicorn_logging_configs()
1106
1213
  app.server_args = server_args
1107
1214
  # Listen for HTTP requests
1108
- uvicorn.run(
1109
- app,
1110
- host=server_args.host,
1111
- port=server_args.port,
1112
- log_level=server_args.log_level_http or server_args.log_level,
1113
- timeout_keep_alive=5,
1114
- loop="uvloop",
1115
- )
1215
+ if server_args.tokenizer_worker_num > 1:
1216
+ from uvicorn.config import LOGGING_CONFIG
1217
+
1218
+ LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
1219
+ "handlers": ["default"],
1220
+ "level": "INFO",
1221
+ "propagate": False,
1222
+ }
1223
+
1224
+ monkey_patch_uvicorn_multiprocessing()
1225
+
1226
+ uvicorn.run(
1227
+ "sglang.srt.entrypoints.http_server:app",
1228
+ host=server_args.host,
1229
+ port=server_args.port,
1230
+ log_level=server_args.log_level_http or server_args.log_level,
1231
+ timeout_keep_alive=5,
1232
+ loop="uvloop",
1233
+ workers=server_args.tokenizer_worker_num,
1234
+ )
1235
+ else:
1236
+ app.is_single_tokenizer_mode = True
1237
+ uvicorn.run(
1238
+ app,
1239
+ host=server_args.host,
1240
+ port=server_args.port,
1241
+ log_level=server_args.log_level_http or server_args.log_level,
1242
+ timeout_keep_alive=5,
1243
+ loop="uvloop",
1244
+ )
1116
1245
  finally:
1117
- warmup_thread.join()
1246
+ if server_args.tokenizer_worker_num > 1:
1247
+ multi_tokenizer_args_shm.unlink()
1248
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1249
+ else:
1250
+ warmup_thread.join()
1118
1251
 
1119
1252
 
1120
1253
  def _execute_server_warmup(
@@ -1261,13 +1394,5 @@ def _wait_and_warmup(
1261
1394
  if server_args.debug_tensor_dump_input_file:
1262
1395
  kill_process_tree(os.getpid())
1263
1396
 
1264
- if server_args.pdlb_url is not None:
1265
- register_disaggregation_server(
1266
- server_args.disaggregation_mode,
1267
- server_args.port,
1268
- server_args.disaggregation_bootstrap_port,
1269
- server_args.pdlb_url,
1270
- )
1271
-
1272
1397
  if launch_callback is not None:
1273
1398
  launch_callback()
@@ -35,6 +35,8 @@ from pydantic import (
35
35
  )
36
36
  from typing_extensions import Literal
37
37
 
38
+ DEFAULT_MODEL_NAME = "default"
39
+
38
40
 
39
41
  class ModelCard(BaseModel):
40
42
  """Model cards."""
@@ -108,6 +110,23 @@ class JsonSchemaResponseFormat(BaseModel):
108
110
  strict: Optional[bool] = False
109
111
 
110
112
 
113
+ class ResponseFormat(BaseModel):
114
+ type: Literal["text", "json_object", "json_schema"]
115
+ json_schema: Optional[JsonSchemaResponseFormat] = None
116
+
117
+
118
+ class StructuresResponseFormat(BaseModel):
119
+ begin: str
120
+ schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
121
+ end: str
122
+
123
+
124
+ class StructuralTagResponseFormat(BaseModel):
125
+ type: Literal["structural_tag"]
126
+ structures: List[StructuresResponseFormat]
127
+ triggers: List[str]
128
+
129
+
111
130
  class FileRequest(BaseModel):
112
131
  # https://platform.openai.com/docs/api-reference/files/create
113
132
  file: bytes # The File object (not file name) to be uploaded
@@ -166,7 +185,7 @@ class BatchResponse(BaseModel):
166
185
  class CompletionRequest(BaseModel):
167
186
  # Ordered by official OpenAI API documentation
168
187
  # https://platform.openai.com/docs/api-reference/completions/create
169
- model: str
188
+ model: str = DEFAULT_MODEL_NAME
170
189
  prompt: Union[List[int], List[List[int]], str, List[str]]
171
190
  best_of: Optional[int] = None
172
191
  echo: bool = False
@@ -200,6 +219,7 @@ class CompletionRequest(BaseModel):
200
219
  skip_special_tokens: bool = True
201
220
  lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
202
221
  session_params: Optional[Dict] = None
222
+ response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
203
223
 
204
224
  # For PD disaggregation
205
225
  bootstrap_host: Optional[Union[List[str], str]] = None
@@ -327,7 +347,7 @@ class ToolCall(BaseModel):
327
347
 
328
348
 
329
349
  class ChatCompletionMessageGenericParam(BaseModel):
330
- role: Literal["system", "assistant", "tool"]
350
+ role: Literal["system", "assistant", "tool", "function"]
331
351
  content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
332
352
  default=None
333
353
  )
@@ -341,9 +361,9 @@ class ChatCompletionMessageGenericParam(BaseModel):
341
361
  def _normalize_role(cls, v):
342
362
  if isinstance(v, str):
343
363
  v_lower = v.lower()
344
- if v_lower not in {"system", "assistant", "tool"}:
364
+ if v_lower not in {"system", "assistant", "tool", "function"}:
345
365
  raise ValueError(
346
- "'role' must be one of 'system', 'assistant', or 'tool' (case-insensitive)."
366
+ "'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
347
367
  )
348
368
  return v_lower
349
369
  raise ValueError("'role' must be a string")
@@ -359,23 +379,6 @@ ChatCompletionMessageParam = Union[
359
379
  ]
360
380
 
361
381
 
362
- class ResponseFormat(BaseModel):
363
- type: Literal["text", "json_object", "json_schema"]
364
- json_schema: Optional[JsonSchemaResponseFormat] = None
365
-
366
-
367
- class StructuresResponseFormat(BaseModel):
368
- begin: str
369
- schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
370
- end: str
371
-
372
-
373
- class StructuralTagResponseFormat(BaseModel):
374
- type: Literal["structural_tag"]
375
- structures: List[StructuresResponseFormat]
376
- triggers: List[str]
377
-
378
-
379
382
  class Function(BaseModel):
380
383
  """Function descriptions."""
381
384
 
@@ -409,7 +412,7 @@ class ChatCompletionRequest(BaseModel):
409
412
  # Ordered by official OpenAI API documentation
410
413
  # https://platform.openai.com/docs/api-reference/chat/create
411
414
  messages: List[ChatCompletionMessageParam]
412
- model: str
415
+ model: str = DEFAULT_MODEL_NAME
413
416
  frequency_penalty: float = 0.0
414
417
  logit_bias: Optional[Dict[str, float]] = None
415
418
  logprobs: bool = False
@@ -457,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
457
460
  values["tool_choice"] = "auto"
458
461
  return values
459
462
 
463
+ @model_validator(mode="before")
464
+ @classmethod
465
+ def normalize_reasoning_inputs(cls, values: Dict):
466
+ r = values.get("reasoning")
467
+ if r is None:
468
+ return values
469
+
470
+ if isinstance(r, dict):
471
+ effort = r.get("effort") or r.get("reasoning_effort")
472
+ if effort in {"low", "medium", "high"}:
473
+ values["reasoning_effort"] = effort
474
+
475
+ enabled = (
476
+ r.get("enabled")
477
+ if r.get("enabled") is not None
478
+ else r.get("enable", False)
479
+ )
480
+ if isinstance(enabled, str):
481
+ enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
482
+ if enabled:
483
+ ctk = values.get("chat_template_kwargs")
484
+ if not isinstance(ctk, dict):
485
+ ctk = {}
486
+ ctk.setdefault("thinking", True)
487
+ values["chat_template_kwargs"] = ctk
488
+
489
+ return values
490
+
491
+ @model_validator(mode="before")
492
+ @classmethod
493
+ def set_json_schema(cls, values):
494
+ response_format = values.get("response_format")
495
+ if not response_format:
496
+ return values
497
+
498
+ if response_format.get("type") != "json_schema":
499
+ return values
500
+
501
+ schema = response_format.pop("schema", None)
502
+ json_schema = response_format.get("json_schema")
503
+
504
+ if json_schema:
505
+ return values
506
+
507
+ if schema:
508
+ name_ = schema.get("title", "Schema")
509
+ strict_ = False
510
+ if "properties" in schema and "strict" in schema["properties"]:
511
+ item = schema["properties"].pop("strict", None)
512
+ if item and item.get("default", False):
513
+ strict_ = True
514
+
515
+ response_format["json_schema"] = {
516
+ "name": name_,
517
+ "schema": schema,
518
+ "strict": strict_,
519
+ }
520
+
521
+ return values
522
+
460
523
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
461
524
  top_k: int = -1
462
525
  min_p: float = 0.0
@@ -479,9 +542,9 @@ class ChatCompletionRequest(BaseModel):
479
542
  rid: Optional[Union[List[str], str]] = None
480
543
 
481
544
  # For PD disaggregation
482
- bootstrap_host: Optional[str] = None
483
- bootstrap_port: Optional[int] = None
484
- bootstrap_room: Optional[int] = None
545
+ bootstrap_host: Optional[Union[List[str], str]] = None
546
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
547
+ bootstrap_room: Optional[Union[List[int], int]] = None
485
548
 
486
549
 
487
550
  class ChatMessage(BaseModel):
@@ -571,7 +634,7 @@ class EmbeddingRequest(BaseModel):
571
634
  # Ordered by official OpenAI API documentation
572
635
  # https://platform.openai.com/docs/api-reference/embeddings/create
573
636
  input: EmbeddingInput
574
- model: str
637
+ model: str = DEFAULT_MODEL_NAME
575
638
  encoding_format: str = "float"
576
639
  dimensions: Optional[int] = None
577
640
  user: Optional[str] = None
@@ -605,7 +668,7 @@ class ScoringRequest(BaseModel):
605
668
  )
606
669
  apply_softmax: bool = False
607
670
  item_first: bool = False
608
- model: str
671
+ model: str = DEFAULT_MODEL_NAME
609
672
 
610
673
 
611
674
  class ScoringResponse(BaseModel):
@@ -1,15 +1,19 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+
15
+ if TYPE_CHECKING:
16
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
17
 
14
18
  logger = logging.getLogger(__name__)
15
19