sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sglang/bench_one_batch.py +3 -0
  2. sglang/bench_one_batch_server.py +10 -1
  3. sglang/bench_serving.py +251 -26
  4. sglang/lang/interpreter.py +1 -1
  5. sglang/srt/configs/__init__.py +4 -0
  6. sglang/srt/configs/internvl.py +6 -0
  7. sglang/srt/configs/longcat_flash.py +104 -0
  8. sglang/srt/configs/model_config.py +37 -7
  9. sglang/srt/configs/qwen3_next.py +326 -0
  10. sglang/srt/connector/__init__.py +1 -1
  11. sglang/srt/connector/base_connector.py +1 -2
  12. sglang/srt/connector/redis.py +2 -2
  13. sglang/srt/connector/serde/__init__.py +1 -1
  14. sglang/srt/connector/serde/safe_serde.py +4 -3
  15. sglang/srt/custom_op.py +11 -1
  16. sglang/srt/debug_utils/dump_comparator.py +81 -44
  17. sglang/srt/debug_utils/dump_loader.py +97 -0
  18. sglang/srt/debug_utils/dumper.py +11 -3
  19. sglang/srt/debug_utils/text_comparator.py +73 -11
  20. sglang/srt/disaggregation/ascend/conn.py +75 -0
  21. sglang/srt/disaggregation/base/conn.py +1 -1
  22. sglang/srt/disaggregation/common/conn.py +15 -12
  23. sglang/srt/disaggregation/decode.py +6 -4
  24. sglang/srt/disaggregation/fake/conn.py +1 -1
  25. sglang/srt/disaggregation/mini_lb.py +6 -420
  26. sglang/srt/disaggregation/mooncake/conn.py +18 -10
  27. sglang/srt/disaggregation/nixl/conn.py +180 -16
  28. sglang/srt/disaggregation/prefill.py +6 -4
  29. sglang/srt/disaggregation/utils.py +5 -50
  30. sglang/srt/distributed/parallel_state.py +94 -58
  31. sglang/srt/entrypoints/engine.py +34 -14
  32. sglang/srt/entrypoints/http_server.py +172 -47
  33. sglang/srt/entrypoints/openai/protocol.py +63 -3
  34. sglang/srt/entrypoints/openai/serving_base.py +6 -2
  35. sglang/srt/entrypoints/openai/serving_chat.py +34 -19
  36. sglang/srt/entrypoints/openai/serving_completions.py +10 -4
  37. sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
  38. sglang/srt/entrypoints/openai/serving_responses.py +7 -4
  39. sglang/srt/eplb/eplb_manager.py +28 -4
  40. sglang/srt/eplb/expert_distribution.py +55 -15
  41. sglang/srt/eplb/expert_location.py +8 -3
  42. sglang/srt/eplb/expert_location_updater.py +1 -1
  43. sglang/srt/function_call/ebnf_composer.py +11 -9
  44. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  45. sglang/srt/function_call/gpt_oss_detector.py +1 -1
  46. sglang/srt/function_call/qwen3_coder_detector.py +1 -1
  47. sglang/srt/hf_transformers_utils.py +12 -0
  48. sglang/srt/layers/activation.py +44 -9
  49. sglang/srt/layers/attention/aiter_backend.py +93 -68
  50. sglang/srt/layers/attention/ascend_backend.py +250 -112
  51. sglang/srt/layers/attention/fla/chunk.py +242 -0
  52. sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
  53. sglang/srt/layers/attention/fla/chunk_o.py +178 -0
  54. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
  55. sglang/srt/layers/attention/fla/cumsum.py +300 -0
  56. sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
  57. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
  58. sglang/srt/layers/attention/fla/index.py +37 -0
  59. sglang/srt/layers/attention/fla/l2norm.py +150 -0
  60. sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
  61. sglang/srt/layers/attention/fla/op.py +66 -0
  62. sglang/srt/layers/attention/fla/solve_tril.py +465 -0
  63. sglang/srt/layers/attention/fla/utils.py +331 -0
  64. sglang/srt/layers/attention/fla/wy_fast.py +158 -0
  65. sglang/srt/layers/attention/flashinfer_backend.py +6 -4
  66. sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
  67. sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
  68. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
  69. sglang/srt/layers/attention/intel_amx_backend.py +3 -0
  70. sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
  71. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
  72. sglang/srt/layers/attention/mamba/mamba.py +64 -0
  73. sglang/srt/layers/attention/torch_native_backend.py +12 -6
  74. sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
  75. sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
  76. sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
  77. sglang/srt/layers/communicator.py +45 -7
  78. sglang/srt/layers/layernorm.py +54 -12
  79. sglang/srt/layers/logits_processor.py +10 -3
  80. sglang/srt/layers/moe/__init__.py +2 -1
  81. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
  82. sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
  83. sglang/srt/layers/moe/ep_moe/layer.py +110 -49
  84. sglang/srt/layers/moe/fused_moe_native.py +5 -3
  85. sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
  86. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  88. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
  89. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  90. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
  91. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
  92. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
  93. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
  94. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
  95. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
  96. sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
  97. sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
  98. sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
  99. sglang/srt/layers/moe/moe_runner/base.py +274 -1
  100. sglang/srt/layers/moe/moe_runner/runner.py +80 -0
  101. sglang/srt/layers/moe/moe_runner/triton.py +448 -0
  102. sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
  103. sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
  104. sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
  105. sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
  106. sglang/srt/layers/moe/topk.py +43 -12
  107. sglang/srt/layers/moe/utils.py +6 -5
  108. sglang/srt/layers/quantization/awq.py +19 -7
  109. sglang/srt/layers/quantization/base_config.py +11 -6
  110. sglang/srt/layers/quantization/blockwise_int8.py +38 -27
  111. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
  112. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
  113. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
  114. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
  115. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
  116. sglang/srt/layers/quantization/fp8.py +76 -47
  117. sglang/srt/layers/quantization/fp8_utils.py +43 -29
  118. sglang/srt/layers/quantization/gptq.py +25 -17
  119. sglang/srt/layers/quantization/modelopt_quant.py +107 -40
  120. sglang/srt/layers/quantization/moe_wna16.py +21 -18
  121. sglang/srt/layers/quantization/mxfp4.py +77 -45
  122. sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
  123. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
  124. sglang/srt/layers/quantization/quark/utils.py +97 -0
  125. sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
  126. sglang/srt/layers/quantization/unquant.py +135 -47
  127. sglang/srt/layers/quantization/utils.py +13 -0
  128. sglang/srt/layers/quantization/w4afp8.py +60 -42
  129. sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
  130. sglang/srt/layers/quantization/w8a8_int8.py +83 -41
  131. sglang/srt/layers/rocm_linear_utils.py +44 -0
  132. sglang/srt/layers/rotary_embedding.py +28 -19
  133. sglang/srt/layers/sampler.py +29 -5
  134. sglang/srt/lora/backend/base_backend.py +50 -8
  135. sglang/srt/lora/backend/triton_backend.py +90 -2
  136. sglang/srt/lora/layers.py +32 -0
  137. sglang/srt/lora/lora.py +4 -1
  138. sglang/srt/lora/lora_manager.py +35 -112
  139. sglang/srt/lora/mem_pool.py +24 -10
  140. sglang/srt/lora/utils.py +18 -9
  141. sglang/srt/managers/cache_controller.py +242 -278
  142. sglang/srt/managers/data_parallel_controller.py +30 -15
  143. sglang/srt/managers/detokenizer_manager.py +13 -2
  144. sglang/srt/managers/disagg_service.py +46 -0
  145. sglang/srt/managers/io_struct.py +160 -11
  146. sglang/srt/managers/mm_utils.py +6 -1
  147. sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
  148. sglang/srt/managers/schedule_batch.py +27 -44
  149. sglang/srt/managers/schedule_policy.py +4 -3
  150. sglang/srt/managers/scheduler.py +90 -115
  151. sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
  152. sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
  153. sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
  154. sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
  155. sglang/srt/managers/template_manager.py +3 -3
  156. sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
  157. sglang/srt/managers/tokenizer_manager.py +41 -477
  158. sglang/srt/managers/tp_worker.py +16 -4
  159. sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
  160. sglang/srt/mem_cache/allocator.py +1 -1
  161. sglang/srt/mem_cache/chunk_cache.py +1 -1
  162. sglang/srt/mem_cache/hicache_storage.py +24 -22
  163. sglang/srt/mem_cache/hiradix_cache.py +184 -101
  164. sglang/srt/mem_cache/lora_radix_cache.py +1 -1
  165. sglang/srt/mem_cache/memory_pool.py +324 -41
  166. sglang/srt/mem_cache/memory_pool_host.py +25 -18
  167. sglang/srt/mem_cache/radix_cache.py +5 -6
  168. sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
  169. sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
  170. sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
  171. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
  172. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
  173. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
  174. sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
  175. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
  176. sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
  177. sglang/srt/mem_cache/swa_radix_cache.py +1 -3
  178. sglang/srt/metrics/collector.py +484 -63
  179. sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
  180. sglang/srt/metrics/utils.py +48 -0
  181. sglang/srt/model_executor/cpu_graph_runner.py +640 -0
  182. sglang/srt/model_executor/cuda_graph_runner.py +13 -5
  183. sglang/srt/model_executor/forward_batch_info.py +72 -18
  184. sglang/srt/model_executor/model_runner.py +189 -31
  185. sglang/srt/model_loader/__init__.py +9 -3
  186. sglang/srt/model_loader/loader.py +33 -28
  187. sglang/srt/model_loader/utils.py +12 -0
  188. sglang/srt/model_loader/weight_utils.py +2 -1
  189. sglang/srt/models/deepseek_v2.py +311 -50
  190. sglang/srt/models/gemma3n_mm.py +1 -1
  191. sglang/srt/models/glm4_moe.py +10 -1
  192. sglang/srt/models/glm4v.py +4 -2
  193. sglang/srt/models/gpt_oss.py +5 -18
  194. sglang/srt/models/internvl.py +28 -0
  195. sglang/srt/models/llama4.py +9 -0
  196. sglang/srt/models/llama_eagle3.py +17 -0
  197. sglang/srt/models/longcat_flash.py +1026 -0
  198. sglang/srt/models/longcat_flash_nextn.py +699 -0
  199. sglang/srt/models/minicpmv.py +165 -3
  200. sglang/srt/models/mllama4.py +25 -0
  201. sglang/srt/models/opt.py +637 -0
  202. sglang/srt/models/qwen2.py +33 -3
  203. sglang/srt/models/qwen2_5_vl.py +90 -42
  204. sglang/srt/models/qwen2_moe.py +79 -14
  205. sglang/srt/models/qwen3.py +8 -2
  206. sglang/srt/models/qwen3_moe.py +39 -8
  207. sglang/srt/models/qwen3_next.py +1039 -0
  208. sglang/srt/models/qwen3_next_mtp.py +109 -0
  209. sglang/srt/models/torch_native_llama.py +1 -1
  210. sglang/srt/models/transformers.py +1 -1
  211. sglang/srt/multimodal/processors/base_processor.py +4 -2
  212. sglang/srt/multimodal/processors/glm4v.py +9 -9
  213. sglang/srt/multimodal/processors/internvl.py +141 -129
  214. sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
  215. sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
  216. sglang/srt/sampling/sampling_batch_info.py +18 -15
  217. sglang/srt/server_args.py +297 -79
  218. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
  219. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
  220. sglang/srt/speculative/eagle_worker.py +216 -120
  221. sglang/srt/speculative/spec_info.py +5 -0
  222. sglang/srt/speculative/standalone_worker.py +109 -0
  223. sglang/srt/utils.py +37 -2
  224. sglang/srt/weight_sync/utils.py +1 -1
  225. sglang/test/attention/test_trtllm_mla_backend.py +181 -8
  226. sglang/test/few_shot_gsm8k.py +1 -0
  227. sglang/test/runners.py +4 -0
  228. sglang/test/test_cutlass_moe.py +24 -6
  229. sglang/test/test_cutlass_w4a8_moe.py +24 -9
  230. sglang/test/test_disaggregation_utils.py +66 -0
  231. sglang/test/test_utils.py +25 -1
  232. sglang/utils.py +5 -0
  233. sglang/version.py +1 -1
  234. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
  235. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
  236. sglang/srt/disaggregation/launch_lb.py +0 -131
  237. sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
  238. /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
  239. /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
  240. /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
  241. /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
  242. /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
  243. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
  244. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
  245. {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
@@ -60,6 +60,7 @@ from sglang.srt.managers.io_struct import (
60
60
  UpdateWeightsFromDistributedReqInput,
61
61
  UpdateWeightsFromTensorReqInput,
62
62
  )
63
+ from sglang.srt.managers.multi_tokenizer_mixin import MultiTokenizerRouter
63
64
  from sglang.srt.managers.scheduler import run_scheduler_process
64
65
  from sglang.srt.managers.template_manager import TemplateManager
65
66
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -654,7 +655,8 @@ def _set_envs_and_config(server_args: ServerArgs):
654
655
  os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
655
656
  os.environ["CUDA_MODULE_LOADING"] = "AUTO"
656
657
  # flashinfer uses this environment variable for various kernels from MoE to quant kernels
657
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
658
+ if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0":
659
+ os.environ["TRTLLM_ENABLE_PDL"] = "1"
658
660
 
659
661
  # Can also be passed as argument
660
662
  os.environ["SGLANG_RUN_ID"] = (
@@ -672,7 +674,7 @@ def _set_envs_and_config(server_args: ServerArgs):
672
674
  if server_args.attention_backend == "flashinfer":
673
675
  assert_pkg_version(
674
676
  "flashinfer_python",
675
- "0.2.14.post1",
677
+ "0.3.1",
676
678
  "Please uninstall the old version and "
677
679
  "reinstall the latest version by following the instructions "
678
680
  "at https://docs.flashinfer.ai/installation.html.",
@@ -680,7 +682,7 @@ def _set_envs_and_config(server_args: ServerArgs):
680
682
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
681
683
  assert_pkg_version(
682
684
  "sgl-kernel",
683
- "0.3.7",
685
+ "0.3.9.post2",
684
686
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
685
687
  )
686
688
 
@@ -702,6 +704,24 @@ def _set_envs_and_config(server_args: ServerArgs):
702
704
  mp.set_start_method("spawn", force=True)
703
705
 
704
706
 
707
+ def _init_tokenizer_manager(
708
+ server_args: ServerArgs, port_args: PortArgs
709
+ ) -> TokenizerManager:
710
+ # Launch tokenizer process
711
+ tokenizer_manager = TokenizerManager(server_args, port_args)
712
+
713
+ # Initialize templates
714
+ template_manager = TemplateManager()
715
+ template_manager.initialize_templates(
716
+ tokenizer_manager=tokenizer_manager,
717
+ model_path=server_args.model_path,
718
+ chat_template=server_args.chat_template,
719
+ completion_template=server_args.completion_template,
720
+ )
721
+
722
+ return tokenizer_manager, template_manager
723
+
724
+
705
725
  def _launch_subprocesses(
706
726
  server_args: ServerArgs, port_args: Optional[PortArgs] = None
707
727
  ) -> Tuple[TokenizerManager, TemplateManager, Dict]:
@@ -815,17 +835,15 @@ def _launch_subprocesses(
815
835
  )
816
836
  detoken_proc.start()
817
837
 
818
- # Launch tokenizer process
819
- tokenizer_manager = TokenizerManager(server_args, port_args)
820
-
821
- # Initialize templates
822
- template_manager = TemplateManager()
823
- template_manager.initialize_templates(
824
- tokenizer_manager=tokenizer_manager,
825
- model_path=server_args.model_path,
826
- chat_template=server_args.chat_template,
827
- completion_template=server_args.completion_template,
828
- )
838
+ # Init tokenizer manager first, as the bootstrap server is initialized here
839
+ if server_args.tokenizer_worker_num > 1:
840
+ # Launch multi-tokenizer router
841
+ tokenizer_manager = MultiTokenizerRouter(server_args, port_args)
842
+ template_manager = None
843
+ else:
844
+ tokenizer_manager, template_manager = _init_tokenizer_manager(
845
+ server_args, port_args
846
+ )
829
847
 
830
848
  # Wait for the model to finish loading
831
849
  scheduler_infos = []
@@ -848,5 +866,7 @@ def _launch_subprocesses(
848
866
 
849
867
  # Assume all schedulers have the same scheduler_info
850
868
  scheduler_info = scheduler_infos[0]
869
+
851
870
  tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
871
+
852
872
  return tokenizer_manager, template_manager, scheduler_info
@@ -23,11 +23,14 @@ import json
23
23
  import logging
24
24
  import multiprocessing as multiprocessing
25
25
  import os
26
+ import tempfile
26
27
  import threading
27
28
  import time
28
29
  from http import HTTPStatus
29
30
  from typing import Any, AsyncIterator, Callable, Dict, List, Optional
30
31
 
32
+ import setproctitle
33
+
31
34
  # Fix a bug of Python threading
32
35
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
33
36
 
@@ -44,11 +47,7 @@ from fastapi.exceptions import RequestValidationError
44
47
  from fastapi.middleware.cors import CORSMiddleware
45
48
  from fastapi.responses import ORJSONResponse, Response, StreamingResponse
46
49
 
47
- from sglang.srt.disaggregation.utils import (
48
- FAKE_BOOTSTRAP_HOST,
49
- DisaggregationMode,
50
- register_disaggregation_server,
51
- )
50
+ from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
52
51
  from sglang.srt.entrypoints.engine import _launch_subprocesses
53
52
  from sglang.srt.entrypoints.openai.protocol import (
54
53
  ChatCompletionRequest,
@@ -91,11 +90,18 @@ from sglang.srt.managers.io_struct import (
91
90
  UpdateWeightVersionReqInput,
92
91
  VertexGenerateReqInput,
93
92
  )
93
+ from sglang.srt.managers.multi_tokenizer_mixin import (
94
+ MultiTokenizerManager,
95
+ get_main_process_id,
96
+ monkey_patch_uvicorn_multiprocessing,
97
+ read_from_shared_memory,
98
+ write_data_for_multi_tokenizer,
99
+ )
94
100
  from sglang.srt.managers.template_manager import TemplateManager
95
101
  from sglang.srt.managers.tokenizer_manager import ServerStatus, TokenizerManager
96
102
  from sglang.srt.metrics.func_timer import enable_func_timer
97
- from sglang.srt.reasoning_parser import ReasoningParser
98
- from sglang.srt.server_args import ServerArgs
103
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
104
+ from sglang.srt.server_args import PortArgs, ServerArgs
99
105
  from sglang.srt.utils import (
100
106
  add_api_key_middleware,
101
107
  add_prometheus_middleware,
@@ -130,8 +136,72 @@ def set_global_state(global_state: _GlobalState):
130
136
  _global_state = global_state
131
137
 
132
138
 
139
+ async def init_multi_tokenizer() -> ServerArgs:
140
+ """Read args information from shm and init tokenizer manager for current process"""
141
+ pid = os.getpid()
142
+ main_pid = get_main_process_id()
143
+ logger.info(f"current worker_id: {pid}, main processID: {main_pid}")
144
+
145
+ # Read configuration from shared memory
146
+ port_args, server_args, scheduler_info = read_from_shared_memory(
147
+ f"multi_tokenizer_args_{main_pid}"
148
+ )
149
+ server_args: ServerArgs
150
+
151
+ # API key authentication is not supported in multi-tokenizer mode
152
+ assert (
153
+ server_args.api_key is None
154
+ ), "API key is not supported in multi-tokenizer mode"
155
+
156
+ port_args.tokenizer_ipc_name = (
157
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
158
+ )
159
+
160
+ # Launch multi-tokenizer manager process
161
+ tokenizer_manager = MultiTokenizerManager(server_args, port_args)
162
+ template_manager = TemplateManager()
163
+ template_manager.initialize_templates(
164
+ tokenizer_manager=tokenizer_manager,
165
+ model_path=server_args.model_path,
166
+ chat_template=server_args.chat_template,
167
+ completion_template=server_args.completion_template,
168
+ )
169
+ # Register this tokenizer with the main tokenizer manager
170
+ await tokenizer_manager.register_to_main_tokenizer_manager()
171
+
172
+ tokenizer_manager.max_req_input_len = scheduler_info["max_req_input_len"]
173
+ set_global_state(
174
+ _GlobalState(
175
+ tokenizer_manager=tokenizer_manager,
176
+ template_manager=template_manager,
177
+ scheduler_info=scheduler_info,
178
+ )
179
+ )
180
+ return server_args
181
+
182
+
133
183
  @asynccontextmanager
134
184
  async def lifespan(fast_api_app: FastAPI):
185
+ if not getattr(fast_api_app, "is_single_tokenizer_mode", False):
186
+ # Initialize multi-tokenizer support for worker processes
187
+ fast_api_app.server_args: ServerArgs = await init_multi_tokenizer()
188
+
189
+ # only metrics middleware is supported in multi-tokenizer mode
190
+ worker_pid = os.getpid()
191
+ if fast_api_app.server_args.enable_metrics:
192
+ add_prometheus_middleware(app)
193
+ enable_func_timer()
194
+
195
+ logger.info(f"Worker {worker_pid} added prometheus middleware")
196
+ fast_api_app.warmup_thread = threading.Thread(
197
+ target=_wait_and_warmup,
198
+ args=(
199
+ fast_api_app.server_args,
200
+ None, # pipe_finish_writer not needed in worker
201
+ None, # launch_callback not needed in worker
202
+ ),
203
+ )
204
+
135
205
  # Initialize OpenAI serving handlers
136
206
  fast_api_app.state.openai_serving_completion = OpenAIServingCompletion(
137
207
  _global_state.tokenizer_manager, _global_state.template_manager
@@ -191,7 +261,15 @@ async def lifespan(fast_api_app: FastAPI):
191
261
  warmup_thread = getattr(fast_api_app, "warmup_thread", None)
192
262
  if warmup_thread is not None:
193
263
  warmup_thread.start()
194
- yield
264
+
265
+ try:
266
+ yield
267
+ finally:
268
+ if server_args.tokenizer_worker_num > 1:
269
+ pid = os.getpid()
270
+ logger.info(f"uvicorn worker {pid} ending...")
271
+ warmup_thread.join()
272
+ logger.info(f"uvicorn worker {pid} ended.")
195
273
 
196
274
 
197
275
  # Fast API
@@ -480,6 +558,16 @@ async def flush_cache():
480
558
  )
481
559
 
482
560
 
561
+ @app.api_route("/clear_hicache_storage_backend", methods=["GET", "POST"])
562
+ async def clear_hicache_storage_backend():
563
+ """Clear the hierarchical cache storage backend."""
564
+ ret = await _global_state.tokenizer_manager.clear_hicache_storage()
565
+ return Response(
566
+ content="Hierarchical cache storage backend cleared.\n",
567
+ status_code=200 if ret.success else HTTPStatus.BAD_REQUEST,
568
+ )
569
+
570
+
483
571
  @app.api_route("/start_profile", methods=["GET", "POST"])
484
572
  async def start_profile_async(obj: Optional[ProfileReqInput] = None):
485
573
  """Start profiling."""
@@ -1068,9 +1156,21 @@ def launch_server(
1068
1156
  1. The HTTP server, Engine, and TokenizerManager both run in the main process.
1069
1157
  2. Inter-process communication is done through IPC (each process uses a different port) via the ZMQ library.
1070
1158
  """
1071
- tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1072
- server_args=server_args
1073
- )
1159
+ if server_args.tokenizer_worker_num > 1:
1160
+ setproctitle.setproctitle(f"sglang::http_server/multi_tokenizer_router")
1161
+ port_args = PortArgs.init_new(server_args)
1162
+ port_args.tokenizer_worker_ipc_name = (
1163
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
1164
+ )
1165
+ tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1166
+ server_args=server_args, port_args=port_args
1167
+ )
1168
+ else:
1169
+ setproctitle.setproctitle(f"sglang::http_server/tokenizer_manager")
1170
+ tokenizer_manager, template_manager, scheduler_info = _launch_subprocesses(
1171
+ server_args=server_args,
1172
+ )
1173
+
1074
1174
  set_global_state(
1075
1175
  _GlobalState(
1076
1176
  tokenizer_manager=tokenizer_manager,
@@ -1079,42 +1179,75 @@ def launch_server(
1079
1179
  )
1080
1180
  )
1081
1181
 
1082
- # Add api key authorization
1083
- if server_args.api_key:
1084
- add_api_key_middleware(app, server_args.api_key)
1085
-
1086
- # Add prometheus middleware
1087
- if server_args.enable_metrics:
1088
- add_prometheus_middleware(app)
1089
- enable_func_timer()
1090
-
1091
- # Send a warmup request - we will create the thread launch it
1092
- # in the lifespan after all other warmups have fired.
1093
- warmup_thread = threading.Thread(
1094
- target=_wait_and_warmup,
1095
- args=(
1182
+ if server_args.tokenizer_worker_num > 1:
1183
+ multi_tokenizer_args_shm = write_data_for_multi_tokenizer(
1184
+ port_args,
1096
1185
  server_args,
1097
- pipe_finish_writer,
1098
- launch_callback,
1099
- ),
1100
- )
1101
- app.warmup_thread = warmup_thread
1186
+ scheduler_info,
1187
+ )
1188
+ else:
1189
+ # Add api key authorization
1190
+ if server_args.api_key:
1191
+ add_api_key_middleware(app, server_args.api_key)
1192
+
1193
+ # Add prometheus middleware
1194
+ if server_args.enable_metrics:
1195
+ add_prometheus_middleware(app)
1196
+ enable_func_timer()
1197
+
1198
+ # Send a warmup request - we will create the thread launch it
1199
+ # in the lifespan after all other warmups have fired.
1200
+ warmup_thread = threading.Thread(
1201
+ target=_wait_and_warmup,
1202
+ args=(
1203
+ server_args,
1204
+ pipe_finish_writer,
1205
+ launch_callback,
1206
+ ),
1207
+ )
1208
+ app.warmup_thread = warmup_thread
1102
1209
 
1103
1210
  try:
1104
1211
  # Update logging configs
1105
1212
  set_uvicorn_logging_configs()
1106
1213
  app.server_args = server_args
1107
1214
  # Listen for HTTP requests
1108
- uvicorn.run(
1109
- app,
1110
- host=server_args.host,
1111
- port=server_args.port,
1112
- log_level=server_args.log_level_http or server_args.log_level,
1113
- timeout_keep_alive=5,
1114
- loop="uvloop",
1115
- )
1215
+ if server_args.tokenizer_worker_num > 1:
1216
+ from uvicorn.config import LOGGING_CONFIG
1217
+
1218
+ LOGGING_CONFIG["loggers"]["sglang.srt.entrypoints.http_server"] = {
1219
+ "handlers": ["default"],
1220
+ "level": "INFO",
1221
+ "propagate": False,
1222
+ }
1223
+
1224
+ monkey_patch_uvicorn_multiprocessing()
1225
+
1226
+ uvicorn.run(
1227
+ "sglang.srt.entrypoints.http_server:app",
1228
+ host=server_args.host,
1229
+ port=server_args.port,
1230
+ log_level=server_args.log_level_http or server_args.log_level,
1231
+ timeout_keep_alive=5,
1232
+ loop="uvloop",
1233
+ workers=server_args.tokenizer_worker_num,
1234
+ )
1235
+ else:
1236
+ app.is_single_tokenizer_mode = True
1237
+ uvicorn.run(
1238
+ app,
1239
+ host=server_args.host,
1240
+ port=server_args.port,
1241
+ log_level=server_args.log_level_http or server_args.log_level,
1242
+ timeout_keep_alive=5,
1243
+ loop="uvloop",
1244
+ )
1116
1245
  finally:
1117
- warmup_thread.join()
1246
+ if server_args.tokenizer_worker_num > 1:
1247
+ multi_tokenizer_args_shm.unlink()
1248
+ _global_state.tokenizer_manager.socket_mapping.clear_all_sockets()
1249
+ else:
1250
+ warmup_thread.join()
1118
1251
 
1119
1252
 
1120
1253
  def _execute_server_warmup(
@@ -1261,13 +1394,5 @@ def _wait_and_warmup(
1261
1394
  if server_args.debug_tensor_dump_input_file:
1262
1395
  kill_process_tree(os.getpid())
1263
1396
 
1264
- if server_args.pdlb_url is not None:
1265
- register_disaggregation_server(
1266
- server_args.disaggregation_mode,
1267
- server_args.port,
1268
- server_args.disaggregation_bootstrap_port,
1269
- server_args.pdlb_url,
1270
- )
1271
-
1272
1397
  if launch_callback is not None:
1273
1398
  launch_callback()
@@ -460,6 +460,66 @@ class ChatCompletionRequest(BaseModel):
460
460
  values["tool_choice"] = "auto"
461
461
  return values
462
462
 
463
+ @model_validator(mode="before")
464
+ @classmethod
465
+ def normalize_reasoning_inputs(cls, values: Dict):
466
+ r = values.get("reasoning")
467
+ if r is None:
468
+ return values
469
+
470
+ if isinstance(r, dict):
471
+ effort = r.get("effort") or r.get("reasoning_effort")
472
+ if effort in {"low", "medium", "high"}:
473
+ values["reasoning_effort"] = effort
474
+
475
+ enabled = (
476
+ r.get("enabled")
477
+ if r.get("enabled") is not None
478
+ else r.get("enable", False)
479
+ )
480
+ if isinstance(enabled, str):
481
+ enabled = enabled.strip().lower() in {"1", "true", "yes", "y", "on"}
482
+ if enabled:
483
+ ctk = values.get("chat_template_kwargs")
484
+ if not isinstance(ctk, dict):
485
+ ctk = {}
486
+ ctk.setdefault("thinking", True)
487
+ values["chat_template_kwargs"] = ctk
488
+
489
+ return values
490
+
491
+ @model_validator(mode="before")
492
+ @classmethod
493
+ def set_json_schema(cls, values):
494
+ response_format = values.get("response_format")
495
+ if not response_format:
496
+ return values
497
+
498
+ if response_format.get("type") != "json_schema":
499
+ return values
500
+
501
+ schema = response_format.pop("schema", None)
502
+ json_schema = response_format.get("json_schema")
503
+
504
+ if json_schema:
505
+ return values
506
+
507
+ if schema:
508
+ name_ = schema.get("title", "Schema")
509
+ strict_ = False
510
+ if "properties" in schema and "strict" in schema["properties"]:
511
+ item = schema["properties"].pop("strict", None)
512
+ if item and item.get("default", False):
513
+ strict_ = True
514
+
515
+ response_format["json_schema"] = {
516
+ "name": name_,
517
+ "schema": schema,
518
+ "strict": strict_,
519
+ }
520
+
521
+ return values
522
+
463
523
  # Extra parameters for SRT backend only and will be ignored by OpenAI models.
464
524
  top_k: int = -1
465
525
  min_p: float = 0.0
@@ -482,9 +542,9 @@ class ChatCompletionRequest(BaseModel):
482
542
  rid: Optional[Union[List[str], str]] = None
483
543
 
484
544
  # For PD disaggregation
485
- bootstrap_host: Optional[str] = None
486
- bootstrap_port: Optional[int] = None
487
- bootstrap_room: Optional[int] = None
545
+ bootstrap_host: Optional[Union[List[str], str]] = None
546
+ bootstrap_port: Optional[Union[List[Optional[int]], int]] = None
547
+ bootstrap_room: Optional[Union[List[int], int]] = None
488
548
 
489
549
 
490
550
  class ChatMessage(BaseModel):
@@ -1,15 +1,19 @@
1
+ from __future__ import annotations
2
+
1
3
  import json
2
4
  import logging
3
5
  import uuid
4
6
  from abc import ABC, abstractmethod
5
- from typing import Any, Optional, Union
7
+ from typing import TYPE_CHECKING, Any, Optional, Union
6
8
 
7
9
  from fastapi import HTTPException, Request
8
10
  from fastapi.responses import ORJSONResponse, StreamingResponse
9
11
 
10
12
  from sglang.srt.entrypoints.openai.protocol import ErrorResponse, OpenAIServingRequest
11
13
  from sglang.srt.managers.io_struct import GenerateReqInput
12
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
14
+
15
+ if TYPE_CHECKING:
16
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
13
17
 
14
18
  logger = logging.getLogger(__name__)
15
19
 
@@ -1,14 +1,15 @@
1
+ from __future__ import annotations
2
+
1
3
  import copy
2
4
  import json
3
5
  import logging
4
6
  import time
5
7
  import uuid
6
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
8
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
7
9
 
8
10
  from fastapi import Request
9
11
  from fastapi.responses import ORJSONResponse, StreamingResponse
10
12
 
11
- from sglang.srt.conversation import generate_chat_conv
12
13
  from sglang.srt.entrypoints.openai.protocol import (
13
14
  ChatCompletionRequest,
14
15
  ChatCompletionResponse,
@@ -33,13 +34,16 @@ from sglang.srt.entrypoints.openai.utils import (
33
34
  to_openai_style_logprobs,
34
35
  )
35
36
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
36
- from sglang.srt.jinja_template_utils import process_content_for_template_format
37
37
  from sglang.srt.managers.io_struct import GenerateReqInput
38
- from sglang.srt.managers.template_manager import TemplateManager
39
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
40
- from sglang.srt.reasoning_parser import ReasoningParser
38
+ from sglang.srt.parser.conversation import generate_chat_conv
39
+ from sglang.srt.parser.jinja_template_utils import process_content_for_template_format
40
+ from sglang.srt.parser.reasoning_parser import ReasoningParser
41
41
  from sglang.utils import convert_json_schema_to_str
42
42
 
43
+ if TYPE_CHECKING:
44
+ from sglang.srt.managers.template_manager import TemplateManager
45
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
46
+
43
47
  logger = logging.getLogger(__name__)
44
48
 
45
49
 
@@ -53,6 +57,7 @@ class OpenAIServingChat(OpenAIServingBase):
53
57
  ):
54
58
  super().__init__(tokenizer_manager)
55
59
  self.template_manager = template_manager
60
+ self.tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
56
61
 
57
62
  def _request_id_prefix(self) -> str:
58
63
  return "chatcmpl-"
@@ -172,10 +177,11 @@ class OpenAIServingChat(OpenAIServingBase):
172
177
  ]
173
178
  else:
174
179
  tools = [item.function.model_dump() for item in request.tools]
175
-
176
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
177
- parser = FunctionCallParser(request.tools, tool_call_parser)
178
- tool_call_constraint = parser.get_structure_constraint(request.tool_choice)
180
+ if self.tool_call_parser:
181
+ parser = FunctionCallParser(request.tools, self.tool_call_parser)
182
+ tool_call_constraint = parser.get_structure_constraint(
183
+ request.tool_choice
184
+ )
179
185
 
180
186
  # Use chat template
181
187
  if self.template_manager.chat_template_name is None:
@@ -537,7 +543,11 @@ class OpenAIServingChat(OpenAIServingBase):
537
543
  yield f"data: {chunk.model_dump_json()}\n\n"
538
544
 
539
545
  # Handle tool calls
540
- if request.tool_choice != "none" and request.tools:
546
+ if (
547
+ request.tool_choice != "none"
548
+ and request.tools
549
+ and self.tool_call_parser
550
+ ):
541
551
  async for chunk in self._process_tool_call_stream(
542
552
  index,
543
553
  delta,
@@ -727,10 +737,13 @@ class OpenAIServingChat(OpenAIServingBase):
727
737
 
728
738
  # Handle tool calls
729
739
  tool_calls = None
730
- if request.tool_choice != "none" and request.tools:
731
- tool_call_parser = self.tokenizer_manager.server_args.tool_call_parser
740
+ if (
741
+ request.tool_choice != "none"
742
+ and request.tools
743
+ and self.tool_call_parser
744
+ ):
732
745
  tool_calls, text, finish_reason = self._process_tool_calls(
733
- text, request.tools, tool_call_parser, finish_reason
746
+ text, request.tools, finish_reason
734
747
  )
735
748
 
736
749
  choice_data = ChatCompletionResponseChoice(
@@ -824,11 +837,10 @@ class OpenAIServingChat(OpenAIServingBase):
824
837
  self,
825
838
  text: str,
826
839
  tools: List[Any],
827
- tool_call_parser: Optional[str],
828
840
  finish_reason: Dict[str, Any],
829
841
  ) -> tuple[Optional[List[ToolCall]], str, Dict[str, Any]]:
830
842
  """Process tool calls in the response"""
831
- parser = FunctionCallParser(tools, tool_call_parser)
843
+ parser = FunctionCallParser(tools, self.tool_call_parser)
832
844
  if parser.has_tool_call(text):
833
845
  if finish_reason["type"] == "stop":
834
846
  finish_reason["type"] = "tool_calls"
@@ -838,7 +850,10 @@ class OpenAIServingChat(OpenAIServingBase):
838
850
  tool_calls = []
839
851
  for call_info in call_info_list:
840
852
  # For Kimi-K2, align tool_call_id with the model format: functions.{name}:{index}
841
- if tool_call_parser == "kimi_k2" and call_info.name is not None:
853
+ if (
854
+ self.tool_call_parser == "kimi_k2"
855
+ and call_info.name is not None
856
+ ):
842
857
  tool_id = f"functions.{call_info.name}:{call_info.tool_index}"
843
858
  else:
844
859
  tool_id = f"call_{uuid.uuid4().hex[:24]}"
@@ -933,7 +948,7 @@ class OpenAIServingChat(OpenAIServingBase):
933
948
  if index not in parser_dict:
934
949
  parser_dict[index] = FunctionCallParser(
935
950
  tools=request.tools,
936
- tool_call_parser=self.tokenizer_manager.server_args.tool_call_parser,
951
+ tool_call_parser=self.tool_call_parser,
937
952
  )
938
953
  parser = parser_dict[index]
939
954
 
@@ -962,7 +977,7 @@ class OpenAIServingChat(OpenAIServingBase):
962
977
  # Tool call ID should be generated only once per tool call
963
978
  if call_item.name:
964
979
  # First chunk: include ID and function name
965
- if self.tokenizer_manager.server_args.tool_call_parser == "kimi_k2":
980
+ if self.tool_call_parser == "kimi_k2":
966
981
  # Align with Kimi-K2 format: functions.{name}:{index}
967
982
  tool_call_id = f"functions.{call_item.name}:{call_item.tool_index}"
968
983
  else:
@@ -1,11 +1,12 @@
1
+ from __future__ import annotations
2
+
1
3
  import logging
2
4
  import time
3
- from typing import Any, AsyncGenerator, Dict, List, Optional, Union
5
+ from typing import TYPE_CHECKING, Any, AsyncGenerator, Dict, List, Optional, Union
4
6
 
5
7
  from fastapi import Request
6
8
  from fastapi.responses import ORJSONResponse, StreamingResponse
7
9
 
8
- from sglang.srt.code_completion_parser import generate_completion_prompt_from_request
9
10
  from sglang.srt.entrypoints.openai.protocol import (
10
11
  CompletionRequest,
11
12
  CompletionResponse,
@@ -21,10 +22,15 @@ from sglang.srt.entrypoints.openai.utils import (
21
22
  to_openai_style_logprobs,
22
23
  )
23
24
  from sglang.srt.managers.io_struct import GenerateReqInput
24
- from sglang.srt.managers.template_manager import TemplateManager
25
- from sglang.srt.managers.tokenizer_manager import TokenizerManager
25
+ from sglang.srt.parser.code_completion_parser import (
26
+ generate_completion_prompt_from_request,
27
+ )
26
28
  from sglang.utils import convert_json_schema_to_str
27
29
 
30
+ if TYPE_CHECKING:
31
+ from sglang.srt.managers.template_manager import TemplateManager
32
+ from sglang.srt.managers.tokenizer_manager import TokenizerManager
33
+
28
34
  logger = logging.getLogger(__name__)
29
35
 
30
36