sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +208 -295
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +9 -6
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  64. sglang/srt/layers/moe/topk.py +13 -4
  65. sglang/srt/layers/quantization/__init__.py +111 -7
  66. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  69. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  71. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  72. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  73. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  75. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  80. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  82. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  83. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  87. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  89. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  90. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  91. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  93. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  94. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  95. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  96. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  97. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  98. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  99. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  100. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  101. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  102. sglang/srt/layers/quantization/fp8.py +69 -28
  103. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  104. sglang/srt/layers/quantization/gptq.py +416 -0
  105. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  106. sglang/srt/layers/quantization/int8_utils.py +73 -0
  107. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  108. sglang/srt/layers/radix_attention.py +1 -0
  109. sglang/srt/layers/rotary_embedding.py +0 -1
  110. sglang/srt/layers/sampler.py +76 -31
  111. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  112. sglang/srt/lora/lora.py +17 -1
  113. sglang/srt/lora/lora_config.py +5 -0
  114. sglang/srt/lora/lora_manager.py +1 -3
  115. sglang/srt/managers/cache_controller.py +193 -62
  116. sglang/srt/managers/configure_logging.py +2 -1
  117. sglang/srt/managers/data_parallel_controller.py +6 -2
  118. sglang/srt/managers/detokenizer_manager.py +124 -102
  119. sglang/srt/managers/image_processor.py +2 -1
  120. sglang/srt/managers/io_struct.py +143 -6
  121. sglang/srt/managers/schedule_batch.py +238 -197
  122. sglang/srt/managers/schedule_policy.py +29 -29
  123. sglang/srt/managers/scheduler.py +681 -259
  124. sglang/srt/managers/session_controller.py +6 -2
  125. sglang/srt/managers/tokenizer_manager.py +224 -68
  126. sglang/srt/managers/tp_worker.py +15 -4
  127. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  128. sglang/srt/mem_cache/chunk_cache.py +18 -11
  129. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  130. sglang/srt/mem_cache/memory_pool.py +44 -18
  131. sglang/srt/mem_cache/radix_cache.py +58 -47
  132. sglang/srt/metrics/collector.py +94 -36
  133. sglang/srt/model_executor/cuda_graph_runner.py +55 -24
  134. sglang/srt/model_executor/forward_batch_info.py +49 -16
  135. sglang/srt/model_executor/model_runner.py +209 -28
  136. sglang/srt/model_loader/loader.py +3 -3
  137. sglang/srt/model_loader/weight_utils.py +36 -14
  138. sglang/srt/models/baichuan.py +31 -6
  139. sglang/srt/models/chatglm.py +39 -7
  140. sglang/srt/models/commandr.py +29 -5
  141. sglang/srt/models/dbrx.py +31 -5
  142. sglang/srt/models/deepseek.py +43 -6
  143. sglang/srt/models/deepseek_nextn.py +32 -19
  144. sglang/srt/models/deepseek_v2.py +265 -29
  145. sglang/srt/models/exaone.py +19 -9
  146. sglang/srt/models/gemma.py +22 -8
  147. sglang/srt/models/gemma2.py +25 -12
  148. sglang/srt/models/gemma2_reward.py +5 -1
  149. sglang/srt/models/gpt2.py +28 -13
  150. sglang/srt/models/gpt_bigcode.py +27 -5
  151. sglang/srt/models/granite.py +21 -9
  152. sglang/srt/models/grok.py +21 -4
  153. sglang/srt/models/internlm2.py +36 -6
  154. sglang/srt/models/internlm2_reward.py +5 -1
  155. sglang/srt/models/llama.py +26 -9
  156. sglang/srt/models/llama_classification.py +5 -1
  157. sglang/srt/models/llama_eagle.py +17 -4
  158. sglang/srt/models/llama_embedding.py +5 -1
  159. sglang/srt/models/llama_reward.py +7 -2
  160. sglang/srt/models/llava.py +19 -3
  161. sglang/srt/models/llavavid.py +10 -1
  162. sglang/srt/models/minicpm.py +26 -2
  163. sglang/srt/models/minicpm3.py +39 -3
  164. sglang/srt/models/minicpmv.py +45 -14
  165. sglang/srt/models/mixtral.py +20 -9
  166. sglang/srt/models/mixtral_quant.py +50 -8
  167. sglang/srt/models/mllama.py +57 -11
  168. sglang/srt/models/olmo.py +34 -6
  169. sglang/srt/models/olmo2.py +34 -13
  170. sglang/srt/models/olmoe.py +26 -4
  171. sglang/srt/models/phi3_small.py +29 -10
  172. sglang/srt/models/qwen.py +26 -3
  173. sglang/srt/models/qwen2.py +26 -4
  174. sglang/srt/models/qwen2_5_vl.py +46 -8
  175. sglang/srt/models/qwen2_eagle.py +17 -5
  176. sglang/srt/models/qwen2_moe.py +44 -6
  177. sglang/srt/models/qwen2_rm.py +78 -0
  178. sglang/srt/models/qwen2_vl.py +39 -8
  179. sglang/srt/models/stablelm.py +32 -5
  180. sglang/srt/models/torch_native_llama.py +5 -2
  181. sglang/srt/models/xverse.py +21 -9
  182. sglang/srt/models/xverse_moe.py +45 -7
  183. sglang/srt/models/yivl.py +2 -1
  184. sglang/srt/openai_api/adapter.py +109 -24
  185. sglang/srt/openai_api/protocol.py +17 -1
  186. sglang/srt/reasoning_parser.py +154 -0
  187. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  188. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  189. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  190. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  191. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  192. sglang/srt/sampling/sampling_batch_info.py +79 -157
  193. sglang/srt/sampling/sampling_params.py +16 -13
  194. sglang/srt/server_args.py +136 -52
  195. sglang/srt/speculative/build_eagle_tree.py +2 -8
  196. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
  197. sglang/srt/speculative/eagle_utils.py +92 -58
  198. sglang/srt/speculative/eagle_worker.py +186 -94
  199. sglang/srt/speculative/spec_info.py +1 -13
  200. sglang/srt/utils.py +43 -17
  201. sglang/srt/warmup.py +47 -0
  202. sglang/test/few_shot_gsm8k.py +4 -1
  203. sglang/test/runners.py +389 -126
  204. sglang/test/send_one.py +88 -0
  205. sglang/test/test_block_fp8_ep.py +361 -0
  206. sglang/test/test_programs.py +1 -1
  207. sglang/test/test_utils.py +138 -84
  208. sglang/utils.py +50 -60
  209. sglang/version.py +1 -1
  210. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
  211. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
  212. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
  213. sglang/bench_latency.py +0 -1
  214. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  215. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  216. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  217. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  218. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
  219. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
@@ -44,6 +44,7 @@ from sglang.srt.managers.io_struct import (
44
44
  InitWeightsUpdateGroupReqInput,
45
45
  ReleaseMemoryOccupationReqInput,
46
46
  ResumeMemoryOccupationReqInput,
47
+ UpdateWeightFromDiskReqInput,
47
48
  UpdateWeightsFromDistributedReqInput,
48
49
  UpdateWeightsFromTensorReqInput,
49
50
  )
@@ -98,7 +99,7 @@ class Engine:
98
99
  kwargs["log_level"] = "error"
99
100
  server_args = ServerArgs(**kwargs)
100
101
 
101
- # Shutdown the subprocesses automatically when the program exists
102
+ # Shutdown the subprocesses automatically when the program exits
102
103
  atexit.register(self.shutdown)
103
104
 
104
105
  # Launch subprocesses
@@ -121,8 +122,10 @@ class Engine:
121
122
  return_logprob: Optional[Union[List[bool], bool]] = False,
122
123
  logprob_start_len: Optional[Union[List[int], int]] = None,
123
124
  top_logprobs_num: Optional[Union[List[int], int]] = None,
125
+ token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
124
126
  lora_path: Optional[List[Optional[str]]] = None,
125
127
  custom_logit_processor: Optional[Union[List[str], str]] = None,
128
+ return_hidden_states: bool = False,
126
129
  stream: bool = False,
127
130
  ) -> Union[Dict, Iterator[Dict]]:
128
131
  """
@@ -141,9 +144,11 @@ class Engine:
141
144
  return_logprob=return_logprob,
142
145
  logprob_start_len=logprob_start_len,
143
146
  top_logprobs_num=top_logprobs_num,
147
+ token_ids_logprob=token_ids_logprob,
144
148
  lora_path=lora_path,
145
149
  modalities=modalities_list,
146
150
  custom_logit_processor=custom_logit_processor,
151
+ return_hidden_states=return_hidden_states,
147
152
  stream=stream,
148
153
  )
149
154
  loop = asyncio.get_event_loop()
@@ -177,6 +182,7 @@ class Engine:
177
182
  return_logprob: Optional[Union[List[bool], bool]] = False,
178
183
  logprob_start_len: Optional[Union[List[int], int]] = None,
179
184
  top_logprobs_num: Optional[Union[List[int], int]] = None,
185
+ token_ids_logprob: Optional[Union[List[List[int]], List[int]]] = None,
180
186
  lora_path: Optional[List[Optional[str]]] = None,
181
187
  custom_logit_processor: Optional[Union[List[str], str]] = None,
182
188
  stream: bool = False,
@@ -193,6 +199,7 @@ class Engine:
193
199
  return_logprob=return_logprob,
194
200
  logprob_start_len=logprob_start_len,
195
201
  top_logprobs_num=top_logprobs_num,
202
+ token_ids_logprob=token_ids_logprob,
196
203
  lora_path=lora_path,
197
204
  stream=stream,
198
205
  custom_logit_processor=custom_logit_processor,
@@ -224,15 +231,22 @@ class Engine:
224
231
  kill_process_tree(os.getpid(), include_parent=False)
225
232
 
226
233
  def start_profile(self):
227
- self.tokenizer_manager.start_profile()
234
+ loop = asyncio.get_event_loop()
235
+ loop.run_until_complete(self.tokenizer_manager.start_profile())
228
236
 
229
237
  def stop_profile(self):
230
238
  self.tokenizer_manager.stop_profile()
231
239
 
232
240
  def get_server_info(self):
241
+ loop = asyncio.get_event_loop()
242
+ internal_states = loop.run_until_complete(
243
+ self.tokenizer_manager.get_internal_state()
244
+ )
245
+
233
246
  return {
234
- **dataclasses.asdict(self.tokenizer_manager.server_args), # server args
247
+ **dataclasses.asdict(self.tokenizer_manager.server_args),
235
248
  **self.scheduler_info,
249
+ **internal_states,
236
250
  "version": __version__,
237
251
  }
238
252
 
@@ -271,16 +285,45 @@ class Engine:
271
285
  self.tokenizer_manager.update_weights_from_distributed(obj, None)
272
286
  )
273
287
 
274
- def update_weights_from_tensor(self, named_tensors: List[Tuple[str, torch.Tensor]]):
275
- """Update weights from distributed source."""
288
+ def update_weights_from_tensor(
289
+ self,
290
+ named_tensors: List[Tuple[str, torch.Tensor]],
291
+ load_format: Optional[str] = None,
292
+ flush_cache: bool = True,
293
+ ):
294
+ """Update weights from distributed source. If there are going to be more updates, set `flush_cache` to be true
295
+ to avoid duplicated operations such as clearing cache."""
276
296
  obj = UpdateWeightsFromTensorReqInput(
277
- serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors)
297
+ serialized_named_tensors=MultiprocessingSerializer.serialize(named_tensors),
298
+ load_format=load_format,
299
+ flush_cache=flush_cache,
278
300
  )
279
301
  loop = asyncio.get_event_loop()
280
302
  return loop.run_until_complete(
281
303
  self.tokenizer_manager.update_weights_from_tensor(obj, None)
282
304
  )
283
305
 
306
+ def update_weights_from_disk(
307
+ self,
308
+ model_path: str,
309
+ load_format: Optional[str] = None,
310
+ ):
311
+ """Update the weights from disk inplace without re-launching the engine.
312
+
313
+ This method allows updating the model weights from disk without restarting
314
+ the engine. It can be used to load a different model or update weights with
315
+ new training.
316
+ """
317
+ obj = UpdateWeightFromDiskReqInput(
318
+ model_path=model_path,
319
+ load_format=load_format,
320
+ )
321
+
322
+ loop = asyncio.get_event_loop()
323
+ return loop.run_until_complete(
324
+ self.tokenizer_manager.update_weights_from_disk(obj, None)
325
+ )
326
+
284
327
  def get_weights_by_name(self, name: str, truncate_size: int = 100):
285
328
  """Get weights by parameter name."""
286
329
  obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
@@ -313,6 +356,7 @@ def _set_envs_and_config(server_args: ServerArgs):
313
356
  os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls))
314
357
  os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
315
358
  os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4"
359
+ os.environ["CUDA_MODULE_LOADING"] = "AUTO"
316
360
 
317
361
  # Set prometheus env vars
318
362
  if server_args.enable_metrics:
@@ -330,18 +374,29 @@ def _set_envs_and_config(server_args: ServerArgs):
330
374
  if server_args.attention_backend == "flashinfer":
331
375
  assert_pkg_version(
332
376
  "flashinfer_python",
333
- "0.2.1.post1",
377
+ "0.2.2.post1",
334
378
  "Please uninstall the old version and "
335
379
  "reinstall the latest version by following the instructions "
336
380
  "at https://docs.flashinfer.ai/installation.html.",
337
381
  )
338
382
 
383
+ def sigchld_handler(signum, frame):
384
+ pid, exitcode = os.waitpid(0, os.WNOHANG)
385
+ if exitcode != 0:
386
+ logger.warning(
387
+ "Child process unexpectedly failed with an exit code %d. pid=%d",
388
+ exitcode,
389
+ pid,
390
+ )
391
+
392
+ signal.signal(signal.SIGCHLD, sigchld_handler)
393
+
339
394
  # Register the signal handler.
340
395
  # The child processes will send SIGQUIT to this process when any error happens
341
396
  # This process then clean up the whole process tree
342
397
  def sigquit_handler(signum, frame):
343
398
  logger.error(
344
- "Received sigquit from a child proces. It usually means the child failed."
399
+ "Received sigquit from a child process. It usually means the child failed."
345
400
  )
346
401
  kill_process_tree(os.getpid())
347
402
 
@@ -384,7 +439,10 @@ def _launch_subprocesses(server_args: ServerArgs) -> Tuple[TokenizerManager, Dic
384
439
  )
385
440
  for tp_rank in tp_rank_range:
386
441
  reader, writer = mp.Pipe(duplex=False)
387
- gpu_id = server_args.base_gpu_id + tp_rank % tp_size_per_node
442
+ gpu_id = (
443
+ server_args.base_gpu_id
444
+ + (tp_rank % tp_size_per_node) * server_args.gpu_id_step
445
+ )
388
446
  proc = mp.Process(
389
447
  target=run_scheduler_process,
390
448
  args=(server_args, port_args, gpu_id, tp_rank, None, writer),
@@ -25,11 +25,14 @@ import os
25
25
  import threading
26
26
  import time
27
27
  from http import HTTPStatus
28
- from typing import AsyncIterator, Dict, Optional
28
+ from typing import AsyncIterator, Callable, Dict, Optional
29
29
 
30
30
  # Fix a bug of Python threading
31
31
  setattr(threading, "_register_atexit", lambda *args, **kwargs: None)
32
32
 
33
+ from contextlib import asynccontextmanager
34
+
35
+ import numpy as np
33
36
  import orjson
34
37
  import requests
35
38
  import uvicorn
@@ -44,15 +47,19 @@ from sglang.srt.managers.io_struct import (
44
47
  CloseSessionReqInput,
45
48
  ConfigureLoggingReq,
46
49
  EmbeddingReqInput,
47
- FunctionCallReqInput,
48
50
  GenerateReqInput,
49
51
  GetWeightsByNameReqInput,
50
52
  InitWeightsUpdateGroupReqInput,
51
53
  OpenSessionReqInput,
54
+ ParseFunctionCallReq,
55
+ ProfileReqInput,
52
56
  ReleaseMemoryOccupationReqInput,
53
57
  ResumeMemoryOccupationReqInput,
58
+ SeparateReasoningReqInput,
59
+ SetInternalStateReq,
54
60
  UpdateWeightFromDiskReqInput,
55
61
  UpdateWeightsFromDistributedReqInput,
62
+ VertexGenerateReqInput,
56
63
  )
57
64
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
58
65
  from sglang.srt.metrics.func_timer import enable_func_timer
@@ -69,6 +76,7 @@ from sglang.srt.openai_api.adapter import (
69
76
  v1_retrieve_file_content,
70
77
  )
71
78
  from sglang.srt.openai_api.protocol import ModelCard, ModelList
79
+ from sglang.srt.reasoning_parser import ReasoningParser
72
80
  from sglang.srt.server_args import ServerArgs
73
81
  from sglang.srt.utils import (
74
82
  add_api_key_middleware,
@@ -77,22 +85,13 @@ from sglang.srt.utils import (
77
85
  kill_process_tree,
78
86
  set_uvicorn_logging_configs,
79
87
  )
88
+ from sglang.srt.warmup import execute_warmups
80
89
  from sglang.utils import get_exception_traceback
81
90
  from sglang.version import __version__
82
91
 
83
92
  logger = logging.getLogger(__name__)
84
93
  asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())
85
94
 
86
- # Fast API
87
- app = FastAPI()
88
- app.add_middleware(
89
- CORSMiddleware,
90
- allow_origins=["*"],
91
- allow_credentials=True,
92
- allow_methods=["*"],
93
- allow_headers=["*"],
94
- )
95
-
96
95
 
97
96
  # Store global states
98
97
  @dataclasses.dataclass
@@ -109,6 +108,34 @@ def set_global_state(global_state: _GlobalState):
109
108
  _global_state = global_state
110
109
 
111
110
 
111
+ @asynccontextmanager
112
+ async def lifespan(fast_api_app: FastAPI):
113
+ server_args: ServerArgs = fast_api_app.server_args
114
+ if server_args.warmups is not None:
115
+ await execute_warmups(
116
+ server_args.warmups.split(","), _global_state.tokenizer_manager
117
+ )
118
+ logger.info("Warmup ended")
119
+
120
+ warmup_thread = getattr(fast_api_app, "warmup_thread", None)
121
+ if warmup_thread is not None:
122
+ warmup_thread.start()
123
+ yield
124
+
125
+
126
+ # Fast API
127
+ app = FastAPI(lifespan=lifespan)
128
+ app.add_middleware(
129
+ CORSMiddleware,
130
+ allow_origins=["*"],
131
+ allow_credentials=True,
132
+ allow_methods=["*"],
133
+ allow_headers=["*"],
134
+ )
135
+
136
+ HEALTH_CHECK_TIMEOUT = int(os.getenv("SGLANG_HEALTH_CHECK_TIMEOUT", 20))
137
+
138
+
112
139
  ##### Native API endpoints #####
113
140
 
114
141
 
@@ -122,24 +149,48 @@ async def health() -> Response:
122
149
  async def health_generate(request: Request) -> Response:
123
150
  """Check the health of the inference server by generating one token."""
124
151
 
125
- sampling_params = {"max_new_tokens": 1, "temperature": 0.7}
152
+ sampling_params = {"max_new_tokens": 1, "temperature": 0.0}
153
+ rid = f"HEALTH_CHECK_{time.time()}"
126
154
 
127
- if _global_state.tokenizer_manager.is_generation:
155
+ if _global_state.tokenizer_manager.is_image_gen:
156
+ raise NotImplementedError()
157
+ elif _global_state.tokenizer_manager.is_generation:
128
158
  gri = GenerateReqInput(
129
- input_ids=[0], sampling_params=sampling_params, log_metrics=False
159
+ rid=rid,
160
+ input_ids=[0],
161
+ sampling_params=sampling_params,
162
+ log_metrics=False,
130
163
  )
131
164
  else:
132
165
  gri = EmbeddingReqInput(
133
- input_ids=[0], sampling_params=sampling_params, log_metrics=False
166
+ rid=rid, input_ids=[0], sampling_params=sampling_params, log_metrics=False
134
167
  )
135
168
 
136
- try:
169
+ async def gen():
137
170
  async for _ in _global_state.tokenizer_manager.generate_request(gri, request):
138
171
  break
139
- return Response(status_code=200)
140
- except Exception as e:
141
- logger.exception(e)
142
- return Response(status_code=503)
172
+
173
+ tic = time.time()
174
+ task = asyncio.create_task(gen())
175
+ while time.time() < tic + HEALTH_CHECK_TIMEOUT:
176
+ await asyncio.sleep(1)
177
+ if _global_state.tokenizer_manager.last_receive_tstamp > tic:
178
+ task.cancel()
179
+ _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
180
+ return Response(status_code=200)
181
+
182
+ task.cancel()
183
+ tic_time = time.strftime("%H:%M:%S", time.localtime(tic))
184
+ last_receive_time = time.strftime(
185
+ "%H:%M:%S", time.localtime(_global_state.tokenizer_manager.last_receive_tstamp)
186
+ )
187
+ logger.error(
188
+ f"Health check failed. Server couldn't get a response from detokenizer for last "
189
+ f"{HEALTH_CHECK_TIMEOUT} seconds. tic start time: {tic_time}. "
190
+ f"last_heartbeat time: {last_receive_time}"
191
+ )
192
+ _global_state.tokenizer_manager.rid_to_state.pop(rid, None)
193
+ return Response(status_code=503)
143
194
 
144
195
 
145
196
  @app.get("/get_model_info")
@@ -155,13 +206,21 @@ async def get_model_info():
155
206
 
156
207
  @app.get("/get_server_info")
157
208
  async def get_server_info():
209
+ internal_states = await _global_state.tokenizer_manager.get_internal_state()
158
210
  return {
159
211
  **dataclasses.asdict(_global_state.tokenizer_manager.server_args),
160
212
  **_global_state.scheduler_info,
213
+ **internal_states,
161
214
  "version": __version__,
162
215
  }
163
216
 
164
217
 
218
+ @app.api_route("/set_internal_state", methods=["POST", "PUT"])
219
+ async def set_internal_state(obj: SetInternalStateReq, request: Request):
220
+ res = await _global_state.tokenizer_manager.set_internal_state(obj)
221
+ return res
222
+
223
+
165
224
  # fastapi implicitly converts json in the request to obj (dataclass)
166
225
  @app.api_route("/generate", methods=["POST", "PUT"])
167
226
  async def generate_request(obj: GenerateReqInput, request: Request):
@@ -178,6 +237,7 @@ async def generate_request(obj: GenerateReqInput, request: Request):
178
237
  ) + b"\n\n"
179
238
  except ValueError as e:
180
239
  out = {"error": {"message": str(e)}}
240
+ logger.error(f"Error: {e}")
181
241
  yield b"data: " + orjson.dumps(
182
242
  out, option=orjson.OPT_NON_STR_KEYS
183
243
  ) + b"\n\n"
@@ -235,9 +295,14 @@ async def flush_cache():
235
295
 
236
296
 
237
297
  @app.api_route("/start_profile", methods=["GET", "POST"])
238
- async def start_profile_async():
298
+ async def start_profile_async(obj: Optional[ProfileReqInput] = None):
239
299
  """Start profiling."""
240
- _global_state.tokenizer_manager.start_profile()
300
+ if obj is None:
301
+ obj = ProfileReqInput()
302
+
303
+ await _global_state.tokenizer_manager.start_profile(
304
+ obj.output_dir, obj.num_steps, obj.activities
305
+ )
241
306
  return Response(
242
307
  content="Start profiling.\n",
243
308
  status_code=200,
@@ -256,11 +321,15 @@ async def stop_profile_async():
256
321
 
257
322
  @app.post("/update_weights_from_disk")
258
323
  async def update_weights_from_disk(obj: UpdateWeightFromDiskReqInput, request: Request):
259
- """Update the weights from disk in-place without re-launching the server."""
260
- success, message = await _global_state.tokenizer_manager.update_weights_from_disk(
261
- obj, request
324
+ """Update the weights from disk inplace without re-launching the server."""
325
+ success, message, num_paused_requests = (
326
+ await _global_state.tokenizer_manager.update_weights_from_disk(obj, request)
262
327
  )
263
- content = {"success": success, "message": message}
328
+ content = {
329
+ "success": success,
330
+ "message": message,
331
+ "num_paused_requests": num_paused_requests,
332
+ }
264
333
  if success:
265
334
  return ORJSONResponse(
266
335
  content,
@@ -322,7 +391,7 @@ async def get_weights_by_name(obj: GetWeightsByNameReqInput, request: Request):
322
391
  async def release_memory_occupation(
323
392
  obj: ReleaseMemoryOccupationReqInput, request: Request
324
393
  ):
325
- """Release GPU occupation temporarily"""
394
+ """Release GPU memory occupation temporarily."""
326
395
  try:
327
396
  await _global_state.tokenizer_manager.release_memory_occupation(obj, request)
328
397
  except Exception as e:
@@ -333,7 +402,7 @@ async def release_memory_occupation(
333
402
  async def resume_memory_occupation(
334
403
  obj: ResumeMemoryOccupationReqInput, request: Request
335
404
  ):
336
- """Resume GPU occupation"""
405
+ """Resume GPU memory occupation."""
337
406
  try:
338
407
  await _global_state.tokenizer_manager.resume_memory_occupation(obj, request)
339
408
  except Exception as e:
@@ -356,7 +425,7 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
356
425
 
357
426
  @app.api_route("/close_session", methods=["GET", "POST"])
358
427
  async def close_session(obj: CloseSessionReqInput, request: Request):
359
- """Close the session"""
428
+ """Close the session."""
360
429
  try:
361
430
  await _global_state.tokenizer_manager.close_session(obj, request)
362
431
  return Response(status_code=200)
@@ -366,13 +435,13 @@ async def close_session(obj: CloseSessionReqInput, request: Request):
366
435
 
367
436
  @app.api_route("/configure_logging", methods=["GET", "POST"])
368
437
  async def configure_logging(obj: ConfigureLoggingReq, request: Request):
369
- """Close the session"""
438
+ """Configure the request logging options."""
370
439
  _global_state.tokenizer_manager.configure_logging(obj)
371
440
  return Response(status_code=200)
372
441
 
373
442
 
374
- @app.post("/function_call")
375
- async def function_call_request(obj: FunctionCallReqInput, request: Request):
443
+ @app.post("/parse_function_call")
444
+ async def parse_function_call_request(obj: ParseFunctionCallReq, request: Request):
376
445
  """
377
446
  A native API endpoint to parse function calls from a text.
378
447
  """
@@ -393,6 +462,26 @@ async def function_call_request(obj: FunctionCallReqInput, request: Request):
393
462
  return ORJSONResponse(content=response_data, status_code=200)
394
463
 
395
464
 
465
+ @app.post("/separate_reasoning")
466
+ async def separate_reasoning_request(obj: SeparateReasoningReqInput, request: Request):
467
+ """
468
+ A native API endpoint to separate reasoning from a text.
469
+ """
470
+ # 1) Initialize the parser based on the request body
471
+ parser = ReasoningParser(model_type=obj.reasoning_parser)
472
+
473
+ # 2) Call the non-stream parsing method (non-stream)
474
+ reasoning_text, normal_text = parser.parse_non_stream(obj.text)
475
+
476
+ # 3) Organize the response content
477
+ response_data = {
478
+ "reasoning_text": reasoning_text,
479
+ "text": normal_text,
480
+ }
481
+
482
+ return ORJSONResponse(content=response_data, status_code=200)
483
+
484
+
396
485
  ##### OpenAI-compatible API endpoints #####
397
486
 
398
487
 
@@ -425,7 +514,7 @@ def available_models():
425
514
  @app.post("/v1/files")
426
515
  async def openai_v1_files(file: UploadFile = File(...), purpose: str = Form("batch")):
427
516
  return await v1_files_create(
428
- file, purpose, _global_state.tokenizer_manager.server_args.file_storage_pth
517
+ file, purpose, _global_state.tokenizer_manager.server_args.file_storage_path
429
518
  )
430
519
 
431
520
 
@@ -463,6 +552,44 @@ async def retrieve_file_content(file_id: str):
463
552
  return await v1_retrieve_file_content(file_id)
464
553
 
465
554
 
555
+ ## SageMaker API
556
+ @app.get("/ping")
557
+ async def sagemaker_health() -> Response:
558
+ """Check the health of the http server."""
559
+ return Response(status_code=200)
560
+
561
+
562
+ @app.post("/invocations")
563
+ async def sagemaker_chat_completions(raw_request: Request):
564
+ return await v1_chat_completions(_global_state.tokenizer_manager, raw_request)
565
+
566
+
567
+ ## Vertex AI API
568
+ @app.post(os.environ.get("AIP_PREDICT_ROUTE", "/vertex_generate"))
569
+ async def vertex_generate(vertex_req: VertexGenerateReqInput, raw_request: Request):
570
+ if not vertex_req.instances:
571
+ return []
572
+ inputs = {}
573
+ for input_key in ("text", "input_ids", "input_embeds"):
574
+ if vertex_req.instances[0].get(input_key):
575
+ inputs[input_key] = [
576
+ instance.get(input_key) for instance in vertex_req.instances
577
+ ]
578
+ break
579
+ image_data = [
580
+ instance.get("image_data")
581
+ for instance in vertex_req.instances
582
+ if instance.get("image_data") is not None
583
+ ] or None
584
+ req = GenerateReqInput(
585
+ **inputs,
586
+ image_data=image_data,
587
+ **(vertex_req.parameters or {}),
588
+ )
589
+ ret = await generate_request(req, raw_request)
590
+ return ORJSONResponse({"predictions": ret})
591
+
592
+
466
593
  def _create_error_response(e):
467
594
  return ORJSONResponse(
468
595
  {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
@@ -472,6 +599,7 @@ def _create_error_response(e):
472
599
  def launch_server(
473
600
  server_args: ServerArgs,
474
601
  pipe_finish_writer: Optional[multiprocessing.connection.Connection] = None,
602
+ launch_callback: Optional[Callable[[], None]] = None,
475
603
  ):
476
604
  """
477
605
  Launch SRT (SGLang Runtime) Server.
@@ -505,21 +633,23 @@ def launch_server(
505
633
  add_prometheus_middleware(app)
506
634
  enable_func_timer()
507
635
 
508
- # Send a warmup request
509
- t = threading.Thread(
636
+ # Send a warmup request - we will create the thread launch it
637
+ # in the lifespan after all other warmups have fired.
638
+ warmup_thread = threading.Thread(
510
639
  target=_wait_and_warmup,
511
640
  args=(
512
641
  server_args,
513
642
  pipe_finish_writer,
514
643
  _global_state.tokenizer_manager.image_token_id,
644
+ launch_callback,
515
645
  ),
516
646
  )
517
- t.start()
647
+ app.warmup_thread = warmup_thread
518
648
 
519
649
  try:
520
650
  # Update logging configs
521
651
  set_uvicorn_logging_configs()
522
-
652
+ app.server_args = server_args
523
653
  # Listen for HTTP requests
524
654
  uvicorn.run(
525
655
  app,
@@ -530,10 +660,15 @@ def launch_server(
530
660
  loop="uvloop",
531
661
  )
532
662
  finally:
533
- t.join()
663
+ warmup_thread.join()
534
664
 
535
665
 
536
- def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
666
+ def _wait_and_warmup(
667
+ server_args: ServerArgs,
668
+ pipe_finish_writer: Optional[multiprocessing.connection.Connection],
669
+ image_token_text: str,
670
+ launch_callback: Optional[Callable[[], None]] = None,
671
+ ):
537
672
  headers = {}
538
673
  url = server_args.url()
539
674
  if server_args.api_key:
@@ -575,8 +710,16 @@ def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
575
710
  else:
576
711
  json_data["text"] = "The capital city of France is"
577
712
 
713
+ # Debug dumping
714
+ if server_args.debug_tensor_dump_input_file:
715
+ json_data.pop("text", None)
716
+ json_data["input_ids"] = np.load(
717
+ server_args.debug_tensor_dump_input_file
718
+ ).tolist()
719
+ json_data["sampling_params"]["max_new_tokens"] = 0
720
+
578
721
  try:
579
- for _ in range(server_args.dp_size):
722
+ for i in range(server_args.dp_size):
580
723
  res = requests.post(
581
724
  url + request_name,
582
725
  json=json_data,
@@ -601,3 +744,9 @@ def _wait_and_warmup(server_args, pipe_finish_writer, image_token_text):
601
744
 
602
745
  if server_args.delete_ckpt_after_loading:
603
746
  delete_directory(server_args.model_path)
747
+
748
+ if server_args.debug_tensor_dump_input_file:
749
+ kill_process_tree(os.getpid())
750
+
751
+ if launch_callback is not None:
752
+ launch_callback()