sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +208 -295
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +9 -6
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  62. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  63. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  64. sglang/srt/layers/moe/topk.py +13 -4
  65. sglang/srt/layers/quantization/__init__.py +111 -7
  66. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  69. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  71. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  72. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  73. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  75. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  80. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  82. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  83. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  86. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  87. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  89. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  90. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  91. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  92. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  93. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  94. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  95. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  96. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  97. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  98. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  99. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  100. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  101. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
  102. sglang/srt/layers/quantization/fp8.py +69 -28
  103. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  104. sglang/srt/layers/quantization/gptq.py +416 -0
  105. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  106. sglang/srt/layers/quantization/int8_utils.py +73 -0
  107. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  108. sglang/srt/layers/radix_attention.py +1 -0
  109. sglang/srt/layers/rotary_embedding.py +0 -1
  110. sglang/srt/layers/sampler.py +76 -31
  111. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  112. sglang/srt/lora/lora.py +17 -1
  113. sglang/srt/lora/lora_config.py +5 -0
  114. sglang/srt/lora/lora_manager.py +1 -3
  115. sglang/srt/managers/cache_controller.py +193 -62
  116. sglang/srt/managers/configure_logging.py +2 -1
  117. sglang/srt/managers/data_parallel_controller.py +6 -2
  118. sglang/srt/managers/detokenizer_manager.py +124 -102
  119. sglang/srt/managers/image_processor.py +2 -1
  120. sglang/srt/managers/io_struct.py +143 -6
  121. sglang/srt/managers/schedule_batch.py +238 -197
  122. sglang/srt/managers/schedule_policy.py +29 -29
  123. sglang/srt/managers/scheduler.py +681 -259
  124. sglang/srt/managers/session_controller.py +6 -2
  125. sglang/srt/managers/tokenizer_manager.py +224 -68
  126. sglang/srt/managers/tp_worker.py +15 -4
  127. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  128. sglang/srt/mem_cache/chunk_cache.py +18 -11
  129. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  130. sglang/srt/mem_cache/memory_pool.py +44 -18
  131. sglang/srt/mem_cache/radix_cache.py +58 -47
  132. sglang/srt/metrics/collector.py +94 -36
  133. sglang/srt/model_executor/cuda_graph_runner.py +55 -24
  134. sglang/srt/model_executor/forward_batch_info.py +49 -16
  135. sglang/srt/model_executor/model_runner.py +209 -28
  136. sglang/srt/model_loader/loader.py +3 -3
  137. sglang/srt/model_loader/weight_utils.py +36 -14
  138. sglang/srt/models/baichuan.py +31 -6
  139. sglang/srt/models/chatglm.py +39 -7
  140. sglang/srt/models/commandr.py +29 -5
  141. sglang/srt/models/dbrx.py +31 -5
  142. sglang/srt/models/deepseek.py +43 -6
  143. sglang/srt/models/deepseek_nextn.py +32 -19
  144. sglang/srt/models/deepseek_v2.py +265 -29
  145. sglang/srt/models/exaone.py +19 -9
  146. sglang/srt/models/gemma.py +22 -8
  147. sglang/srt/models/gemma2.py +25 -12
  148. sglang/srt/models/gemma2_reward.py +5 -1
  149. sglang/srt/models/gpt2.py +28 -13
  150. sglang/srt/models/gpt_bigcode.py +27 -5
  151. sglang/srt/models/granite.py +21 -9
  152. sglang/srt/models/grok.py +21 -4
  153. sglang/srt/models/internlm2.py +36 -6
  154. sglang/srt/models/internlm2_reward.py +5 -1
  155. sglang/srt/models/llama.py +26 -9
  156. sglang/srt/models/llama_classification.py +5 -1
  157. sglang/srt/models/llama_eagle.py +17 -4
  158. sglang/srt/models/llama_embedding.py +5 -1
  159. sglang/srt/models/llama_reward.py +7 -2
  160. sglang/srt/models/llava.py +19 -3
  161. sglang/srt/models/llavavid.py +10 -1
  162. sglang/srt/models/minicpm.py +26 -2
  163. sglang/srt/models/minicpm3.py +39 -3
  164. sglang/srt/models/minicpmv.py +45 -14
  165. sglang/srt/models/mixtral.py +20 -9
  166. sglang/srt/models/mixtral_quant.py +50 -8
  167. sglang/srt/models/mllama.py +57 -11
  168. sglang/srt/models/olmo.py +34 -6
  169. sglang/srt/models/olmo2.py +34 -13
  170. sglang/srt/models/olmoe.py +26 -4
  171. sglang/srt/models/phi3_small.py +29 -10
  172. sglang/srt/models/qwen.py +26 -3
  173. sglang/srt/models/qwen2.py +26 -4
  174. sglang/srt/models/qwen2_5_vl.py +46 -8
  175. sglang/srt/models/qwen2_eagle.py +17 -5
  176. sglang/srt/models/qwen2_moe.py +44 -6
  177. sglang/srt/models/qwen2_rm.py +78 -0
  178. sglang/srt/models/qwen2_vl.py +39 -8
  179. sglang/srt/models/stablelm.py +32 -5
  180. sglang/srt/models/torch_native_llama.py +5 -2
  181. sglang/srt/models/xverse.py +21 -9
  182. sglang/srt/models/xverse_moe.py +45 -7
  183. sglang/srt/models/yivl.py +2 -1
  184. sglang/srt/openai_api/adapter.py +109 -24
  185. sglang/srt/openai_api/protocol.py +17 -1
  186. sglang/srt/reasoning_parser.py +154 -0
  187. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  188. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  189. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  190. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  191. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  192. sglang/srt/sampling/sampling_batch_info.py +79 -157
  193. sglang/srt/sampling/sampling_params.py +16 -13
  194. sglang/srt/server_args.py +136 -52
  195. sglang/srt/speculative/build_eagle_tree.py +2 -8
  196. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
  197. sglang/srt/speculative/eagle_utils.py +92 -58
  198. sglang/srt/speculative/eagle_worker.py +186 -94
  199. sglang/srt/speculative/spec_info.py +1 -13
  200. sglang/srt/utils.py +43 -17
  201. sglang/srt/warmup.py +47 -0
  202. sglang/test/few_shot_gsm8k.py +4 -1
  203. sglang/test/runners.py +389 -126
  204. sglang/test/send_one.py +88 -0
  205. sglang/test/test_block_fp8_ep.py +361 -0
  206. sglang/test/test_programs.py +1 -1
  207. sglang/test/test_utils.py +138 -84
  208. sglang/utils.py +50 -60
  209. sglang/version.py +1 -1
  210. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
  211. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
  212. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
  213. sglang/bench_latency.py +0 -1
  214. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  215. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  216. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  217. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  218. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
  219. {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -23,6 +23,7 @@ from typing import List, Optional
23
23
  import torch
24
24
 
25
25
  from sglang.srt.hf_transformers_utils import check_gguf_file
26
+ from sglang.srt.reasoning_parser import ReasoningParser
26
27
  from sglang.srt.utils import (
27
28
  get_amdgpu_memory_capacity,
28
29
  get_hpu_memory_capacity,
@@ -43,19 +44,19 @@ class ServerArgs:
43
44
  model_path: str
44
45
  tokenizer_path: Optional[str] = None
45
46
  tokenizer_mode: str = "auto"
47
+ skip_tokenizer_init: bool = False
46
48
  load_format: str = "auto"
47
- trust_remote_code: bool = True
49
+ trust_remote_code: bool = False
48
50
  dtype: str = "auto"
49
51
  kv_cache_dtype: str = "auto"
50
- quantization_param_path: nullable_str = None
51
52
  quantization: Optional[str] = None
53
+ quantization_param_path: nullable_str = None
52
54
  context_length: Optional[int] = None
53
55
  device: str = "cuda"
54
56
  served_model_name: Optional[str] = None
55
57
  chat_template: Optional[str] = None
56
58
  is_embedding: bool = False
57
59
  revision: Optional[str] = None
58
- skip_tokenizer_init: bool = False
59
60
 
60
61
  # Port for the HTTP server
61
62
  host: str = "127.0.0.1"
@@ -67,7 +68,7 @@ class ServerArgs:
67
68
  max_total_tokens: Optional[int] = None
68
69
  chunked_prefill_size: Optional[int] = None
69
70
  max_prefill_tokens: int = 16384
70
- schedule_policy: str = "lpm"
71
+ schedule_policy: str = "fcfs"
71
72
  schedule_conservativeness: float = 1.0
72
73
  cpu_offload_gb: int = 0
73
74
  prefill_only_one_req: bool = False
@@ -79,21 +80,25 @@ class ServerArgs:
79
80
  random_seed: Optional[int] = None
80
81
  constrained_json_whitespace_pattern: Optional[str] = None
81
82
  watchdog_timeout: float = 300
83
+ dist_timeout: Optional[int] = None # timeout for torch.distributed
82
84
  download_dir: Optional[str] = None
83
85
  base_gpu_id: int = 0
86
+ gpu_id_step: int = 1
84
87
 
85
88
  # Logging
86
89
  log_level: str = "info"
87
90
  log_level_http: Optional[str] = None
88
91
  log_requests: bool = False
92
+ log_requests_level: int = 0
89
93
  show_time_cost: bool = False
90
94
  enable_metrics: bool = False
91
95
  decode_log_interval: int = 40
92
96
 
93
97
  # API related
94
98
  api_key: Optional[str] = None
95
- file_storage_pth: str = "sglang_storage"
99
+ file_storage_path: str = "sglang_storage"
96
100
  enable_cache_report: bool = False
101
+ reasoning_parser: Optional[str] = None
97
102
 
98
103
  # Data parallelism
99
104
  dp_size: int = 1
@@ -121,11 +126,14 @@ class ServerArgs:
121
126
  grammar_backend: Optional[str] = "outlines"
122
127
 
123
128
  # Speculative decoding
124
- speculative_draft_model_path: Optional[str] = None
125
129
  speculative_algorithm: Optional[str] = None
130
+ speculative_draft_model_path: Optional[str] = None
126
131
  speculative_num_steps: int = 5
127
- speculative_num_draft_tokens: int = 64
128
- speculative_eagle_topk: int = 8
132
+ speculative_eagle_topk: int = 4
133
+ speculative_num_draft_tokens: int = 8
134
+ speculative_accept_threshold_single: float = 1.0
135
+ speculative_accept_threshold_acc: float = 1.0
136
+ speculative_token_map: Optional[str] = None
129
137
 
130
138
  # Double Sparsity
131
139
  enable_double_sparsity: bool = False
@@ -137,7 +145,6 @@ class ServerArgs:
137
145
 
138
146
  # Optimization/debug options
139
147
  disable_radix_cache: bool = False
140
- disable_jump_forward: bool = False
141
148
  disable_cuda_graph: bool = False
142
149
  disable_cuda_graph_padding: bool = False
143
150
  enable_nccl_nvls: bool = False
@@ -161,14 +168,17 @@ class ServerArgs:
161
168
  delete_ckpt_after_loading: bool = False
162
169
  enable_memory_saver: bool = False
163
170
  allow_auto_truncate: bool = False
164
- return_hidden_states: bool = False
165
-
166
- # Custom logit processor
167
171
  enable_custom_logit_processor: bool = False
168
172
  tool_call_parser: str = None
169
173
  enable_hierarchical_cache: bool = False
170
-
171
174
  enable_flashinfer_mla: bool = False
175
+ flashinfer_mla_disable_ragged: bool = False
176
+ warmups: Optional[str] = None
177
+
178
+ # Debug tensor dumps
179
+ debug_tensor_dump_output_folder: Optional[str] = None
180
+ debug_tensor_dump_input_file: Optional[str] = None
181
+ debug_tensor_dump_inject: bool = False
172
182
 
173
183
  def __post_init__(self):
174
184
  # Set missing default values
@@ -262,18 +272,24 @@ class ServerArgs:
262
272
  )
263
273
 
264
274
  # Speculative Decoding
265
- if (
266
- self.speculative_algorithm == "EAGLE"
267
- or self.speculative_algorithm == "NEXTN"
268
- ):
275
+ if self.speculative_algorithm == "NEXTN":
276
+ # NEXTN shares the same implementation of EAGLE
277
+ self.speculative_algorithm = "EAGLE"
278
+
279
+ if self.speculative_algorithm == "EAGLE":
280
+ self.disable_overlap_schedule = True
269
281
  self.prefill_only_one_req = True
270
282
  self.disable_cuda_graph_padding = True
271
- self.disable_radix_cache = True
272
- self.disable_overlap_schedule = True
273
- self.chunked_prefill_size = -1
283
+ if self.max_running_requests is None:
284
+ self.max_running_requests = 32
274
285
  logger.info(
275
- f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
286
+ "Overlap scheduler are disabled because of using "
287
+ "eagle speculative decoding."
288
+ "Max running request set to 32 because of using eagle speculative decoding."
276
289
  )
290
+ # The token generated from the verify step is counted.
291
+ # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
292
+ assert self.speculative_num_steps < self.speculative_num_draft_tokens
277
293
 
278
294
  # GGUF
279
295
  if (
@@ -377,15 +393,6 @@ class ServerArgs:
377
393
  choices=["auto", "fp8_e5m2", "fp8_e4m3"],
378
394
  help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
379
395
  )
380
- parser.add_argument(
381
- "--quantization-param-path",
382
- type=nullable_str,
383
- default=None,
384
- help="Path to the JSON file containing the KV cache "
385
- "scaling factors. This should generally be supplied, when "
386
- "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
387
- "default to 1.0, which may cause accuracy issues. ",
388
- )
389
396
  parser.add_argument(
390
397
  "--quantization",
391
398
  type=str,
@@ -404,6 +411,15 @@ class ServerArgs:
404
411
  ],
405
412
  help="The quantization method.",
406
413
  )
414
+ parser.add_argument(
415
+ "--quantization-param-path",
416
+ type=nullable_str,
417
+ default=None,
418
+ help="Path to the JSON file containing the KV cache "
419
+ "scaling factors. This should generally be supplied, when "
420
+ "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
421
+ "default to 1.0, which may cause accuracy issues. ",
422
+ )
407
423
  parser.add_argument(
408
424
  "--context-length",
409
425
  type=int,
@@ -537,11 +553,17 @@ class ServerArgs:
537
553
  default=ServerArgs.watchdog_timeout,
538
554
  help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
539
555
  )
556
+ parser.add_argument(
557
+ "--dist-timeout",
558
+ type=int,
559
+ default=ServerArgs.dist_timeout,
560
+ help="Set timeout for torch.distributed initialization.",
561
+ )
540
562
  parser.add_argument(
541
563
  "--download-dir",
542
564
  type=str,
543
565
  default=ServerArgs.download_dir,
544
- help="Model download directory.",
566
+ help="Model download directory for huggingface.",
545
567
  )
546
568
  parser.add_argument(
547
569
  "--base-gpu-id",
@@ -549,6 +571,12 @@ class ServerArgs:
549
571
  default=ServerArgs.base_gpu_id,
550
572
  help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
551
573
  )
574
+ parser.add_argument(
575
+ "--gpu-id-step",
576
+ type=int,
577
+ default=ServerArgs.gpu_id_step,
578
+ help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
579
+ )
552
580
 
553
581
  # Logging
554
582
  parser.add_argument(
@@ -566,7 +594,14 @@ class ServerArgs:
566
594
  parser.add_argument(
567
595
  "--log-requests",
568
596
  action="store_true",
569
- help="Log the inputs and outputs of all requests.",
597
+ help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
598
+ )
599
+ parser.add_argument(
600
+ "--log-requests-level",
601
+ type=int,
602
+ default=0,
603
+ help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
604
+ choices=[0, 1, 2],
570
605
  )
571
606
  parser.add_argument(
572
607
  "--show-time-cost",
@@ -593,9 +628,9 @@ class ServerArgs:
593
628
  help="Set API key of the server. It is also used in the OpenAI API compatible server.",
594
629
  )
595
630
  parser.add_argument(
596
- "--file-storage-pth",
631
+ "--file-storage-path",
597
632
  type=str,
598
- default=ServerArgs.file_storage_pth,
633
+ default=ServerArgs.file_storage_path,
599
634
  help="The path of the file storage in backend.",
600
635
  )
601
636
  parser.add_argument(
@@ -603,6 +638,13 @@ class ServerArgs:
603
638
  action="store_true",
604
639
  help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
605
640
  )
641
+ parser.add_argument(
642
+ "--reasoning-parser",
643
+ type=str,
644
+ choices=list(ReasoningParser.DetectorMap.keys()),
645
+ default=ServerArgs.reasoning_parser,
646
+ help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
647
+ )
606
648
 
607
649
  # Data parallelism
608
650
  parser.add_argument(
@@ -694,7 +736,7 @@ class ServerArgs:
694
736
  parser.add_argument(
695
737
  "--grammar-backend",
696
738
  type=str,
697
- choices=["xgrammar", "outlines"],
739
+ choices=["xgrammar", "outlines", "llguidance"],
698
740
  default=ServerArgs.grammar_backend,
699
741
  help="Choose the backend for grammar-guided decoding.",
700
742
  )
@@ -703,6 +745,11 @@ class ServerArgs:
703
745
  action="store_true",
704
746
  help="Enable FlashInfer MLA optimization",
705
747
  )
748
+ parser.add_argument(
749
+ "--flashinfer-mla-disable-ragged",
750
+ action="store_true",
751
+ help="Not using ragged prefill wrapper when running flashinfer mla",
752
+ )
706
753
 
707
754
  # Speculative decoding
708
755
  parser.add_argument(
@@ -722,18 +769,36 @@ class ServerArgs:
722
769
  help="The number of steps sampled from draft model in Speculative Decoding.",
723
770
  default=ServerArgs.speculative_num_steps,
724
771
  )
772
+ parser.add_argument(
773
+ "--speculative-eagle-topk",
774
+ type=int,
775
+ help="The number of tokens sampled from the draft model in eagle2 each step.",
776
+ choices=[1, 2, 4, 8],
777
+ default=ServerArgs.speculative_eagle_topk,
778
+ )
725
779
  parser.add_argument(
726
780
  "--speculative-num-draft-tokens",
727
781
  type=int,
728
- help="The number of token sampled from draft model in Speculative Decoding.",
782
+ help="The number of tokens sampled from the draft model in Speculative Decoding.",
729
783
  default=ServerArgs.speculative_num_draft_tokens,
730
784
  )
731
785
  parser.add_argument(
732
- "--speculative-eagle-topk",
733
- type=int,
734
- help="The number of token sampled from draft model in eagle2 each step.",
735
- choices=[1, 2, 4, 8],
736
- default=ServerArgs.speculative_eagle_topk,
786
+ "--speculative-accept-threshold-single",
787
+ type=float,
788
+ help="Accept a draft token if its probability in the target model is greater than this threshold.",
789
+ default=ServerArgs.speculative_accept_threshold_single,
790
+ )
791
+ parser.add_argument(
792
+ "--speculative-accept-threshold-acc",
793
+ type=float,
794
+ help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
795
+ default=ServerArgs.speculative_accept_threshold_acc,
796
+ )
797
+ parser.add_argument(
798
+ "--speculative-token-map",
799
+ type=str,
800
+ help="The path of the draft model's small vocab table.",
801
+ default=ServerArgs.speculative_token_map,
737
802
  )
738
803
 
739
804
  # Double Sparsity
@@ -779,11 +844,6 @@ class ServerArgs:
779
844
  action="store_true",
780
845
  help="Disable RadixAttention for prefix caching.",
781
846
  )
782
- parser.add_argument(
783
- "--disable-jump-forward",
784
- action="store_true",
785
- help="Disable jump-forward for grammar-guided decoding.",
786
- )
787
847
  parser.add_argument(
788
848
  "--disable-cuda-graph",
789
849
  action="store_true",
@@ -913,12 +973,6 @@ class ServerArgs:
913
973
  action="store_true",
914
974
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
915
975
  )
916
- parser.add_argument(
917
- "--return-hidden-states",
918
- action="store_true",
919
- help="Return hidden states in the response.",
920
- )
921
- # Function Calling
922
976
  parser.add_argument(
923
977
  "--tool-call-parser",
924
978
  type=str,
@@ -932,6 +986,35 @@ class ServerArgs:
932
986
  help="Enable hierarchical cache",
933
987
  )
934
988
 
989
+ # Server warmups
990
+ parser.add_argument(
991
+ "--warmups",
992
+ type=str,
993
+ required=False,
994
+ help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
995
+ "will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
996
+ )
997
+
998
+ # Debug tensor dumps
999
+ parser.add_argument(
1000
+ "--debug-tensor-dump-output-folder",
1001
+ type=str,
1002
+ default=ServerArgs.debug_tensor_dump_output_folder,
1003
+ help="The output folder for dumping tensors.",
1004
+ )
1005
+ parser.add_argument(
1006
+ "--debug-tensor-dump-input-file",
1007
+ type=str,
1008
+ default=ServerArgs.debug_tensor_dump_input_file,
1009
+ help="The input filename for dumping tensors",
1010
+ )
1011
+ parser.add_argument(
1012
+ "--debug-tensor-dump-inject",
1013
+ type=str,
1014
+ default=ServerArgs.debug_tensor_dump_inject,
1015
+ help="Inject the outputs from jax as the input of every layer.",
1016
+ )
1017
+
935
1018
  @classmethod
936
1019
  def from_cli_args(cls, args: argparse.Namespace):
937
1020
  args.tp_size = args.tensor_parallel_size
@@ -960,6 +1043,7 @@ class ServerArgs:
960
1043
  and (self.lora_paths is None or self.disable_radix_cache)
961
1044
  ), "compatibility of lora and cuda graph and radix attention is in progress"
962
1045
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1046
+ assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
963
1047
 
964
1048
  if isinstance(self.lora_paths, list):
965
1049
  lora_paths = self.lora_paths
@@ -3,14 +3,8 @@
3
3
  from typing import List
4
4
 
5
5
  import torch
6
-
7
- from sglang.srt.utils import is_cuda_available
8
-
9
- if is_cuda_available():
10
- from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
11
- from sgl_kernel import (
12
- build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
13
- )
6
+ from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
7
+ from sgl_kernel import build_tree_kernel_efficient as sgl_build_tree_kernel_efficient
14
8
 
15
9
 
16
10
  def build_tree_kernel_efficient_preprocess(
@@ -21,7 +21,6 @@ from sglang.srt.model_executor.forward_batch_info import (
21
21
  from sglang.srt.speculative.eagle_utils import EagleDraftInput
22
22
 
23
23
  if TYPE_CHECKING:
24
- from sglang.srt.model_executor.model_runner import ModelRunner
25
24
  from sglang.srt.speculative.eagle_worker import EAGLEWorker
26
25
 
27
26