sglang 0.4.3.post2__py3-none-any.whl → 0.4.3.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (205) hide show
  1. sglang/api.py +1 -1
  2. sglang/bench_offline_throughput.py +19 -0
  3. sglang/bench_one_batch.py +2 -2
  4. sglang/bench_serving.py +123 -79
  5. sglang/global_config.py +8 -3
  6. sglang/lang/backend/runtime_endpoint.py +1 -1
  7. sglang/lang/ir.py +1 -1
  8. sglang/srt/_custom_ops.py +83 -91
  9. sglang/srt/configs/load_config.py +4 -1
  10. sglang/srt/configs/model_config.py +48 -2
  11. sglang/srt/configs/qwen2_5_vl_config.py +5 -2
  12. sglang/srt/constrained/base_grammar_backend.py +117 -15
  13. sglang/srt/constrained/llguidance_backend.py +151 -0
  14. sglang/srt/constrained/outlines_backend.py +24 -33
  15. sglang/srt/constrained/xgrammar_backend.py +69 -38
  16. sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
  17. sglang/srt/distributed/parallel_state.py +48 -3
  18. sglang/srt/entrypoints/engine.py +67 -9
  19. sglang/srt/entrypoints/http_server.py +190 -41
  20. sglang/srt/entrypoints/verl_engine.py +147 -0
  21. sglang/srt/function_call_parser.py +0 -1
  22. sglang/srt/layers/activation.py +11 -0
  23. sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
  24. sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
  25. sglang/srt/layers/attention/flashinfer_backend.py +302 -414
  26. sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
  27. sglang/srt/layers/attention/torch_native_backend.py +1 -1
  28. sglang/srt/layers/attention/triton_backend.py +13 -8
  29. sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
  30. sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
  31. sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
  32. sglang/srt/layers/attention/utils.py +39 -0
  33. sglang/srt/layers/attention/vision.py +60 -63
  34. sglang/srt/layers/dp_attention.py +142 -1
  35. sglang/srt/layers/layernorm.py +1 -1
  36. sglang/srt/layers/linear.py +3 -1
  37. sglang/srt/layers/logits_processor.py +281 -45
  38. sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
  39. sglang/srt/layers/moe/ep_moe/layer.py +140 -28
  40. sglang/srt/layers/moe/fused_moe_native.py +2 -0
  41. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  42. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
  43. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
  44. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
  45. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
  46. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
  47. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
  48. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
  49. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
  50. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
  51. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
  52. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
  53. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
  54. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
  55. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
  56. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
  57. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
  58. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
  59. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
  60. sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
  63. sglang/srt/layers/moe/topk.py +13 -4
  64. sglang/srt/layers/quantization/__init__.py +111 -7
  65. sglang/srt/layers/quantization/blockwise_int8.py +409 -0
  66. sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  68. sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  69. sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  70. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  71. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  72. sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  73. sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  74. sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  75. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  76. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  77. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  78. sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  79. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  80. sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  81. sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  82. sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  83. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  84. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  85. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  86. sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  87. sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
  88. sglang/srt/layers/quantization/fp8.py +69 -28
  89. sglang/srt/layers/quantization/fp8_utils.py +17 -1
  90. sglang/srt/layers/quantization/gptq.py +416 -0
  91. sglang/srt/layers/quantization/int8_kernel.py +327 -0
  92. sglang/srt/layers/quantization/int8_utils.py +73 -0
  93. sglang/srt/layers/quantization/modelopt_quant.py +18 -1
  94. sglang/srt/layers/radix_attention.py +1 -0
  95. sglang/srt/layers/rotary_embedding.py +0 -1
  96. sglang/srt/layers/sampler.py +76 -31
  97. sglang/srt/layers/vocab_parallel_embedding.py +14 -13
  98. sglang/srt/lora/lora.py +17 -1
  99. sglang/srt/lora/lora_config.py +5 -0
  100. sglang/srt/lora/lora_manager.py +1 -3
  101. sglang/srt/managers/cache_controller.py +193 -62
  102. sglang/srt/managers/configure_logging.py +2 -1
  103. sglang/srt/managers/data_parallel_controller.py +6 -2
  104. sglang/srt/managers/detokenizer_manager.py +124 -102
  105. sglang/srt/managers/image_processor.py +2 -1
  106. sglang/srt/managers/io_struct.py +144 -6
  107. sglang/srt/managers/schedule_batch.py +237 -197
  108. sglang/srt/managers/schedule_policy.py +29 -29
  109. sglang/srt/managers/scheduler.py +773 -334
  110. sglang/srt/managers/session_controller.py +6 -2
  111. sglang/srt/managers/tokenizer_manager.py +225 -68
  112. sglang/srt/managers/tp_worker.py +15 -4
  113. sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
  114. sglang/srt/mem_cache/chunk_cache.py +18 -11
  115. sglang/srt/mem_cache/hiradix_cache.py +394 -0
  116. sglang/srt/mem_cache/memory_pool.py +68 -37
  117. sglang/srt/mem_cache/radix_cache.py +58 -47
  118. sglang/srt/metrics/collector.py +102 -36
  119. sglang/srt/model_executor/cuda_graph_runner.py +56 -31
  120. sglang/srt/model_executor/forward_batch_info.py +49 -16
  121. sglang/srt/model_executor/model_runner.py +280 -81
  122. sglang/srt/model_loader/loader.py +3 -3
  123. sglang/srt/model_loader/weight_utils.py +36 -14
  124. sglang/srt/models/baichuan.py +31 -6
  125. sglang/srt/models/chatglm.py +39 -7
  126. sglang/srt/models/commandr.py +29 -5
  127. sglang/srt/models/dbrx.py +31 -5
  128. sglang/srt/models/deepseek.py +43 -6
  129. sglang/srt/models/deepseek_nextn.py +32 -19
  130. sglang/srt/models/deepseek_v2.py +265 -32
  131. sglang/srt/models/exaone.py +19 -9
  132. sglang/srt/models/gemma.py +22 -8
  133. sglang/srt/models/gemma2.py +25 -12
  134. sglang/srt/models/gemma2_reward.py +5 -1
  135. sglang/srt/models/gpt2.py +28 -13
  136. sglang/srt/models/gpt_bigcode.py +27 -5
  137. sglang/srt/models/granite.py +21 -9
  138. sglang/srt/models/grok.py +21 -4
  139. sglang/srt/models/internlm2.py +36 -6
  140. sglang/srt/models/internlm2_reward.py +5 -1
  141. sglang/srt/models/llama.py +26 -9
  142. sglang/srt/models/llama_classification.py +5 -1
  143. sglang/srt/models/llama_eagle.py +17 -4
  144. sglang/srt/models/llama_embedding.py +5 -1
  145. sglang/srt/models/llama_reward.py +7 -2
  146. sglang/srt/models/llava.py +19 -3
  147. sglang/srt/models/llavavid.py +10 -1
  148. sglang/srt/models/minicpm.py +26 -2
  149. sglang/srt/models/minicpm3.py +39 -3
  150. sglang/srt/models/minicpmv.py +45 -14
  151. sglang/srt/models/mixtral.py +20 -9
  152. sglang/srt/models/mixtral_quant.py +50 -8
  153. sglang/srt/models/mllama.py +57 -11
  154. sglang/srt/models/olmo.py +34 -6
  155. sglang/srt/models/olmo2.py +34 -13
  156. sglang/srt/models/olmoe.py +26 -4
  157. sglang/srt/models/phi3_small.py +29 -10
  158. sglang/srt/models/qwen.py +26 -3
  159. sglang/srt/models/qwen2.py +26 -4
  160. sglang/srt/models/qwen2_5_vl.py +46 -8
  161. sglang/srt/models/qwen2_eagle.py +17 -5
  162. sglang/srt/models/qwen2_moe.py +44 -6
  163. sglang/srt/models/qwen2_rm.py +78 -0
  164. sglang/srt/models/qwen2_vl.py +39 -8
  165. sglang/srt/models/stablelm.py +32 -5
  166. sglang/srt/models/torch_native_llama.py +5 -2
  167. sglang/srt/models/xverse.py +21 -9
  168. sglang/srt/models/xverse_moe.py +45 -7
  169. sglang/srt/models/yivl.py +2 -1
  170. sglang/srt/openai_api/adapter.py +109 -24
  171. sglang/srt/openai_api/protocol.py +17 -1
  172. sglang/srt/reasoning_parser.py +154 -0
  173. sglang/srt/sampling/penaltylib/__init__.py +4 -6
  174. sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
  175. sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
  176. sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
  177. sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
  178. sglang/srt/sampling/sampling_batch_info.py +79 -157
  179. sglang/srt/sampling/sampling_params.py +16 -13
  180. sglang/srt/server_args.py +135 -60
  181. sglang/srt/speculative/build_eagle_tree.py +8 -9
  182. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -12
  183. sglang/srt/speculative/eagle_utils.py +92 -57
  184. sglang/srt/speculative/eagle_worker.py +238 -111
  185. sglang/srt/speculative/spec_info.py +1 -13
  186. sglang/srt/utils.py +43 -17
  187. sglang/srt/warmup.py +47 -0
  188. sglang/test/few_shot_gsm8k.py +4 -1
  189. sglang/test/runners.py +389 -126
  190. sglang/test/send_one.py +88 -0
  191. sglang/test/test_block_fp8_ep.py +361 -0
  192. sglang/test/test_programs.py +1 -1
  193. sglang/test/test_utils.py +138 -84
  194. sglang/utils.py +50 -60
  195. sglang/version.py +1 -1
  196. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/METADATA +22 -15
  197. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/RECORD +200 -166
  198. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/WHEEL +1 -1
  199. sglang/bench_latency.py +0 -1
  200. sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
  201. sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
  202. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
  203. sglang/test/srt/sampling/penaltylib/utils.py +0 -344
  204. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/LICENSE +0 -0
  205. {sglang-0.4.3.post2.dist-info → sglang-0.4.3.post4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -23,6 +23,7 @@ from typing import List, Optional
23
23
  import torch
24
24
 
25
25
  from sglang.srt.hf_transformers_utils import check_gguf_file
26
+ from sglang.srt.reasoning_parser import ReasoningParser
26
27
  from sglang.srt.utils import (
27
28
  get_amdgpu_memory_capacity,
28
29
  get_hpu_memory_capacity,
@@ -43,19 +44,19 @@ class ServerArgs:
43
44
  model_path: str
44
45
  tokenizer_path: Optional[str] = None
45
46
  tokenizer_mode: str = "auto"
47
+ skip_tokenizer_init: bool = False
46
48
  load_format: str = "auto"
47
- trust_remote_code: bool = True
49
+ trust_remote_code: bool = False
48
50
  dtype: str = "auto"
49
51
  kv_cache_dtype: str = "auto"
50
- quantization_param_path: nullable_str = None
51
52
  quantization: Optional[str] = None
53
+ quantization_param_path: nullable_str = None
52
54
  context_length: Optional[int] = None
53
55
  device: str = "cuda"
54
56
  served_model_name: Optional[str] = None
55
57
  chat_template: Optional[str] = None
56
58
  is_embedding: bool = False
57
59
  revision: Optional[str] = None
58
- skip_tokenizer_init: bool = False
59
60
 
60
61
  # Port for the HTTP server
61
62
  host: str = "127.0.0.1"
@@ -67,10 +68,9 @@ class ServerArgs:
67
68
  max_total_tokens: Optional[int] = None
68
69
  chunked_prefill_size: Optional[int] = None
69
70
  max_prefill_tokens: int = 16384
70
- schedule_policy: str = "lpm"
71
+ schedule_policy: str = "fcfs"
71
72
  schedule_conservativeness: float = 1.0
72
73
  cpu_offload_gb: int = 0
73
- prefill_only_one_req: bool = False
74
74
 
75
75
  # Other runtime options
76
76
  tp_size: int = 1
@@ -79,21 +79,25 @@ class ServerArgs:
79
79
  random_seed: Optional[int] = None
80
80
  constrained_json_whitespace_pattern: Optional[str] = None
81
81
  watchdog_timeout: float = 300
82
+ dist_timeout: Optional[int] = None # timeout for torch.distributed
82
83
  download_dir: Optional[str] = None
83
84
  base_gpu_id: int = 0
85
+ gpu_id_step: int = 1
84
86
 
85
87
  # Logging
86
88
  log_level: str = "info"
87
89
  log_level_http: Optional[str] = None
88
90
  log_requests: bool = False
91
+ log_requests_level: int = 0
89
92
  show_time_cost: bool = False
90
93
  enable_metrics: bool = False
91
94
  decode_log_interval: int = 40
92
95
 
93
96
  # API related
94
97
  api_key: Optional[str] = None
95
- file_storage_pth: str = "sglang_storage"
98
+ file_storage_path: str = "sglang_storage"
96
99
  enable_cache_report: bool = False
100
+ reasoning_parser: Optional[str] = None
97
101
 
98
102
  # Data parallelism
99
103
  dp_size: int = 1
@@ -121,11 +125,14 @@ class ServerArgs:
121
125
  grammar_backend: Optional[str] = "outlines"
122
126
 
123
127
  # Speculative decoding
124
- speculative_draft_model_path: Optional[str] = None
125
128
  speculative_algorithm: Optional[str] = None
129
+ speculative_draft_model_path: Optional[str] = None
126
130
  speculative_num_steps: int = 5
127
- speculative_num_draft_tokens: int = 64
128
- speculative_eagle_topk: int = 8
131
+ speculative_eagle_topk: int = 4
132
+ speculative_num_draft_tokens: int = 8
133
+ speculative_accept_threshold_single: float = 1.0
134
+ speculative_accept_threshold_acc: float = 1.0
135
+ speculative_token_map: Optional[str] = None
129
136
 
130
137
  # Double Sparsity
131
138
  enable_double_sparsity: bool = False
@@ -137,7 +144,6 @@ class ServerArgs:
137
144
 
138
145
  # Optimization/debug options
139
146
  disable_radix_cache: bool = False
140
- disable_jump_forward: bool = False
141
147
  disable_cuda_graph: bool = False
142
148
  disable_cuda_graph_padding: bool = False
143
149
  enable_nccl_nvls: bool = False
@@ -161,14 +167,17 @@ class ServerArgs:
161
167
  delete_ckpt_after_loading: bool = False
162
168
  enable_memory_saver: bool = False
163
169
  allow_auto_truncate: bool = False
164
- return_hidden_states: bool = False
165
-
166
- # Custom logit processor
167
170
  enable_custom_logit_processor: bool = False
168
171
  tool_call_parser: str = None
169
172
  enable_hierarchical_cache: bool = False
170
-
171
173
  enable_flashinfer_mla: bool = False
174
+ flashinfer_mla_disable_ragged: bool = False
175
+ warmups: Optional[str] = None
176
+
177
+ # Debug tensor dumps
178
+ debug_tensor_dump_output_folder: Optional[str] = None
179
+ debug_tensor_dump_input_file: Optional[str] = None
180
+ debug_tensor_dump_inject: bool = False
172
181
 
173
182
  def __post_init__(self):
174
183
  # Set missing default values
@@ -262,18 +271,22 @@ class ServerArgs:
262
271
  )
263
272
 
264
273
  # Speculative Decoding
265
- if (
266
- self.speculative_algorithm == "EAGLE"
267
- or self.speculative_algorithm == "NEXTN"
268
- ):
269
- self.prefill_only_one_req = True
270
- self.disable_cuda_graph_padding = True
271
- self.disable_radix_cache = True
274
+ if self.speculative_algorithm == "NEXTN":
275
+ # NEXTN shares the same implementation of EAGLE
276
+ self.speculative_algorithm = "EAGLE"
277
+
278
+ if self.speculative_algorithm == "EAGLE":
279
+ if self.max_running_requests is None:
280
+ self.max_running_requests = 32
272
281
  self.disable_overlap_schedule = True
273
- self.chunked_prefill_size = -1
282
+ self.disable_cuda_graph_padding = True
274
283
  logger.info(
275
- f"The radix cache, chunked prefill, and overlap scheduler are disabled because of using {self.speculative_algorithm} speculative decoding."
284
+ "Overlap scheduler are disabled because of using "
285
+ "eagle speculative decoding."
276
286
  )
287
+ # The token generated from the verify step is counted.
288
+ # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
289
+ # assert self.speculative_num_steps < self.speculative_num_draft_tokens
277
290
 
278
291
  # GGUF
279
292
  if (
@@ -377,15 +390,6 @@ class ServerArgs:
377
390
  choices=["auto", "fp8_e5m2", "fp8_e4m3"],
378
391
  help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
379
392
  )
380
- parser.add_argument(
381
- "--quantization-param-path",
382
- type=nullable_str,
383
- default=None,
384
- help="Path to the JSON file containing the KV cache "
385
- "scaling factors. This should generally be supplied, when "
386
- "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
387
- "default to 1.0, which may cause accuracy issues. ",
388
- )
389
393
  parser.add_argument(
390
394
  "--quantization",
391
395
  type=str,
@@ -404,6 +408,15 @@ class ServerArgs:
404
408
  ],
405
409
  help="The quantization method.",
406
410
  )
411
+ parser.add_argument(
412
+ "--quantization-param-path",
413
+ type=nullable_str,
414
+ default=None,
415
+ help="Path to the JSON file containing the KV cache "
416
+ "scaling factors. This should generally be supplied, when "
417
+ "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
418
+ "default to 1.0, which may cause accuracy issues. ",
419
+ )
407
420
  parser.add_argument(
408
421
  "--context-length",
409
422
  type=int,
@@ -493,12 +506,6 @@ class ServerArgs:
493
506
  default=ServerArgs.cpu_offload_gb,
494
507
  help="How many GBs of RAM to reserve for CPU offloading",
495
508
  )
496
- parser.add_argument(
497
- "--prefill-only-one-req",
498
- type=bool,
499
- help="If true, we only prefill one request at one prefill batch",
500
- default=ServerArgs.prefill_only_one_req,
501
- )
502
509
 
503
510
  # Other runtime options
504
511
  parser.add_argument(
@@ -537,11 +544,17 @@ class ServerArgs:
537
544
  default=ServerArgs.watchdog_timeout,
538
545
  help="Set watchdog timeout in seconds. If a forward batch takes longer than this, the server will crash to prevent hanging.",
539
546
  )
547
+ parser.add_argument(
548
+ "--dist-timeout",
549
+ type=int,
550
+ default=ServerArgs.dist_timeout,
551
+ help="Set timeout for torch.distributed initialization.",
552
+ )
540
553
  parser.add_argument(
541
554
  "--download-dir",
542
555
  type=str,
543
556
  default=ServerArgs.download_dir,
544
- help="Model download directory.",
557
+ help="Model download directory for huggingface.",
545
558
  )
546
559
  parser.add_argument(
547
560
  "--base-gpu-id",
@@ -549,6 +562,12 @@ class ServerArgs:
549
562
  default=ServerArgs.base_gpu_id,
550
563
  help="The base GPU ID to start allocating GPUs from. Useful when running multiple instances on the same machine.",
551
564
  )
565
+ parser.add_argument(
566
+ "--gpu-id-step",
567
+ type=int,
568
+ default=ServerArgs.gpu_id_step,
569
+ help="The delta between consecutive GPU IDs that are used. For example, setting it to 2 will use GPU 0,2,4,...",
570
+ )
552
571
 
553
572
  # Logging
554
573
  parser.add_argument(
@@ -566,7 +585,14 @@ class ServerArgs:
566
585
  parser.add_argument(
567
586
  "--log-requests",
568
587
  action="store_true",
569
- help="Log the inputs and outputs of all requests.",
588
+ help="Log metadata, inputs, outputs of all requests. The verbosity is decided by --log-requests-level",
589
+ )
590
+ parser.add_argument(
591
+ "--log-requests-level",
592
+ type=int,
593
+ default=0,
594
+ help="0: Log metadata. 1. Log metadata and partial input/output. 2. Log every input/output.",
595
+ choices=[0, 1, 2],
570
596
  )
571
597
  parser.add_argument(
572
598
  "--show-time-cost",
@@ -593,9 +619,9 @@ class ServerArgs:
593
619
  help="Set API key of the server. It is also used in the OpenAI API compatible server.",
594
620
  )
595
621
  parser.add_argument(
596
- "--file-storage-pth",
622
+ "--file-storage-path",
597
623
  type=str,
598
- default=ServerArgs.file_storage_pth,
624
+ default=ServerArgs.file_storage_path,
599
625
  help="The path of the file storage in backend.",
600
626
  )
601
627
  parser.add_argument(
@@ -603,6 +629,13 @@ class ServerArgs:
603
629
  action="store_true",
604
630
  help="Return number of cached tokens in usage.prompt_tokens_details for each openai request.",
605
631
  )
632
+ parser.add_argument(
633
+ "--reasoning-parser",
634
+ type=str,
635
+ choices=list(ReasoningParser.DetectorMap.keys()),
636
+ default=ServerArgs.reasoning_parser,
637
+ help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
638
+ )
606
639
 
607
640
  # Data parallelism
608
641
  parser.add_argument(
@@ -694,7 +727,7 @@ class ServerArgs:
694
727
  parser.add_argument(
695
728
  "--grammar-backend",
696
729
  type=str,
697
- choices=["xgrammar", "outlines"],
730
+ choices=["xgrammar", "outlines", "llguidance"],
698
731
  default=ServerArgs.grammar_backend,
699
732
  help="Choose the backend for grammar-guided decoding.",
700
733
  )
@@ -703,6 +736,11 @@ class ServerArgs:
703
736
  action="store_true",
704
737
  help="Enable FlashInfer MLA optimization",
705
738
  )
739
+ parser.add_argument(
740
+ "--flashinfer-mla-disable-ragged",
741
+ action="store_true",
742
+ help="Not using ragged prefill wrapper when running flashinfer mla",
743
+ )
706
744
 
707
745
  # Speculative decoding
708
746
  parser.add_argument(
@@ -722,18 +760,36 @@ class ServerArgs:
722
760
  help="The number of steps sampled from draft model in Speculative Decoding.",
723
761
  default=ServerArgs.speculative_num_steps,
724
762
  )
763
+ parser.add_argument(
764
+ "--speculative-eagle-topk",
765
+ type=int,
766
+ help="The number of tokens sampled from the draft model in eagle2 each step.",
767
+ choices=[1, 2, 4, 8],
768
+ default=ServerArgs.speculative_eagle_topk,
769
+ )
725
770
  parser.add_argument(
726
771
  "--speculative-num-draft-tokens",
727
772
  type=int,
728
- help="The number of token sampled from draft model in Speculative Decoding.",
773
+ help="The number of tokens sampled from the draft model in Speculative Decoding.",
729
774
  default=ServerArgs.speculative_num_draft_tokens,
730
775
  )
731
776
  parser.add_argument(
732
- "--speculative-eagle-topk",
733
- type=int,
734
- help="The number of token sampled from draft model in eagle2 each step.",
735
- choices=[1, 2, 4, 8],
736
- default=ServerArgs.speculative_eagle_topk,
777
+ "--speculative-accept-threshold-single",
778
+ type=float,
779
+ help="Accept a draft token if its probability in the target model is greater than this threshold.",
780
+ default=ServerArgs.speculative_accept_threshold_single,
781
+ )
782
+ parser.add_argument(
783
+ "--speculative-accept-threshold-acc",
784
+ type=float,
785
+ help="The accept probability of a draft token is raised from its target probability p to min(1, p / threshold_acc).",
786
+ default=ServerArgs.speculative_accept_threshold_acc,
787
+ )
788
+ parser.add_argument(
789
+ "--speculative-token-map",
790
+ type=str,
791
+ help="The path of the draft model's small vocab table.",
792
+ default=ServerArgs.speculative_token_map,
737
793
  )
738
794
 
739
795
  # Double Sparsity
@@ -779,11 +835,6 @@ class ServerArgs:
779
835
  action="store_true",
780
836
  help="Disable RadixAttention for prefix caching.",
781
837
  )
782
- parser.add_argument(
783
- "--disable-jump-forward",
784
- action="store_true",
785
- help="Disable jump-forward for grammar-guided decoding.",
786
- )
787
838
  parser.add_argument(
788
839
  "--disable-cuda-graph",
789
840
  action="store_true",
@@ -913,12 +964,6 @@ class ServerArgs:
913
964
  action="store_true",
914
965
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
915
966
  )
916
- parser.add_argument(
917
- "--return-hidden-states",
918
- action="store_true",
919
- help="Return hidden states in the response.",
920
- )
921
- # Function Calling
922
967
  parser.add_argument(
923
968
  "--tool-call-parser",
924
969
  type=str,
@@ -932,6 +977,35 @@ class ServerArgs:
932
977
  help="Enable hierarchical cache",
933
978
  )
934
979
 
980
+ # Server warmups
981
+ parser.add_argument(
982
+ "--warmups",
983
+ type=str,
984
+ required=False,
985
+ help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
986
+ "will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
987
+ )
988
+
989
+ # Debug tensor dumps
990
+ parser.add_argument(
991
+ "--debug-tensor-dump-output-folder",
992
+ type=str,
993
+ default=ServerArgs.debug_tensor_dump_output_folder,
994
+ help="The output folder for dumping tensors.",
995
+ )
996
+ parser.add_argument(
997
+ "--debug-tensor-dump-input-file",
998
+ type=str,
999
+ default=ServerArgs.debug_tensor_dump_input_file,
1000
+ help="The input filename for dumping tensors",
1001
+ )
1002
+ parser.add_argument(
1003
+ "--debug-tensor-dump-inject",
1004
+ type=str,
1005
+ default=ServerArgs.debug_tensor_dump_inject,
1006
+ help="Inject the outputs from jax as the input of every layer.",
1007
+ )
1008
+
935
1009
  @classmethod
936
1010
  def from_cli_args(cls, args: argparse.Namespace):
937
1011
  args.tp_size = args.tensor_parallel_size
@@ -960,6 +1034,7 @@ class ServerArgs:
960
1034
  and (self.lora_paths is None or self.disable_radix_cache)
961
1035
  ), "compatibility of lora and cuda graph and radix attention is in progress"
962
1036
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1037
+ assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
963
1038
 
964
1039
  if isinstance(self.lora_paths, list):
965
1040
  lora_paths = self.lora_paths
@@ -3,14 +3,8 @@
3
3
  from typing import List
4
4
 
5
5
  import torch
6
-
7
- from sglang.srt.utils import is_cuda_available
8
-
9
- if is_cuda_available():
10
- from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
11
- from sgl_kernel import (
12
- build_tree_kernel_efficient as sgl_build_tree_kernel_efficient,
13
- )
6
+ from sgl_kernel import build_tree_kernel as sgl_build_tree_kernel
7
+ from sgl_kernel import build_tree_kernel_efficient as sgl_build_tree_kernel_efficient
14
8
 
15
9
 
16
10
  def build_tree_kernel_efficient_preprocess(
@@ -32,7 +26,12 @@ def build_tree_kernel_efficient_preprocess(
32
26
 
33
27
  draft_tokens = torch.gather(ss_token_list, index=top_scores_index, dim=1)
34
28
  draft_tokens = torch.cat((verified_id.unsqueeze(1), draft_tokens), dim=1).flatten()
35
- parent_list = torch.cat(parents_list[:-1], dim=1)
29
+
30
+ if len(parents_list) > 1:
31
+ parent_list = torch.cat(parents_list[:-1], dim=1)
32
+ else:
33
+ batch_size = parents_list[0].shape[0]
34
+ parent_list = torch.empty(batch_size, 0, device=parents_list[0].device)
36
35
 
37
36
  return parent_list, top_scores_index, draft_tokens
38
37
 
@@ -1,7 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import bisect
4
- import time
5
4
  from typing import TYPE_CHECKING, Callable
6
5
 
7
6
  import torch
@@ -21,7 +20,6 @@ from sglang.srt.model_executor.forward_batch_info import (
21
20
  from sglang.srt.speculative.eagle_utils import EagleDraftInput
22
21
 
23
22
  if TYPE_CHECKING:
24
- from sglang.srt.model_executor.model_runner import ModelRunner
25
23
  from sglang.srt.speculative.eagle_worker import EAGLEWorker
26
24
 
27
25
 
@@ -163,20 +161,11 @@ class EAGLEDraftCudaGraphRunner:
163
161
 
164
162
  run_once()
165
163
 
166
- torch.cuda.synchronize()
167
- self.model_runner.tp_group.barrier()
168
-
169
- torch.cuda.synchronize()
170
- self.model_runner.tp_group.barrier()
171
-
172
164
  with torch.cuda.graph(
173
165
  graph, pool=get_global_graph_memory_pool(), stream=stream
174
166
  ):
175
167
  out = run_once()
176
168
 
177
- torch.cuda.synchronize()
178
- self.model_runner.tp_group.barrier()
179
-
180
169
  set_global_graph_memory_pool(graph.pool())
181
170
  return graph, out
182
171
 
@@ -205,7 +194,7 @@ class EAGLEDraftCudaGraphRunner:
205
194
 
206
195
  # Attention backend
207
196
  self.model_runner.draft_attn_backend.init_forward_metadata_replay_cuda_graph(
208
- forward_batch
197
+ forward_batch, forward_batch.batch_size
209
198
  )
210
199
 
211
200
  # Replay