sglang 0.5.0rc1__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (203) hide show
  1. sglang/bench_one_batch.py +0 -7
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +25 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -2
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +29 -4
  24. sglang/srt/entrypoints/http_server.py +76 -0
  25. sglang/srt/entrypoints/openai/protocol.py +4 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +23 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +10 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +14 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +227 -76
  37. sglang/srt/layers/attention/triton_backend.py +109 -73
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +398 -36
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +49 -19
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +58 -10
  46. sglang/srt/layers/dp_attention.py +137 -27
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +16 -18
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=129,N=352,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_2_0/E=161,N=192,device_name=NVIDIA_A800-SXM4-80GB,dtype=int8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_0/E=16,N=1024,device_name=NVIDIA_B200.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
  60. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=640,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  61. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  62. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  63. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  64. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=257,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  67. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  68. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  69. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  70. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  71. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  72. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  73. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  75. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  76. sglang/srt/layers/moe/router.py +15 -9
  77. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  78. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  79. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  80. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  81. sglang/srt/layers/moe/topk.py +167 -83
  82. sglang/srt/layers/moe/utils.py +159 -18
  83. sglang/srt/layers/multimodal.py +156 -40
  84. sglang/srt/layers/quantization/__init__.py +18 -46
  85. sglang/srt/layers/quantization/awq.py +22 -23
  86. sglang/srt/layers/quantization/base_config.py +2 -6
  87. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  88. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -29
  89. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  90. sglang/srt/layers/quantization/fp8.py +127 -119
  91. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  92. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  93. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  94. sglang/srt/layers/quantization/gptq.py +17 -21
  95. sglang/srt/layers/quantization/marlin_utils.py +26 -8
  96. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  97. sglang/srt/layers/quantization/modelopt_quant.py +217 -98
  98. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  99. sglang/srt/layers/quantization/mxfp4.py +222 -39
  100. sglang/srt/layers/quantization/quark/quark.py +390 -0
  101. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  102. sglang/srt/layers/quantization/unquant.py +34 -70
  103. sglang/srt/layers/quantization/utils.py +77 -2
  104. sglang/srt/layers/quantization/w4afp8.py +7 -8
  105. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  106. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  107. sglang/srt/layers/radix_attention.py +6 -0
  108. sglang/srt/layers/rotary_embedding.py +1 -0
  109. sglang/srt/layers/sampler.py +5 -2
  110. sglang/srt/lora/layers.py +6 -2
  111. sglang/srt/lora/lora_manager.py +21 -22
  112. sglang/srt/lora/lora_registry.py +3 -3
  113. sglang/srt/lora/mem_pool.py +26 -24
  114. sglang/srt/lora/utils.py +10 -12
  115. sglang/srt/managers/cache_controller.py +80 -19
  116. sglang/srt/managers/detokenizer_manager.py +10 -2
  117. sglang/srt/managers/io_struct.py +23 -0
  118. sglang/srt/managers/mm_utils.py +1 -1
  119. sglang/srt/managers/schedule_batch.py +22 -48
  120. sglang/srt/managers/scheduler.py +28 -20
  121. sglang/srt/managers/session_controller.py +1 -1
  122. sglang/srt/managers/template_manager.py +7 -5
  123. sglang/srt/managers/tokenizer_manager.py +88 -39
  124. sglang/srt/managers/tp_worker.py +1 -0
  125. sglang/srt/managers/utils.py +59 -1
  126. sglang/srt/mem_cache/allocator.py +10 -157
  127. sglang/srt/mem_cache/allocator_ascend.py +147 -0
  128. sglang/srt/mem_cache/chunk_cache.py +1 -1
  129. sglang/srt/mem_cache/hicache_storage.py +14 -4
  130. sglang/srt/mem_cache/memory_pool.py +3 -3
  131. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  132. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  133. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  134. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  135. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  136. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  137. sglang/srt/model_executor/cuda_graph_runner.py +33 -33
  138. sglang/srt/model_executor/forward_batch_info.py +11 -10
  139. sglang/srt/model_executor/model_runner.py +93 -78
  140. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  141. sglang/srt/model_loader/loader.py +24 -6
  142. sglang/srt/models/dbrx.py +12 -6
  143. sglang/srt/models/deepseek.py +2 -1
  144. sglang/srt/models/deepseek_nextn.py +5 -2
  145. sglang/srt/models/deepseek_v2.py +226 -223
  146. sglang/srt/models/ernie4.py +2 -2
  147. sglang/srt/models/glm4_moe.py +27 -65
  148. sglang/srt/models/glm4_moe_nextn.py +2 -1
  149. sglang/srt/models/glm4v.py +52 -1
  150. sglang/srt/models/glm4v_moe.py +8 -11
  151. sglang/srt/models/gpt_oss.py +41 -76
  152. sglang/srt/models/granitemoe.py +0 -1
  153. sglang/srt/models/grok.py +376 -48
  154. sglang/srt/models/interns1.py +12 -47
  155. sglang/srt/models/internvl.py +6 -51
  156. sglang/srt/models/llama.py +10 -2
  157. sglang/srt/models/llama4.py +18 -7
  158. sglang/srt/models/minicpm3.py +0 -1
  159. sglang/srt/models/mixtral.py +0 -2
  160. sglang/srt/models/nemotron_nas.py +435 -0
  161. sglang/srt/models/olmoe.py +0 -1
  162. sglang/srt/models/phi4mm.py +3 -21
  163. sglang/srt/models/qwen2.py +2 -2
  164. sglang/srt/models/qwen2_5_vl.py +2 -0
  165. sglang/srt/models/qwen2_moe.py +23 -23
  166. sglang/srt/models/qwen3.py +2 -2
  167. sglang/srt/models/qwen3_classification.py +84 -0
  168. sglang/srt/models/qwen3_moe.py +27 -43
  169. sglang/srt/models/step3_vl.py +8 -3
  170. sglang/srt/models/xverse_moe.py +11 -5
  171. sglang/srt/multimodal/processors/base_processor.py +3 -3
  172. sglang/srt/multimodal/processors/internvl.py +7 -2
  173. sglang/srt/multimodal/processors/llava.py +11 -7
  174. sglang/srt/offloader.py +433 -0
  175. sglang/srt/operations.py +22 -2
  176. sglang/srt/reasoning_parser.py +4 -3
  177. sglang/srt/sampling/sampling_batch_info.py +7 -4
  178. sglang/srt/server_args.py +264 -105
  179. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +8 -21
  180. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +7 -21
  181. sglang/srt/speculative/eagle_utils.py +36 -13
  182. sglang/srt/speculative/eagle_worker.py +56 -3
  183. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  184. sglang/srt/two_batch_overlap.py +20 -19
  185. sglang/srt/utils.py +68 -70
  186. sglang/test/runners.py +8 -5
  187. sglang/test/test_block_fp8.py +5 -6
  188. sglang/test/test_block_fp8_ep.py +13 -19
  189. sglang/test/test_cutlass_moe.py +4 -6
  190. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  191. sglang/test/test_fp4_moe.py +4 -3
  192. sglang/test/test_marlin_moe.py +1 -1
  193. sglang/test/test_marlin_utils.py +1 -1
  194. sglang/test/test_utils.py +7 -0
  195. sglang/utils.py +0 -1
  196. sglang/version.py +1 -1
  197. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/METADATA +11 -11
  198. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/RECORD +201 -171
  199. sglang/srt/layers/quantization/fp4.py +0 -557
  200. sglang/srt/layers/quantization/scalar_type.py +0 -352
  201. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  202. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  203. {sglang-0.5.0rc1.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -23,8 +23,9 @@ import sys
23
23
  import tempfile
24
24
  from typing import List, Literal, Optional, Union
25
25
 
26
+ from sglang.srt.function_call.function_call_parser import FunctionCallParser
26
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
27
- from sglang.srt.layers.utils import is_sm100_supported
28
+ from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
28
29
  from sglang.srt.lora.lora_registry import LoRARef
29
30
  from sglang.srt.reasoning_parser import ReasoningParser
30
31
  from sglang.srt.utils import (
@@ -33,10 +34,12 @@ from sglang.srt.utils import (
33
34
  configure_ipv6,
34
35
  get_device,
35
36
  get_device_memory_capacity,
37
+ is_cuda,
36
38
  is_flashinfer_available,
37
39
  is_hip,
38
40
  is_port_available,
39
41
  is_remote_url,
42
+ is_triton_kernels_available,
40
43
  is_valid_ipv6_address,
41
44
  nullable_str,
42
45
  )
@@ -82,7 +85,6 @@ class ServerArgs:
82
85
  max_prefill_tokens: int = 16384
83
86
  schedule_policy: str = "fcfs"
84
87
  schedule_conservativeness: float = 1.0
85
- cpu_offload_gb: int = 0
86
88
  page_size: Optional[int] = None
87
89
  hybrid_kvcache_ratio: Optional[float] = None
88
90
  swa_full_tokens_ratio: float = 0.8
@@ -120,10 +122,12 @@ class ServerArgs:
120
122
  decode_log_interval: int = 40
121
123
  enable_request_time_stats_logging: bool = False
122
124
  kv_events_config: Optional[str] = None
125
+ gc_warning_threshold_secs: float = 0.0
123
126
 
124
127
  # API related
125
128
  api_key: Optional[str] = None
126
129
  served_model_name: Optional[str] = None
130
+ weight_version: str = "default"
127
131
  chat_template: Optional[str] = None
128
132
  completion_template: Optional[str] = None
129
133
  file_storage_path: str = "sglang_storage"
@@ -149,7 +153,9 @@ class ServerArgs:
149
153
  enable_lora: Optional[bool] = None
150
154
  max_lora_rank: Optional[int] = None
151
155
  lora_target_modules: Optional[Union[set[str], List[str]]] = None
152
- lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
156
+ lora_paths: Optional[
157
+ Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]
158
+ ] = None
153
159
  max_loaded_loras: Optional[int] = None
154
160
  max_loras_per_batch: int = 8
155
161
  lora_backend: str = "triton"
@@ -174,9 +180,16 @@ class ServerArgs:
174
180
 
175
181
  # Expert parallelism
176
182
  ep_size: int = 1
177
- moe_a2a_backend: Optional[Literal["deepep"]] = None
178
- enable_flashinfer_cutlass_moe: bool = False
179
- enable_flashinfer_trtllm_moe: bool = False
183
+ moe_a2a_backend: Literal["none", "deepep"] = "none"
184
+ moe_runner_backend: Literal[
185
+ "auto",
186
+ "triton",
187
+ "triton_kernel",
188
+ "flashinfer_trtllm",
189
+ "flashinfer_cutlass",
190
+ "flashinfer_mxfp4",
191
+ ] = "auto"
192
+ flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
180
193
  enable_flashinfer_allreduce_fusion: bool = False
181
194
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
182
195
  ep_num_redundant_experts: int = 0
@@ -212,6 +225,13 @@ class ServerArgs:
212
225
  ds_heavy_channel_type: str = "qk"
213
226
  ds_sparse_decode_threshold: int = 4096
214
227
 
228
+ # Offloading
229
+ cpu_offload_gb: int = 0
230
+ offload_group_size: int = -1
231
+ offload_num_in_group: int = 1
232
+ offload_prefetch_step: int = 1
233
+ offload_mode: str = "cpu"
234
+
215
235
  # Optimization/debug options
216
236
  disable_radix_cache: bool = False
217
237
  cuda_graph_max_bs: Optional[int] = None
@@ -222,6 +242,7 @@ class ServerArgs:
222
242
  enable_cudagraph_gc: bool = False
223
243
  enable_nccl_nvls: bool = False
224
244
  enable_symm_mem: bool = False
245
+ disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
225
246
  enable_tokenizer_batch_encode: bool = False
226
247
  disable_outlines_disk_cache: bool = False
227
248
  disable_custom_all_reduce: bool = False
@@ -249,8 +270,6 @@ class ServerArgs:
249
270
  disable_chunked_prefix_cache: bool = False
250
271
  disable_fast_image_processor: bool = False
251
272
  enable_return_hidden_states: bool = False
252
- enable_triton_kernel_moe: bool = False
253
- enable_flashinfer_mxfp4_moe: bool = False
254
273
  scheduler_recv_interval: int = 1
255
274
 
256
275
  # Debug tensor dumps
@@ -281,12 +300,13 @@ class ServerArgs:
281
300
  # Deprecated arguments
282
301
  enable_ep_moe: bool = False
283
302
  enable_deepep_moe: bool = False
303
+ enable_flashinfer_cutlass_moe: bool = False
304
+ enable_flashinfer_trtllm_moe: bool = False
305
+ enable_triton_kernel_moe: bool = False
306
+ enable_flashinfer_mxfp4_moe: bool = False
284
307
 
285
308
  def __post_init__(self):
286
309
  # Check deprecated arguments
287
- def print_deprecated_warning(message: str):
288
- logger.warning(f"\033[33m{message}\033[0m")
289
-
290
310
  if self.enable_ep_moe:
291
311
  self.ep_size = self.tp_size
292
312
  print_deprecated_warning(
@@ -297,6 +317,26 @@ class ServerArgs:
297
317
  print_deprecated_warning(
298
318
  "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
299
319
  )
320
+ if self.enable_triton_kernel_moe:
321
+ self.moe_runner_backend = "triton_kernel"
322
+ print_deprecated_warning(
323
+ "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
324
+ )
325
+ if self.enable_flashinfer_cutlass_moe:
326
+ self.moe_runner_backend = "flashinfer_cutlass"
327
+ print_deprecated_warning(
328
+ "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
329
+ )
330
+ if self.enable_flashinfer_trtllm_moe:
331
+ self.moe_runner_backend = "flashinfer_trtllm"
332
+ print_deprecated_warning(
333
+ "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
334
+ )
335
+ if self.enable_flashinfer_mxfp4_moe:
336
+ self.moe_runner_backend = "flashinfer_mxfp4"
337
+ print_deprecated_warning(
338
+ "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
339
+ )
300
340
 
301
341
  # Set missing default values
302
342
  if self.tokenizer_path is None:
@@ -447,11 +487,6 @@ class ServerArgs:
447
487
  )
448
488
  self.page_size = 64
449
489
 
450
- if self.speculative_algorithm is not None:
451
- raise ValueError(
452
- "trtllm_mla backend does not support speculative decoding yet."
453
- )
454
-
455
490
  if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
456
491
  raise ValueError(
457
492
  "TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
@@ -473,11 +508,6 @@ class ServerArgs:
473
508
  )
474
509
  self.page_size = 64
475
510
 
476
- if self.speculative_algorithm is not None:
477
- raise ValueError(
478
- "trtllm_mha backend does not support speculative decoding yet."
479
- )
480
-
481
511
  if self.attention_backend == "dual_chunk_flash_attn":
482
512
  logger.warning(
483
513
  "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
@@ -516,17 +546,16 @@ class ServerArgs:
516
546
  ), "Please enable dp attention when setting enable_dp_lm_head. "
517
547
 
518
548
  # MoE kernel
519
- if self.enable_flashinfer_cutlass_moe:
549
+ if self.moe_runner_backend == "flashinfer_cutlass":
520
550
  assert (
521
551
  self.quantization == "modelopt_fp4"
522
552
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
523
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
524
553
  assert self.ep_size in [
525
554
  1,
526
555
  self.tp_size,
527
556
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
528
557
 
529
- if self.enable_flashinfer_trtllm_moe:
558
+ if self.moe_runner_backend == "flashinfer_trtllm":
530
559
  if not self.disable_shared_experts_fusion:
531
560
  self.disable_shared_experts_fusion = True
532
561
  logger.warning(
@@ -555,7 +584,7 @@ class ServerArgs:
555
584
  self.ep_dispatch_algorithm = "static"
556
585
 
557
586
  if self.enable_eplb:
558
- assert self.ep_size > 1 or self.moe_a2a_backend is not None
587
+ assert self.ep_size > 1
559
588
 
560
589
  if self.enable_expert_distribution_metrics and (
561
590
  self.expert_distribution_recorder_mode is None
@@ -575,6 +604,7 @@ class ServerArgs:
575
604
  "Pipeline parallelism is incompatible with overlap schedule."
576
605
  )
577
606
 
607
+ # Hicache
578
608
  if self.hicache_storage_backend == "mooncake":
579
609
  # to use mooncake storage backend, the following conditions must be met:
580
610
  self.hicache_io_backend = "kernel"
@@ -609,6 +639,10 @@ class ServerArgs:
609
639
  logger.warning(
610
640
  "DeepSeek MTP does not require setting speculative_draft_model_path."
611
641
  )
642
+ if self.page_size != 1 and self.attention_backend == "flashinfer":
643
+ raise ValueError(
644
+ "Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
645
+ )
612
646
 
613
647
  # Auto choose parameters
614
648
  if self.speculative_num_steps is None:
@@ -622,6 +656,16 @@ class ServerArgs:
622
656
  self.speculative_num_draft_tokens,
623
657
  ) = auto_choose_speculative_params(self)
624
658
 
659
+ if (
660
+ self.attention_backend == "trtllm_mha"
661
+ or self.decode_attention_backend == "trtllm_mha"
662
+ or self.prefill_attention_backend == "trtllm_mha"
663
+ ):
664
+ if self.speculative_eagle_topk > 1:
665
+ raise ValueError(
666
+ "trtllm_mha backend only supports topk = 1 for speculative decoding."
667
+ )
668
+
625
669
  if (
626
670
  self.speculative_eagle_topk == 1
627
671
  and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
@@ -679,6 +723,12 @@ class ServerArgs:
679
723
  "1" if self.disable_outlines_disk_cache else "0"
680
724
  )
681
725
 
726
+ if self.enable_hierarchical_cache and self.disable_radix_cache:
727
+ raise ValueError(
728
+ "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
729
+ "and cannot be used at the same time. Please use only one of them."
730
+ )
731
+
682
732
  @staticmethod
683
733
  def add_cli_args(parser: argparse.ArgumentParser):
684
734
  # Model and tokenizer
@@ -932,12 +982,6 @@ class ServerArgs:
932
982
  default=ServerArgs.schedule_conservativeness,
933
983
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
934
984
  )
935
- parser.add_argument(
936
- "--cpu-offload-gb",
937
- type=int,
938
- default=ServerArgs.cpu_offload_gb,
939
- help="How many GBs of RAM to reserve for CPU offloading.",
940
- )
941
985
  parser.add_argument(
942
986
  "--page-size",
943
987
  type=int,
@@ -1130,6 +1174,12 @@ class ServerArgs:
1130
1174
  default=ServerArgs.collect_tokens_histogram,
1131
1175
  help="Collect prompt/generation tokens histogram.",
1132
1176
  )
1177
+ parser.add_argument(
1178
+ "--gc-warning-threshold-secs",
1179
+ type=float,
1180
+ default=ServerArgs.gc_warning_threshold_secs,
1181
+ help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
1182
+ )
1133
1183
  parser.add_argument(
1134
1184
  "--decode-log-interval",
1135
1185
  type=int,
@@ -1162,6 +1212,12 @@ class ServerArgs:
1162
1212
  default=ServerArgs.served_model_name,
1163
1213
  help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
1164
1214
  )
1215
+ parser.add_argument(
1216
+ "--weight-version",
1217
+ type=str,
1218
+ default=ServerArgs.weight_version,
1219
+ help="Version identifier for the model weights. Defaults to 'default' if not specified.",
1220
+ )
1165
1221
  parser.add_argument(
1166
1222
  "--chat-template",
1167
1223
  type=str,
@@ -1192,23 +1248,13 @@ class ServerArgs:
1192
1248
  default=ServerArgs.reasoning_parser,
1193
1249
  help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
1194
1250
  )
1251
+ tool_call_parser_choices = list(FunctionCallParser.ToolCallParserEnum.keys())
1195
1252
  parser.add_argument(
1196
1253
  "--tool-call-parser",
1197
1254
  type=str,
1198
- choices=[ # TODO: use FunctionCallParser.DetectorMap.keys()
1199
- "qwen25",
1200
- "mistral",
1201
- "llama3",
1202
- "deepseekv3",
1203
- "pythonic",
1204
- "kimi_k2",
1205
- "qwen3_coder",
1206
- "glm45",
1207
- "step3",
1208
- "gpt-oss",
1209
- ],
1255
+ choices=tool_call_parser_choices,
1210
1256
  default=ServerArgs.tool_call_parser,
1211
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1257
+ help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
1212
1258
  )
1213
1259
  parser.add_argument(
1214
1260
  "--tool-server",
@@ -1293,7 +1339,7 @@ class ServerArgs:
1293
1339
  nargs="*",
1294
1340
  default=None,
1295
1341
  action=LoRAPathAction,
1296
- help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
1342
+ help='The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool}',
1297
1343
  )
1298
1344
  parser.add_argument(
1299
1345
  "--max-loras-per-batch",
@@ -1316,19 +1362,23 @@ class ServerArgs:
1316
1362
 
1317
1363
  # Kernel backend
1318
1364
  ATTN_BACKENDS = [
1319
- "aiter",
1365
+ # Common
1366
+ "triton",
1367
+ "torch_native",
1368
+ # NVIDIA specific
1320
1369
  "cutlass_mla",
1321
1370
  "fa3",
1322
1371
  "flashinfer",
1323
1372
  "flashmla",
1324
- "intel_amx",
1325
- "torch_native",
1326
- "ascend",
1327
- "triton",
1328
1373
  "trtllm_mla",
1329
1374
  "trtllm_mha",
1330
1375
  "dual_chunk_flash_attn",
1376
+ # AMD specific
1377
+ "aiter",
1331
1378
  "wave",
1379
+ # Other platforms
1380
+ "intel_amx",
1381
+ "ascend",
1332
1382
  ]
1333
1383
  parser.add_argument(
1334
1384
  "--attention-backend",
@@ -1434,19 +1484,30 @@ class ServerArgs:
1434
1484
  parser.add_argument(
1435
1485
  "--moe-a2a-backend",
1436
1486
  type=str,
1437
- choices=["deepep"],
1487
+ choices=["none", "deepep"],
1438
1488
  default=ServerArgs.moe_a2a_backend,
1439
1489
  help="Choose the backend for MoE A2A.",
1440
1490
  )
1441
1491
  parser.add_argument(
1442
- "--enable-flashinfer-cutlass-moe",
1443
- action="store_true",
1444
- help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
1492
+ "--moe-runner-backend",
1493
+ type=str,
1494
+ choices=[
1495
+ "auto",
1496
+ "triton",
1497
+ "triton_kernel",
1498
+ "flashinfer_trtllm",
1499
+ "flashinfer_cutlass",
1500
+ "flashinfer_mxfp4",
1501
+ ],
1502
+ default=ServerArgs.moe_runner_backend,
1503
+ help="Choose the runner backend for MoE.",
1445
1504
  )
1446
1505
  parser.add_argument(
1447
- "--enable-flashinfer-trtllm-moe",
1448
- action="store_true",
1449
- help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
1506
+ "--flashinfer-mxfp4-moe-precision",
1507
+ type=str,
1508
+ choices=["mxfp4", "bf16"],
1509
+ default=ServerArgs.flashinfer_mxfp4_moe_precision,
1510
+ help="Choose the computation precision of flashinfer mxfp4 moe",
1450
1511
  )
1451
1512
  parser.add_argument(
1452
1513
  "--enable-flashinfer-allreduce-fusion",
@@ -1622,6 +1683,38 @@ class ServerArgs:
1622
1683
  help="The type of heavy channels in double sparsity attention",
1623
1684
  )
1624
1685
 
1686
+ # Offloading
1687
+ parser.add_argument(
1688
+ "--cpu-offload-gb",
1689
+ type=int,
1690
+ default=ServerArgs.cpu_offload_gb,
1691
+ help="How many GBs of RAM to reserve for CPU offloading.",
1692
+ )
1693
+ parser.add_argument(
1694
+ "--offload-group-size",
1695
+ type=int,
1696
+ default=ServerArgs.offload_group_size,
1697
+ help="Number of layers per group in offloading.",
1698
+ )
1699
+ parser.add_argument(
1700
+ "--offload-num-in-group",
1701
+ type=int,
1702
+ default=ServerArgs.offload_num_in_group,
1703
+ help="Number of layers to be offloaded within a group.",
1704
+ )
1705
+ parser.add_argument(
1706
+ "--offload-prefetch-step",
1707
+ type=int,
1708
+ default=ServerArgs.offload_prefetch_step,
1709
+ help="Steps to prefetch in offloading.",
1710
+ )
1711
+ parser.add_argument(
1712
+ "--offload-mode",
1713
+ type=str,
1714
+ default=ServerArgs.offload_mode,
1715
+ help="Mode of offloading.",
1716
+ )
1717
+
1625
1718
  # Optimization/debug options
1626
1719
  parser.add_argument(
1627
1720
  "--disable-radix-cache",
@@ -1670,6 +1763,11 @@ class ServerArgs:
1670
1763
  action="store_true",
1671
1764
  help="Enable NCCL symmetric memory for fast collectives.",
1672
1765
  )
1766
+ parser.add_argument(
1767
+ "--disable-flashinfer-cutlass-moe-fp4-allgather",
1768
+ action="store_true",
1769
+ help="Disables quantize before all-gather for flashinfer cutlass moe.",
1770
+ )
1673
1771
  parser.add_argument(
1674
1772
  "--enable-tokenizer-batch-encode",
1675
1773
  action="store_true",
@@ -1813,16 +1911,6 @@ class ServerArgs:
1813
1911
  action="store_true",
1814
1912
  help="Enable returning hidden states with responses.",
1815
1913
  )
1816
- parser.add_argument(
1817
- "--enable-triton-kernel-moe",
1818
- action="store_true",
1819
- help="Use triton moe grouped gemm kernel.",
1820
- )
1821
- parser.add_argument(
1822
- "--enable-flashinfer-mxfp4-moe",
1823
- action="store_true",
1824
- help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
1825
- )
1826
1914
  parser.add_argument(
1827
1915
  "--scheduler-recv-interval",
1828
1916
  type=int,
@@ -1923,24 +2011,25 @@ class ServerArgs:
1923
2011
  default=None,
1924
2012
  help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
1925
2013
  )
2014
+ parser.add_argument(
2015
+ "--weight-loader-disable-mmap",
2016
+ action="store_true",
2017
+ help="Disable mmap while loading weight using safetensors.",
2018
+ )
2019
+
2020
+ # For PD-Multiplexing
1926
2021
  parser.add_argument(
1927
2022
  "--enable-pdmux",
1928
2023
  action="store_true",
1929
2024
  help="Enable PD-Multiplexing, PD running on greenctx stream.",
1930
2025
  )
1931
2026
 
1932
- # For PD-Multiplexing
1933
2027
  parser.add_argument(
1934
2028
  "--sm-group-num",
1935
2029
  type=int,
1936
2030
  default=ServerArgs.sm_group_num,
1937
2031
  help="Number of sm partition groups.",
1938
2032
  )
1939
- parser.add_argument(
1940
- "--weight-loader-disable-mmap",
1941
- action="store_true",
1942
- help="Disable mmap while loading weight using safetensors.",
1943
- )
1944
2033
 
1945
2034
  # Deprecated arguments
1946
2035
  parser.add_argument(
@@ -1953,6 +2042,26 @@ class ServerArgs:
1953
2042
  action="store_true",
1954
2043
  help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
1955
2044
  )
2045
+ parser.add_argument(
2046
+ "--enable-flashinfer-cutlass-moe",
2047
+ action="store_true",
2048
+ help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2049
+ )
2050
+ parser.add_argument(
2051
+ "--enable-flashinfer-trtllm-moe",
2052
+ action="store_true",
2053
+ help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
2054
+ )
2055
+ parser.add_argument(
2056
+ "--enable-triton-kernel-moe",
2057
+ action="store_true",
2058
+ help="(Deprecated) Use triton moe grouped gemm kernel.",
2059
+ )
2060
+ parser.add_argument(
2061
+ "--enable-flashinfer-mxfp4-moe",
2062
+ action="store_true",
2063
+ help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
2064
+ )
1956
2065
 
1957
2066
  @classmethod
1958
2067
  def from_cli_args(cls, args: argparse.Namespace):
@@ -2037,28 +2146,42 @@ class ServerArgs:
2037
2146
  )
2038
2147
 
2039
2148
  if self.enable_lora:
2040
- # Normalize lora_paths to a dictionary if it is a list.
2041
- # TODO (lifuhuang): support specifying pinned adapters in server_args.
2042
2149
  if isinstance(self.lora_paths, list):
2043
2150
  lora_paths = self.lora_paths
2044
- self.lora_paths = {}
2151
+ self.lora_paths = []
2045
2152
  for lora_path in lora_paths:
2046
- if "=" in lora_path:
2047
- name, path = lora_path.split("=", 1)
2048
- self.lora_paths[name] = LoRARef(
2049
- lora_name=name, lora_path=path, pinned=False
2153
+ if isinstance(lora_path, str):
2154
+ if "=" in lora_path:
2155
+ name, path = lora_path.split("=", 1)
2156
+ lora_ref = LoRARef(
2157
+ lora_name=name, lora_path=path, pinned=False
2158
+ )
2159
+ else:
2160
+ lora_ref = LoRARef(
2161
+ lora_name=lora_path, lora_path=lora_path, pinned=False
2162
+ )
2163
+ elif isinstance(lora_path, dict):
2164
+ assert (
2165
+ "lora_name" in lora_path and "lora_path" in lora_path
2166
+ ), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
2167
+ lora_ref = LoRARef(
2168
+ lora_name=lora_path["lora_name"],
2169
+ lora_path=lora_path["lora_path"],
2170
+ pinned=lora_path.get("pinned", False),
2050
2171
  )
2051
2172
  else:
2052
- self.lora_paths[lora_path] = LoRARef(
2053
- lora_name=lora_path, lora_path=lora_path, pinned=False
2173
+ raise ValueError(
2174
+ f"Invalid type for item in --lora-paths list: {type(lora_path)}. "
2175
+ "Expected a string or a dictionary."
2054
2176
  )
2177
+ self.lora_paths.append(lora_ref)
2055
2178
  elif isinstance(self.lora_paths, dict):
2056
- self.lora_paths = {
2057
- k: LoRARef(lora_name=k, lora_path=v, pinned=False)
2179
+ self.lora_paths = [
2180
+ LoRARef(lora_name=k, lora_path=v, pinned=False)
2058
2181
  for k, v in self.lora_paths.items()
2059
- }
2182
+ ]
2060
2183
  elif self.lora_paths is None:
2061
- self.lora_paths = {}
2184
+ self.lora_paths = []
2062
2185
  else:
2063
2186
  raise ValueError(
2064
2187
  f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
@@ -2085,9 +2208,7 @@ class ServerArgs:
2085
2208
  "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
2086
2209
  f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
2087
2210
  )
2088
- assert (
2089
- not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
2090
- ), (
2211
+ assert len(self.lora_paths) <= self.max_loaded_loras, (
2091
2212
  "The number of LoRA paths should not exceed max_loaded_loras. "
2092
2213
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2093
2214
  )
@@ -2105,11 +2226,26 @@ class ServerArgs:
2105
2226
  model_arch = hf_config.architectures[0]
2106
2227
  if model_arch in ["GptOssForCausalLM"]:
2107
2228
  if self.attention_backend is None:
2108
- self.attention_backend = "triton"
2229
+ if is_cuda() and is_sm100_supported():
2230
+ self.attention_backend = "trtllm_mha"
2231
+ elif is_cuda() and is_sm90_supported():
2232
+ self.attention_backend = "fa3"
2233
+ else:
2234
+ self.attention_backend = "triton"
2109
2235
  supported_backends = ["triton", "trtllm_mha", "fa3"]
2236
+ logger.info(
2237
+ f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
2238
+ )
2110
2239
  assert (
2111
2240
  self.attention_backend in supported_backends
2112
2241
  ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2242
+
2243
+ if is_sm100_supported():
2244
+ if not self.enable_dp_attention:
2245
+ self.enable_flashinfer_allreduce_fusion = True
2246
+ logger.info(
2247
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2248
+ )
2113
2249
  quantization_config = getattr(hf_config, "quantization_config", None)
2114
2250
  is_mxfp4_quant_format = (
2115
2251
  quantization_config is not None
@@ -2117,18 +2253,21 @@ class ServerArgs:
2117
2253
  )
2118
2254
 
2119
2255
  if is_sm100_supported() and is_mxfp4_quant_format:
2120
- self.enable_flashinfer_mxfp4_moe = True
2121
- self.enable_triton_kernel_moe = False
2256
+ self.moe_runner_backend = "flashinfer_mxfp4"
2122
2257
  logger.warning(
2123
2258
  "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2124
2259
  )
2125
2260
  else:
2126
- if self.enable_triton_kernel_moe:
2261
+ if self.moe_runner_backend == "triton_kernel":
2127
2262
  assert (
2128
2263
  self.ep_size == 1
2129
2264
  ), "Triton kernel MoE is only supported when ep_size == 1"
2130
- if not self.enable_triton_kernel_moe and self.ep_size == 1:
2131
- self.enable_triton_kernel_moe = True
2265
+ if (
2266
+ self.moe_runner_backend == "auto"
2267
+ and self.ep_size == 1
2268
+ and is_triton_kernels_available()
2269
+ ):
2270
+ self.moe_runner_backend = "triton_kernel"
2132
2271
  logger.warning(
2133
2272
  "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2134
2273
  )
@@ -2137,7 +2276,10 @@ class ServerArgs:
2137
2276
  # use bf16 for mxfp4 triton kernels
2138
2277
  self.dtype = "bfloat16"
2139
2278
  elif "Llama4" in model_arch:
2140
- assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
2279
+ assert self.attention_backend in {
2280
+ "fa3",
2281
+ "aiter",
2282
+ }, "fa3 or aiter is required for Llama4 model"
2141
2283
  elif model_arch in [
2142
2284
  "Gemma2ForCausalLM",
2143
2285
  "Gemma3ForCausalLM",
@@ -2291,13 +2433,22 @@ class PortArgs:
2291
2433
 
2292
2434
  class LoRAPathAction(argparse.Action):
2293
2435
  def __call__(self, parser, namespace, values, option_string=None):
2294
- setattr(namespace, self.dest, {})
2295
- for lora_path in values:
2296
- if "=" in lora_path:
2297
- name, path = lora_path.split("=", 1)
2298
- getattr(namespace, self.dest)[name] = path
2299
- else:
2300
- getattr(namespace, self.dest)[lora_path] = lora_path
2436
+ lora_paths = []
2437
+ if values:
2438
+ assert isinstance(values, list), "Expected a list of LoRA paths."
2439
+ for lora_path in values:
2440
+ lora_path = lora_path.strip()
2441
+ if lora_path.startswith("{") and lora_path.endswith("}"):
2442
+ obj = json.loads(lora_path)
2443
+ assert "lora_path" in obj and "lora_name" in obj, (
2444
+ f"{repr(lora_path)} looks like a JSON str, "
2445
+ "but it does not contain 'lora_name' and 'lora_path' keys."
2446
+ )
2447
+ lora_paths.append(obj)
2448
+ else:
2449
+ lora_paths.append(lora_path)
2450
+
2451
+ setattr(namespace, self.dest, lora_paths)
2301
2452
 
2302
2453
 
2303
2454
  class DeprecatedAction(argparse.Action):
@@ -2310,6 +2461,10 @@ class DeprecatedAction(argparse.Action):
2310
2461
  raise ValueError(self.help)
2311
2462
 
2312
2463
 
2464
+ def print_deprecated_warning(message: str):
2465
+ logger.warning(f"\033[33m{message}\033[0m")
2466
+
2467
+
2313
2468
  def auto_choose_speculative_params(self: ServerArgs):
2314
2469
  """
2315
2470
  Automatically choose the parameters for speculative decoding.
@@ -2322,8 +2477,12 @@ def auto_choose_speculative_params(self: ServerArgs):
2322
2477
  if arch in ["LlamaForCausalLM"]:
2323
2478
  # The default value for llama
2324
2479
  return (5, 4, 8)
2325
- elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
2326
- # The default value for deepseek
2480
+ elif arch in [
2481
+ "DeepseekV3ForCausalLM",
2482
+ "DeepseekV2ForCausalLM",
2483
+ "GptOssForCausalLM",
2484
+ ]:
2485
+ # The default value for deepseek and gpt-oss
2327
2486
  return (3, 1, 4)
2328
2487
  elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
2329
2488
  return (5, 4, 8)