sglang 0.5.0rc2__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_one_batch.py +0 -6
  2. sglang/bench_one_batch_server.py +7 -2
  3. sglang/bench_serving.py +3 -3
  4. sglang/eval/llama3_eval.py +0 -1
  5. sglang/srt/configs/model_config.py +24 -9
  6. sglang/srt/configs/update_config.py +40 -5
  7. sglang/srt/constrained/xgrammar_backend.py +23 -11
  8. sglang/srt/conversation.py +2 -15
  9. sglang/srt/disaggregation/ascend/conn.py +1 -3
  10. sglang/srt/disaggregation/base/conn.py +1 -0
  11. sglang/srt/disaggregation/decode.py +1 -1
  12. sglang/srt/disaggregation/launch_lb.py +7 -1
  13. sglang/srt/disaggregation/mini_lb.py +11 -5
  14. sglang/srt/disaggregation/mooncake/conn.py +141 -47
  15. sglang/srt/disaggregation/prefill.py +261 -5
  16. sglang/srt/disaggregation/utils.py +2 -1
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +1 -1
  18. sglang/srt/distributed/device_communicators/pynccl.py +68 -18
  19. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +52 -0
  20. sglang/srt/distributed/naive_distributed.py +112 -0
  21. sglang/srt/distributed/parallel_state.py +90 -4
  22. sglang/srt/entrypoints/context.py +20 -1
  23. sglang/srt/entrypoints/engine.py +27 -2
  24. sglang/srt/entrypoints/http_server.py +12 -0
  25. sglang/srt/entrypoints/openai/protocol.py +2 -2
  26. sglang/srt/entrypoints/openai/serving_chat.py +22 -6
  27. sglang/srt/entrypoints/openai/serving_completions.py +9 -1
  28. sglang/srt/entrypoints/openai/serving_responses.py +2 -2
  29. sglang/srt/eplb/expert_distribution.py +2 -3
  30. sglang/srt/function_call/deepseekv3_detector.py +1 -1
  31. sglang/srt/hf_transformers_utils.py +24 -0
  32. sglang/srt/host_shared_memory.py +83 -0
  33. sglang/srt/layers/attention/ascend_backend.py +132 -22
  34. sglang/srt/layers/attention/flashattention_backend.py +24 -17
  35. sglang/srt/layers/attention/flashinfer_backend.py +11 -3
  36. sglang/srt/layers/attention/flashinfer_mla_backend.py +226 -76
  37. sglang/srt/layers/attention/triton_backend.py +85 -46
  38. sglang/srt/layers/attention/triton_ops/decode_attention.py +33 -2
  39. sglang/srt/layers/attention/triton_ops/extend_attention.py +32 -2
  40. sglang/srt/layers/attention/trtllm_mha_backend.py +390 -30
  41. sglang/srt/layers/attention/trtllm_mla_backend.py +39 -16
  42. sglang/srt/layers/attention/utils.py +94 -15
  43. sglang/srt/layers/attention/vision.py +40 -13
  44. sglang/srt/layers/attention/vision_utils.py +65 -0
  45. sglang/srt/layers/communicator.py +51 -3
  46. sglang/srt/layers/dp_attention.py +23 -4
  47. sglang/srt/layers/elementwise.py +94 -0
  48. sglang/srt/layers/flashinfer_comm_fusion.py +29 -1
  49. sglang/srt/layers/layernorm.py +8 -1
  50. sglang/srt/layers/linear.py +24 -0
  51. sglang/srt/layers/logits_processor.py +5 -1
  52. sglang/srt/layers/moe/__init__.py +31 -0
  53. sglang/srt/layers/moe/ep_moe/layer.py +37 -33
  54. sglang/srt/layers/moe/fused_moe_native.py +14 -25
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=384,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=704,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=161,N=384,device_name=NVIDIA_RTX_PRO_6000_Blackwell_Max-Q_Workstation_Edition,dtype=fp8_w8a8.json +146 -0
  59. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +69 -76
  60. sglang/srt/layers/moe/fused_moe_triton/layer.py +66 -123
  61. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +20 -18
  62. sglang/srt/layers/moe/moe_runner/__init__.py +3 -0
  63. sglang/srt/layers/moe/moe_runner/base.py +13 -0
  64. sglang/srt/layers/moe/rocm_moe_utils.py +141 -0
  65. sglang/srt/layers/moe/router.py +15 -9
  66. sglang/srt/layers/moe/token_dispatcher/__init__.py +6 -0
  67. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +55 -14
  68. sglang/srt/layers/moe/token_dispatcher/deepep.py +11 -21
  69. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  70. sglang/srt/layers/moe/topk.py +167 -83
  71. sglang/srt/layers/moe/utils.py +159 -18
  72. sglang/srt/layers/quantization/__init__.py +13 -14
  73. sglang/srt/layers/quantization/awq.py +7 -7
  74. sglang/srt/layers/quantization/base_config.py +2 -6
  75. sglang/srt/layers/quantization/blockwise_int8.py +4 -12
  76. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +72 -28
  77. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -1
  78. sglang/srt/layers/quantization/fp8.py +127 -119
  79. sglang/srt/layers/quantization/fp8_kernel.py +195 -24
  80. sglang/srt/layers/quantization/fp8_utils.py +34 -9
  81. sglang/srt/layers/quantization/fpgemm_fp8.py +203 -0
  82. sglang/srt/layers/quantization/gptq.py +5 -4
  83. sglang/srt/layers/quantization/marlin_utils.py +11 -3
  84. sglang/srt/layers/quantization/marlin_utils_fp8.py +352 -0
  85. sglang/srt/layers/quantization/modelopt_quant.py +165 -68
  86. sglang/srt/layers/quantization/moe_wna16.py +10 -15
  87. sglang/srt/layers/quantization/mxfp4.py +206 -37
  88. sglang/srt/layers/quantization/quark/quark.py +390 -0
  89. sglang/srt/layers/quantization/quark/quark_moe.py +197 -0
  90. sglang/srt/layers/quantization/unquant.py +34 -70
  91. sglang/srt/layers/quantization/utils.py +25 -0
  92. sglang/srt/layers/quantization/w4afp8.py +7 -8
  93. sglang/srt/layers/quantization/w8a8_fp8.py +5 -13
  94. sglang/srt/layers/quantization/w8a8_int8.py +5 -13
  95. sglang/srt/layers/radix_attention.py +6 -0
  96. sglang/srt/layers/rotary_embedding.py +1 -0
  97. sglang/srt/lora/lora_manager.py +21 -22
  98. sglang/srt/lora/lora_registry.py +3 -3
  99. sglang/srt/lora/mem_pool.py +26 -24
  100. sglang/srt/lora/utils.py +10 -12
  101. sglang/srt/managers/cache_controller.py +76 -18
  102. sglang/srt/managers/detokenizer_manager.py +10 -2
  103. sglang/srt/managers/io_struct.py +9 -0
  104. sglang/srt/managers/mm_utils.py +1 -1
  105. sglang/srt/managers/schedule_batch.py +4 -9
  106. sglang/srt/managers/scheduler.py +25 -16
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/template_manager.py +7 -5
  109. sglang/srt/managers/tokenizer_manager.py +60 -21
  110. sglang/srt/managers/tp_worker.py +1 -0
  111. sglang/srt/managers/utils.py +59 -1
  112. sglang/srt/mem_cache/allocator.py +7 -5
  113. sglang/srt/mem_cache/allocator_ascend.py +0 -11
  114. sglang/srt/mem_cache/hicache_storage.py +14 -4
  115. sglang/srt/mem_cache/memory_pool.py +3 -3
  116. sglang/srt/mem_cache/memory_pool_host.py +35 -2
  117. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +56 -12
  118. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +8 -4
  119. sglang/srt/mem_cache/storage/nixl/hicache_nixl.py +153 -59
  120. sglang/srt/mem_cache/storage/nixl/nixl_utils.py +19 -53
  121. sglang/srt/mem_cache/storage/nixl/test_hicache_nixl_storage.py +46 -7
  122. sglang/srt/model_executor/cuda_graph_runner.py +25 -12
  123. sglang/srt/model_executor/forward_batch_info.py +4 -1
  124. sglang/srt/model_executor/model_runner.py +43 -32
  125. sglang/srt/model_executor/npu_graph_runner.py +94 -0
  126. sglang/srt/model_loader/loader.py +24 -6
  127. sglang/srt/models/dbrx.py +12 -6
  128. sglang/srt/models/deepseek.py +2 -1
  129. sglang/srt/models/deepseek_nextn.py +3 -1
  130. sglang/srt/models/deepseek_v2.py +224 -223
  131. sglang/srt/models/ernie4.py +2 -2
  132. sglang/srt/models/glm4_moe.py +25 -63
  133. sglang/srt/models/glm4v.py +52 -1
  134. sglang/srt/models/glm4v_moe.py +8 -11
  135. sglang/srt/models/gpt_oss.py +34 -74
  136. sglang/srt/models/granitemoe.py +0 -1
  137. sglang/srt/models/grok.py +376 -48
  138. sglang/srt/models/interns1.py +12 -47
  139. sglang/srt/models/internvl.py +6 -51
  140. sglang/srt/models/llama4.py +0 -2
  141. sglang/srt/models/minicpm3.py +0 -1
  142. sglang/srt/models/mixtral.py +0 -2
  143. sglang/srt/models/nemotron_nas.py +435 -0
  144. sglang/srt/models/olmoe.py +0 -1
  145. sglang/srt/models/phi4mm.py +3 -21
  146. sglang/srt/models/qwen2_5_vl.py +2 -0
  147. sglang/srt/models/qwen2_moe.py +3 -18
  148. sglang/srt/models/qwen3.py +2 -2
  149. sglang/srt/models/qwen3_classification.py +7 -1
  150. sglang/srt/models/qwen3_moe.py +9 -38
  151. sglang/srt/models/step3_vl.py +2 -1
  152. sglang/srt/models/xverse_moe.py +11 -5
  153. sglang/srt/multimodal/processors/base_processor.py +3 -3
  154. sglang/srt/multimodal/processors/internvl.py +7 -2
  155. sglang/srt/multimodal/processors/llava.py +11 -7
  156. sglang/srt/offloader.py +433 -0
  157. sglang/srt/operations.py +6 -1
  158. sglang/srt/reasoning_parser.py +4 -3
  159. sglang/srt/server_args.py +237 -104
  160. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +1 -0
  161. sglang/srt/speculative/eagle_utils.py +36 -13
  162. sglang/srt/speculative/eagle_worker.py +56 -3
  163. sglang/srt/tokenizer/tiktoken_tokenizer.py +161 -0
  164. sglang/srt/two_batch_overlap.py +16 -11
  165. sglang/srt/utils.py +68 -70
  166. sglang/test/runners.py +8 -5
  167. sglang/test/test_block_fp8.py +5 -6
  168. sglang/test/test_block_fp8_ep.py +13 -19
  169. sglang/test/test_cutlass_moe.py +4 -6
  170. sglang/test/test_cutlass_w4a8_moe.py +4 -3
  171. sglang/test/test_fp4_moe.py +4 -3
  172. sglang/test/test_utils.py +7 -0
  173. sglang/utils.py +0 -1
  174. sglang/version.py +1 -1
  175. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/METADATA +7 -7
  176. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/RECORD +179 -161
  177. sglang/srt/layers/quantization/fp4.py +0 -557
  178. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/WHEEL +0 -0
  179. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.5.0rc2.dist-info → sglang-0.5.1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -23,6 +23,7 @@ import sys
23
23
  import tempfile
24
24
  from typing import List, Literal, Optional, Union
25
25
 
26
+ from sglang.srt.function_call.function_call_parser import FunctionCallParser
26
27
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
27
28
  from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
28
29
  from sglang.srt.lora.lora_registry import LoRARef
@@ -33,10 +34,12 @@ from sglang.srt.utils import (
33
34
  configure_ipv6,
34
35
  get_device,
35
36
  get_device_memory_capacity,
37
+ is_cuda,
36
38
  is_flashinfer_available,
37
39
  is_hip,
38
40
  is_port_available,
39
41
  is_remote_url,
42
+ is_triton_kernels_available,
40
43
  is_valid_ipv6_address,
41
44
  nullable_str,
42
45
  )
@@ -82,7 +85,6 @@ class ServerArgs:
82
85
  max_prefill_tokens: int = 16384
83
86
  schedule_policy: str = "fcfs"
84
87
  schedule_conservativeness: float = 1.0
85
- cpu_offload_gb: int = 0
86
88
  page_size: Optional[int] = None
87
89
  hybrid_kvcache_ratio: Optional[float] = None
88
90
  swa_full_tokens_ratio: float = 0.8
@@ -120,6 +122,7 @@ class ServerArgs:
120
122
  decode_log_interval: int = 40
121
123
  enable_request_time_stats_logging: bool = False
122
124
  kv_events_config: Optional[str] = None
125
+ gc_warning_threshold_secs: float = 0.0
123
126
 
124
127
  # API related
125
128
  api_key: Optional[str] = None
@@ -150,7 +153,9 @@ class ServerArgs:
150
153
  enable_lora: Optional[bool] = None
151
154
  max_lora_rank: Optional[int] = None
152
155
  lora_target_modules: Optional[Union[set[str], List[str]]] = None
153
- lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
156
+ lora_paths: Optional[
157
+ Union[dict[str, str], List[dict[str, str]], List[str], List[LoRARef]]
158
+ ] = None
154
159
  max_loaded_loras: Optional[int] = None
155
160
  max_loras_per_batch: int = 8
156
161
  lora_backend: str = "triton"
@@ -175,9 +180,16 @@ class ServerArgs:
175
180
 
176
181
  # Expert parallelism
177
182
  ep_size: int = 1
178
- moe_a2a_backend: Optional[Literal["deepep"]] = None
179
- enable_flashinfer_cutlass_moe: bool = False
180
- enable_flashinfer_trtllm_moe: bool = False
183
+ moe_a2a_backend: Literal["none", "deepep"] = "none"
184
+ moe_runner_backend: Literal[
185
+ "auto",
186
+ "triton",
187
+ "triton_kernel",
188
+ "flashinfer_trtllm",
189
+ "flashinfer_cutlass",
190
+ "flashinfer_mxfp4",
191
+ ] = "auto"
192
+ flashinfer_mxfp4_moe_precision: Literal["default", "bf16"] = "default"
181
193
  enable_flashinfer_allreduce_fusion: bool = False
182
194
  deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
183
195
  ep_num_redundant_experts: int = 0
@@ -213,6 +225,13 @@ class ServerArgs:
213
225
  ds_heavy_channel_type: str = "qk"
214
226
  ds_sparse_decode_threshold: int = 4096
215
227
 
228
+ # Offloading
229
+ cpu_offload_gb: int = 0
230
+ offload_group_size: int = -1
231
+ offload_num_in_group: int = 1
232
+ offload_prefetch_step: int = 1
233
+ offload_mode: str = "cpu"
234
+
216
235
  # Optimization/debug options
217
236
  disable_radix_cache: bool = False
218
237
  cuda_graph_max_bs: Optional[int] = None
@@ -223,6 +242,7 @@ class ServerArgs:
223
242
  enable_cudagraph_gc: bool = False
224
243
  enable_nccl_nvls: bool = False
225
244
  enable_symm_mem: bool = False
245
+ disable_flashinfer_cutlass_moe_fp4_allgather: bool = False
226
246
  enable_tokenizer_batch_encode: bool = False
227
247
  disable_outlines_disk_cache: bool = False
228
248
  disable_custom_all_reduce: bool = False
@@ -250,8 +270,6 @@ class ServerArgs:
250
270
  disable_chunked_prefix_cache: bool = False
251
271
  disable_fast_image_processor: bool = False
252
272
  enable_return_hidden_states: bool = False
253
- enable_triton_kernel_moe: bool = False
254
- enable_flashinfer_mxfp4_moe: bool = False
255
273
  scheduler_recv_interval: int = 1
256
274
 
257
275
  # Debug tensor dumps
@@ -282,12 +300,13 @@ class ServerArgs:
282
300
  # Deprecated arguments
283
301
  enable_ep_moe: bool = False
284
302
  enable_deepep_moe: bool = False
303
+ enable_flashinfer_cutlass_moe: bool = False
304
+ enable_flashinfer_trtllm_moe: bool = False
305
+ enable_triton_kernel_moe: bool = False
306
+ enable_flashinfer_mxfp4_moe: bool = False
285
307
 
286
308
  def __post_init__(self):
287
309
  # Check deprecated arguments
288
- def print_deprecated_warning(message: str):
289
- logger.warning(f"\033[33m{message}\033[0m")
290
-
291
310
  if self.enable_ep_moe:
292
311
  self.ep_size = self.tp_size
293
312
  print_deprecated_warning(
@@ -298,6 +317,26 @@ class ServerArgs:
298
317
  print_deprecated_warning(
299
318
  "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
300
319
  )
320
+ if self.enable_triton_kernel_moe:
321
+ self.moe_runner_backend = "triton_kernel"
322
+ print_deprecated_warning(
323
+ "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
324
+ )
325
+ if self.enable_flashinfer_cutlass_moe:
326
+ self.moe_runner_backend = "flashinfer_cutlass"
327
+ print_deprecated_warning(
328
+ "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
329
+ )
330
+ if self.enable_flashinfer_trtllm_moe:
331
+ self.moe_runner_backend = "flashinfer_trtllm"
332
+ print_deprecated_warning(
333
+ "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
334
+ )
335
+ if self.enable_flashinfer_mxfp4_moe:
336
+ self.moe_runner_backend = "flashinfer_mxfp4"
337
+ print_deprecated_warning(
338
+ "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
339
+ )
301
340
 
302
341
  # Set missing default values
303
342
  if self.tokenizer_path is None:
@@ -448,11 +487,6 @@ class ServerArgs:
448
487
  )
449
488
  self.page_size = 64
450
489
 
451
- if self.speculative_algorithm is not None:
452
- raise ValueError(
453
- "trtllm_mla backend does not support speculative decoding yet."
454
- )
455
-
456
490
  if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
457
491
  raise ValueError(
458
492
  "TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
@@ -474,11 +508,6 @@ class ServerArgs:
474
508
  )
475
509
  self.page_size = 64
476
510
 
477
- if self.speculative_algorithm is not None:
478
- raise ValueError(
479
- "trtllm_mha backend does not support speculative decoding yet."
480
- )
481
-
482
511
  if self.attention_backend == "dual_chunk_flash_attn":
483
512
  logger.warning(
484
513
  "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
@@ -517,17 +546,16 @@ class ServerArgs:
517
546
  ), "Please enable dp attention when setting enable_dp_lm_head. "
518
547
 
519
548
  # MoE kernel
520
- if self.enable_flashinfer_cutlass_moe:
549
+ if self.moe_runner_backend == "flashinfer_cutlass":
521
550
  assert (
522
551
  self.quantization == "modelopt_fp4"
523
552
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
524
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
525
553
  assert self.ep_size in [
526
554
  1,
527
555
  self.tp_size,
528
556
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
529
557
 
530
- if self.enable_flashinfer_trtllm_moe:
558
+ if self.moe_runner_backend == "flashinfer_trtllm":
531
559
  if not self.disable_shared_experts_fusion:
532
560
  self.disable_shared_experts_fusion = True
533
561
  logger.warning(
@@ -556,7 +584,7 @@ class ServerArgs:
556
584
  self.ep_dispatch_algorithm = "static"
557
585
 
558
586
  if self.enable_eplb:
559
- assert self.ep_size > 1 or self.moe_a2a_backend is not None
587
+ assert self.ep_size > 1
560
588
 
561
589
  if self.enable_expert_distribution_metrics and (
562
590
  self.expert_distribution_recorder_mode is None
@@ -611,6 +639,10 @@ class ServerArgs:
611
639
  logger.warning(
612
640
  "DeepSeek MTP does not require setting speculative_draft_model_path."
613
641
  )
642
+ if self.page_size != 1 and self.attention_backend == "flashinfer":
643
+ raise ValueError(
644
+ "Speculative decoding with page_size != 1 is not supported. Please set page_size to 1."
645
+ )
614
646
 
615
647
  # Auto choose parameters
616
648
  if self.speculative_num_steps is None:
@@ -624,6 +656,16 @@ class ServerArgs:
624
656
  self.speculative_num_draft_tokens,
625
657
  ) = auto_choose_speculative_params(self)
626
658
 
659
+ if (
660
+ self.attention_backend == "trtllm_mha"
661
+ or self.decode_attention_backend == "trtllm_mha"
662
+ or self.prefill_attention_backend == "trtllm_mha"
663
+ ):
664
+ if self.speculative_eagle_topk > 1:
665
+ raise ValueError(
666
+ "trtllm_mha backend only supports topk = 1 for speculative decoding."
667
+ )
668
+
627
669
  if (
628
670
  self.speculative_eagle_topk == 1
629
671
  and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
@@ -681,6 +723,12 @@ class ServerArgs:
681
723
  "1" if self.disable_outlines_disk_cache else "0"
682
724
  )
683
725
 
726
+ if self.enable_hierarchical_cache and self.disable_radix_cache:
727
+ raise ValueError(
728
+ "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
729
+ "and cannot be used at the same time. Please use only one of them."
730
+ )
731
+
684
732
  @staticmethod
685
733
  def add_cli_args(parser: argparse.ArgumentParser):
686
734
  # Model and tokenizer
@@ -934,12 +982,6 @@ class ServerArgs:
934
982
  default=ServerArgs.schedule_conservativeness,
935
983
  help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
936
984
  )
937
- parser.add_argument(
938
- "--cpu-offload-gb",
939
- type=int,
940
- default=ServerArgs.cpu_offload_gb,
941
- help="How many GBs of RAM to reserve for CPU offloading.",
942
- )
943
985
  parser.add_argument(
944
986
  "--page-size",
945
987
  type=int,
@@ -1132,6 +1174,12 @@ class ServerArgs:
1132
1174
  default=ServerArgs.collect_tokens_histogram,
1133
1175
  help="Collect prompt/generation tokens histogram.",
1134
1176
  )
1177
+ parser.add_argument(
1178
+ "--gc-warning-threshold-secs",
1179
+ type=float,
1180
+ default=ServerArgs.gc_warning_threshold_secs,
1181
+ help="The threshold for long GC warning. If a GC takes longer than this, a warning will be logged. Set to 0 to disable.",
1182
+ )
1135
1183
  parser.add_argument(
1136
1184
  "--decode-log-interval",
1137
1185
  type=int,
@@ -1200,23 +1248,13 @@ class ServerArgs:
1200
1248
  default=ServerArgs.reasoning_parser,
1201
1249
  help=f"Specify the parser for reasoning models, supported parsers are: {list(ReasoningParser.DetectorMap.keys())}.",
1202
1250
  )
1251
+ tool_call_parser_choices = list(FunctionCallParser.ToolCallParserEnum.keys())
1203
1252
  parser.add_argument(
1204
1253
  "--tool-call-parser",
1205
1254
  type=str,
1206
- choices=[ # TODO: use FunctionCallParser.DetectorMap.keys()
1207
- "qwen25",
1208
- "mistral",
1209
- "llama3",
1210
- "deepseekv3",
1211
- "pythonic",
1212
- "kimi_k2",
1213
- "qwen3_coder",
1214
- "glm45",
1215
- "step3",
1216
- "gpt-oss",
1217
- ],
1255
+ choices=tool_call_parser_choices,
1218
1256
  default=ServerArgs.tool_call_parser,
1219
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1257
+ help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
1220
1258
  )
1221
1259
  parser.add_argument(
1222
1260
  "--tool-server",
@@ -1301,7 +1339,7 @@ class ServerArgs:
1301
1339
  nargs="*",
1302
1340
  default=None,
1303
1341
  action=LoRAPathAction,
1304
- help="The list of LoRA adapters. You can provide a list of either path in str or renamed path in the format {name}={path}.",
1342
+ help='The list of LoRA adapters to load. Each adapter must be specified in one of the following formats: <PATH> | <NAME>=<PATH> | JSON with schema {"lora_name":str,"lora_path":str,"pinned":bool}',
1305
1343
  )
1306
1344
  parser.add_argument(
1307
1345
  "--max-loras-per-batch",
@@ -1446,19 +1484,30 @@ class ServerArgs:
1446
1484
  parser.add_argument(
1447
1485
  "--moe-a2a-backend",
1448
1486
  type=str,
1449
- choices=["deepep"],
1487
+ choices=["none", "deepep"],
1450
1488
  default=ServerArgs.moe_a2a_backend,
1451
1489
  help="Choose the backend for MoE A2A.",
1452
1490
  )
1453
1491
  parser.add_argument(
1454
- "--enable-flashinfer-cutlass-moe",
1455
- action="store_true",
1456
- help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
1492
+ "--moe-runner-backend",
1493
+ type=str,
1494
+ choices=[
1495
+ "auto",
1496
+ "triton",
1497
+ "triton_kernel",
1498
+ "flashinfer_trtllm",
1499
+ "flashinfer_cutlass",
1500
+ "flashinfer_mxfp4",
1501
+ ],
1502
+ default=ServerArgs.moe_runner_backend,
1503
+ help="Choose the runner backend for MoE.",
1457
1504
  )
1458
1505
  parser.add_argument(
1459
- "--enable-flashinfer-trtllm-moe",
1460
- action="store_true",
1461
- help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
1506
+ "--flashinfer-mxfp4-moe-precision",
1507
+ type=str,
1508
+ choices=["mxfp4", "bf16"],
1509
+ default=ServerArgs.flashinfer_mxfp4_moe_precision,
1510
+ help="Choose the computation precision of flashinfer mxfp4 moe",
1462
1511
  )
1463
1512
  parser.add_argument(
1464
1513
  "--enable-flashinfer-allreduce-fusion",
@@ -1634,6 +1683,38 @@ class ServerArgs:
1634
1683
  help="The type of heavy channels in double sparsity attention",
1635
1684
  )
1636
1685
 
1686
+ # Offloading
1687
+ parser.add_argument(
1688
+ "--cpu-offload-gb",
1689
+ type=int,
1690
+ default=ServerArgs.cpu_offload_gb,
1691
+ help="How many GBs of RAM to reserve for CPU offloading.",
1692
+ )
1693
+ parser.add_argument(
1694
+ "--offload-group-size",
1695
+ type=int,
1696
+ default=ServerArgs.offload_group_size,
1697
+ help="Number of layers per group in offloading.",
1698
+ )
1699
+ parser.add_argument(
1700
+ "--offload-num-in-group",
1701
+ type=int,
1702
+ default=ServerArgs.offload_num_in_group,
1703
+ help="Number of layers to be offloaded within a group.",
1704
+ )
1705
+ parser.add_argument(
1706
+ "--offload-prefetch-step",
1707
+ type=int,
1708
+ default=ServerArgs.offload_prefetch_step,
1709
+ help="Steps to prefetch in offloading.",
1710
+ )
1711
+ parser.add_argument(
1712
+ "--offload-mode",
1713
+ type=str,
1714
+ default=ServerArgs.offload_mode,
1715
+ help="Mode of offloading.",
1716
+ )
1717
+
1637
1718
  # Optimization/debug options
1638
1719
  parser.add_argument(
1639
1720
  "--disable-radix-cache",
@@ -1682,6 +1763,11 @@ class ServerArgs:
1682
1763
  action="store_true",
1683
1764
  help="Enable NCCL symmetric memory for fast collectives.",
1684
1765
  )
1766
+ parser.add_argument(
1767
+ "--disable-flashinfer-cutlass-moe-fp4-allgather",
1768
+ action="store_true",
1769
+ help="Disables quantize before all-gather for flashinfer cutlass moe.",
1770
+ )
1685
1771
  parser.add_argument(
1686
1772
  "--enable-tokenizer-batch-encode",
1687
1773
  action="store_true",
@@ -1825,16 +1911,6 @@ class ServerArgs:
1825
1911
  action="store_true",
1826
1912
  help="Enable returning hidden states with responses.",
1827
1913
  )
1828
- parser.add_argument(
1829
- "--enable-triton-kernel-moe",
1830
- action="store_true",
1831
- help="Use triton moe grouped gemm kernel.",
1832
- )
1833
- parser.add_argument(
1834
- "--enable-flashinfer-mxfp4-moe",
1835
- action="store_true",
1836
- help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
1837
- )
1838
1914
  parser.add_argument(
1839
1915
  "--scheduler-recv-interval",
1840
1916
  type=int,
@@ -1935,24 +2011,25 @@ class ServerArgs:
1935
2011
  default=None,
1936
2012
  help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
1937
2013
  )
2014
+ parser.add_argument(
2015
+ "--weight-loader-disable-mmap",
2016
+ action="store_true",
2017
+ help="Disable mmap while loading weight using safetensors.",
2018
+ )
2019
+
2020
+ # For PD-Multiplexing
1938
2021
  parser.add_argument(
1939
2022
  "--enable-pdmux",
1940
2023
  action="store_true",
1941
2024
  help="Enable PD-Multiplexing, PD running on greenctx stream.",
1942
2025
  )
1943
2026
 
1944
- # For PD-Multiplexing
1945
2027
  parser.add_argument(
1946
2028
  "--sm-group-num",
1947
2029
  type=int,
1948
2030
  default=ServerArgs.sm_group_num,
1949
2031
  help="Number of sm partition groups.",
1950
2032
  )
1951
- parser.add_argument(
1952
- "--weight-loader-disable-mmap",
1953
- action="store_true",
1954
- help="Disable mmap while loading weight using safetensors.",
1955
- )
1956
2033
 
1957
2034
  # Deprecated arguments
1958
2035
  parser.add_argument(
@@ -1965,6 +2042,26 @@ class ServerArgs:
1965
2042
  action="store_true",
1966
2043
  help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
1967
2044
  )
2045
+ parser.add_argument(
2046
+ "--enable-flashinfer-cutlass-moe",
2047
+ action="store_true",
2048
+ help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2049
+ )
2050
+ parser.add_argument(
2051
+ "--enable-flashinfer-trtllm-moe",
2052
+ action="store_true",
2053
+ help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
2054
+ )
2055
+ parser.add_argument(
2056
+ "--enable-triton-kernel-moe",
2057
+ action="store_true",
2058
+ help="(Deprecated) Use triton moe grouped gemm kernel.",
2059
+ )
2060
+ parser.add_argument(
2061
+ "--enable-flashinfer-mxfp4-moe",
2062
+ action="store_true",
2063
+ help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
2064
+ )
1968
2065
 
1969
2066
  @classmethod
1970
2067
  def from_cli_args(cls, args: argparse.Namespace):
@@ -2049,28 +2146,42 @@ class ServerArgs:
2049
2146
  )
2050
2147
 
2051
2148
  if self.enable_lora:
2052
- # Normalize lora_paths to a dictionary if it is a list.
2053
- # TODO (lifuhuang): support specifying pinned adapters in server_args.
2054
2149
  if isinstance(self.lora_paths, list):
2055
2150
  lora_paths = self.lora_paths
2056
- self.lora_paths = {}
2151
+ self.lora_paths = []
2057
2152
  for lora_path in lora_paths:
2058
- if "=" in lora_path:
2059
- name, path = lora_path.split("=", 1)
2060
- self.lora_paths[name] = LoRARef(
2061
- lora_name=name, lora_path=path, pinned=False
2153
+ if isinstance(lora_path, str):
2154
+ if "=" in lora_path:
2155
+ name, path = lora_path.split("=", 1)
2156
+ lora_ref = LoRARef(
2157
+ lora_name=name, lora_path=path, pinned=False
2158
+ )
2159
+ else:
2160
+ lora_ref = LoRARef(
2161
+ lora_name=lora_path, lora_path=lora_path, pinned=False
2162
+ )
2163
+ elif isinstance(lora_path, dict):
2164
+ assert (
2165
+ "lora_name" in lora_path and "lora_path" in lora_path
2166
+ ), f"When providing LoRA paths as a list of dict, each dict should contain 'lora_name' and 'lora_path' keys. Got: {lora_path}"
2167
+ lora_ref = LoRARef(
2168
+ lora_name=lora_path["lora_name"],
2169
+ lora_path=lora_path["lora_path"],
2170
+ pinned=lora_path.get("pinned", False),
2062
2171
  )
2063
2172
  else:
2064
- self.lora_paths[lora_path] = LoRARef(
2065
- lora_name=lora_path, lora_path=lora_path, pinned=False
2173
+ raise ValueError(
2174
+ f"Invalid type for item in --lora-paths list: {type(lora_path)}. "
2175
+ "Expected a string or a dictionary."
2066
2176
  )
2177
+ self.lora_paths.append(lora_ref)
2067
2178
  elif isinstance(self.lora_paths, dict):
2068
- self.lora_paths = {
2069
- k: LoRARef(lora_name=k, lora_path=v, pinned=False)
2179
+ self.lora_paths = [
2180
+ LoRARef(lora_name=k, lora_path=v, pinned=False)
2070
2181
  for k, v in self.lora_paths.items()
2071
- }
2182
+ ]
2072
2183
  elif self.lora_paths is None:
2073
- self.lora_paths = {}
2184
+ self.lora_paths = []
2074
2185
  else:
2075
2186
  raise ValueError(
2076
2187
  f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
@@ -2097,9 +2208,7 @@ class ServerArgs:
2097
2208
  "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
2098
2209
  f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
2099
2210
  )
2100
- assert (
2101
- not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
2102
- ), (
2211
+ assert len(self.lora_paths) <= self.max_loaded_loras, (
2103
2212
  "The number of LoRA paths should not exceed max_loaded_loras. "
2104
2213
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2105
2214
  )
@@ -2117,9 +2226,9 @@ class ServerArgs:
2117
2226
  model_arch = hf_config.architectures[0]
2118
2227
  if model_arch in ["GptOssForCausalLM"]:
2119
2228
  if self.attention_backend is None:
2120
- if is_sm100_supported():
2229
+ if is_cuda() and is_sm100_supported():
2121
2230
  self.attention_backend = "trtllm_mha"
2122
- elif is_sm90_supported():
2231
+ elif is_cuda() and is_sm90_supported():
2123
2232
  self.attention_backend = "fa3"
2124
2233
  else:
2125
2234
  self.attention_backend = "triton"
@@ -2132,10 +2241,11 @@ class ServerArgs:
2132
2241
  ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2133
2242
 
2134
2243
  if is_sm100_supported():
2135
- self.enable_flashinfer_allreduce_fusion = True
2136
- logger.info(
2137
- "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2138
- )
2244
+ if not self.enable_dp_attention:
2245
+ self.enable_flashinfer_allreduce_fusion = True
2246
+ logger.info(
2247
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2248
+ )
2139
2249
  quantization_config = getattr(hf_config, "quantization_config", None)
2140
2250
  is_mxfp4_quant_format = (
2141
2251
  quantization_config is not None
@@ -2143,18 +2253,21 @@ class ServerArgs:
2143
2253
  )
2144
2254
 
2145
2255
  if is_sm100_supported() and is_mxfp4_quant_format:
2146
- self.enable_flashinfer_mxfp4_moe = True
2147
- self.enable_triton_kernel_moe = False
2256
+ self.moe_runner_backend = "flashinfer_mxfp4"
2148
2257
  logger.warning(
2149
2258
  "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2150
2259
  )
2151
2260
  else:
2152
- if self.enable_triton_kernel_moe:
2261
+ if self.moe_runner_backend == "triton_kernel":
2153
2262
  assert (
2154
2263
  self.ep_size == 1
2155
2264
  ), "Triton kernel MoE is only supported when ep_size == 1"
2156
- if not self.enable_triton_kernel_moe and self.ep_size == 1:
2157
- self.enable_triton_kernel_moe = True
2265
+ if (
2266
+ self.moe_runner_backend == "auto"
2267
+ and self.ep_size == 1
2268
+ and is_triton_kernels_available()
2269
+ ):
2270
+ self.moe_runner_backend = "triton_kernel"
2158
2271
  logger.warning(
2159
2272
  "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2160
2273
  )
@@ -2163,7 +2276,10 @@ class ServerArgs:
2163
2276
  # use bf16 for mxfp4 triton kernels
2164
2277
  self.dtype = "bfloat16"
2165
2278
  elif "Llama4" in model_arch:
2166
- assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
2279
+ assert self.attention_backend in {
2280
+ "fa3",
2281
+ "aiter",
2282
+ }, "fa3 or aiter is required for Llama4 model"
2167
2283
  elif model_arch in [
2168
2284
  "Gemma2ForCausalLM",
2169
2285
  "Gemma3ForCausalLM",
@@ -2317,13 +2433,22 @@ class PortArgs:
2317
2433
 
2318
2434
  class LoRAPathAction(argparse.Action):
2319
2435
  def __call__(self, parser, namespace, values, option_string=None):
2320
- setattr(namespace, self.dest, {})
2321
- for lora_path in values:
2322
- if "=" in lora_path:
2323
- name, path = lora_path.split("=", 1)
2324
- getattr(namespace, self.dest)[name] = path
2325
- else:
2326
- getattr(namespace, self.dest)[lora_path] = lora_path
2436
+ lora_paths = []
2437
+ if values:
2438
+ assert isinstance(values, list), "Expected a list of LoRA paths."
2439
+ for lora_path in values:
2440
+ lora_path = lora_path.strip()
2441
+ if lora_path.startswith("{") and lora_path.endswith("}"):
2442
+ obj = json.loads(lora_path)
2443
+ assert "lora_path" in obj and "lora_name" in obj, (
2444
+ f"{repr(lora_path)} looks like a JSON str, "
2445
+ "but it does not contain 'lora_name' and 'lora_path' keys."
2446
+ )
2447
+ lora_paths.append(obj)
2448
+ else:
2449
+ lora_paths.append(lora_path)
2450
+
2451
+ setattr(namespace, self.dest, lora_paths)
2327
2452
 
2328
2453
 
2329
2454
  class DeprecatedAction(argparse.Action):
@@ -2336,6 +2461,10 @@ class DeprecatedAction(argparse.Action):
2336
2461
  raise ValueError(self.help)
2337
2462
 
2338
2463
 
2464
+ def print_deprecated_warning(message: str):
2465
+ logger.warning(f"\033[33m{message}\033[0m")
2466
+
2467
+
2339
2468
  def auto_choose_speculative_params(self: ServerArgs):
2340
2469
  """
2341
2470
  Automatically choose the parameters for speculative decoding.
@@ -2348,8 +2477,12 @@ def auto_choose_speculative_params(self: ServerArgs):
2348
2477
  if arch in ["LlamaForCausalLM"]:
2349
2478
  # The default value for llama
2350
2479
  return (5, 4, 8)
2351
- elif arch in ["DeepseekV3ForCausalLM", "DeepseekV2ForCausalLM"]:
2352
- # The default value for deepseek
2480
+ elif arch in [
2481
+ "DeepseekV3ForCausalLM",
2482
+ "DeepseekV2ForCausalLM",
2483
+ "GptOssForCausalLM",
2484
+ ]:
2485
+ # The default value for deepseek and gpt-oss
2353
2486
  return (3, 1, 4)
2354
2487
  elif arch in ["Grok1ForCausalLM", "Grok1VForCausalLM"]:
2355
2488
  return (5, 4, 8)
@@ -41,6 +41,7 @@ class EAGLEDraftCudaGraphRunner:
41
41
  # Parse args
42
42
  self.eagle_worker = eagle_worker
43
43
  self.model_runner = model_runner = eagle_worker.model_runner
44
+ self.model_runner: EAGLEWorker
44
45
  self.graphs = {}
45
46
  self.output_buffers = {}
46
47
  self.enable_torch_compile = model_runner.server_args.enable_torch_compile