sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (175) hide show
  1. sglang/__init__.py +8 -3
  2. sglang/bench_one_batch.py +119 -17
  3. sglang/lang/chat_template.py +18 -0
  4. sglang/srt/bench_utils.py +137 -0
  5. sglang/srt/configs/model_config.py +42 -7
  6. sglang/srt/conversation.py +9 -5
  7. sglang/srt/disaggregation/base/conn.py +5 -2
  8. sglang/srt/disaggregation/decode.py +14 -4
  9. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
  10. sglang/srt/disaggregation/mooncake/conn.py +286 -160
  11. sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
  12. sglang/srt/disaggregation/prefill.py +2 -0
  13. sglang/srt/distributed/parallel_state.py +15 -11
  14. sglang/srt/entrypoints/context.py +227 -0
  15. sglang/srt/entrypoints/engine.py +15 -9
  16. sglang/srt/entrypoints/harmony_utils.py +372 -0
  17. sglang/srt/entrypoints/http_server.py +74 -4
  18. sglang/srt/entrypoints/openai/protocol.py +218 -1
  19. sglang/srt/entrypoints/openai/serving_chat.py +41 -11
  20. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  21. sglang/srt/entrypoints/openai/tool_server.py +175 -0
  22. sglang/srt/entrypoints/tool.py +87 -0
  23. sglang/srt/eplb/expert_location.py +5 -1
  24. sglang/srt/function_call/ebnf_composer.py +1 -0
  25. sglang/srt/function_call/function_call_parser.py +2 -0
  26. sglang/srt/function_call/glm4_moe_detector.py +1 -1
  27. sglang/srt/function_call/gpt_oss_detector.py +331 -0
  28. sglang/srt/function_call/kimik2_detector.py +3 -3
  29. sglang/srt/function_call/qwen3_coder_detector.py +219 -9
  30. sglang/srt/hf_transformers_utils.py +30 -3
  31. sglang/srt/jinja_template_utils.py +14 -1
  32. sglang/srt/layers/attention/aiter_backend.py +375 -115
  33. sglang/srt/layers/attention/ascend_backend.py +3 -0
  34. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  35. sglang/srt/layers/attention/flashattention_backend.py +18 -0
  36. sglang/srt/layers/attention/flashinfer_backend.py +52 -13
  37. sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
  38. sglang/srt/layers/attention/triton_backend.py +85 -14
  39. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  40. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  41. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  42. sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
  43. sglang/srt/layers/attention/vision.py +22 -6
  44. sglang/srt/layers/attention/wave_backend.py +627 -0
  45. sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
  46. sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
  47. sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
  48. sglang/srt/layers/communicator.py +29 -14
  49. sglang/srt/layers/dp_attention.py +12 -0
  50. sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
  51. sglang/srt/layers/linear.py +3 -7
  52. sglang/srt/layers/moe/cutlass_moe.py +12 -3
  53. sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
  54. sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
  55. sglang/srt/layers/moe/ep_moe/layer.py +135 -73
  56. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  57. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  58. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  59. sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
  60. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
  61. sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
  62. sglang/srt/layers/moe/topk.py +16 -4
  63. sglang/srt/layers/moe/utils.py +16 -0
  64. sglang/srt/layers/quantization/__init__.py +27 -3
  65. sglang/srt/layers/quantization/fp4.py +557 -0
  66. sglang/srt/layers/quantization/fp8.py +3 -6
  67. sglang/srt/layers/quantization/fp8_kernel.py +277 -0
  68. sglang/srt/layers/quantization/fp8_utils.py +51 -10
  69. sglang/srt/layers/quantization/modelopt_quant.py +258 -68
  70. sglang/srt/layers/quantization/mxfp4.py +654 -0
  71. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  72. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  73. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  74. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  75. sglang/srt/layers/quantization/quark/utils.py +107 -0
  76. sglang/srt/layers/quantization/unquant.py +60 -6
  77. sglang/srt/layers/quantization/w4afp8.py +21 -12
  78. sglang/srt/layers/quantization/w8a8_int8.py +48 -34
  79. sglang/srt/layers/rotary_embedding.py +506 -3
  80. sglang/srt/layers/utils.py +9 -0
  81. sglang/srt/layers/vocab_parallel_embedding.py +8 -3
  82. sglang/srt/lora/backend/base_backend.py +3 -23
  83. sglang/srt/lora/layers.py +60 -114
  84. sglang/srt/lora/lora.py +17 -62
  85. sglang/srt/lora/lora_manager.py +82 -62
  86. sglang/srt/lora/lora_registry.py +23 -11
  87. sglang/srt/lora/mem_pool.py +63 -68
  88. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  89. sglang/srt/lora/utils.py +25 -58
  90. sglang/srt/managers/cache_controller.py +75 -58
  91. sglang/srt/managers/detokenizer_manager.py +1 -1
  92. sglang/srt/managers/io_struct.py +20 -8
  93. sglang/srt/managers/mm_utils.py +6 -13
  94. sglang/srt/managers/multimodal_processor.py +1 -1
  95. sglang/srt/managers/schedule_batch.py +61 -25
  96. sglang/srt/managers/schedule_policy.py +6 -6
  97. sglang/srt/managers/scheduler.py +41 -19
  98. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  99. sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
  100. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  101. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  102. sglang/srt/managers/template_manager.py +35 -1
  103. sglang/srt/managers/tokenizer_manager.py +47 -30
  104. sglang/srt/managers/tp_worker.py +3 -0
  105. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  106. sglang/srt/mem_cache/allocator.py +61 -87
  107. sglang/srt/mem_cache/hicache_storage.py +1 -1
  108. sglang/srt/mem_cache/hiradix_cache.py +80 -22
  109. sglang/srt/mem_cache/lora_radix_cache.py +421 -0
  110. sglang/srt/mem_cache/memory_pool_host.py +34 -36
  111. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  112. sglang/srt/mem_cache/radix_cache.py +2 -5
  113. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  114. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
  115. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
  116. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
  117. sglang/srt/model_executor/cuda_graph_runner.py +29 -9
  118. sglang/srt/model_executor/forward_batch_info.py +61 -19
  119. sglang/srt/model_executor/model_runner.py +148 -37
  120. sglang/srt/model_loader/loader.py +18 -6
  121. sglang/srt/model_loader/weight_utils.py +10 -0
  122. sglang/srt/models/bailing_moe.py +425 -0
  123. sglang/srt/models/deepseek_v2.py +137 -59
  124. sglang/srt/models/ernie4.py +426 -0
  125. sglang/srt/models/ernie4_eagle.py +203 -0
  126. sglang/srt/models/gemma2.py +0 -34
  127. sglang/srt/models/gemma3n_mm.py +38 -0
  128. sglang/srt/models/glm4.py +6 -0
  129. sglang/srt/models/glm4_moe.py +28 -16
  130. sglang/srt/models/glm4v.py +589 -0
  131. sglang/srt/models/glm4v_moe.py +400 -0
  132. sglang/srt/models/gpt_oss.py +1251 -0
  133. sglang/srt/models/granite.py +0 -25
  134. sglang/srt/models/llama.py +0 -25
  135. sglang/srt/models/llama4.py +1 -1
  136. sglang/srt/models/qwen2.py +6 -0
  137. sglang/srt/models/qwen2_5_vl.py +7 -3
  138. sglang/srt/models/qwen2_audio.py +10 -9
  139. sglang/srt/models/qwen2_moe.py +6 -0
  140. sglang/srt/models/qwen3.py +0 -24
  141. sglang/srt/models/qwen3_moe.py +32 -6
  142. sglang/srt/models/registry.py +1 -1
  143. sglang/srt/models/step3_vl.py +9 -0
  144. sglang/srt/models/torch_native_llama.py +0 -24
  145. sglang/srt/models/transformers.py +2 -5
  146. sglang/srt/multimodal/processors/base_processor.py +23 -13
  147. sglang/srt/multimodal/processors/glm4v.py +132 -0
  148. sglang/srt/multimodal/processors/qwen_audio.py +4 -2
  149. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  150. sglang/srt/reasoning_parser.py +332 -37
  151. sglang/srt/server_args.py +186 -75
  152. sglang/srt/speculative/eagle_worker.py +16 -0
  153. sglang/srt/two_batch_overlap.py +169 -9
  154. sglang/srt/utils.py +41 -5
  155. sglang/srt/weight_sync/tensor_bucket.py +106 -0
  156. sglang/test/attention/test_trtllm_mla_backend.py +186 -36
  157. sglang/test/doc_patch.py +59 -0
  158. sglang/test/few_shot_gsm8k.py +1 -1
  159. sglang/test/few_shot_gsm8k_engine.py +1 -1
  160. sglang/test/run_eval.py +4 -1
  161. sglang/test/runners.py +2 -2
  162. sglang/test/simple_eval_common.py +6 -0
  163. sglang/test/simple_eval_gpqa.py +2 -0
  164. sglang/test/test_fp4_moe.py +118 -36
  165. sglang/test/test_utils.py +1 -1
  166. sglang/utils.py +1 -1
  167. sglang/version.py +1 -1
  168. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
  169. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
  170. sglang/srt/lora/backend/flashinfer_backend.py +0 -131
  171. /sglang/{api.py → lang/api.py} +0 -0
  172. /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
  173. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
  174. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
  175. {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -108,7 +108,7 @@ class ServerArgs:
108
108
  log_level: str = "info"
109
109
  log_level_http: Optional[str] = None
110
110
  log_requests: bool = False
111
- log_requests_level: int = 0
111
+ log_requests_level: int = 2
112
112
  crash_dump_folder: Optional[str] = None
113
113
  show_time_cost: bool = False
114
114
  enable_metrics: bool = False
@@ -130,6 +130,7 @@ class ServerArgs:
130
130
  enable_cache_report: bool = False
131
131
  reasoning_parser: Optional[str] = None
132
132
  tool_call_parser: Optional[str] = None
133
+ tool_server: Optional[str] = None
133
134
 
134
135
  # Data parallelism
135
136
  dp_size: int = 1
@@ -201,6 +202,7 @@ class ServerArgs:
201
202
  hicache_io_backend: str = "kernel"
202
203
  hicache_mem_layout: str = "layer_first"
203
204
  hicache_storage_backend: Optional[str] = None
205
+ hicache_storage_prefetch_policy: str = "best_effort"
204
206
 
205
207
  # Double Sparsity
206
208
  enable_double_sparsity: bool = False
@@ -229,6 +231,7 @@ class ServerArgs:
229
231
  enable_dp_attention: bool = False
230
232
  enable_dp_lm_head: bool = False
231
233
  enable_two_batch_overlap: bool = False
234
+ tbo_token_distribution_threshold: float = 0.48
232
235
  enable_torch_compile: bool = False
233
236
  torch_compile_max_bs: int = 32
234
237
  torchao_config: str = ""
@@ -247,6 +250,8 @@ class ServerArgs:
247
250
  disable_fast_image_processor: bool = False
248
251
  enable_return_hidden_states: bool = False
249
252
  enable_triton_kernel_moe: bool = False
253
+ enable_flashinfer_mxfp4_moe: bool = False
254
+ scheduler_recv_interval: int = 1
250
255
 
251
256
  # Debug tensor dumps
252
257
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -278,7 +283,6 @@ class ServerArgs:
278
283
  enable_deepep_moe: bool = False
279
284
 
280
285
  def __post_init__(self):
281
-
282
286
  # Check deprecated arguments
283
287
  def print_deprecated_warning(message: str):
284
288
  logger.warning(f"\033[33m{message}\033[0m")
@@ -384,6 +388,9 @@ class ServerArgs:
384
388
  self.attention_backend = "torch_native"
385
389
  self.sampling_backend = "pytorch"
386
390
 
391
+ # Model-specific adjustments
392
+ self.model_specific_adjustments()
393
+
387
394
  # Set kernel backends
388
395
  if self.device == "cpu":
389
396
  if self.attention_backend is None:
@@ -425,7 +432,10 @@ class ServerArgs:
425
432
  )
426
433
  self.page_size = 128
427
434
 
428
- if self.attention_backend == "trtllm_mla":
435
+ if (
436
+ self.attention_backend == "trtllm_mla"
437
+ or self.decode_attention_backend == "trtllm_mla"
438
+ ):
429
439
  if not is_sm100_supported():
430
440
  raise ValueError(
431
441
  "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
@@ -436,11 +446,46 @@ class ServerArgs:
436
446
  f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
437
447
  )
438
448
  self.page_size = 64
449
+
439
450
  if self.speculative_algorithm is not None:
440
451
  raise ValueError(
441
452
  "trtllm_mla backend does not support speculative decoding yet."
442
453
  )
443
454
 
455
+ if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
456
+ raise ValueError(
457
+ "TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
458
+ )
459
+
460
+ if (
461
+ self.attention_backend == "trtllm_mha"
462
+ or self.decode_attention_backend == "trtllm_mha"
463
+ or self.prefill_attention_backend == "trtllm_mha"
464
+ ):
465
+ if not is_sm100_supported():
466
+ raise ValueError(
467
+ "TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
468
+ )
469
+
470
+ if self.page_size not in [16, 32, 64]:
471
+ logger.warning(
472
+ f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
473
+ )
474
+ self.page_size = 64
475
+
476
+ if self.speculative_algorithm is not None:
477
+ raise ValueError(
478
+ "trtllm_mha backend does not support speculative decoding yet."
479
+ )
480
+
481
+ if self.attention_backend == "dual_chunk_flash_attn":
482
+ logger.warning(
483
+ "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
484
+ )
485
+ self.enable_mixed_chunk = False
486
+ self.disable_cuda_graph = True
487
+ self.disable_radix_cache = True
488
+
444
489
  # Set page size
445
490
  if self.page_size is None:
446
491
  self.page_size = 1
@@ -481,6 +526,13 @@ class ServerArgs:
481
526
  self.tp_size,
482
527
  ], "The expert parallel size must be 1 or the same as the tensor parallel size"
483
528
 
529
+ if self.enable_flashinfer_trtllm_moe:
530
+ if not self.disable_shared_experts_fusion:
531
+ self.disable_shared_experts_fusion = True
532
+ logger.warning(
533
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
534
+ )
535
+
484
536
  # DeepEP MoE
485
537
  if self.moe_a2a_backend == "deepep":
486
538
  if self.deepep_mode == "normal":
@@ -493,7 +545,7 @@ class ServerArgs:
493
545
 
494
546
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
495
547
  self.expert_distribution_recorder_mode = "stat"
496
- logger.info(
548
+ logger.warning(
497
549
  "EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
498
550
  )
499
551
 
@@ -501,9 +553,6 @@ class ServerArgs:
501
553
  self.ep_dispatch_algorithm is None
502
554
  ):
503
555
  self.ep_dispatch_algorithm = "static"
504
- logger.info(
505
- "EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
506
- )
507
556
 
508
557
  if self.enable_eplb:
509
558
  assert self.ep_size > 1 or self.moe_a2a_backend is not None
@@ -526,6 +575,11 @@ class ServerArgs:
526
575
  "Pipeline parallelism is incompatible with overlap schedule."
527
576
  )
528
577
 
578
+ if self.hicache_storage_backend == "mooncake":
579
+ # to use mooncake storage backend, the following conditions must be met:
580
+ self.hicache_io_backend = "kernel"
581
+ self.hicache_mem_layout = "page_first"
582
+
529
583
  # Speculative Decoding
530
584
  if self.speculative_algorithm == "NEXTN":
531
585
  # NEXTN shares the same implementation of EAGLE
@@ -806,6 +860,7 @@ class ServerArgs:
806
860
  "moe_wna16",
807
861
  "qoq",
808
862
  "w4afp8",
863
+ "mxfp4",
809
864
  ],
810
865
  help="The quantization method.",
811
866
  )
@@ -868,7 +923,7 @@ class ServerArgs:
868
923
  "--schedule-policy",
869
924
  type=str,
870
925
  default=ServerArgs.schedule_policy,
871
- choices=["lpm", "random", "fcfs", "dfs-weight"],
926
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
872
927
  help="The scheduling policy of the requests.",
873
928
  )
874
929
  parser.add_argument(
@@ -1021,7 +1076,7 @@ class ServerArgs:
1021
1076
  parser.add_argument(
1022
1077
  "--log-requests-level",
1023
1078
  type=int,
1024
- default=0,
1079
+ default=ServerArgs.log_requests_level,
1025
1080
  help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
1026
1081
  choices=[0, 1, 2, 3],
1027
1082
  )
@@ -1140,7 +1195,7 @@ class ServerArgs:
1140
1195
  parser.add_argument(
1141
1196
  "--tool-call-parser",
1142
1197
  type=str,
1143
- choices=[
1198
+ choices=[ # TODO: use FunctionCallParser.DetectorMap.keys()
1144
1199
  "qwen25",
1145
1200
  "mistral",
1146
1201
  "llama3",
@@ -1150,10 +1205,17 @@ class ServerArgs:
1150
1205
  "qwen3_coder",
1151
1206
  "glm45",
1152
1207
  "step3",
1208
+ "gpt-oss",
1153
1209
  ],
1154
1210
  default=ServerArgs.tool_call_parser,
1155
1211
  help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
1156
1212
  )
1213
+ parser.add_argument(
1214
+ "--tool-server",
1215
+ type=str,
1216
+ default=None,
1217
+ help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
1218
+ )
1157
1219
 
1158
1220
  # Data parallelism
1159
1221
  parser.add_argument(
@@ -1253,53 +1315,42 @@ class ServerArgs:
1253
1315
  )
1254
1316
 
1255
1317
  # Kernel backend
1318
+ ATTN_BACKENDS = [
1319
+ "aiter",
1320
+ "cutlass_mla",
1321
+ "fa3",
1322
+ "flashinfer",
1323
+ "flashmla",
1324
+ "intel_amx",
1325
+ "torch_native",
1326
+ "ascend",
1327
+ "triton",
1328
+ "trtllm_mla",
1329
+ "trtllm_mha",
1330
+ "dual_chunk_flash_attn",
1331
+ "wave",
1332
+ ]
1256
1333
  parser.add_argument(
1257
1334
  "--attention-backend",
1258
1335
  type=str,
1259
- choices=[
1260
- "aiter",
1261
- "cutlass_mla",
1262
- "fa3",
1263
- "flashinfer",
1264
- "flashmla",
1265
- "intel_amx",
1266
- "torch_native",
1267
- "ascend",
1268
- "triton",
1269
- "trtllm_mla",
1270
- ],
1336
+ choices=ATTN_BACKENDS,
1271
1337
  default=ServerArgs.attention_backend,
1272
1338
  help="Choose the kernels for attention layers.",
1273
1339
  )
1274
- parser.add_argument(
1275
- "--decode-attention-backend",
1276
- type=str,
1277
- choices=[
1278
- "flashinfer",
1279
- "triton",
1280
- "torch_native",
1281
- "fa3",
1282
- "flashmla",
1283
- "cutlass_mla",
1284
- ],
1285
- default=ServerArgs.decode_attention_backend,
1286
- help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
1287
- )
1288
-
1289
1340
  parser.add_argument(
1290
1341
  "--prefill-attention-backend",
1291
1342
  type=str,
1292
- choices=[
1293
- "flashinfer",
1294
- "triton",
1295
- "torch_native",
1296
- "fa3",
1297
- "flashmla",
1298
- "cutlass_mla",
1299
- ],
1343
+ choices=ATTN_BACKENDS,
1300
1344
  default=ServerArgs.prefill_attention_backend,
1301
1345
  help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
1302
1346
  )
1347
+ parser.add_argument(
1348
+ "--decode-attention-backend",
1349
+ type=str,
1350
+ choices=ATTN_BACKENDS,
1351
+ default=ServerArgs.decode_attention_backend,
1352
+ help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
1353
+ )
1303
1354
  parser.add_argument(
1304
1355
  "--sampling-backend",
1305
1356
  type=str,
@@ -1400,7 +1451,7 @@ class ServerArgs:
1400
1451
  parser.add_argument(
1401
1452
  "--enable-flashinfer-allreduce-fusion",
1402
1453
  action="store_true",
1403
- help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
1454
+ help="Enable FlashInfer allreduce fusion with Residual RMSNorm.",
1404
1455
  )
1405
1456
  parser.add_argument(
1406
1457
  "--deepep-mode",
@@ -1519,7 +1570,6 @@ class ServerArgs:
1519
1570
  default=ServerArgs.hicache_mem_layout,
1520
1571
  help="The layout of host memory pool for hierarchical cache.",
1521
1572
  )
1522
-
1523
1573
  parser.add_argument(
1524
1574
  "--hicache-storage-backend",
1525
1575
  type=str,
@@ -1527,6 +1577,13 @@ class ServerArgs:
1527
1577
  default=ServerArgs.hicache_storage_backend,
1528
1578
  help="The storage backend for hierarchical KV cache.",
1529
1579
  )
1580
+ parser.add_argument(
1581
+ "--hicache-storage-prefetch-policy",
1582
+ type=str,
1583
+ choices=["best_effort", "wait_complete", "timeout"],
1584
+ default=ServerArgs.hicache_storage_prefetch_policy,
1585
+ help="Control when prefetching from the storage backend should stop.",
1586
+ )
1530
1587
 
1531
1588
  # Double Sparsity
1532
1589
  parser.add_argument(
@@ -1658,6 +1715,12 @@ class ServerArgs:
1658
1715
  action="store_true",
1659
1716
  help="Enabling two micro batches to overlap.",
1660
1717
  )
1718
+ parser.add_argument(
1719
+ "--tbo-token-distribution-threshold",
1720
+ type=float,
1721
+ default=ServerArgs.tbo_token_distribution_threshold,
1722
+ help="The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.",
1723
+ )
1661
1724
  parser.add_argument(
1662
1725
  "--enable-torch-compile",
1663
1726
  action="store_true",
@@ -1755,6 +1818,17 @@ class ServerArgs:
1755
1818
  action="store_true",
1756
1819
  help="Use triton moe grouped gemm kernel.",
1757
1820
  )
1821
+ parser.add_argument(
1822
+ "--enable-flashinfer-mxfp4-moe",
1823
+ action="store_true",
1824
+ help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
1825
+ )
1826
+ parser.add_argument(
1827
+ "--scheduler-recv-interval",
1828
+ type=int,
1829
+ default=ServerArgs.scheduler_recv_interval,
1830
+ help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1831
+ )
1758
1832
 
1759
1833
  # Debug tensor dumps
1760
1834
  parser.add_argument(
@@ -1931,17 +2005,6 @@ class ServerArgs:
1931
2005
  None,
1932
2006
  }, "moe_dense_tp_size only support 1 and None currently"
1933
2007
 
1934
- # Check model architecture
1935
- model_arch = self.get_hf_config().architectures[0]
1936
- if "Llama4" in model_arch:
1937
- assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
1938
-
1939
- if "Gemma2ForCausalLM" in model_arch:
1940
- # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
1941
- # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
1942
- logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.")
1943
- self.disable_hybrid_swa_memory = True
1944
-
1945
2008
  # Check LoRA
1946
2009
  self.check_lora_server_args()
1947
2010
 
@@ -1952,22 +2015,20 @@ class ServerArgs:
1952
2015
  ), "enable_mixed_chunk is required for speculative decoding"
1953
2016
 
1954
2017
  # Check chunked prefill
1955
- assert (
1956
- self.chunked_prefill_size % self.page_size == 0
1957
- ), "chunked_prefill_size must be divisible by page_size"
2018
+ # Skip validation if chunked prefill is disabled (i.e., size <= 0).
2019
+ if self.chunked_prefill_size > 0:
2020
+ assert (
2021
+ self.chunked_prefill_size % self.page_size == 0
2022
+ ), "chunked_prefill_size must be divisible by page_size"
1958
2023
 
1959
2024
  def check_lora_server_args(self):
1960
- assert (
1961
- self.max_loras_per_batch > 0
1962
- # FIXME
1963
- and (self.lora_paths is None or self.disable_radix_cache)
1964
- ), "compatibility of lora and radix attention is in progress"
2025
+ assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
1965
2026
 
1966
2027
  # Enable LoRA if any LoRA paths are provided for backward compatibility.
1967
2028
  if self.lora_paths:
1968
2029
  if self.enable_lora is None:
1969
2030
  self.enable_lora = True
1970
- logger.info(
2031
+ logger.warning(
1971
2032
  "--enable-lora is set to True because --lora-paths is provided."
1972
2033
  )
1973
2034
  elif self.enable_lora is False:
@@ -1977,21 +2038,23 @@ class ServerArgs:
1977
2038
 
1978
2039
  if self.enable_lora:
1979
2040
  # Normalize lora_paths to a dictionary if it is a list.
2041
+ # TODO (lifuhuang): support specifying pinned adapters in server_args.
1980
2042
  if isinstance(self.lora_paths, list):
1981
2043
  lora_paths = self.lora_paths
1982
2044
  self.lora_paths = {}
1983
2045
  for lora_path in lora_paths:
1984
2046
  if "=" in lora_path:
1985
2047
  name, path = lora_path.split("=", 1)
1986
- self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
2048
+ self.lora_paths[name] = LoRARef(
2049
+ lora_name=name, lora_path=path, pinned=False
2050
+ )
1987
2051
  else:
1988
2052
  self.lora_paths[lora_path] = LoRARef(
1989
- lora_name=lora_path,
1990
- lora_path=lora_path,
2053
+ lora_name=lora_path, lora_path=lora_path, pinned=False
1991
2054
  )
1992
2055
  elif isinstance(self.lora_paths, dict):
1993
2056
  self.lora_paths = {
1994
- k: LoRARef(lora_name=k, lora_path=v)
2057
+ k: LoRARef(lora_name=k, lora_path=v, pinned=False)
1995
2058
  for k, v in self.lora_paths.items()
1996
2059
  }
1997
2060
  elif self.lora_paths is None:
@@ -2037,6 +2100,58 @@ class ServerArgs:
2037
2100
  f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
2038
2101
  )
2039
2102
 
2103
+ def model_specific_adjustments(self):
2104
+ hf_config = self.get_hf_config()
2105
+ model_arch = hf_config.architectures[0]
2106
+ if model_arch in ["GptOssForCausalLM"]:
2107
+ if self.attention_backend is None:
2108
+ self.attention_backend = "triton"
2109
+ supported_backends = ["triton", "trtllm_mha", "fa3"]
2110
+ assert (
2111
+ self.attention_backend in supported_backends
2112
+ ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2113
+ quantization_config = getattr(hf_config, "quantization_config", None)
2114
+ is_mxfp4_quant_format = (
2115
+ quantization_config is not None
2116
+ and quantization_config.get("quant_method") == "mxfp4"
2117
+ )
2118
+
2119
+ if is_sm100_supported() and is_mxfp4_quant_format:
2120
+ self.enable_flashinfer_mxfp4_moe = True
2121
+ self.enable_triton_kernel_moe = False
2122
+ logger.warning(
2123
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2124
+ )
2125
+ else:
2126
+ if self.enable_triton_kernel_moe:
2127
+ assert (
2128
+ self.ep_size == 1
2129
+ ), "Triton kernel MoE is only supported when ep_size == 1"
2130
+ if not self.enable_triton_kernel_moe and self.ep_size == 1:
2131
+ self.enable_triton_kernel_moe = True
2132
+ logger.warning(
2133
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2134
+ )
2135
+ self.disable_hybrid_swa_memory = True
2136
+ if is_mxfp4_quant_format:
2137
+ # use bf16 for mxfp4 triton kernels
2138
+ self.dtype = "bfloat16"
2139
+ elif "Llama4" in model_arch:
2140
+ assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
2141
+ elif model_arch in [
2142
+ "Gemma2ForCausalLM",
2143
+ "Gemma3ForCausalLM",
2144
+ "Gemma3ForConditionalGeneration",
2145
+ "Gemma3nForCausalLM",
2146
+ "Gemma3nForConditionalGeneration",
2147
+ ]:
2148
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2149
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2150
+ logger.warning(
2151
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2152
+ )
2153
+ self.disable_hybrid_swa_memory = True
2154
+
2040
2155
  def adjust_mem_fraction_for_vlm(self, model_config):
2041
2156
  vision_config = getattr(model_config.hf_config, "vision_config", None)
2042
2157
  if vision_config is None:
@@ -2074,10 +2189,6 @@ class ServerArgs:
2074
2189
  self.mem_fraction_static = (
2075
2190
  original_server_arg_mem_fraction * final_overall_factor
2076
2191
  )
2077
- logger.warning(
2078
- f"Multimodal model: Dynamically adjusted --mem-fraction-static "
2079
- f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
2080
- )
2081
2192
 
2082
2193
 
2083
2194
  def prepare_server_args(argv: List[str]) -> ServerArgs:
@@ -226,6 +226,22 @@ class EAGLEWorker(TpModelWorker):
226
226
  self.draft_model_runner,
227
227
  skip_prefill=False,
228
228
  )
229
+ elif self.server_args.attention_backend == "aiter":
230
+ from sglang.srt.layers.attention.aiter_backend import (
231
+ AiterAttnBackend,
232
+ AiterMultiStepDraftBackend,
233
+ )
234
+
235
+ self.draft_attn_backend = AiterMultiStepDraftBackend(
236
+ self.draft_model_runner,
237
+ self.topk,
238
+ self.speculative_num_steps,
239
+ )
240
+ self.draft_extend_attn_backend = AiterAttnBackend(
241
+ self.draft_model_runner,
242
+ skip_prefill=False,
243
+ )
244
+ self.has_prefill_wrapper_verify = False
229
245
  elif self.server_args.attention_backend == "fa3":
230
246
  from sglang.srt.layers.attention.flashattention_backend import (
231
247
  FlashAttentionBackend,