sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (143) hide show
  1. sglang/bench_one_batch.py +113 -17
  2. sglang/compile_deep_gemm.py +8 -1
  3. sglang/global_config.py +5 -1
  4. sglang/srt/configs/model_config.py +35 -0
  5. sglang/srt/conversation.py +9 -117
  6. sglang/srt/disaggregation/base/conn.py +5 -2
  7. sglang/srt/disaggregation/decode.py +6 -1
  8. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
  9. sglang/srt/disaggregation/mooncake/conn.py +243 -135
  10. sglang/srt/disaggregation/prefill.py +3 -0
  11. sglang/srt/distributed/device_communicators/pynccl.py +7 -0
  12. sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
  13. sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
  14. sglang/srt/distributed/parallel_state.py +22 -9
  15. sglang/srt/entrypoints/context.py +244 -0
  16. sglang/srt/entrypoints/engine.py +8 -5
  17. sglang/srt/entrypoints/harmony_utils.py +370 -0
  18. sglang/srt/entrypoints/http_server.py +106 -15
  19. sglang/srt/entrypoints/openai/protocol.py +227 -1
  20. sglang/srt/entrypoints/openai/serving_chat.py +278 -42
  21. sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
  22. sglang/srt/entrypoints/openai/tool_server.py +174 -0
  23. sglang/srt/entrypoints/tool.py +87 -0
  24. sglang/srt/eplb/expert_distribution.py +4 -2
  25. sglang/srt/eplb/expert_location.py +5 -1
  26. sglang/srt/function_call/harmony_tool_parser.py +130 -0
  27. sglang/srt/hf_transformers_utils.py +55 -13
  28. sglang/srt/jinja_template_utils.py +8 -1
  29. sglang/srt/layers/attention/aiter_backend.py +5 -8
  30. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  31. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
  32. sglang/srt/layers/attention/flashattention_backend.py +7 -11
  33. sglang/srt/layers/attention/triton_backend.py +85 -14
  34. sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
  35. sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
  36. sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
  37. sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
  38. sglang/srt/layers/attention/vision.py +40 -15
  39. sglang/srt/layers/communicator.py +35 -8
  40. sglang/srt/layers/dp_attention.py +12 -0
  41. sglang/srt/layers/linear.py +9 -8
  42. sglang/srt/layers/logits_processor.py +9 -1
  43. sglang/srt/layers/moe/cutlass_moe.py +20 -6
  44. sglang/srt/layers/moe/ep_moe/layer.py +87 -107
  45. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
  46. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
  47. sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
  48. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
  49. sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
  50. sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
  51. sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
  52. sglang/srt/layers/moe/topk.py +12 -3
  53. sglang/srt/layers/moe/utils.py +59 -0
  54. sglang/srt/layers/quantization/__init__.py +22 -0
  55. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
  56. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
  57. sglang/srt/layers/quantization/fp4.py +557 -0
  58. sglang/srt/layers/quantization/fp8.py +8 -7
  59. sglang/srt/layers/quantization/fp8_kernel.py +0 -4
  60. sglang/srt/layers/quantization/fp8_utils.py +29 -0
  61. sglang/srt/layers/quantization/modelopt_quant.py +259 -64
  62. sglang/srt/layers/quantization/mxfp4.py +651 -0
  63. sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
  64. sglang/srt/layers/quantization/quark/__init__.py +0 -0
  65. sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
  66. sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
  67. sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
  68. sglang/srt/layers/quantization/quark/utils.py +107 -0
  69. sglang/srt/layers/quantization/unquant.py +60 -6
  70. sglang/srt/layers/quantization/w4afp8.py +1 -1
  71. sglang/srt/layers/rotary_embedding.py +225 -1
  72. sglang/srt/layers/utils.py +9 -0
  73. sglang/srt/layers/vocab_parallel_embedding.py +15 -4
  74. sglang/srt/lora/lora_manager.py +70 -14
  75. sglang/srt/lora/lora_registry.py +10 -2
  76. sglang/srt/lora/mem_pool.py +43 -5
  77. sglang/srt/managers/cache_controller.py +61 -32
  78. sglang/srt/managers/data_parallel_controller.py +52 -2
  79. sglang/srt/managers/detokenizer_manager.py +1 -1
  80. sglang/srt/managers/io_struct.py +21 -4
  81. sglang/srt/managers/mm_utils.py +5 -11
  82. sglang/srt/managers/schedule_batch.py +30 -8
  83. sglang/srt/managers/schedule_policy.py +3 -1
  84. sglang/srt/managers/scheduler.py +170 -18
  85. sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
  86. sglang/srt/managers/scheduler_recv_skipper.py +37 -0
  87. sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
  88. sglang/srt/managers/template_manager.py +59 -22
  89. sglang/srt/managers/tokenizer_manager.py +137 -67
  90. sglang/srt/managers/tp_worker.py +3 -0
  91. sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
  92. sglang/srt/managers/utils.py +45 -1
  93. sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
  94. sglang/srt/mem_cache/hicache_storage.py +13 -21
  95. sglang/srt/mem_cache/hiradix_cache.py +53 -5
  96. sglang/srt/mem_cache/memory_pool_host.py +1 -1
  97. sglang/srt/mem_cache/multimodal_cache.py +33 -13
  98. sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
  99. sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
  100. sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
  101. sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  102. sglang/srt/model_executor/forward_batch_info.py +48 -17
  103. sglang/srt/model_executor/model_runner.py +24 -2
  104. sglang/srt/model_loader/weight_utils.py +10 -0
  105. sglang/srt/models/bailing_moe.py +425 -0
  106. sglang/srt/models/deepseek_v2.py +95 -50
  107. sglang/srt/models/ernie4.py +426 -0
  108. sglang/srt/models/ernie4_eagle.py +203 -0
  109. sglang/srt/models/gemma3n_mm.py +39 -0
  110. sglang/srt/models/glm4_moe.py +102 -27
  111. sglang/srt/models/gpt_oss.py +1134 -0
  112. sglang/srt/models/grok.py +3 -3
  113. sglang/srt/models/llama4.py +13 -2
  114. sglang/srt/models/mixtral.py +3 -3
  115. sglang/srt/models/mllama4.py +428 -19
  116. sglang/srt/models/qwen2.py +6 -0
  117. sglang/srt/models/qwen2_moe.py +7 -4
  118. sglang/srt/models/qwen3_moe.py +39 -14
  119. sglang/srt/models/step3_vl.py +10 -1
  120. sglang/srt/models/transformers.py +2 -5
  121. sglang/srt/multimodal/processors/base_processor.py +4 -3
  122. sglang/srt/multimodal/processors/gemma3n.py +0 -7
  123. sglang/srt/multimodal/processors/step3_vl.py +3 -1
  124. sglang/srt/operations_strategy.py +1 -1
  125. sglang/srt/reasoning_parser.py +18 -39
  126. sglang/srt/server_args.py +218 -23
  127. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
  128. sglang/srt/two_batch_overlap.py +163 -9
  129. sglang/srt/utils.py +41 -26
  130. sglang/srt/weight_sync/utils.py +1 -1
  131. sglang/test/runners.py +4 -4
  132. sglang/test/test_utils.py +4 -4
  133. sglang/version.py +1 -1
  134. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
  135. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
  136. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
  137. /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
  138. /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
  139. /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
  140. /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
  141. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
  142. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
  143. {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -37,6 +37,7 @@ from sglang.srt.utils import (
37
37
  is_hip,
38
38
  is_port_available,
39
39
  is_remote_url,
40
+ is_triton_kernels_available,
40
41
  is_valid_ipv6_address,
41
42
  nullable_str,
42
43
  )
@@ -149,6 +150,7 @@ class ServerArgs:
149
150
  max_lora_rank: Optional[int] = None
150
151
  lora_target_modules: Optional[Union[set[str], List[str]]] = None
151
152
  lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
153
+ max_loaded_loras: Optional[int] = None
152
154
  max_loras_per_batch: int = 8
153
155
  lora_backend: str = "triton"
154
156
 
@@ -172,12 +174,11 @@ class ServerArgs:
172
174
 
173
175
  # Expert parallelism
174
176
  ep_size: int = 1
175
- enable_ep_moe: bool = False
176
- enable_deepep_moe: bool = False
177
+ moe_a2a_backend: Optional[Literal["deepep"]] = None
177
178
  enable_flashinfer_cutlass_moe: bool = False
178
179
  enable_flashinfer_trtllm_moe: bool = False
179
180
  enable_flashinfer_allreduce_fusion: bool = False
180
- deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
181
+ deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
181
182
  ep_num_redundant_experts: int = 0
182
183
  ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
183
184
  init_expert_location: str = "trivial"
@@ -201,6 +202,7 @@ class ServerArgs:
201
202
  hicache_io_backend: str = "kernel"
202
203
  hicache_mem_layout: str = "layer_first"
203
204
  hicache_storage_backend: Optional[str] = None
205
+ hicache_storage_prefetch_policy: str = "best_effort"
204
206
 
205
207
  # Double Sparsity
206
208
  enable_double_sparsity: bool = False
@@ -219,6 +221,7 @@ class ServerArgs:
219
221
  enable_profile_cuda_graph: bool = False
220
222
  enable_cudagraph_gc: bool = False
221
223
  enable_nccl_nvls: bool = False
224
+ enable_symm_mem: bool = False
222
225
  enable_tokenizer_batch_encode: bool = False
223
226
  disable_outlines_disk_cache: bool = False
224
227
  disable_custom_all_reduce: bool = False
@@ -228,6 +231,7 @@ class ServerArgs:
228
231
  enable_dp_attention: bool = False
229
232
  enable_dp_lm_head: bool = False
230
233
  enable_two_batch_overlap: bool = False
234
+ tbo_token_distribution_threshold: float = 0.48
231
235
  enable_torch_compile: bool = False
232
236
  torch_compile_max_bs: int = 32
233
237
  torchao_config: str = ""
@@ -246,6 +250,8 @@ class ServerArgs:
246
250
  disable_fast_image_processor: bool = False
247
251
  enable_return_hidden_states: bool = False
248
252
  enable_triton_kernel_moe: bool = False
253
+ enable_flashinfer_mxfp4_moe: bool = False
254
+ scheduler_recv_interval: int = 1
249
255
 
250
256
  # Debug tensor dumps
251
257
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -272,7 +278,30 @@ class ServerArgs:
272
278
  enable_pdmux: bool = False
273
279
  sm_group_num: int = 3
274
280
 
281
+ # For tool server
282
+ tool_server: Optional[str] = None
283
+
284
+ # Deprecated arguments
285
+ enable_ep_moe: bool = False
286
+ enable_deepep_moe: bool = False
287
+
275
288
  def __post_init__(self):
289
+
290
+ # Check deprecated arguments
291
+ def print_deprecated_warning(message: str):
292
+ logger.warning(f"\033[33m{message}\033[0m")
293
+
294
+ if self.enable_ep_moe:
295
+ self.ep_size = self.tp_size
296
+ print_deprecated_warning(
297
+ "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
298
+ )
299
+ if self.enable_deepep_moe:
300
+ self.moe_a2a_backend = "deepep"
301
+ print_deprecated_warning(
302
+ "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
303
+ )
304
+
276
305
  # Set missing default values
277
306
  if self.tokenizer_path is None:
278
307
  self.tokenizer_path = self.model_path
@@ -420,6 +449,81 @@ class ServerArgs:
420
449
  "trtllm_mla backend does not support speculative decoding yet."
421
450
  )
422
451
 
452
+ if (
453
+ self.attention_backend == "trtllm_mha"
454
+ or self.decode_attention_backend == "trtllm_mha"
455
+ or self.prefill_attention_backend == "trtllm_mha"
456
+ ):
457
+ if not is_sm100_supported():
458
+ raise ValueError(
459
+ "TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
460
+ )
461
+
462
+ if self.page_size not in [16, 32, 64]:
463
+ logger.warning(
464
+ f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
465
+ )
466
+ self.page_size = 64
467
+
468
+ if self.speculative_algorithm is not None:
469
+ raise ValueError(
470
+ "trtllm_mha backend does not support speculative decoding yet."
471
+ )
472
+
473
+ model_arch = self.get_hf_config().architectures[0]
474
+ if model_arch in ["GptOssForCausalLM"]:
475
+ if self.attention_backend is None:
476
+ # default is triton, but we could have trtllm_mha as an option
477
+ self.attention_backend = "triton"
478
+ assert (
479
+ self.attention_backend == "trtllm_mha"
480
+ or self.attention_backend == "triton"
481
+ )
482
+ quantization_config = getattr(
483
+ self.get_hf_config(), "quantization_config", None
484
+ )
485
+ is_mxfp4_quant_format = (
486
+ quantization_config is not None
487
+ and quantization_config.get("quant_method") == "mxfp4"
488
+ )
489
+
490
+ if is_sm100_supported() and is_mxfp4_quant_format:
491
+ self.enable_flashinfer_mxfp4_moe = True
492
+ self.enable_triton_kernel_moe = False
493
+ logger.info(
494
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
495
+ )
496
+ else:
497
+ if self.enable_triton_kernel_moe:
498
+ assert (
499
+ self.ep_size == 1
500
+ ), "Triton kernel MoE is only supported when ep_size == 1"
501
+ if not self.enable_triton_kernel_moe and self.ep_size == 1:
502
+ self.enable_triton_kernel_moe = True
503
+ logger.info(
504
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
505
+ )
506
+
507
+ self.disable_hybrid_swa_memory = True
508
+
509
+ if is_mxfp4_quant_format:
510
+ # use bf16 for mxfp4 triton kernels
511
+ self.dtype = "bfloat16"
512
+
513
+ if self.attention_backend == "dual_chunk_flash_attn":
514
+ logger.warning(
515
+ "Mixed chunk is disabled because of using dual chunk flash attention backend"
516
+ )
517
+ logger.warning(
518
+ "Radix cache is disabled because of using dual chunk flash attention backend"
519
+ )
520
+ logger.warning(
521
+ "Cuda graph is disabled because of using dual chunk flash attention backend"
522
+ )
523
+ self.enable_mixed_chunk = False
524
+ self.disable_cuda_graph = True
525
+ self.disable_radix_cache = True
526
+
423
527
  # Set page size
424
528
  if self.page_size is None:
425
529
  self.page_size = 1
@@ -455,14 +559,20 @@ class ServerArgs:
455
559
  self.quantization == "modelopt_fp4"
456
560
  ), "modelopt_fp4 quantization is required for Flashinfer MOE"
457
561
  os.environ["TRTLLM_ENABLE_PDL"] = "1"
458
- if self.enable_ep_moe:
459
- self.ep_size = self.tp_size
562
+ assert self.ep_size in [
563
+ 1,
564
+ self.tp_size,
565
+ ], "The expert parallel size must be 1 or the same as the tensor parallel size"
566
+
567
+ if self.enable_flashinfer_trtllm_moe:
568
+ if not self.disable_shared_experts_fusion:
569
+ self.disable_shared_experts_fusion = True
460
570
  logger.warning(
461
- f"Flashinfer cutlass MoE and EP MoE are enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
571
+ "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
462
572
  )
463
573
 
464
574
  # DeepEP MoE
465
- if self.enable_deepep_moe:
575
+ if self.moe_a2a_backend == "deepep":
466
576
  if self.deepep_mode == "normal":
467
577
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
468
578
  self.disable_cuda_graph = True
@@ -486,7 +596,7 @@ class ServerArgs:
486
596
  )
487
597
 
488
598
  if self.enable_eplb:
489
- assert self.enable_ep_moe or self.enable_deepep_moe
599
+ assert self.ep_size > 1 or self.moe_a2a_backend is not None
490
600
 
491
601
  if self.enable_expert_distribution_metrics and (
492
602
  self.expert_distribution_recorder_mode is None
@@ -786,6 +896,7 @@ class ServerArgs:
786
896
  "moe_wna16",
787
897
  "qoq",
788
898
  "w4afp8",
899
+ "mxfp4",
789
900
  ],
790
901
  help="The quantization method.",
791
902
  )
@@ -848,7 +959,7 @@ class ServerArgs:
848
959
  "--schedule-policy",
849
960
  type=str,
850
961
  default=ServerArgs.schedule_policy,
851
- choices=["lpm", "random", "fcfs", "dfs-weight"],
962
+ choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
852
963
  help="The scheduling policy of the requests.",
853
964
  )
854
965
  parser.add_argument(
@@ -1151,6 +1262,7 @@ class ServerArgs:
1151
1262
  choices=[
1152
1263
  "round_robin",
1153
1264
  "shortest_queue",
1265
+ "minimum_tokens",
1154
1266
  ],
1155
1267
  )
1156
1268
 
@@ -1218,6 +1330,12 @@ class ServerArgs:
1218
1330
  default=8,
1219
1331
  help="Maximum number of adapters for a running batch, include base-only request.",
1220
1332
  )
1333
+ parser.add_argument(
1334
+ "--max-loaded-loras",
1335
+ type=int,
1336
+ default=ServerArgs.max_loaded_loras,
1337
+ help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
1338
+ )
1221
1339
  parser.add_argument(
1222
1340
  "--lora-backend",
1223
1341
  type=str,
@@ -1240,6 +1358,8 @@ class ServerArgs:
1240
1358
  "ascend",
1241
1359
  "triton",
1242
1360
  "trtllm_mla",
1361
+ "trtllm_mha",
1362
+ "dual_chunk_flash_attn",
1243
1363
  ],
1244
1364
  default=ServerArgs.attention_backend,
1245
1365
  help="Choose the kernels for attention layers.",
@@ -1354,30 +1474,27 @@ class ServerArgs:
1354
1474
  help="The expert parallelism size.",
1355
1475
  )
1356
1476
  parser.add_argument(
1357
- "--enable-ep-moe",
1358
- action="store_true",
1359
- help="Enabling expert parallelism for moe. The ep size is equal to the tp size.",
1477
+ "--moe-a2a-backend",
1478
+ type=str,
1479
+ choices=["deepep"],
1480
+ default=ServerArgs.moe_a2a_backend,
1481
+ help="Choose the backend for MoE A2A.",
1360
1482
  )
1361
1483
  parser.add_argument(
1362
1484
  "--enable-flashinfer-cutlass-moe",
1363
1485
  action="store_true",
1364
- help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP with --enable-ep-moe",
1486
+ help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
1365
1487
  )
1366
1488
  parser.add_argument(
1367
1489
  "--enable-flashinfer-trtllm-moe",
1368
1490
  action="store_true",
1369
- help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP with --enable-ep-moe",
1491
+ help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
1370
1492
  )
1371
1493
  parser.add_argument(
1372
1494
  "--enable-flashinfer-allreduce-fusion",
1373
1495
  action="store_true",
1374
1496
  help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
1375
1497
  )
1376
- parser.add_argument(
1377
- "--enable-deepep-moe",
1378
- action="store_true",
1379
- help="Enabling DeepEP MoE implementation for EP MoE.",
1380
- )
1381
1498
  parser.add_argument(
1382
1499
  "--deepep-mode",
1383
1500
  type=str,
@@ -1503,6 +1620,13 @@ class ServerArgs:
1503
1620
  default=ServerArgs.hicache_storage_backend,
1504
1621
  help="The storage backend for hierarchical KV cache.",
1505
1622
  )
1623
+ parser.add_argument(
1624
+ "--hicache-storage-prefetch-policy",
1625
+ type=str,
1626
+ choices=["best_effort", "wait_complete", "timeout"],
1627
+ default=ServerArgs.hicache_storage_prefetch_policy,
1628
+ help="Control when prefetching from the storage backend should stop.",
1629
+ )
1506
1630
 
1507
1631
  # Double Sparsity
1508
1632
  parser.add_argument(
@@ -1584,6 +1708,11 @@ class ServerArgs:
1584
1708
  action="store_true",
1585
1709
  help="Enable NCCL NVLS for prefill heavy requests when available.",
1586
1710
  )
1711
+ parser.add_argument(
1712
+ "--enable-symm-mem",
1713
+ action="store_true",
1714
+ help="Enable NCCL symmetric memory for fast collectives.",
1715
+ )
1587
1716
  parser.add_argument(
1588
1717
  "--enable-tokenizer-batch-encode",
1589
1718
  action="store_true",
@@ -1629,6 +1758,12 @@ class ServerArgs:
1629
1758
  action="store_true",
1630
1759
  help="Enabling two micro batches to overlap.",
1631
1760
  )
1761
+ parser.add_argument(
1762
+ "--tbo-token-distribution-threshold",
1763
+ type=float,
1764
+ default=ServerArgs.tbo_token_distribution_threshold,
1765
+ help="The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.",
1766
+ )
1632
1767
  parser.add_argument(
1633
1768
  "--enable-torch-compile",
1634
1769
  action="store_true",
@@ -1726,6 +1861,17 @@ class ServerArgs:
1726
1861
  action="store_true",
1727
1862
  help="Use triton moe grouped gemm kernel.",
1728
1863
  )
1864
+ parser.add_argument(
1865
+ "--enable-flashinfer-mxfp4-moe",
1866
+ action="store_true",
1867
+ help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
1868
+ )
1869
+ parser.add_argument(
1870
+ "--scheduler-recv-interval",
1871
+ type=int,
1872
+ default=ServerArgs.scheduler_recv_interval,
1873
+ help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
1874
+ )
1729
1875
 
1730
1876
  # Debug tensor dumps
1731
1877
  parser.add_argument(
@@ -1839,6 +1985,26 @@ class ServerArgs:
1839
1985
  help="Disable mmap while loading weight using safetensors.",
1840
1986
  )
1841
1987
 
1988
+ # For tool server
1989
+ parser.add_argument(
1990
+ "--tool-server",
1991
+ type=str,
1992
+ default=None,
1993
+ help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
1994
+ )
1995
+
1996
+ # Deprecated arguments
1997
+ parser.add_argument(
1998
+ "--enable-ep-moe",
1999
+ action="store_true",
2000
+ help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
2001
+ )
2002
+ parser.add_argument(
2003
+ "--enable-deepep-moe",
2004
+ action="store_true",
2005
+ help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
2006
+ )
2007
+
1842
2008
  @classmethod
1843
2009
  def from_cli_args(cls, args: argparse.Namespace):
1844
2010
  args.tp_size = args.tensor_parallel_size
@@ -1895,6 +2061,20 @@ class ServerArgs:
1895
2061
  if "Llama4" in model_arch:
1896
2062
  assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
1897
2063
 
2064
+ if model_arch in [
2065
+ "Gemma2ForCausalLM",
2066
+ "Gemma3ForCausalLM",
2067
+ "Gemma3ForConditionalGeneration",
2068
+ "Gemma3nForCausalLM",
2069
+ "Gemma3nForConditionalGeneration",
2070
+ ]:
2071
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2072
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2073
+ logger.warning(
2074
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2075
+ )
2076
+ self.disable_hybrid_swa_memory = True
2077
+
1898
2078
  # Check LoRA
1899
2079
  self.check_lora_server_args()
1900
2080
 
@@ -1930,21 +2110,23 @@ class ServerArgs:
1930
2110
 
1931
2111
  if self.enable_lora:
1932
2112
  # Normalize lora_paths to a dictionary if it is a list.
2113
+ # TODO (lifuhuang): support specifying pinned adapters in server_args.
1933
2114
  if isinstance(self.lora_paths, list):
1934
2115
  lora_paths = self.lora_paths
1935
2116
  self.lora_paths = {}
1936
2117
  for lora_path in lora_paths:
1937
2118
  if "=" in lora_path:
1938
2119
  name, path = lora_path.split("=", 1)
1939
- self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
2120
+ self.lora_paths[name] = LoRARef(
2121
+ lora_name=name, lora_path=path, pinned=False
2122
+ )
1940
2123
  else:
1941
2124
  self.lora_paths[lora_path] = LoRARef(
1942
- lora_name=lora_path,
1943
- lora_path=lora_path,
2125
+ lora_name=lora_path, lora_path=lora_path, pinned=False
1944
2126
  )
1945
2127
  elif isinstance(self.lora_paths, dict):
1946
2128
  self.lora_paths = {
1947
- k: LoRARef(lora_name=k, lora_path=v)
2129
+ k: LoRARef(lora_name=k, lora_path=v, pinned=False)
1948
2130
  for k, v in self.lora_paths.items()
1949
2131
  }
1950
2132
  elif self.lora_paths is None:
@@ -1969,6 +2151,19 @@ class ServerArgs:
1969
2151
  self.max_lora_rank and self.lora_target_modules
1970
2152
  ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
1971
2153
 
2154
+ # Validate max_loaded_loras
2155
+ if self.max_loaded_loras is not None:
2156
+ assert self.max_loaded_loras >= self.max_loras_per_batch, (
2157
+ "max_loaded_loras should be greater than or equal to max_loras_per_batch. "
2158
+ f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
2159
+ )
2160
+ assert (
2161
+ not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
2162
+ ), (
2163
+ "The number of LoRA paths should not exceed max_loaded_loras. "
2164
+ f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2165
+ )
2166
+
1972
2167
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
1973
2168
  larger_tp = max(decode_tp, prefill_tp)
1974
2169
  smaller_tp = min(decode_tp, prefill_tp)
@@ -142,6 +142,22 @@ class EAGLEDraftExtendCudaGraphRunner:
142
142
  self.global_num_tokens_for_logprob_gpu = None
143
143
  self.gathered_buffer = None
144
144
 
145
+ if hasattr(
146
+ self.model_runner.model_config.hf_config, "draft_vocab_size"
147
+ ): # llama_eagle
148
+ vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
149
+ elif hasattr(
150
+ self.model_runner.model_config.hf_config, "hot_vocab_size"
151
+ ): # llama_eagle3
152
+ vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
153
+ else:
154
+ vocab_size = self.model_runner.model_config.vocab_size
155
+
156
+ self.next_token_logits_buffer = torch.zeros(
157
+ (self.max_bs, vocab_size),
158
+ dtype=torch.float,
159
+ )
160
+
145
161
  # Capture
146
162
  try:
147
163
  with model_capture_mode():
@@ -189,6 +205,7 @@ class EAGLEDraftExtendCudaGraphRunner:
189
205
  out_cache_loc = self.out_cache_loc[:num_tokens]
190
206
  positions = self.positions[:num_tokens]
191
207
  hidden_states = self.hidden_states[:num_tokens]
208
+ next_token_logits_buffer = self.next_token_logits_buffer[:bs]
192
209
 
193
210
  if self.require_mlp_tp_gather:
194
211
  self.global_num_tokens_gpu.copy_(
@@ -238,6 +255,7 @@ class EAGLEDraftExtendCudaGraphRunner:
238
255
  input_ids=input_ids,
239
256
  req_pool_indices=req_pool_indices,
240
257
  seq_lens=seq_lens,
258
+ next_token_logits_buffer=next_token_logits_buffer,
241
259
  req_to_token_pool=self.model_runner.req_to_token_pool,
242
260
  token_to_kv_pool=self.model_runner.token_to_kv_pool,
243
261
  out_cache_loc=out_cache_loc,