sglang 0.4.10.post1__py3-none-any.whl → 0.5.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +113 -17
- sglang/compile_deep_gemm.py +8 -1
- sglang/global_config.py +5 -1
- sglang/srt/configs/model_config.py +35 -0
- sglang/srt/conversation.py +9 -117
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +6 -1
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -0
- sglang/srt/disaggregation/mooncake/conn.py +243 -135
- sglang/srt/disaggregation/prefill.py +3 -0
- sglang/srt/distributed/device_communicators/pynccl.py +7 -0
- sglang/srt/distributed/device_communicators/pynccl_allocator.py +133 -0
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +42 -3
- sglang/srt/distributed/parallel_state.py +22 -9
- sglang/srt/entrypoints/context.py +244 -0
- sglang/srt/entrypoints/engine.py +8 -5
- sglang/srt/entrypoints/harmony_utils.py +370 -0
- sglang/srt/entrypoints/http_server.py +106 -15
- sglang/srt/entrypoints/openai/protocol.py +227 -1
- sglang/srt/entrypoints/openai/serving_chat.py +278 -42
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +174 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_distribution.py +4 -2
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/harmony_tool_parser.py +130 -0
- sglang/srt/hf_transformers_utils.py +55 -13
- sglang/srt/jinja_template_utils.py +8 -1
- sglang/srt/layers/attention/aiter_backend.py +5 -8
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +7 -11
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +6 -6
- sglang/srt/layers/attention/vision.py +40 -15
- sglang/srt/layers/communicator.py +35 -8
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/linear.py +9 -8
- sglang/srt/layers/logits_processor.py +9 -1
- sglang/srt/layers/moe/cutlass_moe.py +20 -6
- sglang/srt/layers/moe/ep_moe/layer.py +87 -107
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=352,device_name=NVIDIA_RTX_6000_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +442 -58
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +169 -15
- sglang/srt/layers/moe/token_dispatcher/__init__.py +23 -0
- sglang/srt/layers/moe/token_dispatcher/base_dispatcher.py +12 -1
- sglang/srt/layers/moe/{ep_moe/token_dispatcher.py → token_dispatcher/deepep.py} +8 -15
- sglang/srt/layers/moe/topk.py +12 -3
- sglang/srt/layers/moe/utils.py +59 -0
- sglang/srt/layers/quantization/__init__.py +22 -0
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +3 -2
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +1 -1
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +8 -7
- sglang/srt/layers/quantization/fp8_kernel.py +0 -4
- sglang/srt/layers/quantization/fp8_utils.py +29 -0
- sglang/srt/layers/quantization/modelopt_quant.py +259 -64
- sglang/srt/layers/quantization/mxfp4.py +651 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/__init__.py +0 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +1 -1
- sglang/srt/layers/rotary_embedding.py +225 -1
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +15 -4
- sglang/srt/lora/lora_manager.py +70 -14
- sglang/srt/lora/lora_registry.py +10 -2
- sglang/srt/lora/mem_pool.py +43 -5
- sglang/srt/managers/cache_controller.py +61 -32
- sglang/srt/managers/data_parallel_controller.py +52 -2
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +21 -4
- sglang/srt/managers/mm_utils.py +5 -11
- sglang/srt/managers/schedule_batch.py +30 -8
- sglang/srt/managers/schedule_policy.py +3 -1
- sglang/srt/managers/scheduler.py +170 -18
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +59 -22
- sglang/srt/managers/tokenizer_manager.py +137 -67
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/managers/utils.py +45 -1
- sglang/srt/mem_cache/cpp_radix_tree/radix_tree.py +182 -0
- sglang/srt/mem_cache/hicache_storage.py +13 -21
- sglang/srt/mem_cache/hiradix_cache.py +53 -5
- sglang/srt/mem_cache/memory_pool_host.py +1 -1
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache_cpp.py +229 -0
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_utils.cpp +35 -0
- sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang/srt/model_executor/forward_batch_info.py +48 -17
- sglang/srt/model_executor/model_runner.py +24 -2
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +95 -50
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma3n_mm.py +39 -0
- sglang/srt/models/glm4_moe.py +102 -27
- sglang/srt/models/gpt_oss.py +1134 -0
- sglang/srt/models/grok.py +3 -3
- sglang/srt/models/llama4.py +13 -2
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mllama4.py +428 -19
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_moe.py +7 -4
- sglang/srt/models/qwen3_moe.py +39 -14
- sglang/srt/models/step3_vl.py +10 -1
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +4 -3
- sglang/srt/multimodal/processors/gemma3n.py +0 -7
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/operations_strategy.py +1 -1
- sglang/srt/reasoning_parser.py +18 -39
- sglang/srt/server_args.py +218 -23
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +18 -0
- sglang/srt/two_batch_overlap.py +163 -9
- sglang/srt/utils.py +41 -26
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/runners.py +4 -4
- sglang/test/test_utils.py +4 -4
- sglang/version.py +1 -1
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/METADATA +18 -15
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/RECORD +143 -116
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/mooncake_store.py +0 -0
- /sglang/srt/mem_cache/{mooncake_store → storage/mooncake_store}/unit_test.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/hicache_nixl.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/nixl_utils.py +0 -0
- /sglang/srt/mem_cache/{nixl → storage/nixl}/test_hicache_nixl_storage.py +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post1.dist-info → sglang-0.5.0rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -37,6 +37,7 @@ from sglang.srt.utils import (
|
|
37
37
|
is_hip,
|
38
38
|
is_port_available,
|
39
39
|
is_remote_url,
|
40
|
+
is_triton_kernels_available,
|
40
41
|
is_valid_ipv6_address,
|
41
42
|
nullable_str,
|
42
43
|
)
|
@@ -149,6 +150,7 @@ class ServerArgs:
|
|
149
150
|
max_lora_rank: Optional[int] = None
|
150
151
|
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
151
152
|
lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
|
153
|
+
max_loaded_loras: Optional[int] = None
|
152
154
|
max_loras_per_batch: int = 8
|
153
155
|
lora_backend: str = "triton"
|
154
156
|
|
@@ -172,12 +174,11 @@ class ServerArgs:
|
|
172
174
|
|
173
175
|
# Expert parallelism
|
174
176
|
ep_size: int = 1
|
175
|
-
|
176
|
-
enable_deepep_moe: bool = False
|
177
|
+
moe_a2a_backend: Optional[Literal["deepep"]] = None
|
177
178
|
enable_flashinfer_cutlass_moe: bool = False
|
178
179
|
enable_flashinfer_trtllm_moe: bool = False
|
179
180
|
enable_flashinfer_allreduce_fusion: bool = False
|
180
|
-
deepep_mode:
|
181
|
+
deepep_mode: Literal["auto", "normal", "low_latency"] = "auto"
|
181
182
|
ep_num_redundant_experts: int = 0
|
182
183
|
ep_dispatch_algorithm: Optional[Literal["static", "dynamic", "fake"]] = None
|
183
184
|
init_expert_location: str = "trivial"
|
@@ -201,6 +202,7 @@ class ServerArgs:
|
|
201
202
|
hicache_io_backend: str = "kernel"
|
202
203
|
hicache_mem_layout: str = "layer_first"
|
203
204
|
hicache_storage_backend: Optional[str] = None
|
205
|
+
hicache_storage_prefetch_policy: str = "best_effort"
|
204
206
|
|
205
207
|
# Double Sparsity
|
206
208
|
enable_double_sparsity: bool = False
|
@@ -219,6 +221,7 @@ class ServerArgs:
|
|
219
221
|
enable_profile_cuda_graph: bool = False
|
220
222
|
enable_cudagraph_gc: bool = False
|
221
223
|
enable_nccl_nvls: bool = False
|
224
|
+
enable_symm_mem: bool = False
|
222
225
|
enable_tokenizer_batch_encode: bool = False
|
223
226
|
disable_outlines_disk_cache: bool = False
|
224
227
|
disable_custom_all_reduce: bool = False
|
@@ -228,6 +231,7 @@ class ServerArgs:
|
|
228
231
|
enable_dp_attention: bool = False
|
229
232
|
enable_dp_lm_head: bool = False
|
230
233
|
enable_two_batch_overlap: bool = False
|
234
|
+
tbo_token_distribution_threshold: float = 0.48
|
231
235
|
enable_torch_compile: bool = False
|
232
236
|
torch_compile_max_bs: int = 32
|
233
237
|
torchao_config: str = ""
|
@@ -246,6 +250,8 @@ class ServerArgs:
|
|
246
250
|
disable_fast_image_processor: bool = False
|
247
251
|
enable_return_hidden_states: bool = False
|
248
252
|
enable_triton_kernel_moe: bool = False
|
253
|
+
enable_flashinfer_mxfp4_moe: bool = False
|
254
|
+
scheduler_recv_interval: int = 1
|
249
255
|
|
250
256
|
# Debug tensor dumps
|
251
257
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -272,7 +278,30 @@ class ServerArgs:
|
|
272
278
|
enable_pdmux: bool = False
|
273
279
|
sm_group_num: int = 3
|
274
280
|
|
281
|
+
# For tool server
|
282
|
+
tool_server: Optional[str] = None
|
283
|
+
|
284
|
+
# Deprecated arguments
|
285
|
+
enable_ep_moe: bool = False
|
286
|
+
enable_deepep_moe: bool = False
|
287
|
+
|
275
288
|
def __post_init__(self):
|
289
|
+
|
290
|
+
# Check deprecated arguments
|
291
|
+
def print_deprecated_warning(message: str):
|
292
|
+
logger.warning(f"\033[33m{message}\033[0m")
|
293
|
+
|
294
|
+
if self.enable_ep_moe:
|
295
|
+
self.ep_size = self.tp_size
|
296
|
+
print_deprecated_warning(
|
297
|
+
"NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
|
298
|
+
)
|
299
|
+
if self.enable_deepep_moe:
|
300
|
+
self.moe_a2a_backend = "deepep"
|
301
|
+
print_deprecated_warning(
|
302
|
+
"NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
|
303
|
+
)
|
304
|
+
|
276
305
|
# Set missing default values
|
277
306
|
if self.tokenizer_path is None:
|
278
307
|
self.tokenizer_path = self.model_path
|
@@ -420,6 +449,81 @@ class ServerArgs:
|
|
420
449
|
"trtllm_mla backend does not support speculative decoding yet."
|
421
450
|
)
|
422
451
|
|
452
|
+
if (
|
453
|
+
self.attention_backend == "trtllm_mha"
|
454
|
+
or self.decode_attention_backend == "trtllm_mha"
|
455
|
+
or self.prefill_attention_backend == "trtllm_mha"
|
456
|
+
):
|
457
|
+
if not is_sm100_supported():
|
458
|
+
raise ValueError(
|
459
|
+
"TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
460
|
+
)
|
461
|
+
|
462
|
+
if self.page_size not in [16, 32, 64]:
|
463
|
+
logger.warning(
|
464
|
+
f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
|
465
|
+
)
|
466
|
+
self.page_size = 64
|
467
|
+
|
468
|
+
if self.speculative_algorithm is not None:
|
469
|
+
raise ValueError(
|
470
|
+
"trtllm_mha backend does not support speculative decoding yet."
|
471
|
+
)
|
472
|
+
|
473
|
+
model_arch = self.get_hf_config().architectures[0]
|
474
|
+
if model_arch in ["GptOssForCausalLM"]:
|
475
|
+
if self.attention_backend is None:
|
476
|
+
# default is triton, but we could have trtllm_mha as an option
|
477
|
+
self.attention_backend = "triton"
|
478
|
+
assert (
|
479
|
+
self.attention_backend == "trtllm_mha"
|
480
|
+
or self.attention_backend == "triton"
|
481
|
+
)
|
482
|
+
quantization_config = getattr(
|
483
|
+
self.get_hf_config(), "quantization_config", None
|
484
|
+
)
|
485
|
+
is_mxfp4_quant_format = (
|
486
|
+
quantization_config is not None
|
487
|
+
and quantization_config.get("quant_method") == "mxfp4"
|
488
|
+
)
|
489
|
+
|
490
|
+
if is_sm100_supported() and is_mxfp4_quant_format:
|
491
|
+
self.enable_flashinfer_mxfp4_moe = True
|
492
|
+
self.enable_triton_kernel_moe = False
|
493
|
+
logger.info(
|
494
|
+
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
495
|
+
)
|
496
|
+
else:
|
497
|
+
if self.enable_triton_kernel_moe:
|
498
|
+
assert (
|
499
|
+
self.ep_size == 1
|
500
|
+
), "Triton kernel MoE is only supported when ep_size == 1"
|
501
|
+
if not self.enable_triton_kernel_moe and self.ep_size == 1:
|
502
|
+
self.enable_triton_kernel_moe = True
|
503
|
+
logger.info(
|
504
|
+
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
505
|
+
)
|
506
|
+
|
507
|
+
self.disable_hybrid_swa_memory = True
|
508
|
+
|
509
|
+
if is_mxfp4_quant_format:
|
510
|
+
# use bf16 for mxfp4 triton kernels
|
511
|
+
self.dtype = "bfloat16"
|
512
|
+
|
513
|
+
if self.attention_backend == "dual_chunk_flash_attn":
|
514
|
+
logger.warning(
|
515
|
+
"Mixed chunk is disabled because of using dual chunk flash attention backend"
|
516
|
+
)
|
517
|
+
logger.warning(
|
518
|
+
"Radix cache is disabled because of using dual chunk flash attention backend"
|
519
|
+
)
|
520
|
+
logger.warning(
|
521
|
+
"Cuda graph is disabled because of using dual chunk flash attention backend"
|
522
|
+
)
|
523
|
+
self.enable_mixed_chunk = False
|
524
|
+
self.disable_cuda_graph = True
|
525
|
+
self.disable_radix_cache = True
|
526
|
+
|
423
527
|
# Set page size
|
424
528
|
if self.page_size is None:
|
425
529
|
self.page_size = 1
|
@@ -455,14 +559,20 @@ class ServerArgs:
|
|
455
559
|
self.quantization == "modelopt_fp4"
|
456
560
|
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
457
561
|
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
458
|
-
|
459
|
-
|
562
|
+
assert self.ep_size in [
|
563
|
+
1,
|
564
|
+
self.tp_size,
|
565
|
+
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
566
|
+
|
567
|
+
if self.enable_flashinfer_trtllm_moe:
|
568
|
+
if not self.disable_shared_experts_fusion:
|
569
|
+
self.disable_shared_experts_fusion = True
|
460
570
|
logger.warning(
|
461
|
-
|
571
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
462
572
|
)
|
463
573
|
|
464
574
|
# DeepEP MoE
|
465
|
-
if self.
|
575
|
+
if self.moe_a2a_backend == "deepep":
|
466
576
|
if self.deepep_mode == "normal":
|
467
577
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
468
578
|
self.disable_cuda_graph = True
|
@@ -486,7 +596,7 @@ class ServerArgs:
|
|
486
596
|
)
|
487
597
|
|
488
598
|
if self.enable_eplb:
|
489
|
-
assert self.
|
599
|
+
assert self.ep_size > 1 or self.moe_a2a_backend is not None
|
490
600
|
|
491
601
|
if self.enable_expert_distribution_metrics and (
|
492
602
|
self.expert_distribution_recorder_mode is None
|
@@ -786,6 +896,7 @@ class ServerArgs:
|
|
786
896
|
"moe_wna16",
|
787
897
|
"qoq",
|
788
898
|
"w4afp8",
|
899
|
+
"mxfp4",
|
789
900
|
],
|
790
901
|
help="The quantization method.",
|
791
902
|
)
|
@@ -848,7 +959,7 @@ class ServerArgs:
|
|
848
959
|
"--schedule-policy",
|
849
960
|
type=str,
|
850
961
|
default=ServerArgs.schedule_policy,
|
851
|
-
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
962
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
852
963
|
help="The scheduling policy of the requests.",
|
853
964
|
)
|
854
965
|
parser.add_argument(
|
@@ -1151,6 +1262,7 @@ class ServerArgs:
|
|
1151
1262
|
choices=[
|
1152
1263
|
"round_robin",
|
1153
1264
|
"shortest_queue",
|
1265
|
+
"minimum_tokens",
|
1154
1266
|
],
|
1155
1267
|
)
|
1156
1268
|
|
@@ -1218,6 +1330,12 @@ class ServerArgs:
|
|
1218
1330
|
default=8,
|
1219
1331
|
help="Maximum number of adapters for a running batch, include base-only request.",
|
1220
1332
|
)
|
1333
|
+
parser.add_argument(
|
1334
|
+
"--max-loaded-loras",
|
1335
|
+
type=int,
|
1336
|
+
default=ServerArgs.max_loaded_loras,
|
1337
|
+
help="If specified, it limits the maximum number of LoRA adapters loaded in CPU memory at a time. The value must be greater than or equal to `--max-loras-per-batch`.",
|
1338
|
+
)
|
1221
1339
|
parser.add_argument(
|
1222
1340
|
"--lora-backend",
|
1223
1341
|
type=str,
|
@@ -1240,6 +1358,8 @@ class ServerArgs:
|
|
1240
1358
|
"ascend",
|
1241
1359
|
"triton",
|
1242
1360
|
"trtllm_mla",
|
1361
|
+
"trtllm_mha",
|
1362
|
+
"dual_chunk_flash_attn",
|
1243
1363
|
],
|
1244
1364
|
default=ServerArgs.attention_backend,
|
1245
1365
|
help="Choose the kernels for attention layers.",
|
@@ -1354,30 +1474,27 @@ class ServerArgs:
|
|
1354
1474
|
help="The expert parallelism size.",
|
1355
1475
|
)
|
1356
1476
|
parser.add_argument(
|
1357
|
-
"--
|
1358
|
-
|
1359
|
-
|
1477
|
+
"--moe-a2a-backend",
|
1478
|
+
type=str,
|
1479
|
+
choices=["deepep"],
|
1480
|
+
default=ServerArgs.moe_a2a_backend,
|
1481
|
+
help="Choose the backend for MoE A2A.",
|
1360
1482
|
)
|
1361
1483
|
parser.add_argument(
|
1362
1484
|
"--enable-flashinfer-cutlass-moe",
|
1363
1485
|
action="store_true",
|
1364
|
-
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP
|
1486
|
+
help="Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
1365
1487
|
)
|
1366
1488
|
parser.add_argument(
|
1367
1489
|
"--enable-flashinfer-trtllm-moe",
|
1368
1490
|
action="store_true",
|
1369
|
-
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP
|
1491
|
+
help="Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
|
1370
1492
|
)
|
1371
1493
|
parser.add_argument(
|
1372
1494
|
"--enable-flashinfer-allreduce-fusion",
|
1373
1495
|
action="store_true",
|
1374
1496
|
help="Enable FlashInfer allreduce fusion for Add_RMSNorm.",
|
1375
1497
|
)
|
1376
|
-
parser.add_argument(
|
1377
|
-
"--enable-deepep-moe",
|
1378
|
-
action="store_true",
|
1379
|
-
help="Enabling DeepEP MoE implementation for EP MoE.",
|
1380
|
-
)
|
1381
1498
|
parser.add_argument(
|
1382
1499
|
"--deepep-mode",
|
1383
1500
|
type=str,
|
@@ -1503,6 +1620,13 @@ class ServerArgs:
|
|
1503
1620
|
default=ServerArgs.hicache_storage_backend,
|
1504
1621
|
help="The storage backend for hierarchical KV cache.",
|
1505
1622
|
)
|
1623
|
+
parser.add_argument(
|
1624
|
+
"--hicache-storage-prefetch-policy",
|
1625
|
+
type=str,
|
1626
|
+
choices=["best_effort", "wait_complete", "timeout"],
|
1627
|
+
default=ServerArgs.hicache_storage_prefetch_policy,
|
1628
|
+
help="Control when prefetching from the storage backend should stop.",
|
1629
|
+
)
|
1506
1630
|
|
1507
1631
|
# Double Sparsity
|
1508
1632
|
parser.add_argument(
|
@@ -1584,6 +1708,11 @@ class ServerArgs:
|
|
1584
1708
|
action="store_true",
|
1585
1709
|
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
1586
1710
|
)
|
1711
|
+
parser.add_argument(
|
1712
|
+
"--enable-symm-mem",
|
1713
|
+
action="store_true",
|
1714
|
+
help="Enable NCCL symmetric memory for fast collectives.",
|
1715
|
+
)
|
1587
1716
|
parser.add_argument(
|
1588
1717
|
"--enable-tokenizer-batch-encode",
|
1589
1718
|
action="store_true",
|
@@ -1629,6 +1758,12 @@ class ServerArgs:
|
|
1629
1758
|
action="store_true",
|
1630
1759
|
help="Enabling two micro batches to overlap.",
|
1631
1760
|
)
|
1761
|
+
parser.add_argument(
|
1762
|
+
"--tbo-token-distribution-threshold",
|
1763
|
+
type=float,
|
1764
|
+
default=ServerArgs.tbo_token_distribution_threshold,
|
1765
|
+
help="The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.",
|
1766
|
+
)
|
1632
1767
|
parser.add_argument(
|
1633
1768
|
"--enable-torch-compile",
|
1634
1769
|
action="store_true",
|
@@ -1726,6 +1861,17 @@ class ServerArgs:
|
|
1726
1861
|
action="store_true",
|
1727
1862
|
help="Use triton moe grouped gemm kernel.",
|
1728
1863
|
)
|
1864
|
+
parser.add_argument(
|
1865
|
+
"--enable-flashinfer-mxfp4-moe",
|
1866
|
+
action="store_true",
|
1867
|
+
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
1868
|
+
)
|
1869
|
+
parser.add_argument(
|
1870
|
+
"--scheduler-recv-interval",
|
1871
|
+
type=int,
|
1872
|
+
default=ServerArgs.scheduler_recv_interval,
|
1873
|
+
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1874
|
+
)
|
1729
1875
|
|
1730
1876
|
# Debug tensor dumps
|
1731
1877
|
parser.add_argument(
|
@@ -1839,6 +1985,26 @@ class ServerArgs:
|
|
1839
1985
|
help="Disable mmap while loading weight using safetensors.",
|
1840
1986
|
)
|
1841
1987
|
|
1988
|
+
# For tool server
|
1989
|
+
parser.add_argument(
|
1990
|
+
"--tool-server",
|
1991
|
+
type=str,
|
1992
|
+
default=None,
|
1993
|
+
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
|
1994
|
+
)
|
1995
|
+
|
1996
|
+
# Deprecated arguments
|
1997
|
+
parser.add_argument(
|
1998
|
+
"--enable-ep-moe",
|
1999
|
+
action="store_true",
|
2000
|
+
help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
|
2001
|
+
)
|
2002
|
+
parser.add_argument(
|
2003
|
+
"--enable-deepep-moe",
|
2004
|
+
action="store_true",
|
2005
|
+
help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
|
2006
|
+
)
|
2007
|
+
|
1842
2008
|
@classmethod
|
1843
2009
|
def from_cli_args(cls, args: argparse.Namespace):
|
1844
2010
|
args.tp_size = args.tensor_parallel_size
|
@@ -1895,6 +2061,20 @@ class ServerArgs:
|
|
1895
2061
|
if "Llama4" in model_arch:
|
1896
2062
|
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
1897
2063
|
|
2064
|
+
if model_arch in [
|
2065
|
+
"Gemma2ForCausalLM",
|
2066
|
+
"Gemma3ForCausalLM",
|
2067
|
+
"Gemma3ForConditionalGeneration",
|
2068
|
+
"Gemma3nForCausalLM",
|
2069
|
+
"Gemma3nForConditionalGeneration",
|
2070
|
+
]:
|
2071
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
2072
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
2073
|
+
logger.warning(
|
2074
|
+
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
2075
|
+
)
|
2076
|
+
self.disable_hybrid_swa_memory = True
|
2077
|
+
|
1898
2078
|
# Check LoRA
|
1899
2079
|
self.check_lora_server_args()
|
1900
2080
|
|
@@ -1930,21 +2110,23 @@ class ServerArgs:
|
|
1930
2110
|
|
1931
2111
|
if self.enable_lora:
|
1932
2112
|
# Normalize lora_paths to a dictionary if it is a list.
|
2113
|
+
# TODO (lifuhuang): support specifying pinned adapters in server_args.
|
1933
2114
|
if isinstance(self.lora_paths, list):
|
1934
2115
|
lora_paths = self.lora_paths
|
1935
2116
|
self.lora_paths = {}
|
1936
2117
|
for lora_path in lora_paths:
|
1937
2118
|
if "=" in lora_path:
|
1938
2119
|
name, path = lora_path.split("=", 1)
|
1939
|
-
self.lora_paths[name] = LoRARef(
|
2120
|
+
self.lora_paths[name] = LoRARef(
|
2121
|
+
lora_name=name, lora_path=path, pinned=False
|
2122
|
+
)
|
1940
2123
|
else:
|
1941
2124
|
self.lora_paths[lora_path] = LoRARef(
|
1942
|
-
lora_name=lora_path,
|
1943
|
-
lora_path=lora_path,
|
2125
|
+
lora_name=lora_path, lora_path=lora_path, pinned=False
|
1944
2126
|
)
|
1945
2127
|
elif isinstance(self.lora_paths, dict):
|
1946
2128
|
self.lora_paths = {
|
1947
|
-
k: LoRARef(lora_name=k, lora_path=v)
|
2129
|
+
k: LoRARef(lora_name=k, lora_path=v, pinned=False)
|
1948
2130
|
for k, v in self.lora_paths.items()
|
1949
2131
|
}
|
1950
2132
|
elif self.lora_paths is None:
|
@@ -1969,6 +2151,19 @@ class ServerArgs:
|
|
1969
2151
|
self.max_lora_rank and self.lora_target_modules
|
1970
2152
|
), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
|
1971
2153
|
|
2154
|
+
# Validate max_loaded_loras
|
2155
|
+
if self.max_loaded_loras is not None:
|
2156
|
+
assert self.max_loaded_loras >= self.max_loras_per_batch, (
|
2157
|
+
"max_loaded_loras should be greater than or equal to max_loras_per_batch. "
|
2158
|
+
f"max_loaded_loras={self.max_loaded_loras}, max_loras_per_batch={self.max_loras_per_batch}"
|
2159
|
+
)
|
2160
|
+
assert (
|
2161
|
+
not self.lora_paths or len(self.lora_paths) <= self.max_loaded_loras
|
2162
|
+
), (
|
2163
|
+
"The number of LoRA paths should not exceed max_loaded_loras. "
|
2164
|
+
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2165
|
+
)
|
2166
|
+
|
1972
2167
|
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
1973
2168
|
larger_tp = max(decode_tp, prefill_tp)
|
1974
2169
|
smaller_tp = min(decode_tp, prefill_tp)
|
@@ -142,6 +142,22 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
142
142
|
self.global_num_tokens_for_logprob_gpu = None
|
143
143
|
self.gathered_buffer = None
|
144
144
|
|
145
|
+
if hasattr(
|
146
|
+
self.model_runner.model_config.hf_config, "draft_vocab_size"
|
147
|
+
): # llama_eagle
|
148
|
+
vocab_size = self.model_runner.model_config.hf_config.draft_vocab_size
|
149
|
+
elif hasattr(
|
150
|
+
self.model_runner.model_config.hf_config, "hot_vocab_size"
|
151
|
+
): # llama_eagle3
|
152
|
+
vocab_size = self.model_runner.model_config.hf_config.hot_vocab_size
|
153
|
+
else:
|
154
|
+
vocab_size = self.model_runner.model_config.vocab_size
|
155
|
+
|
156
|
+
self.next_token_logits_buffer = torch.zeros(
|
157
|
+
(self.max_bs, vocab_size),
|
158
|
+
dtype=torch.float,
|
159
|
+
)
|
160
|
+
|
145
161
|
# Capture
|
146
162
|
try:
|
147
163
|
with model_capture_mode():
|
@@ -189,6 +205,7 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
189
205
|
out_cache_loc = self.out_cache_loc[:num_tokens]
|
190
206
|
positions = self.positions[:num_tokens]
|
191
207
|
hidden_states = self.hidden_states[:num_tokens]
|
208
|
+
next_token_logits_buffer = self.next_token_logits_buffer[:bs]
|
192
209
|
|
193
210
|
if self.require_mlp_tp_gather:
|
194
211
|
self.global_num_tokens_gpu.copy_(
|
@@ -238,6 +255,7 @@ class EAGLEDraftExtendCudaGraphRunner:
|
|
238
255
|
input_ids=input_ids,
|
239
256
|
req_pool_indices=req_pool_indices,
|
240
257
|
seq_lens=seq_lens,
|
258
|
+
next_token_logits_buffer=next_token_logits_buffer,
|
241
259
|
req_to_token_pool=self.model_runner.req_to_token_pool,
|
242
260
|
token_to_kv_pool=self.model_runner.token_to_kv_pool,
|
243
261
|
out_cache_loc=out_cache_loc,
|