sglang 0.4.10.post2__py3-none-any.whl → 0.5.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -3
- sglang/bench_one_batch.py +119 -17
- sglang/lang/chat_template.py +18 -0
- sglang/srt/bench_utils.py +137 -0
- sglang/srt/configs/model_config.py +42 -7
- sglang/srt/conversation.py +9 -5
- sglang/srt/disaggregation/base/conn.py +5 -2
- sglang/srt/disaggregation/decode.py +14 -4
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +3 -0
- sglang/srt/disaggregation/mooncake/conn.py +286 -160
- sglang/srt/disaggregation/mooncake/transfer_engine.py +29 -0
- sglang/srt/disaggregation/prefill.py +2 -0
- sglang/srt/distributed/parallel_state.py +15 -11
- sglang/srt/entrypoints/context.py +227 -0
- sglang/srt/entrypoints/engine.py +15 -9
- sglang/srt/entrypoints/harmony_utils.py +372 -0
- sglang/srt/entrypoints/http_server.py +74 -4
- sglang/srt/entrypoints/openai/protocol.py +218 -1
- sglang/srt/entrypoints/openai/serving_chat.py +41 -11
- sglang/srt/entrypoints/openai/serving_responses.py +1273 -0
- sglang/srt/entrypoints/openai/tool_server.py +175 -0
- sglang/srt/entrypoints/tool.py +87 -0
- sglang/srt/eplb/expert_location.py +5 -1
- sglang/srt/function_call/ebnf_composer.py +1 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +331 -0
- sglang/srt/function_call/kimik2_detector.py +3 -3
- sglang/srt/function_call/qwen3_coder_detector.py +219 -9
- sglang/srt/hf_transformers_utils.py +30 -3
- sglang/srt/jinja_template_utils.py +14 -1
- sglang/srt/layers/attention/aiter_backend.py +375 -115
- sglang/srt/layers/attention/ascend_backend.py +3 -0
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1700 -0
- sglang/srt/layers/attention/flashattention_backend.py +18 -0
- sglang/srt/layers/attention/flashinfer_backend.py +52 -13
- sglang/srt/layers/attention/hybrid_attn_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +85 -14
- sglang/srt/layers/attention/triton_ops/decode_attention.py +17 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +143 -98
- sglang/srt/layers/attention/trtllm_mha_backend.py +332 -0
- sglang/srt/layers/attention/trtllm_mla_backend.py +119 -22
- sglang/srt/layers/attention/vision.py +22 -6
- sglang/srt/layers/attention/wave_backend.py +627 -0
- sglang/srt/layers/attention/wave_ops/decode_attention.py +186 -0
- sglang/srt/layers/attention/wave_ops/extend_attention.py +149 -0
- sglang/srt/layers/attention/wave_ops/prefill_attention.py +79 -0
- sglang/srt/layers/communicator.py +29 -14
- sglang/srt/layers/dp_attention.py +12 -0
- sglang/srt/layers/flashinfer_comm_fusion.py +4 -4
- sglang/srt/layers/linear.py +3 -7
- sglang/srt/layers/moe/cutlass_moe.py +12 -3
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -5
- sglang/srt/layers/moe/ep_moe/kernels.py +43 -0
- sglang/srt/layers/moe/ep_moe/layer.py +135 -73
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=384,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +101 -12
- sglang/srt/layers/moe/fused_moe_triton/layer.py +412 -33
- sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +188 -3
- sglang/srt/layers/moe/token_dispatcher/deepep.py +61 -24
- sglang/srt/layers/moe/topk.py +16 -4
- sglang/srt/layers/moe/utils.py +16 -0
- sglang/srt/layers/quantization/__init__.py +27 -3
- sglang/srt/layers/quantization/fp4.py +557 -0
- sglang/srt/layers/quantization/fp8.py +3 -6
- sglang/srt/layers/quantization/fp8_kernel.py +277 -0
- sglang/srt/layers/quantization/fp8_utils.py +51 -10
- sglang/srt/layers/quantization/modelopt_quant.py +258 -68
- sglang/srt/layers/quantization/mxfp4.py +654 -0
- sglang/srt/layers/quantization/mxfp4_tensor.py +133 -0
- sglang/srt/layers/quantization/quark/schemes/__init__.py +6 -0
- sglang/srt/layers/quantization/quark/schemes/quark_scheme.py +55 -0
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +118 -0
- sglang/srt/layers/quantization/quark/utils.py +107 -0
- sglang/srt/layers/quantization/unquant.py +60 -6
- sglang/srt/layers/quantization/w4afp8.py +21 -12
- sglang/srt/layers/quantization/w8a8_int8.py +48 -34
- sglang/srt/layers/rotary_embedding.py +506 -3
- sglang/srt/layers/utils.py +9 -0
- sglang/srt/layers/vocab_parallel_embedding.py +8 -3
- sglang/srt/lora/backend/base_backend.py +3 -23
- sglang/srt/lora/layers.py +60 -114
- sglang/srt/lora/lora.py +17 -62
- sglang/srt/lora/lora_manager.py +82 -62
- sglang/srt/lora/lora_registry.py +23 -11
- sglang/srt/lora/mem_pool.py +63 -68
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/utils.py +25 -58
- sglang/srt/managers/cache_controller.py +75 -58
- sglang/srt/managers/detokenizer_manager.py +1 -1
- sglang/srt/managers/io_struct.py +20 -8
- sglang/srt/managers/mm_utils.py +6 -13
- sglang/srt/managers/multimodal_processor.py +1 -1
- sglang/srt/managers/schedule_batch.py +61 -25
- sglang/srt/managers/schedule_policy.py +6 -6
- sglang/srt/managers/scheduler.py +41 -19
- sglang/srt/managers/scheduler_output_processor_mixin.py +1 -2
- sglang/srt/managers/scheduler_profiler_mixin.py +28 -8
- sglang/srt/managers/scheduler_recv_skipper.py +37 -0
- sglang/srt/managers/scheduler_update_weights_mixin.py +6 -0
- sglang/srt/managers/template_manager.py +35 -1
- sglang/srt/managers/tokenizer_manager.py +47 -30
- sglang/srt/managers/tp_worker.py +3 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -0
- sglang/srt/mem_cache/allocator.py +61 -87
- sglang/srt/mem_cache/hicache_storage.py +1 -1
- sglang/srt/mem_cache/hiradix_cache.py +80 -22
- sglang/srt/mem_cache/lora_radix_cache.py +421 -0
- sglang/srt/mem_cache/memory_pool_host.py +34 -36
- sglang/srt/mem_cache/multimodal_cache.py +33 -13
- sglang/srt/mem_cache/radix_cache.py +2 -5
- sglang/srt/mem_cache/storage/hf3fs/client_hf3fs.py +2 -2
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +443 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +139 -67
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +6 -9
- sglang/srt/model_executor/cuda_graph_runner.py +29 -9
- sglang/srt/model_executor/forward_batch_info.py +61 -19
- sglang/srt/model_executor/model_runner.py +148 -37
- sglang/srt/model_loader/loader.py +18 -6
- sglang/srt/model_loader/weight_utils.py +10 -0
- sglang/srt/models/bailing_moe.py +425 -0
- sglang/srt/models/deepseek_v2.py +137 -59
- sglang/srt/models/ernie4.py +426 -0
- sglang/srt/models/ernie4_eagle.py +203 -0
- sglang/srt/models/gemma2.py +0 -34
- sglang/srt/models/gemma3n_mm.py +38 -0
- sglang/srt/models/glm4.py +6 -0
- sglang/srt/models/glm4_moe.py +28 -16
- sglang/srt/models/glm4v.py +589 -0
- sglang/srt/models/glm4v_moe.py +400 -0
- sglang/srt/models/gpt_oss.py +1251 -0
- sglang/srt/models/granite.py +0 -25
- sglang/srt/models/llama.py +0 -25
- sglang/srt/models/llama4.py +1 -1
- sglang/srt/models/qwen2.py +6 -0
- sglang/srt/models/qwen2_5_vl.py +7 -3
- sglang/srt/models/qwen2_audio.py +10 -9
- sglang/srt/models/qwen2_moe.py +6 -0
- sglang/srt/models/qwen3.py +0 -24
- sglang/srt/models/qwen3_moe.py +32 -6
- sglang/srt/models/registry.py +1 -1
- sglang/srt/models/step3_vl.py +9 -0
- sglang/srt/models/torch_native_llama.py +0 -24
- sglang/srt/models/transformers.py +2 -5
- sglang/srt/multimodal/processors/base_processor.py +23 -13
- sglang/srt/multimodal/processors/glm4v.py +132 -0
- sglang/srt/multimodal/processors/qwen_audio.py +4 -2
- sglang/srt/multimodal/processors/step3_vl.py +3 -1
- sglang/srt/reasoning_parser.py +332 -37
- sglang/srt/server_args.py +186 -75
- sglang/srt/speculative/eagle_worker.py +16 -0
- sglang/srt/two_batch_overlap.py +169 -9
- sglang/srt/utils.py +41 -5
- sglang/srt/weight_sync/tensor_bucket.py +106 -0
- sglang/test/attention/test_trtllm_mla_backend.py +186 -36
- sglang/test/doc_patch.py +59 -0
- sglang/test/few_shot_gsm8k.py +1 -1
- sglang/test/few_shot_gsm8k_engine.py +1 -1
- sglang/test/run_eval.py +4 -1
- sglang/test/runners.py +2 -2
- sglang/test/simple_eval_common.py +6 -0
- sglang/test/simple_eval_gpqa.py +2 -0
- sglang/test/test_fp4_moe.py +118 -36
- sglang/test/test_utils.py +1 -1
- sglang/utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/METADATA +36 -38
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/RECORD +174 -141
- sglang/srt/lora/backend/flashinfer_backend.py +0 -131
- /sglang/{api.py → lang/api.py} +0 -0
- /sglang/{lang/backend → srt/layers/quantization/quark}/__init__.py +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/WHEEL +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.10.post2.dist-info → sglang-0.5.0rc1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -108,7 +108,7 @@ class ServerArgs:
|
|
108
108
|
log_level: str = "info"
|
109
109
|
log_level_http: Optional[str] = None
|
110
110
|
log_requests: bool = False
|
111
|
-
log_requests_level: int =
|
111
|
+
log_requests_level: int = 2
|
112
112
|
crash_dump_folder: Optional[str] = None
|
113
113
|
show_time_cost: bool = False
|
114
114
|
enable_metrics: bool = False
|
@@ -130,6 +130,7 @@ class ServerArgs:
|
|
130
130
|
enable_cache_report: bool = False
|
131
131
|
reasoning_parser: Optional[str] = None
|
132
132
|
tool_call_parser: Optional[str] = None
|
133
|
+
tool_server: Optional[str] = None
|
133
134
|
|
134
135
|
# Data parallelism
|
135
136
|
dp_size: int = 1
|
@@ -201,6 +202,7 @@ class ServerArgs:
|
|
201
202
|
hicache_io_backend: str = "kernel"
|
202
203
|
hicache_mem_layout: str = "layer_first"
|
203
204
|
hicache_storage_backend: Optional[str] = None
|
205
|
+
hicache_storage_prefetch_policy: str = "best_effort"
|
204
206
|
|
205
207
|
# Double Sparsity
|
206
208
|
enable_double_sparsity: bool = False
|
@@ -229,6 +231,7 @@ class ServerArgs:
|
|
229
231
|
enable_dp_attention: bool = False
|
230
232
|
enable_dp_lm_head: bool = False
|
231
233
|
enable_two_batch_overlap: bool = False
|
234
|
+
tbo_token_distribution_threshold: float = 0.48
|
232
235
|
enable_torch_compile: bool = False
|
233
236
|
torch_compile_max_bs: int = 32
|
234
237
|
torchao_config: str = ""
|
@@ -247,6 +250,8 @@ class ServerArgs:
|
|
247
250
|
disable_fast_image_processor: bool = False
|
248
251
|
enable_return_hidden_states: bool = False
|
249
252
|
enable_triton_kernel_moe: bool = False
|
253
|
+
enable_flashinfer_mxfp4_moe: bool = False
|
254
|
+
scheduler_recv_interval: int = 1
|
250
255
|
|
251
256
|
# Debug tensor dumps
|
252
257
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -278,7 +283,6 @@ class ServerArgs:
|
|
278
283
|
enable_deepep_moe: bool = False
|
279
284
|
|
280
285
|
def __post_init__(self):
|
281
|
-
|
282
286
|
# Check deprecated arguments
|
283
287
|
def print_deprecated_warning(message: str):
|
284
288
|
logger.warning(f"\033[33m{message}\033[0m")
|
@@ -384,6 +388,9 @@ class ServerArgs:
|
|
384
388
|
self.attention_backend = "torch_native"
|
385
389
|
self.sampling_backend = "pytorch"
|
386
390
|
|
391
|
+
# Model-specific adjustments
|
392
|
+
self.model_specific_adjustments()
|
393
|
+
|
387
394
|
# Set kernel backends
|
388
395
|
if self.device == "cpu":
|
389
396
|
if self.attention_backend is None:
|
@@ -425,7 +432,10 @@ class ServerArgs:
|
|
425
432
|
)
|
426
433
|
self.page_size = 128
|
427
434
|
|
428
|
-
if
|
435
|
+
if (
|
436
|
+
self.attention_backend == "trtllm_mla"
|
437
|
+
or self.decode_attention_backend == "trtllm_mla"
|
438
|
+
):
|
429
439
|
if not is_sm100_supported():
|
430
440
|
raise ValueError(
|
431
441
|
"TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
@@ -436,11 +446,46 @@ class ServerArgs:
|
|
436
446
|
f"TensorRT-LLM MLA only supports page_size of 32 or 64, changing page_size from {self.page_size} to 64."
|
437
447
|
)
|
438
448
|
self.page_size = 64
|
449
|
+
|
439
450
|
if self.speculative_algorithm is not None:
|
440
451
|
raise ValueError(
|
441
452
|
"trtllm_mla backend does not support speculative decoding yet."
|
442
453
|
)
|
443
454
|
|
455
|
+
if self.kv_cache_dtype not in ["fp8_e4m3", "auto"]:
|
456
|
+
raise ValueError(
|
457
|
+
"TensorRT-LLM MLA backend only supports kv-cache-dtype of fp8_e4m3 or auto."
|
458
|
+
)
|
459
|
+
|
460
|
+
if (
|
461
|
+
self.attention_backend == "trtllm_mha"
|
462
|
+
or self.decode_attention_backend == "trtllm_mha"
|
463
|
+
or self.prefill_attention_backend == "trtllm_mha"
|
464
|
+
):
|
465
|
+
if not is_sm100_supported():
|
466
|
+
raise ValueError(
|
467
|
+
"TRTLLM MHA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
|
468
|
+
)
|
469
|
+
|
470
|
+
if self.page_size not in [16, 32, 64]:
|
471
|
+
logger.warning(
|
472
|
+
f"TensorRT-LLM MHA only supports page_size of 16, 32 or 64, changing page_size from {self.page_size} to 64."
|
473
|
+
)
|
474
|
+
self.page_size = 64
|
475
|
+
|
476
|
+
if self.speculative_algorithm is not None:
|
477
|
+
raise ValueError(
|
478
|
+
"trtllm_mha backend does not support speculative decoding yet."
|
479
|
+
)
|
480
|
+
|
481
|
+
if self.attention_backend == "dual_chunk_flash_attn":
|
482
|
+
logger.warning(
|
483
|
+
"Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
|
484
|
+
)
|
485
|
+
self.enable_mixed_chunk = False
|
486
|
+
self.disable_cuda_graph = True
|
487
|
+
self.disable_radix_cache = True
|
488
|
+
|
444
489
|
# Set page size
|
445
490
|
if self.page_size is None:
|
446
491
|
self.page_size = 1
|
@@ -481,6 +526,13 @@ class ServerArgs:
|
|
481
526
|
self.tp_size,
|
482
527
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
483
528
|
|
529
|
+
if self.enable_flashinfer_trtllm_moe:
|
530
|
+
if not self.disable_shared_experts_fusion:
|
531
|
+
self.disable_shared_experts_fusion = True
|
532
|
+
logger.warning(
|
533
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
534
|
+
)
|
535
|
+
|
484
536
|
# DeepEP MoE
|
485
537
|
if self.moe_a2a_backend == "deepep":
|
486
538
|
if self.deepep_mode == "normal":
|
@@ -493,7 +545,7 @@ class ServerArgs:
|
|
493
545
|
|
494
546
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
495
547
|
self.expert_distribution_recorder_mode = "stat"
|
496
|
-
logger.
|
548
|
+
logger.warning(
|
497
549
|
"EPLB is enabled. The expert_distribution_recorder_mode is automatically set."
|
498
550
|
)
|
499
551
|
|
@@ -501,9 +553,6 @@ class ServerArgs:
|
|
501
553
|
self.ep_dispatch_algorithm is None
|
502
554
|
):
|
503
555
|
self.ep_dispatch_algorithm = "static"
|
504
|
-
logger.info(
|
505
|
-
"EPLB is enabled or init_expert_location is provided. ep_dispatch_algorithm is configured."
|
506
|
-
)
|
507
556
|
|
508
557
|
if self.enable_eplb:
|
509
558
|
assert self.ep_size > 1 or self.moe_a2a_backend is not None
|
@@ -526,6 +575,11 @@ class ServerArgs:
|
|
526
575
|
"Pipeline parallelism is incompatible with overlap schedule."
|
527
576
|
)
|
528
577
|
|
578
|
+
if self.hicache_storage_backend == "mooncake":
|
579
|
+
# to use mooncake storage backend, the following conditions must be met:
|
580
|
+
self.hicache_io_backend = "kernel"
|
581
|
+
self.hicache_mem_layout = "page_first"
|
582
|
+
|
529
583
|
# Speculative Decoding
|
530
584
|
if self.speculative_algorithm == "NEXTN":
|
531
585
|
# NEXTN shares the same implementation of EAGLE
|
@@ -806,6 +860,7 @@ class ServerArgs:
|
|
806
860
|
"moe_wna16",
|
807
861
|
"qoq",
|
808
862
|
"w4afp8",
|
863
|
+
"mxfp4",
|
809
864
|
],
|
810
865
|
help="The quantization method.",
|
811
866
|
)
|
@@ -868,7 +923,7 @@ class ServerArgs:
|
|
868
923
|
"--schedule-policy",
|
869
924
|
type=str,
|
870
925
|
default=ServerArgs.schedule_policy,
|
871
|
-
choices=["lpm", "random", "fcfs", "dfs-weight"],
|
926
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
872
927
|
help="The scheduling policy of the requests.",
|
873
928
|
)
|
874
929
|
parser.add_argument(
|
@@ -1021,7 +1076,7 @@ class ServerArgs:
|
|
1021
1076
|
parser.add_argument(
|
1022
1077
|
"--log-requests-level",
|
1023
1078
|
type=int,
|
1024
|
-
default=
|
1079
|
+
default=ServerArgs.log_requests_level,
|
1025
1080
|
help="0: Log metadata (no sampling parameters). 1: Log metadata and sampling parameters. 2: Log metadata, sampling parameters and partial input/output. 3: Log every input/output.",
|
1026
1081
|
choices=[0, 1, 2, 3],
|
1027
1082
|
)
|
@@ -1140,7 +1195,7 @@ class ServerArgs:
|
|
1140
1195
|
parser.add_argument(
|
1141
1196
|
"--tool-call-parser",
|
1142
1197
|
type=str,
|
1143
|
-
choices=[
|
1198
|
+
choices=[ # TODO: use FunctionCallParser.DetectorMap.keys()
|
1144
1199
|
"qwen25",
|
1145
1200
|
"mistral",
|
1146
1201
|
"llama3",
|
@@ -1150,10 +1205,17 @@ class ServerArgs:
|
|
1150
1205
|
"qwen3_coder",
|
1151
1206
|
"glm45",
|
1152
1207
|
"step3",
|
1208
|
+
"gpt-oss",
|
1153
1209
|
],
|
1154
1210
|
default=ServerArgs.tool_call_parser,
|
1155
1211
|
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', 'qwen3_coder', 'glm45', and 'step3'.",
|
1156
1212
|
)
|
1213
|
+
parser.add_argument(
|
1214
|
+
"--tool-server",
|
1215
|
+
type=str,
|
1216
|
+
default=None,
|
1217
|
+
help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
|
1218
|
+
)
|
1157
1219
|
|
1158
1220
|
# Data parallelism
|
1159
1221
|
parser.add_argument(
|
@@ -1253,53 +1315,42 @@ class ServerArgs:
|
|
1253
1315
|
)
|
1254
1316
|
|
1255
1317
|
# Kernel backend
|
1318
|
+
ATTN_BACKENDS = [
|
1319
|
+
"aiter",
|
1320
|
+
"cutlass_mla",
|
1321
|
+
"fa3",
|
1322
|
+
"flashinfer",
|
1323
|
+
"flashmla",
|
1324
|
+
"intel_amx",
|
1325
|
+
"torch_native",
|
1326
|
+
"ascend",
|
1327
|
+
"triton",
|
1328
|
+
"trtllm_mla",
|
1329
|
+
"trtllm_mha",
|
1330
|
+
"dual_chunk_flash_attn",
|
1331
|
+
"wave",
|
1332
|
+
]
|
1256
1333
|
parser.add_argument(
|
1257
1334
|
"--attention-backend",
|
1258
1335
|
type=str,
|
1259
|
-
choices=
|
1260
|
-
"aiter",
|
1261
|
-
"cutlass_mla",
|
1262
|
-
"fa3",
|
1263
|
-
"flashinfer",
|
1264
|
-
"flashmla",
|
1265
|
-
"intel_amx",
|
1266
|
-
"torch_native",
|
1267
|
-
"ascend",
|
1268
|
-
"triton",
|
1269
|
-
"trtllm_mla",
|
1270
|
-
],
|
1336
|
+
choices=ATTN_BACKENDS,
|
1271
1337
|
default=ServerArgs.attention_backend,
|
1272
1338
|
help="Choose the kernels for attention layers.",
|
1273
1339
|
)
|
1274
|
-
parser.add_argument(
|
1275
|
-
"--decode-attention-backend",
|
1276
|
-
type=str,
|
1277
|
-
choices=[
|
1278
|
-
"flashinfer",
|
1279
|
-
"triton",
|
1280
|
-
"torch_native",
|
1281
|
-
"fa3",
|
1282
|
-
"flashmla",
|
1283
|
-
"cutlass_mla",
|
1284
|
-
],
|
1285
|
-
default=ServerArgs.decode_attention_backend,
|
1286
|
-
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
|
1287
|
-
)
|
1288
|
-
|
1289
1340
|
parser.add_argument(
|
1290
1341
|
"--prefill-attention-backend",
|
1291
1342
|
type=str,
|
1292
|
-
choices=
|
1293
|
-
"flashinfer",
|
1294
|
-
"triton",
|
1295
|
-
"torch_native",
|
1296
|
-
"fa3",
|
1297
|
-
"flashmla",
|
1298
|
-
"cutlass_mla",
|
1299
|
-
],
|
1343
|
+
choices=ATTN_BACKENDS,
|
1300
1344
|
default=ServerArgs.prefill_attention_backend,
|
1301
1345
|
help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
|
1302
1346
|
)
|
1347
|
+
parser.add_argument(
|
1348
|
+
"--decode-attention-backend",
|
1349
|
+
type=str,
|
1350
|
+
choices=ATTN_BACKENDS,
|
1351
|
+
default=ServerArgs.decode_attention_backend,
|
1352
|
+
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
|
1353
|
+
)
|
1303
1354
|
parser.add_argument(
|
1304
1355
|
"--sampling-backend",
|
1305
1356
|
type=str,
|
@@ -1400,7 +1451,7 @@ class ServerArgs:
|
|
1400
1451
|
parser.add_argument(
|
1401
1452
|
"--enable-flashinfer-allreduce-fusion",
|
1402
1453
|
action="store_true",
|
1403
|
-
help="Enable FlashInfer allreduce fusion
|
1454
|
+
help="Enable FlashInfer allreduce fusion with Residual RMSNorm.",
|
1404
1455
|
)
|
1405
1456
|
parser.add_argument(
|
1406
1457
|
"--deepep-mode",
|
@@ -1519,7 +1570,6 @@ class ServerArgs:
|
|
1519
1570
|
default=ServerArgs.hicache_mem_layout,
|
1520
1571
|
help="The layout of host memory pool for hierarchical cache.",
|
1521
1572
|
)
|
1522
|
-
|
1523
1573
|
parser.add_argument(
|
1524
1574
|
"--hicache-storage-backend",
|
1525
1575
|
type=str,
|
@@ -1527,6 +1577,13 @@ class ServerArgs:
|
|
1527
1577
|
default=ServerArgs.hicache_storage_backend,
|
1528
1578
|
help="The storage backend for hierarchical KV cache.",
|
1529
1579
|
)
|
1580
|
+
parser.add_argument(
|
1581
|
+
"--hicache-storage-prefetch-policy",
|
1582
|
+
type=str,
|
1583
|
+
choices=["best_effort", "wait_complete", "timeout"],
|
1584
|
+
default=ServerArgs.hicache_storage_prefetch_policy,
|
1585
|
+
help="Control when prefetching from the storage backend should stop.",
|
1586
|
+
)
|
1530
1587
|
|
1531
1588
|
# Double Sparsity
|
1532
1589
|
parser.add_argument(
|
@@ -1658,6 +1715,12 @@ class ServerArgs:
|
|
1658
1715
|
action="store_true",
|
1659
1716
|
help="Enabling two micro batches to overlap.",
|
1660
1717
|
)
|
1718
|
+
parser.add_argument(
|
1719
|
+
"--tbo-token-distribution-threshold",
|
1720
|
+
type=float,
|
1721
|
+
default=ServerArgs.tbo_token_distribution_threshold,
|
1722
|
+
help="The threshold of token distribution between two batches in micro-batch-overlap, determines whether to two-batch-overlap or two-chunk-overlap. Set to 0 denote disable two-chunk-overlap.",
|
1723
|
+
)
|
1661
1724
|
parser.add_argument(
|
1662
1725
|
"--enable-torch-compile",
|
1663
1726
|
action="store_true",
|
@@ -1755,6 +1818,17 @@ class ServerArgs:
|
|
1755
1818
|
action="store_true",
|
1756
1819
|
help="Use triton moe grouped gemm kernel.",
|
1757
1820
|
)
|
1821
|
+
parser.add_argument(
|
1822
|
+
"--enable-flashinfer-mxfp4-moe",
|
1823
|
+
action="store_true",
|
1824
|
+
help="Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
|
1825
|
+
)
|
1826
|
+
parser.add_argument(
|
1827
|
+
"--scheduler-recv-interval",
|
1828
|
+
type=int,
|
1829
|
+
default=ServerArgs.scheduler_recv_interval,
|
1830
|
+
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1831
|
+
)
|
1758
1832
|
|
1759
1833
|
# Debug tensor dumps
|
1760
1834
|
parser.add_argument(
|
@@ -1931,17 +2005,6 @@ class ServerArgs:
|
|
1931
2005
|
None,
|
1932
2006
|
}, "moe_dense_tp_size only support 1 and None currently"
|
1933
2007
|
|
1934
|
-
# Check model architecture
|
1935
|
-
model_arch = self.get_hf_config().architectures[0]
|
1936
|
-
if "Llama4" in model_arch:
|
1937
|
-
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
1938
|
-
|
1939
|
-
if "Gemma2ForCausalLM" in model_arch:
|
1940
|
-
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
1941
|
-
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
1942
|
-
logger.warning("Disable hybrid SWA memory for Gemma2ForCausalLM.")
|
1943
|
-
self.disable_hybrid_swa_memory = True
|
1944
|
-
|
1945
2008
|
# Check LoRA
|
1946
2009
|
self.check_lora_server_args()
|
1947
2010
|
|
@@ -1952,22 +2015,20 @@ class ServerArgs:
|
|
1952
2015
|
), "enable_mixed_chunk is required for speculative decoding"
|
1953
2016
|
|
1954
2017
|
# Check chunked prefill
|
1955
|
-
|
1956
|
-
|
1957
|
-
|
2018
|
+
# Skip validation if chunked prefill is disabled (i.e., size <= 0).
|
2019
|
+
if self.chunked_prefill_size > 0:
|
2020
|
+
assert (
|
2021
|
+
self.chunked_prefill_size % self.page_size == 0
|
2022
|
+
), "chunked_prefill_size must be divisible by page_size"
|
1958
2023
|
|
1959
2024
|
def check_lora_server_args(self):
|
1960
|
-
assert
|
1961
|
-
self.max_loras_per_batch > 0
|
1962
|
-
# FIXME
|
1963
|
-
and (self.lora_paths is None or self.disable_radix_cache)
|
1964
|
-
), "compatibility of lora and radix attention is in progress"
|
2025
|
+
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
1965
2026
|
|
1966
2027
|
# Enable LoRA if any LoRA paths are provided for backward compatibility.
|
1967
2028
|
if self.lora_paths:
|
1968
2029
|
if self.enable_lora is None:
|
1969
2030
|
self.enable_lora = True
|
1970
|
-
logger.
|
2031
|
+
logger.warning(
|
1971
2032
|
"--enable-lora is set to True because --lora-paths is provided."
|
1972
2033
|
)
|
1973
2034
|
elif self.enable_lora is False:
|
@@ -1977,21 +2038,23 @@ class ServerArgs:
|
|
1977
2038
|
|
1978
2039
|
if self.enable_lora:
|
1979
2040
|
# Normalize lora_paths to a dictionary if it is a list.
|
2041
|
+
# TODO (lifuhuang): support specifying pinned adapters in server_args.
|
1980
2042
|
if isinstance(self.lora_paths, list):
|
1981
2043
|
lora_paths = self.lora_paths
|
1982
2044
|
self.lora_paths = {}
|
1983
2045
|
for lora_path in lora_paths:
|
1984
2046
|
if "=" in lora_path:
|
1985
2047
|
name, path = lora_path.split("=", 1)
|
1986
|
-
self.lora_paths[name] = LoRARef(
|
2048
|
+
self.lora_paths[name] = LoRARef(
|
2049
|
+
lora_name=name, lora_path=path, pinned=False
|
2050
|
+
)
|
1987
2051
|
else:
|
1988
2052
|
self.lora_paths[lora_path] = LoRARef(
|
1989
|
-
lora_name=lora_path,
|
1990
|
-
lora_path=lora_path,
|
2053
|
+
lora_name=lora_path, lora_path=lora_path, pinned=False
|
1991
2054
|
)
|
1992
2055
|
elif isinstance(self.lora_paths, dict):
|
1993
2056
|
self.lora_paths = {
|
1994
|
-
k: LoRARef(lora_name=k, lora_path=v)
|
2057
|
+
k: LoRARef(lora_name=k, lora_path=v, pinned=False)
|
1995
2058
|
for k, v in self.lora_paths.items()
|
1996
2059
|
}
|
1997
2060
|
elif self.lora_paths is None:
|
@@ -2037,6 +2100,58 @@ class ServerArgs:
|
|
2037
2100
|
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
2038
2101
|
)
|
2039
2102
|
|
2103
|
+
def model_specific_adjustments(self):
|
2104
|
+
hf_config = self.get_hf_config()
|
2105
|
+
model_arch = hf_config.architectures[0]
|
2106
|
+
if model_arch in ["GptOssForCausalLM"]:
|
2107
|
+
if self.attention_backend is None:
|
2108
|
+
self.attention_backend = "triton"
|
2109
|
+
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
2110
|
+
assert (
|
2111
|
+
self.attention_backend in supported_backends
|
2112
|
+
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
2113
|
+
quantization_config = getattr(hf_config, "quantization_config", None)
|
2114
|
+
is_mxfp4_quant_format = (
|
2115
|
+
quantization_config is not None
|
2116
|
+
and quantization_config.get("quant_method") == "mxfp4"
|
2117
|
+
)
|
2118
|
+
|
2119
|
+
if is_sm100_supported() and is_mxfp4_quant_format:
|
2120
|
+
self.enable_flashinfer_mxfp4_moe = True
|
2121
|
+
self.enable_triton_kernel_moe = False
|
2122
|
+
logger.warning(
|
2123
|
+
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
2124
|
+
)
|
2125
|
+
else:
|
2126
|
+
if self.enable_triton_kernel_moe:
|
2127
|
+
assert (
|
2128
|
+
self.ep_size == 1
|
2129
|
+
), "Triton kernel MoE is only supported when ep_size == 1"
|
2130
|
+
if not self.enable_triton_kernel_moe and self.ep_size == 1:
|
2131
|
+
self.enable_triton_kernel_moe = True
|
2132
|
+
logger.warning(
|
2133
|
+
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2134
|
+
)
|
2135
|
+
self.disable_hybrid_swa_memory = True
|
2136
|
+
if is_mxfp4_quant_format:
|
2137
|
+
# use bf16 for mxfp4 triton kernels
|
2138
|
+
self.dtype = "bfloat16"
|
2139
|
+
elif "Llama4" in model_arch:
|
2140
|
+
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
2141
|
+
elif model_arch in [
|
2142
|
+
"Gemma2ForCausalLM",
|
2143
|
+
"Gemma3ForCausalLM",
|
2144
|
+
"Gemma3ForConditionalGeneration",
|
2145
|
+
"Gemma3nForCausalLM",
|
2146
|
+
"Gemma3nForConditionalGeneration",
|
2147
|
+
]:
|
2148
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
2149
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
2150
|
+
logger.warning(
|
2151
|
+
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
2152
|
+
)
|
2153
|
+
self.disable_hybrid_swa_memory = True
|
2154
|
+
|
2040
2155
|
def adjust_mem_fraction_for_vlm(self, model_config):
|
2041
2156
|
vision_config = getattr(model_config.hf_config, "vision_config", None)
|
2042
2157
|
if vision_config is None:
|
@@ -2074,10 +2189,6 @@ class ServerArgs:
|
|
2074
2189
|
self.mem_fraction_static = (
|
2075
2190
|
original_server_arg_mem_fraction * final_overall_factor
|
2076
2191
|
)
|
2077
|
-
logger.warning(
|
2078
|
-
f"Multimodal model: Dynamically adjusted --mem-fraction-static "
|
2079
|
-
f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
|
2080
|
-
)
|
2081
2192
|
|
2082
2193
|
|
2083
2194
|
def prepare_server_args(argv: List[str]) -> ServerArgs:
|
@@ -226,6 +226,22 @@ class EAGLEWorker(TpModelWorker):
|
|
226
226
|
self.draft_model_runner,
|
227
227
|
skip_prefill=False,
|
228
228
|
)
|
229
|
+
elif self.server_args.attention_backend == "aiter":
|
230
|
+
from sglang.srt.layers.attention.aiter_backend import (
|
231
|
+
AiterAttnBackend,
|
232
|
+
AiterMultiStepDraftBackend,
|
233
|
+
)
|
234
|
+
|
235
|
+
self.draft_attn_backend = AiterMultiStepDraftBackend(
|
236
|
+
self.draft_model_runner,
|
237
|
+
self.topk,
|
238
|
+
self.speculative_num_steps,
|
239
|
+
)
|
240
|
+
self.draft_extend_attn_backend = AiterAttnBackend(
|
241
|
+
self.draft_model_runner,
|
242
|
+
skip_prefill=False,
|
243
|
+
)
|
244
|
+
self.has_prefill_wrapper_verify = False
|
229
245
|
elif self.server_args.attention_backend == "fa3":
|
230
246
|
from sglang.srt.layers.attention.flashattention_backend import (
|
231
247
|
FlashAttentionBackend,
|