sglang 0.5.1.post2__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +89 -54
- sglang/bench_serving.py +437 -40
- sglang/lang/interpreter.py +1 -1
- sglang/profiler.py +0 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +90 -27
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +82 -26
- sglang/srt/entrypoints/openai/serving_completions.py +25 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +28 -7
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +381 -136
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +11 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -14
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -8
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +111 -56
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +141 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -10
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +31 -22
- sglang/srt/layers/quantization/fp8.py +78 -48
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +45 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +93 -68
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +396 -365
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +18 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +190 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +148 -122
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +77 -480
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +53 -40
- sglang/srt/mem_cache/hiradix_cache.py +196 -104
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +395 -53
- sglang/srt/mem_cache/memory_pool_host.py +27 -19
- sglang/srt/mem_cache/radix_cache.py +6 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +152 -23
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +154 -95
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +190 -32
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +323 -53
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +7 -19
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +91 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{conversation.py → parser/conversation.py} +38 -5
- sglang/srt/parser/harmony_parser.py +588 -0
- sglang/srt/parser/reasoning_parser.py +309 -0
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +307 -80
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +96 -7
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/METADATA +13 -10
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/RECORD +253 -201
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- sglang/srt/reasoning_parser.py +0 -553
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -25,9 +25,8 @@ from typing import List, Literal, Optional, Union
|
|
25
25
|
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
|
-
from sglang.srt.layers.utils import is_sm90_supported, is_sm100_supported
|
29
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
30
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
29
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
31
30
|
from sglang.srt.utils import (
|
32
31
|
LORA_TARGET_ALL_MODULES,
|
33
32
|
SUPPORTED_LORA_TARGET_MODULES,
|
@@ -39,20 +38,105 @@ from sglang.srt.utils import (
|
|
39
38
|
is_hip,
|
40
39
|
is_port_available,
|
41
40
|
is_remote_url,
|
41
|
+
is_sm90_supported,
|
42
|
+
is_sm100_supported,
|
42
43
|
is_triton_kernels_available,
|
43
44
|
is_valid_ipv6_address,
|
44
45
|
nullable_str,
|
45
46
|
)
|
47
|
+
from sglang.utils import is_in_ci
|
46
48
|
|
47
49
|
logger = logging.getLogger(__name__)
|
48
50
|
|
49
51
|
|
52
|
+
# Define constants
|
53
|
+
LOAD_FORMAT_CHOICES = [
|
54
|
+
"auto",
|
55
|
+
"pt",
|
56
|
+
"safetensors",
|
57
|
+
"npcache",
|
58
|
+
"dummy",
|
59
|
+
"sharded_state",
|
60
|
+
"gguf",
|
61
|
+
"bitsandbytes",
|
62
|
+
"layered",
|
63
|
+
"remote",
|
64
|
+
]
|
65
|
+
|
66
|
+
QUANTIZATION_CHOICES = [
|
67
|
+
"awq",
|
68
|
+
"fp8",
|
69
|
+
"gptq",
|
70
|
+
"marlin",
|
71
|
+
"gptq_marlin",
|
72
|
+
"awq_marlin",
|
73
|
+
"bitsandbytes",
|
74
|
+
"gguf",
|
75
|
+
"modelopt",
|
76
|
+
"modelopt_fp4",
|
77
|
+
"petit_nvfp4",
|
78
|
+
"w8a8_int8",
|
79
|
+
"w8a8_fp8",
|
80
|
+
"moe_wna16",
|
81
|
+
"qoq",
|
82
|
+
"w4afp8",
|
83
|
+
"mxfp4",
|
84
|
+
]
|
85
|
+
|
86
|
+
ATTENTION_BACKEND_CHOICES = [
|
87
|
+
# Common
|
88
|
+
"triton",
|
89
|
+
"torch_native",
|
90
|
+
# NVIDIA specific
|
91
|
+
"cutlass_mla",
|
92
|
+
"fa3",
|
93
|
+
"flashinfer",
|
94
|
+
"flashmla",
|
95
|
+
"trtllm_mla",
|
96
|
+
"trtllm_mha",
|
97
|
+
"dual_chunk_flash_attn",
|
98
|
+
"hybrid_linear_attn",
|
99
|
+
# AMD specific
|
100
|
+
"aiter",
|
101
|
+
"wave",
|
102
|
+
# Other platforms
|
103
|
+
"intel_amx",
|
104
|
+
"ascend",
|
105
|
+
]
|
106
|
+
|
107
|
+
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
108
|
+
|
109
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
110
|
+
|
111
|
+
|
112
|
+
# Allow external code to add more choices
|
113
|
+
def add_load_format_choices(choices):
|
114
|
+
LOAD_FORMAT_CHOICES.extend(choices)
|
115
|
+
|
116
|
+
|
117
|
+
def add_quantization_method_choices(choices):
|
118
|
+
QUANTIZATION_CHOICES.extend(choices)
|
119
|
+
|
120
|
+
|
121
|
+
def add_attention_backend_choices(choices):
|
122
|
+
ATTENTION_BACKEND_CHOICES.extend(choices)
|
123
|
+
|
124
|
+
|
125
|
+
def add_disagg_transfer_backend_choices(choices):
|
126
|
+
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
127
|
+
|
128
|
+
|
129
|
+
def add_grammar_backend_choices(choices):
|
130
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
131
|
+
|
132
|
+
|
50
133
|
@dataclasses.dataclass
|
51
134
|
class ServerArgs:
|
52
135
|
# Model and tokenizer
|
53
136
|
model_path: str
|
54
137
|
tokenizer_path: Optional[str] = None
|
55
138
|
tokenizer_mode: str = "auto"
|
139
|
+
tokenizer_worker_num: int = 1
|
56
140
|
skip_tokenizer_init: bool = False
|
57
141
|
load_format: str = "auto"
|
58
142
|
model_loader_extra_config: str = "{}"
|
@@ -119,6 +203,8 @@ class ServerArgs:
|
|
119
203
|
bucket_inter_token_latency: Optional[List[float]] = None
|
120
204
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
121
205
|
collect_tokens_histogram: bool = False
|
206
|
+
prompt_tokens_buckets: Optional[List[str]] = None
|
207
|
+
generation_tokens_buckets: Optional[List[str]] = None
|
122
208
|
decode_log_interval: int = 40
|
123
209
|
enable_request_time_stats_logging: bool = False
|
124
210
|
kv_events_config: Optional[str] = None
|
@@ -139,6 +225,8 @@ class ServerArgs:
|
|
139
225
|
# Data parallelism
|
140
226
|
dp_size: int = 1
|
141
227
|
load_balance_method: str = "round_robin"
|
228
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
229
|
+
prefill_round_robin_balance: bool = False
|
142
230
|
|
143
231
|
# Multi-node distributed serving
|
144
232
|
dist_init_addr: Optional[str] = None
|
@@ -171,12 +259,14 @@ class ServerArgs:
|
|
171
259
|
# Speculative decoding
|
172
260
|
speculative_algorithm: Optional[str] = None
|
173
261
|
speculative_draft_model_path: Optional[str] = None
|
262
|
+
speculative_draft_model_revision: Optional[str] = None
|
174
263
|
speculative_num_steps: Optional[int] = None
|
175
264
|
speculative_eagle_topk: Optional[int] = None
|
176
265
|
speculative_num_draft_tokens: Optional[int] = None
|
177
266
|
speculative_accept_threshold_single: float = 1.0
|
178
267
|
speculative_accept_threshold_acc: float = 1.0
|
179
268
|
speculative_token_map: Optional[str] = None
|
269
|
+
speculative_attention_mode: str = "prefill"
|
180
270
|
|
181
271
|
# Expert parallelism
|
182
272
|
ep_size: int = 1
|
@@ -199,6 +289,7 @@ class ServerArgs:
|
|
199
289
|
eplb_algorithm: str = "auto"
|
200
290
|
eplb_rebalance_num_iterations: int = 1000
|
201
291
|
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
292
|
+
eplb_min_rebalancing_utilization_threshold: float = 1.0
|
202
293
|
expert_distribution_recorder_mode: Optional[
|
203
294
|
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
204
295
|
] = None
|
@@ -211,11 +302,14 @@ class ServerArgs:
|
|
211
302
|
enable_hierarchical_cache: bool = False
|
212
303
|
hicache_ratio: float = 2.0
|
213
304
|
hicache_size: int = 0
|
214
|
-
hicache_write_policy: str = "
|
305
|
+
hicache_write_policy: str = "write_through"
|
215
306
|
hicache_io_backend: str = "kernel"
|
216
307
|
hicache_mem_layout: str = "layer_first"
|
217
308
|
hicache_storage_backend: Optional[str] = None
|
218
309
|
hicache_storage_prefetch_policy: str = "best_effort"
|
310
|
+
hicache_storage_backend_extra_config: Optional[str] = None
|
311
|
+
# LMCache
|
312
|
+
enable_lmcache: bool = False
|
219
313
|
|
220
314
|
# Double Sparsity
|
221
315
|
enable_double_sparsity: bool = False
|
@@ -271,6 +365,7 @@ class ServerArgs:
|
|
271
365
|
disable_fast_image_processor: bool = False
|
272
366
|
enable_return_hidden_states: bool = False
|
273
367
|
scheduler_recv_interval: int = 1
|
368
|
+
numa_node: Optional[List[int]] = None
|
274
369
|
|
275
370
|
# Debug tensor dumps
|
276
371
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -287,7 +382,6 @@ class ServerArgs:
|
|
287
382
|
disaggregation_prefill_pp: Optional[int] = 1
|
288
383
|
disaggregation_ib_device: Optional[str] = None
|
289
384
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
290
|
-
pdlb_url: Optional[str] = None
|
291
385
|
|
292
386
|
# For model weight update
|
293
387
|
custom_weight_loader: Optional[List[str]] = None
|
@@ -297,6 +391,10 @@ class ServerArgs:
|
|
297
391
|
enable_pdmux: bool = False
|
298
392
|
sm_group_num: int = 3
|
299
393
|
|
394
|
+
# Mamba cache
|
395
|
+
max_mamba_cache_size: Optional[int] = None
|
396
|
+
mamba_ssm_dtype: str = "float32"
|
397
|
+
|
300
398
|
# Deprecated arguments
|
301
399
|
enable_ep_moe: bool = False
|
302
400
|
enable_deepep_moe: bool = False
|
@@ -384,9 +482,14 @@ class ServerArgs:
|
|
384
482
|
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
385
483
|
reserved_mem = 32 * 1024
|
386
484
|
|
485
|
+
# draft model and larger cuda graph buffers
|
387
486
|
if self.speculative_algorithm is not None:
|
388
|
-
|
389
|
-
|
487
|
+
if self.speculative_algorithm == "STANDALONE":
|
488
|
+
# Standalone speculative decoding needs more memory than other speculative
|
489
|
+
# decoding algorithms since the draft model is typically larger.
|
490
|
+
reserved_mem += 6 * 1024
|
491
|
+
else:
|
492
|
+
reserved_mem += 2 * 1024
|
390
493
|
if self.enable_dp_attention:
|
391
494
|
reserved_mem += 4 * 1024
|
392
495
|
|
@@ -528,12 +631,12 @@ class ServerArgs:
|
|
528
631
|
if self.grammar_backend is None:
|
529
632
|
self.grammar_backend = "xgrammar"
|
530
633
|
|
634
|
+
if self.dp_size == 1:
|
635
|
+
self.enable_dp_attention = False
|
636
|
+
|
531
637
|
# Data parallelism attention
|
532
638
|
if self.enable_dp_attention:
|
533
639
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
534
|
-
assert (
|
535
|
-
self.dp_size > 1
|
536
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
537
640
|
assert self.tp_size % self.dp_size == 0
|
538
641
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
539
642
|
logger.warning(
|
@@ -556,11 +659,13 @@ class ServerArgs:
|
|
556
659
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
557
660
|
|
558
661
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
559
|
-
|
560
|
-
self.
|
561
|
-
|
562
|
-
|
563
|
-
|
662
|
+
assert (
|
663
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
664
|
+
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
665
|
+
self.disable_shared_experts_fusion = True
|
666
|
+
logger.warning(
|
667
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
668
|
+
)
|
564
669
|
|
565
670
|
# DeepEP MoE
|
566
671
|
if self.moe_a2a_backend == "deepep":
|
@@ -615,7 +720,12 @@ class ServerArgs:
|
|
615
720
|
# NEXTN shares the same implementation of EAGLE
|
616
721
|
self.speculative_algorithm = "EAGLE"
|
617
722
|
|
618
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
723
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
724
|
+
if self.speculative_algorithm == "STANDALONE":
|
725
|
+
# TODO: support dp attention for standalone speculative decoding
|
726
|
+
assert (
|
727
|
+
self.enable_dp_attention is False
|
728
|
+
), "Currently standalone speculative decoding does not support dp attention."
|
619
729
|
if self.max_running_requests is None:
|
620
730
|
self.max_running_requests = 48
|
621
731
|
self.disable_overlap_schedule = True
|
@@ -671,6 +781,15 @@ class ServerArgs:
|
|
671
781
|
)
|
672
782
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
673
783
|
|
784
|
+
if (
|
785
|
+
self.speculative_eagle_topk > 1
|
786
|
+
and self.page_size > 1
|
787
|
+
and self.attention_backend != "flashinfer"
|
788
|
+
):
|
789
|
+
raise ValueError(
|
790
|
+
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
791
|
+
)
|
792
|
+
|
674
793
|
# The token generated from the verify step is counted.
|
675
794
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
676
795
|
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
@@ -698,6 +817,13 @@ class ServerArgs:
|
|
698
817
|
|
699
818
|
self.disable_radix_cache = True
|
700
819
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
820
|
+
|
821
|
+
if self.dp_size > 1 and not is_in_ci():
|
822
|
+
assert self.prefill_round_robin_balance, (
|
823
|
+
"Prefill round robin balance is required when dp size > 1. "
|
824
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
825
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
826
|
+
)
|
701
827
|
elif self.disaggregation_mode == "prefill":
|
702
828
|
if self.disaggregation_decode_tp is None:
|
703
829
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -714,6 +840,8 @@ class ServerArgs:
|
|
714
840
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
715
841
|
"1" if self.enable_torch_compile else "0"
|
716
842
|
)
|
843
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
844
|
+
|
717
845
|
# Set env var before grammar backends init
|
718
846
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
719
847
|
"1" if self.disable_outlines_disk_cache else "0"
|
@@ -750,6 +878,12 @@ class ServerArgs:
|
|
750
878
|
"tokenizer if available, and 'slow' will "
|
751
879
|
"always use the slow tokenizer.",
|
752
880
|
)
|
881
|
+
parser.add_argument(
|
882
|
+
"--tokenizer-worker-num",
|
883
|
+
type=int,
|
884
|
+
default=ServerArgs.tokenizer_worker_num,
|
885
|
+
help="The worker num of the tokenizer manager.",
|
886
|
+
)
|
753
887
|
parser.add_argument(
|
754
888
|
"--skip-tokenizer-init",
|
755
889
|
action="store_true",
|
@@ -759,18 +893,7 @@ class ServerArgs:
|
|
759
893
|
"--load-format",
|
760
894
|
type=str,
|
761
895
|
default=ServerArgs.load_format,
|
762
|
-
choices=
|
763
|
-
"auto",
|
764
|
-
"pt",
|
765
|
-
"safetensors",
|
766
|
-
"npcache",
|
767
|
-
"dummy",
|
768
|
-
"sharded_state",
|
769
|
-
"gguf",
|
770
|
-
"bitsandbytes",
|
771
|
-
"layered",
|
772
|
-
"remote",
|
773
|
-
],
|
896
|
+
choices=LOAD_FORMAT_CHOICES,
|
774
897
|
help="The format of the model weights to load. "
|
775
898
|
'"auto" will try to load the weights in the safetensors format '
|
776
899
|
"and fall back to the pytorch bin format if safetensors format "
|
@@ -889,25 +1012,7 @@ class ServerArgs:
|
|
889
1012
|
"--quantization",
|
890
1013
|
type=str,
|
891
1014
|
default=ServerArgs.quantization,
|
892
|
-
choices=
|
893
|
-
"awq",
|
894
|
-
"fp8",
|
895
|
-
"gptq",
|
896
|
-
"marlin",
|
897
|
-
"gptq_marlin",
|
898
|
-
"awq_marlin",
|
899
|
-
"bitsandbytes",
|
900
|
-
"gguf",
|
901
|
-
"modelopt",
|
902
|
-
"modelopt_fp4",
|
903
|
-
"petit_nvfp4",
|
904
|
-
"w8a8_int8",
|
905
|
-
"w8a8_fp8",
|
906
|
-
"moe_wna16",
|
907
|
-
"qoq",
|
908
|
-
"w4afp8",
|
909
|
-
"mxfp4",
|
910
|
-
],
|
1015
|
+
choices=QUANTIZATION_CHOICES,
|
911
1016
|
help="The quantization method.",
|
912
1017
|
)
|
913
1018
|
parser.add_argument(
|
@@ -1170,6 +1275,26 @@ class ServerArgs:
|
|
1170
1275
|
default=ServerArgs.collect_tokens_histogram,
|
1171
1276
|
help="Collect prompt/generation tokens histogram.",
|
1172
1277
|
)
|
1278
|
+
bucket_rule = (
|
1279
|
+
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1280
|
+
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1281
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
|
1282
|
+
"<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
|
1283
|
+
)
|
1284
|
+
parser.add_argument(
|
1285
|
+
"--prompt-tokens-buckets",
|
1286
|
+
type=str,
|
1287
|
+
nargs="+",
|
1288
|
+
default=ServerArgs.prompt_tokens_buckets,
|
1289
|
+
help=f"The buckets rule of prompt tokens. {bucket_rule}",
|
1290
|
+
)
|
1291
|
+
parser.add_argument(
|
1292
|
+
"--generation-tokens-buckets",
|
1293
|
+
type=str,
|
1294
|
+
nargs="+",
|
1295
|
+
default=ServerArgs.generation_tokens_buckets,
|
1296
|
+
help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
|
1297
|
+
)
|
1173
1298
|
parser.add_argument(
|
1174
1299
|
"--gc-warning-threshold-secs",
|
1175
1300
|
type=float,
|
@@ -1278,6 +1403,12 @@ class ServerArgs:
|
|
1278
1403
|
"minimum_tokens",
|
1279
1404
|
],
|
1280
1405
|
)
|
1406
|
+
parser.add_argument(
|
1407
|
+
"--prefill-round-robin-balance",
|
1408
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1409
|
+
action="store_true",
|
1410
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1411
|
+
)
|
1281
1412
|
|
1282
1413
|
# Multi-node distributed serving
|
1283
1414
|
parser.add_argument(
|
@@ -1357,43 +1488,24 @@ class ServerArgs:
|
|
1357
1488
|
)
|
1358
1489
|
|
1359
1490
|
# Kernel backend
|
1360
|
-
ATTN_BACKENDS = [
|
1361
|
-
# Common
|
1362
|
-
"triton",
|
1363
|
-
"torch_native",
|
1364
|
-
# NVIDIA specific
|
1365
|
-
"cutlass_mla",
|
1366
|
-
"fa3",
|
1367
|
-
"flashinfer",
|
1368
|
-
"flashmla",
|
1369
|
-
"trtllm_mla",
|
1370
|
-
"trtllm_mha",
|
1371
|
-
"dual_chunk_flash_attn",
|
1372
|
-
# AMD specific
|
1373
|
-
"aiter",
|
1374
|
-
"wave",
|
1375
|
-
# Other platforms
|
1376
|
-
"intel_amx",
|
1377
|
-
"ascend",
|
1378
|
-
]
|
1379
1491
|
parser.add_argument(
|
1380
1492
|
"--attention-backend",
|
1381
1493
|
type=str,
|
1382
|
-
choices=
|
1494
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1383
1495
|
default=ServerArgs.attention_backend,
|
1384
1496
|
help="Choose the kernels for attention layers.",
|
1385
1497
|
)
|
1386
1498
|
parser.add_argument(
|
1387
1499
|
"--prefill-attention-backend",
|
1388
1500
|
type=str,
|
1389
|
-
choices=
|
1501
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1390
1502
|
default=ServerArgs.prefill_attention_backend,
|
1391
1503
|
help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
|
1392
1504
|
)
|
1393
1505
|
parser.add_argument(
|
1394
1506
|
"--decode-attention-backend",
|
1395
1507
|
type=str,
|
1396
|
-
choices=
|
1508
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1397
1509
|
default=ServerArgs.decode_attention_backend,
|
1398
1510
|
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
|
1399
1511
|
)
|
@@ -1407,7 +1519,7 @@ class ServerArgs:
|
|
1407
1519
|
parser.add_argument(
|
1408
1520
|
"--grammar-backend",
|
1409
1521
|
type=str,
|
1410
|
-
choices=
|
1522
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1411
1523
|
default=ServerArgs.grammar_backend,
|
1412
1524
|
help="Choose the backend for grammar-guided decoding.",
|
1413
1525
|
)
|
@@ -1423,14 +1535,23 @@ class ServerArgs:
|
|
1423
1535
|
parser.add_argument(
|
1424
1536
|
"--speculative-algorithm",
|
1425
1537
|
type=str,
|
1426
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
1538
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
1427
1539
|
help="Speculative algorithm.",
|
1428
1540
|
)
|
1429
1541
|
parser.add_argument(
|
1430
1542
|
"--speculative-draft-model-path",
|
1543
|
+
"--speculative-draft-model",
|
1431
1544
|
type=str,
|
1432
1545
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1433
1546
|
)
|
1547
|
+
parser.add_argument(
|
1548
|
+
"--speculative-draft-model-revision",
|
1549
|
+
type=str,
|
1550
|
+
default=None,
|
1551
|
+
help="The specific draft model version to use. It can be a branch "
|
1552
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
1553
|
+
"the default version.",
|
1554
|
+
)
|
1434
1555
|
parser.add_argument(
|
1435
1556
|
"--speculative-num-steps",
|
1436
1557
|
type=int,
|
@@ -1467,6 +1588,13 @@ class ServerArgs:
|
|
1467
1588
|
help="The path of the draft model's small vocab table.",
|
1468
1589
|
default=ServerArgs.speculative_token_map,
|
1469
1590
|
)
|
1591
|
+
parser.add_argument(
|
1592
|
+
"--speculative-attention-mode",
|
1593
|
+
type=str,
|
1594
|
+
choices=["prefill", "decode"],
|
1595
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
1596
|
+
default=ServerArgs.speculative_attention_mode,
|
1597
|
+
)
|
1470
1598
|
|
1471
1599
|
# Expert parallelism
|
1472
1600
|
parser.add_argument(
|
@@ -1501,7 +1629,7 @@ class ServerArgs:
|
|
1501
1629
|
parser.add_argument(
|
1502
1630
|
"--flashinfer-mxfp4-moe-precision",
|
1503
1631
|
type=str,
|
1504
|
-
choices=["
|
1632
|
+
choices=["default", "bf16"],
|
1505
1633
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1506
1634
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1507
1635
|
)
|
@@ -1558,6 +1686,12 @@ class ServerArgs:
|
|
1558
1686
|
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1559
1687
|
help="Number of layers to rebalance per forward pass.",
|
1560
1688
|
)
|
1689
|
+
parser.add_argument(
|
1690
|
+
"--eplb-min-rebalancing-utilization-threshold",
|
1691
|
+
type=float,
|
1692
|
+
default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
|
1693
|
+
help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
|
1694
|
+
)
|
1561
1695
|
parser.add_argument(
|
1562
1696
|
"--expert-distribution-recorder-mode",
|
1563
1697
|
type=str,
|
@@ -1588,6 +1722,21 @@ class ServerArgs:
|
|
1588
1722
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1589
1723
|
)
|
1590
1724
|
|
1725
|
+
# Mamba Cache
|
1726
|
+
parser.add_argument(
|
1727
|
+
"--max-mamba-cache-size",
|
1728
|
+
type=int,
|
1729
|
+
default=ServerArgs.max_mamba_cache_size,
|
1730
|
+
help="The maximum size of the mamba cache.",
|
1731
|
+
)
|
1732
|
+
parser.add_argument(
|
1733
|
+
"--mamba-ssm-dtype",
|
1734
|
+
type=str,
|
1735
|
+
default=ServerArgs.mamba_ssm_dtype,
|
1736
|
+
choices=["float32", "bfloat16"],
|
1737
|
+
help="The data type of the SSM states in mamba cache.",
|
1738
|
+
)
|
1739
|
+
|
1591
1740
|
# Hierarchical cache
|
1592
1741
|
parser.add_argument(
|
1593
1742
|
"--enable-hierarchical-cache",
|
@@ -1641,6 +1790,18 @@ class ServerArgs:
|
|
1641
1790
|
default=ServerArgs.hicache_storage_prefetch_policy,
|
1642
1791
|
help="Control when prefetching from the storage backend should stop.",
|
1643
1792
|
)
|
1793
|
+
parser.add_argument(
|
1794
|
+
"--hicache-storage-backend-extra-config",
|
1795
|
+
type=str,
|
1796
|
+
default=ServerArgs.hicache_storage_backend_extra_config,
|
1797
|
+
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1798
|
+
)
|
1799
|
+
# LMCache
|
1800
|
+
parser.add_argument(
|
1801
|
+
"--enable-lmcache",
|
1802
|
+
action="store_true",
|
1803
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
1804
|
+
)
|
1644
1805
|
|
1645
1806
|
# Double Sparsity
|
1646
1807
|
parser.add_argument(
|
@@ -1913,6 +2074,12 @@ class ServerArgs:
|
|
1913
2074
|
default=ServerArgs.scheduler_recv_interval,
|
1914
2075
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1915
2076
|
)
|
2077
|
+
parser.add_argument(
|
2078
|
+
"--numa-node",
|
2079
|
+
type=int,
|
2080
|
+
nargs="+",
|
2081
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2082
|
+
)
|
1916
2083
|
|
1917
2084
|
# Debug tensor dumps
|
1918
2085
|
parser.add_argument(
|
@@ -1951,7 +2118,7 @@ class ServerArgs:
|
|
1951
2118
|
"--disaggregation-transfer-backend",
|
1952
2119
|
type=str,
|
1953
2120
|
default=ServerArgs.disaggregation_transfer_backend,
|
1954
|
-
choices=
|
2121
|
+
choices=DISAGG_TRANSFER_BACKEND_CHOICES,
|
1955
2122
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1956
2123
|
)
|
1957
2124
|
parser.add_argument(
|
@@ -1992,12 +2159,6 @@ class ServerArgs:
|
|
1992
2159
|
default=ServerArgs.num_reserved_decode_tokens,
|
1993
2160
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
1994
2161
|
)
|
1995
|
-
parser.add_argument(
|
1996
|
-
"--pdlb-url",
|
1997
|
-
type=str,
|
1998
|
-
default=None,
|
1999
|
-
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
2000
|
-
)
|
2001
2162
|
|
2002
2163
|
# Custom weight loader
|
2003
2164
|
parser.add_argument(
|
@@ -2126,6 +2287,15 @@ class ServerArgs:
|
|
2126
2287
|
self.chunked_prefill_size % self.page_size == 0
|
2127
2288
|
), "chunked_prefill_size must be divisible by page_size"
|
2128
2289
|
|
2290
|
+
# Check multi tokenizer
|
2291
|
+
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
2292
|
+
self.validate_buckets_rule(
|
2293
|
+
"--prompt-tokens-buckets", self.prompt_tokens_buckets
|
2294
|
+
)
|
2295
|
+
self.validate_buckets_rule(
|
2296
|
+
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2297
|
+
)
|
2298
|
+
|
2129
2299
|
def check_lora_server_args(self):
|
2130
2300
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
2131
2301
|
|
@@ -2217,6 +2387,54 @@ class ServerArgs:
|
|
2217
2387
|
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
2218
2388
|
)
|
2219
2389
|
|
2390
|
+
def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
|
2391
|
+
if not buckets_rule:
|
2392
|
+
return
|
2393
|
+
|
2394
|
+
assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
|
2395
|
+
rule = buckets_rule[0]
|
2396
|
+
assert rule in [
|
2397
|
+
"tse",
|
2398
|
+
"default",
|
2399
|
+
"customer",
|
2400
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
|
2401
|
+
|
2402
|
+
if rule == "tse":
|
2403
|
+
assert (
|
2404
|
+
len(buckets_rule) == 4
|
2405
|
+
), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
|
2406
|
+
try:
|
2407
|
+
middle = float(buckets_rule[1])
|
2408
|
+
base = float(buckets_rule[2])
|
2409
|
+
count = int(buckets_rule[3])
|
2410
|
+
except (ValueError, IndexError):
|
2411
|
+
assert (
|
2412
|
+
False
|
2413
|
+
), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
|
2414
|
+
assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
|
2415
|
+
assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
|
2416
|
+
assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
|
2417
|
+
|
2418
|
+
elif rule == "default":
|
2419
|
+
assert (
|
2420
|
+
len(buckets_rule) == 1
|
2421
|
+
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
2422
|
+
|
2423
|
+
elif rule == "customer":
|
2424
|
+
assert (
|
2425
|
+
len(buckets_rule) >= 2
|
2426
|
+
), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
|
2427
|
+
try:
|
2428
|
+
bucket_values = [float(x) for x in buckets_rule[1:]]
|
2429
|
+
except ValueError:
|
2430
|
+
assert False, f"{arg_name} customer rule bucket values must be numeric"
|
2431
|
+
assert len(set(bucket_values)) == len(
|
2432
|
+
bucket_values
|
2433
|
+
), f"{arg_name} customer rule bucket values should not contain duplicates"
|
2434
|
+
assert all(
|
2435
|
+
val >= 0 for val in bucket_values
|
2436
|
+
), f"{arg_name} customer rule bucket values should be non-negative"
|
2437
|
+
|
2220
2438
|
def model_specific_adjustments(self):
|
2221
2439
|
hf_config = self.get_hf_config()
|
2222
2440
|
model_arch = hf_config.architectures[0]
|
@@ -2271,11 +2489,13 @@ class ServerArgs:
|
|
2271
2489
|
if is_mxfp4_quant_format:
|
2272
2490
|
# use bf16 for mxfp4 triton kernels
|
2273
2491
|
self.dtype = "bfloat16"
|
2492
|
+
|
2274
2493
|
elif "Llama4" in model_arch:
|
2275
2494
|
assert self.attention_backend in {
|
2276
2495
|
"fa3",
|
2277
2496
|
"aiter",
|
2278
|
-
|
2497
|
+
"triton",
|
2498
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
2279
2499
|
elif model_arch in [
|
2280
2500
|
"Gemma2ForCausalLM",
|
2281
2501
|
"Gemma3ForCausalLM",
|
@@ -2368,6 +2588,9 @@ class PortArgs:
|
|
2368
2588
|
# The ipc filename for Scheduler to send metrics
|
2369
2589
|
metrics_ipc_name: str
|
2370
2590
|
|
2591
|
+
# The ipc filename for Tokenizer and worker tokenizer
|
2592
|
+
tokenizer_worker_ipc_name: Optional[str]
|
2593
|
+
|
2371
2594
|
@staticmethod
|
2372
2595
|
def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
|
2373
2596
|
if server_args.nccl_port is None:
|
@@ -2391,6 +2614,7 @@ class PortArgs:
|
|
2391
2614
|
nccl_port=nccl_port,
|
2392
2615
|
rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2393
2616
|
metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2617
|
+
tokenizer_worker_ipc_name=None,
|
2394
2618
|
)
|
2395
2619
|
else:
|
2396
2620
|
# DP attention. Use TCP + port to handle both single-node and multi-node.
|
@@ -2424,6 +2648,7 @@ class PortArgs:
|
|
2424
2648
|
nccl_port=nccl_port,
|
2425
2649
|
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2426
2650
|
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2651
|
+
tokenizer_worker_ipc_name=None,
|
2427
2652
|
)
|
2428
2653
|
|
2429
2654
|
|
@@ -2469,7 +2694,9 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2469
2694
|
"""
|
2470
2695
|
hf_config = self.get_hf_config()
|
2471
2696
|
arch = hf_config.architectures[0]
|
2472
|
-
|
2697
|
+
if self.speculative_algorithm == "STANDALONE":
|
2698
|
+
# The default value for standalone speculative decoding
|
2699
|
+
return (3, 1, 4)
|
2473
2700
|
if arch in ["LlamaForCausalLM"]:
|
2474
2701
|
# The default value for llama
|
2475
2702
|
return (5, 4, 8)
|