sglang 0.5.1.post3__py3-none-any.whl → 0.5.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +3 -0
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +251 -26
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/longcat_flash.py +104 -0
- sglang/srt/configs/model_config.py +37 -7
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +1 -1
- sglang/srt/connector/base_connector.py +1 -2
- sglang/srt/connector/redis.py +2 -2
- sglang/srt/connector/serde/__init__.py +1 -1
- sglang/srt/connector/serde/safe_serde.py +4 -3
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/ascend/conn.py +75 -0
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +6 -4
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -420
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +6 -4
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +94 -58
- sglang/srt/entrypoints/engine.py +34 -14
- sglang/srt/entrypoints/http_server.py +172 -47
- sglang/srt/entrypoints/openai/protocol.py +63 -3
- sglang/srt/entrypoints/openai/serving_base.py +6 -2
- sglang/srt/entrypoints/openai/serving_chat.py +34 -19
- sglang/srt/entrypoints/openai/serving_completions.py +10 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +8 -4
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/eplb/eplb_manager.py +28 -4
- sglang/srt/eplb/expert_distribution.py +55 -15
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/hf_transformers_utils.py +12 -0
- sglang/srt/layers/activation.py +44 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +250 -112
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +47 -8
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +584 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/trtllm_mla_backend.py +126 -36
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/layernorm.py +54 -12
- sglang/srt/layers/logits_processor.py +10 -3
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +4 -12
- sglang/srt/layers/moe/ep_moe/kernels.py +74 -0
- sglang/srt/layers/moe/ep_moe/layer.py +110 -49
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/__init__.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=129,N=352,device_name=NVIDIA_B200,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +9 -1049
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +212 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +799 -0
- sglang/srt/layers/moe/fused_moe_triton/layer.py +56 -45
- sglang/srt/layers/moe/fused_moe_triton/moe_align_block_size.py +87 -0
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +41 -38
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +43 -12
- sglang/srt/layers/moe/utils.py +6 -5
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +9 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +0 -3
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +43 -29
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +107 -40
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +77 -45
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/utils.py +13 -0
- sglang/srt/layers/quantization/w4afp8.py +60 -42
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +83 -41
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +28 -19
- sglang/srt/layers/sampler.py +29 -5
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/cache_controller.py +242 -278
- sglang/srt/managers/data_parallel_controller.py +30 -15
- sglang/srt/managers/detokenizer_manager.py +13 -2
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +160 -11
- sglang/srt/managers/mm_utils.py +6 -1
- sglang/srt/managers/multi_tokenizer_mixin.py +579 -0
- sglang/srt/managers/schedule_batch.py +27 -44
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +90 -115
- sglang/srt/managers/scheduler_metrics_mixin.py +114 -8
- sglang/srt/managers/scheduler_output_processor_mixin.py +29 -19
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/scheduler_update_weights_mixin.py +8 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +491 -0
- sglang/srt/managers/tokenizer_manager.py +41 -477
- sglang/srt/managers/tp_worker.py +16 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +8 -10
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +24 -22
- sglang/srt/mem_cache/hiradix_cache.py +184 -101
- sglang/srt/mem_cache/lora_radix_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +324 -41
- sglang/srt/mem_cache/memory_pool_host.py +25 -18
- sglang/srt/mem_cache/radix_cache.py +5 -6
- sglang/srt/mem_cache/radix_cache_cpp.py +1 -1
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +61 -34
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +149 -12
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +74 -19
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +1 -3
- sglang/srt/metrics/collector.py +484 -63
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +72 -18
- sglang/srt/model_executor/model_runner.py +189 -31
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +33 -28
- sglang/srt/model_loader/utils.py +12 -0
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/deepseek_v2.py +311 -50
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +10 -1
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/gpt_oss.py +5 -18
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +17 -0
- sglang/srt/models/longcat_flash.py +1026 -0
- sglang/srt/models/longcat_flash_nextn.py +699 -0
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +33 -3
- sglang/srt/models/qwen2_5_vl.py +90 -42
- sglang/srt/models/qwen2_moe.py +79 -14
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +39 -8
- sglang/srt/models/qwen3_next.py +1039 -0
- sglang/srt/models/qwen3_next_mtp.py +109 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/models/transformers.py +1 -1
- sglang/srt/multimodal/processors/base_processor.py +4 -2
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/sampling/penaltylib/orchestrator.py +14 -2
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +297 -79
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_worker.py +216 -120
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/utils.py +37 -2
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +181 -8
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_cutlass_w4a8_moe.py +24 -9
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_utils.py +25 -1
- sglang/utils.py +5 -0
- sglang/version.py +1 -1
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/METADATA +11 -9
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/RECORD +243 -194
- sglang/srt/disaggregation/launch_lb.py +0 -131
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post3.dist-info → sglang-0.5.2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -26,7 +26,7 @@ from typing import List, Literal, Optional, Union
|
|
26
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
27
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
28
|
from sglang.srt.lora.lora_registry import LoRARef
|
29
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
29
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
30
30
|
from sglang.srt.utils import (
|
31
31
|
LORA_TARGET_ALL_MODULES,
|
32
32
|
SUPPORTED_LORA_TARGET_MODULES,
|
@@ -44,16 +44,99 @@ from sglang.srt.utils import (
|
|
44
44
|
is_valid_ipv6_address,
|
45
45
|
nullable_str,
|
46
46
|
)
|
47
|
+
from sglang.utils import is_in_ci
|
47
48
|
|
48
49
|
logger = logging.getLogger(__name__)
|
49
50
|
|
50
51
|
|
52
|
+
# Define constants
|
53
|
+
LOAD_FORMAT_CHOICES = [
|
54
|
+
"auto",
|
55
|
+
"pt",
|
56
|
+
"safetensors",
|
57
|
+
"npcache",
|
58
|
+
"dummy",
|
59
|
+
"sharded_state",
|
60
|
+
"gguf",
|
61
|
+
"bitsandbytes",
|
62
|
+
"layered",
|
63
|
+
"remote",
|
64
|
+
]
|
65
|
+
|
66
|
+
QUANTIZATION_CHOICES = [
|
67
|
+
"awq",
|
68
|
+
"fp8",
|
69
|
+
"gptq",
|
70
|
+
"marlin",
|
71
|
+
"gptq_marlin",
|
72
|
+
"awq_marlin",
|
73
|
+
"bitsandbytes",
|
74
|
+
"gguf",
|
75
|
+
"modelopt",
|
76
|
+
"modelopt_fp4",
|
77
|
+
"petit_nvfp4",
|
78
|
+
"w8a8_int8",
|
79
|
+
"w8a8_fp8",
|
80
|
+
"moe_wna16",
|
81
|
+
"qoq",
|
82
|
+
"w4afp8",
|
83
|
+
"mxfp4",
|
84
|
+
]
|
85
|
+
|
86
|
+
ATTENTION_BACKEND_CHOICES = [
|
87
|
+
# Common
|
88
|
+
"triton",
|
89
|
+
"torch_native",
|
90
|
+
# NVIDIA specific
|
91
|
+
"cutlass_mla",
|
92
|
+
"fa3",
|
93
|
+
"flashinfer",
|
94
|
+
"flashmla",
|
95
|
+
"trtllm_mla",
|
96
|
+
"trtllm_mha",
|
97
|
+
"dual_chunk_flash_attn",
|
98
|
+
"hybrid_linear_attn",
|
99
|
+
# AMD specific
|
100
|
+
"aiter",
|
101
|
+
"wave",
|
102
|
+
# Other platforms
|
103
|
+
"intel_amx",
|
104
|
+
"ascend",
|
105
|
+
]
|
106
|
+
|
107
|
+
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
108
|
+
|
109
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
110
|
+
|
111
|
+
|
112
|
+
# Allow external code to add more choices
|
113
|
+
def add_load_format_choices(choices):
|
114
|
+
LOAD_FORMAT_CHOICES.extend(choices)
|
115
|
+
|
116
|
+
|
117
|
+
def add_quantization_method_choices(choices):
|
118
|
+
QUANTIZATION_CHOICES.extend(choices)
|
119
|
+
|
120
|
+
|
121
|
+
def add_attention_backend_choices(choices):
|
122
|
+
ATTENTION_BACKEND_CHOICES.extend(choices)
|
123
|
+
|
124
|
+
|
125
|
+
def add_disagg_transfer_backend_choices(choices):
|
126
|
+
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
127
|
+
|
128
|
+
|
129
|
+
def add_grammar_backend_choices(choices):
|
130
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
131
|
+
|
132
|
+
|
51
133
|
@dataclasses.dataclass
|
52
134
|
class ServerArgs:
|
53
135
|
# Model and tokenizer
|
54
136
|
model_path: str
|
55
137
|
tokenizer_path: Optional[str] = None
|
56
138
|
tokenizer_mode: str = "auto"
|
139
|
+
tokenizer_worker_num: int = 1
|
57
140
|
skip_tokenizer_init: bool = False
|
58
141
|
load_format: str = "auto"
|
59
142
|
model_loader_extra_config: str = "{}"
|
@@ -120,6 +203,8 @@ class ServerArgs:
|
|
120
203
|
bucket_inter_token_latency: Optional[List[float]] = None
|
121
204
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
122
205
|
collect_tokens_histogram: bool = False
|
206
|
+
prompt_tokens_buckets: Optional[List[str]] = None
|
207
|
+
generation_tokens_buckets: Optional[List[str]] = None
|
123
208
|
decode_log_interval: int = 40
|
124
209
|
enable_request_time_stats_logging: bool = False
|
125
210
|
kv_events_config: Optional[str] = None
|
@@ -140,6 +225,8 @@ class ServerArgs:
|
|
140
225
|
# Data parallelism
|
141
226
|
dp_size: int = 1
|
142
227
|
load_balance_method: str = "round_robin"
|
228
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
229
|
+
prefill_round_robin_balance: bool = False
|
143
230
|
|
144
231
|
# Multi-node distributed serving
|
145
232
|
dist_init_addr: Optional[str] = None
|
@@ -172,12 +259,14 @@ class ServerArgs:
|
|
172
259
|
# Speculative decoding
|
173
260
|
speculative_algorithm: Optional[str] = None
|
174
261
|
speculative_draft_model_path: Optional[str] = None
|
262
|
+
speculative_draft_model_revision: Optional[str] = None
|
175
263
|
speculative_num_steps: Optional[int] = None
|
176
264
|
speculative_eagle_topk: Optional[int] = None
|
177
265
|
speculative_num_draft_tokens: Optional[int] = None
|
178
266
|
speculative_accept_threshold_single: float = 1.0
|
179
267
|
speculative_accept_threshold_acc: float = 1.0
|
180
268
|
speculative_token_map: Optional[str] = None
|
269
|
+
speculative_attention_mode: str = "prefill"
|
181
270
|
|
182
271
|
# Expert parallelism
|
183
272
|
ep_size: int = 1
|
@@ -200,6 +289,7 @@ class ServerArgs:
|
|
200
289
|
eplb_algorithm: str = "auto"
|
201
290
|
eplb_rebalance_num_iterations: int = 1000
|
202
291
|
eplb_rebalance_layers_per_chunk: Optional[int] = None
|
292
|
+
eplb_min_rebalancing_utilization_threshold: float = 1.0
|
203
293
|
expert_distribution_recorder_mode: Optional[
|
204
294
|
Literal["stat", "stat_approx", "per_pass", "per_token"]
|
205
295
|
] = None
|
@@ -212,12 +302,14 @@ class ServerArgs:
|
|
212
302
|
enable_hierarchical_cache: bool = False
|
213
303
|
hicache_ratio: float = 2.0
|
214
304
|
hicache_size: int = 0
|
215
|
-
hicache_write_policy: str = "
|
305
|
+
hicache_write_policy: str = "write_through"
|
216
306
|
hicache_io_backend: str = "kernel"
|
217
307
|
hicache_mem_layout: str = "layer_first"
|
218
308
|
hicache_storage_backend: Optional[str] = None
|
219
309
|
hicache_storage_prefetch_policy: str = "best_effort"
|
220
310
|
hicache_storage_backend_extra_config: Optional[str] = None
|
311
|
+
# LMCache
|
312
|
+
enable_lmcache: bool = False
|
221
313
|
|
222
314
|
# Double Sparsity
|
223
315
|
enable_double_sparsity: bool = False
|
@@ -273,6 +365,7 @@ class ServerArgs:
|
|
273
365
|
disable_fast_image_processor: bool = False
|
274
366
|
enable_return_hidden_states: bool = False
|
275
367
|
scheduler_recv_interval: int = 1
|
368
|
+
numa_node: Optional[List[int]] = None
|
276
369
|
|
277
370
|
# Debug tensor dumps
|
278
371
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -289,7 +382,6 @@ class ServerArgs:
|
|
289
382
|
disaggregation_prefill_pp: Optional[int] = 1
|
290
383
|
disaggregation_ib_device: Optional[str] = None
|
291
384
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
292
|
-
pdlb_url: Optional[str] = None
|
293
385
|
|
294
386
|
# For model weight update
|
295
387
|
custom_weight_loader: Optional[List[str]] = None
|
@@ -299,6 +391,10 @@ class ServerArgs:
|
|
299
391
|
enable_pdmux: bool = False
|
300
392
|
sm_group_num: int = 3
|
301
393
|
|
394
|
+
# Mamba cache
|
395
|
+
max_mamba_cache_size: Optional[int] = None
|
396
|
+
mamba_ssm_dtype: str = "float32"
|
397
|
+
|
302
398
|
# Deprecated arguments
|
303
399
|
enable_ep_moe: bool = False
|
304
400
|
enable_deepep_moe: bool = False
|
@@ -386,9 +482,14 @@ class ServerArgs:
|
|
386
482
|
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
387
483
|
reserved_mem = 32 * 1024
|
388
484
|
|
485
|
+
# draft model and larger cuda graph buffers
|
389
486
|
if self.speculative_algorithm is not None:
|
390
|
-
|
391
|
-
|
487
|
+
if self.speculative_algorithm == "STANDALONE":
|
488
|
+
# Standalone speculative decoding needs more memory than other speculative
|
489
|
+
# decoding algorithms since the draft model is typically larger.
|
490
|
+
reserved_mem += 6 * 1024
|
491
|
+
else:
|
492
|
+
reserved_mem += 2 * 1024
|
392
493
|
if self.enable_dp_attention:
|
393
494
|
reserved_mem += 4 * 1024
|
394
495
|
|
@@ -530,12 +631,12 @@ class ServerArgs:
|
|
530
631
|
if self.grammar_backend is None:
|
531
632
|
self.grammar_backend = "xgrammar"
|
532
633
|
|
634
|
+
if self.dp_size == 1:
|
635
|
+
self.enable_dp_attention = False
|
636
|
+
|
533
637
|
# Data parallelism attention
|
534
638
|
if self.enable_dp_attention:
|
535
639
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
536
|
-
assert (
|
537
|
-
self.dp_size > 1
|
538
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
539
640
|
assert self.tp_size % self.dp_size == 0
|
540
641
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
541
642
|
logger.warning(
|
@@ -558,11 +659,13 @@ class ServerArgs:
|
|
558
659
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
559
660
|
|
560
661
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
561
|
-
|
562
|
-
self.
|
563
|
-
|
564
|
-
|
565
|
-
|
662
|
+
assert (
|
663
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
664
|
+
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
665
|
+
self.disable_shared_experts_fusion = True
|
666
|
+
logger.warning(
|
667
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
668
|
+
)
|
566
669
|
|
567
670
|
# DeepEP MoE
|
568
671
|
if self.moe_a2a_backend == "deepep":
|
@@ -617,7 +720,12 @@ class ServerArgs:
|
|
617
720
|
# NEXTN shares the same implementation of EAGLE
|
618
721
|
self.speculative_algorithm = "EAGLE"
|
619
722
|
|
620
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
723
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
724
|
+
if self.speculative_algorithm == "STANDALONE":
|
725
|
+
# TODO: support dp attention for standalone speculative decoding
|
726
|
+
assert (
|
727
|
+
self.enable_dp_attention is False
|
728
|
+
), "Currently standalone speculative decoding does not support dp attention."
|
621
729
|
if self.max_running_requests is None:
|
622
730
|
self.max_running_requests = 48
|
623
731
|
self.disable_overlap_schedule = True
|
@@ -673,6 +781,15 @@ class ServerArgs:
|
|
673
781
|
)
|
674
782
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
675
783
|
|
784
|
+
if (
|
785
|
+
self.speculative_eagle_topk > 1
|
786
|
+
and self.page_size > 1
|
787
|
+
and self.attention_backend != "flashinfer"
|
788
|
+
):
|
789
|
+
raise ValueError(
|
790
|
+
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
791
|
+
)
|
792
|
+
|
676
793
|
# The token generated from the verify step is counted.
|
677
794
|
# If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
|
678
795
|
# assert self.speculative_num_steps < self.speculative_num_draft_tokens
|
@@ -700,6 +817,13 @@ class ServerArgs:
|
|
700
817
|
|
701
818
|
self.disable_radix_cache = True
|
702
819
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
820
|
+
|
821
|
+
if self.dp_size > 1 and not is_in_ci():
|
822
|
+
assert self.prefill_round_robin_balance, (
|
823
|
+
"Prefill round robin balance is required when dp size > 1. "
|
824
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
825
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
826
|
+
)
|
703
827
|
elif self.disaggregation_mode == "prefill":
|
704
828
|
if self.disaggregation_decode_tp is None:
|
705
829
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -716,6 +840,8 @@ class ServerArgs:
|
|
716
840
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
717
841
|
"1" if self.enable_torch_compile else "0"
|
718
842
|
)
|
843
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
844
|
+
|
719
845
|
# Set env var before grammar backends init
|
720
846
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
721
847
|
"1" if self.disable_outlines_disk_cache else "0"
|
@@ -752,6 +878,12 @@ class ServerArgs:
|
|
752
878
|
"tokenizer if available, and 'slow' will "
|
753
879
|
"always use the slow tokenizer.",
|
754
880
|
)
|
881
|
+
parser.add_argument(
|
882
|
+
"--tokenizer-worker-num",
|
883
|
+
type=int,
|
884
|
+
default=ServerArgs.tokenizer_worker_num,
|
885
|
+
help="The worker num of the tokenizer manager.",
|
886
|
+
)
|
755
887
|
parser.add_argument(
|
756
888
|
"--skip-tokenizer-init",
|
757
889
|
action="store_true",
|
@@ -761,18 +893,7 @@ class ServerArgs:
|
|
761
893
|
"--load-format",
|
762
894
|
type=str,
|
763
895
|
default=ServerArgs.load_format,
|
764
|
-
choices=
|
765
|
-
"auto",
|
766
|
-
"pt",
|
767
|
-
"safetensors",
|
768
|
-
"npcache",
|
769
|
-
"dummy",
|
770
|
-
"sharded_state",
|
771
|
-
"gguf",
|
772
|
-
"bitsandbytes",
|
773
|
-
"layered",
|
774
|
-
"remote",
|
775
|
-
],
|
896
|
+
choices=LOAD_FORMAT_CHOICES,
|
776
897
|
help="The format of the model weights to load. "
|
777
898
|
'"auto" will try to load the weights in the safetensors format '
|
778
899
|
"and fall back to the pytorch bin format if safetensors format "
|
@@ -891,25 +1012,7 @@ class ServerArgs:
|
|
891
1012
|
"--quantization",
|
892
1013
|
type=str,
|
893
1014
|
default=ServerArgs.quantization,
|
894
|
-
choices=
|
895
|
-
"awq",
|
896
|
-
"fp8",
|
897
|
-
"gptq",
|
898
|
-
"marlin",
|
899
|
-
"gptq_marlin",
|
900
|
-
"awq_marlin",
|
901
|
-
"bitsandbytes",
|
902
|
-
"gguf",
|
903
|
-
"modelopt",
|
904
|
-
"modelopt_fp4",
|
905
|
-
"petit_nvfp4",
|
906
|
-
"w8a8_int8",
|
907
|
-
"w8a8_fp8",
|
908
|
-
"moe_wna16",
|
909
|
-
"qoq",
|
910
|
-
"w4afp8",
|
911
|
-
"mxfp4",
|
912
|
-
],
|
1015
|
+
choices=QUANTIZATION_CHOICES,
|
913
1016
|
help="The quantization method.",
|
914
1017
|
)
|
915
1018
|
parser.add_argument(
|
@@ -1172,6 +1275,26 @@ class ServerArgs:
|
|
1172
1275
|
default=ServerArgs.collect_tokens_histogram,
|
1173
1276
|
help="Collect prompt/generation tokens histogram.",
|
1174
1277
|
)
|
1278
|
+
bucket_rule = (
|
1279
|
+
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1280
|
+
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1281
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
|
1282
|
+
"<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
|
1283
|
+
)
|
1284
|
+
parser.add_argument(
|
1285
|
+
"--prompt-tokens-buckets",
|
1286
|
+
type=str,
|
1287
|
+
nargs="+",
|
1288
|
+
default=ServerArgs.prompt_tokens_buckets,
|
1289
|
+
help=f"The buckets rule of prompt tokens. {bucket_rule}",
|
1290
|
+
)
|
1291
|
+
parser.add_argument(
|
1292
|
+
"--generation-tokens-buckets",
|
1293
|
+
type=str,
|
1294
|
+
nargs="+",
|
1295
|
+
default=ServerArgs.generation_tokens_buckets,
|
1296
|
+
help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
|
1297
|
+
)
|
1175
1298
|
parser.add_argument(
|
1176
1299
|
"--gc-warning-threshold-secs",
|
1177
1300
|
type=float,
|
@@ -1280,6 +1403,12 @@ class ServerArgs:
|
|
1280
1403
|
"minimum_tokens",
|
1281
1404
|
],
|
1282
1405
|
)
|
1406
|
+
parser.add_argument(
|
1407
|
+
"--prefill-round-robin-balance",
|
1408
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1409
|
+
action="store_true",
|
1410
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1411
|
+
)
|
1283
1412
|
|
1284
1413
|
# Multi-node distributed serving
|
1285
1414
|
parser.add_argument(
|
@@ -1359,43 +1488,24 @@ class ServerArgs:
|
|
1359
1488
|
)
|
1360
1489
|
|
1361
1490
|
# Kernel backend
|
1362
|
-
ATTN_BACKENDS = [
|
1363
|
-
# Common
|
1364
|
-
"triton",
|
1365
|
-
"torch_native",
|
1366
|
-
# NVIDIA specific
|
1367
|
-
"cutlass_mla",
|
1368
|
-
"fa3",
|
1369
|
-
"flashinfer",
|
1370
|
-
"flashmla",
|
1371
|
-
"trtllm_mla",
|
1372
|
-
"trtllm_mha",
|
1373
|
-
"dual_chunk_flash_attn",
|
1374
|
-
# AMD specific
|
1375
|
-
"aiter",
|
1376
|
-
"wave",
|
1377
|
-
# Other platforms
|
1378
|
-
"intel_amx",
|
1379
|
-
"ascend",
|
1380
|
-
]
|
1381
1491
|
parser.add_argument(
|
1382
1492
|
"--attention-backend",
|
1383
1493
|
type=str,
|
1384
|
-
choices=
|
1494
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1385
1495
|
default=ServerArgs.attention_backend,
|
1386
1496
|
help="Choose the kernels for attention layers.",
|
1387
1497
|
)
|
1388
1498
|
parser.add_argument(
|
1389
1499
|
"--prefill-attention-backend",
|
1390
1500
|
type=str,
|
1391
|
-
choices=
|
1501
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1392
1502
|
default=ServerArgs.prefill_attention_backend,
|
1393
1503
|
help="Choose the kernels for prefill attention layers (have priority over --attention-backend).",
|
1394
1504
|
)
|
1395
1505
|
parser.add_argument(
|
1396
1506
|
"--decode-attention-backend",
|
1397
1507
|
type=str,
|
1398
|
-
choices=
|
1508
|
+
choices=ATTENTION_BACKEND_CHOICES,
|
1399
1509
|
default=ServerArgs.decode_attention_backend,
|
1400
1510
|
help="Choose the kernels for decode attention layers (have priority over --attention-backend).",
|
1401
1511
|
)
|
@@ -1409,7 +1519,7 @@ class ServerArgs:
|
|
1409
1519
|
parser.add_argument(
|
1410
1520
|
"--grammar-backend",
|
1411
1521
|
type=str,
|
1412
|
-
choices=
|
1522
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1413
1523
|
default=ServerArgs.grammar_backend,
|
1414
1524
|
help="Choose the backend for grammar-guided decoding.",
|
1415
1525
|
)
|
@@ -1425,14 +1535,23 @@ class ServerArgs:
|
|
1425
1535
|
parser.add_argument(
|
1426
1536
|
"--speculative-algorithm",
|
1427
1537
|
type=str,
|
1428
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
1538
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
1429
1539
|
help="Speculative algorithm.",
|
1430
1540
|
)
|
1431
1541
|
parser.add_argument(
|
1432
1542
|
"--speculative-draft-model-path",
|
1543
|
+
"--speculative-draft-model",
|
1433
1544
|
type=str,
|
1434
1545
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1435
1546
|
)
|
1547
|
+
parser.add_argument(
|
1548
|
+
"--speculative-draft-model-revision",
|
1549
|
+
type=str,
|
1550
|
+
default=None,
|
1551
|
+
help="The specific draft model version to use. It can be a branch "
|
1552
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
1553
|
+
"the default version.",
|
1554
|
+
)
|
1436
1555
|
parser.add_argument(
|
1437
1556
|
"--speculative-num-steps",
|
1438
1557
|
type=int,
|
@@ -1469,6 +1588,13 @@ class ServerArgs:
|
|
1469
1588
|
help="The path of the draft model's small vocab table.",
|
1470
1589
|
default=ServerArgs.speculative_token_map,
|
1471
1590
|
)
|
1591
|
+
parser.add_argument(
|
1592
|
+
"--speculative-attention-mode",
|
1593
|
+
type=str,
|
1594
|
+
choices=["prefill", "decode"],
|
1595
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
1596
|
+
default=ServerArgs.speculative_attention_mode,
|
1597
|
+
)
|
1472
1598
|
|
1473
1599
|
# Expert parallelism
|
1474
1600
|
parser.add_argument(
|
@@ -1503,7 +1629,7 @@ class ServerArgs:
|
|
1503
1629
|
parser.add_argument(
|
1504
1630
|
"--flashinfer-mxfp4-moe-precision",
|
1505
1631
|
type=str,
|
1506
|
-
choices=["
|
1632
|
+
choices=["default", "bf16"],
|
1507
1633
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1508
1634
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1509
1635
|
)
|
@@ -1560,6 +1686,12 @@ class ServerArgs:
|
|
1560
1686
|
default=ServerArgs.eplb_rebalance_layers_per_chunk,
|
1561
1687
|
help="Number of layers to rebalance per forward pass.",
|
1562
1688
|
)
|
1689
|
+
parser.add_argument(
|
1690
|
+
"--eplb-min-rebalancing-utilization-threshold",
|
1691
|
+
type=float,
|
1692
|
+
default=ServerArgs.eplb_min_rebalancing_utilization_threshold,
|
1693
|
+
help="Minimum threshold for GPU average utilization to trigger EPLB rebalancing. Must be in the range [0.0, 1.0].",
|
1694
|
+
)
|
1563
1695
|
parser.add_argument(
|
1564
1696
|
"--expert-distribution-recorder-mode",
|
1565
1697
|
type=str,
|
@@ -1590,6 +1722,21 @@ class ServerArgs:
|
|
1590
1722
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1591
1723
|
)
|
1592
1724
|
|
1725
|
+
# Mamba Cache
|
1726
|
+
parser.add_argument(
|
1727
|
+
"--max-mamba-cache-size",
|
1728
|
+
type=int,
|
1729
|
+
default=ServerArgs.max_mamba_cache_size,
|
1730
|
+
help="The maximum size of the mamba cache.",
|
1731
|
+
)
|
1732
|
+
parser.add_argument(
|
1733
|
+
"--mamba-ssm-dtype",
|
1734
|
+
type=str,
|
1735
|
+
default=ServerArgs.mamba_ssm_dtype,
|
1736
|
+
choices=["float32", "bfloat16"],
|
1737
|
+
help="The data type of the SSM states in mamba cache.",
|
1738
|
+
)
|
1739
|
+
|
1593
1740
|
# Hierarchical cache
|
1594
1741
|
parser.add_argument(
|
1595
1742
|
"--enable-hierarchical-cache",
|
@@ -1649,6 +1796,12 @@ class ServerArgs:
|
|
1649
1796
|
default=ServerArgs.hicache_storage_backend_extra_config,
|
1650
1797
|
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1651
1798
|
)
|
1799
|
+
# LMCache
|
1800
|
+
parser.add_argument(
|
1801
|
+
"--enable-lmcache",
|
1802
|
+
action="store_true",
|
1803
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
1804
|
+
)
|
1652
1805
|
|
1653
1806
|
# Double Sparsity
|
1654
1807
|
parser.add_argument(
|
@@ -1921,6 +2074,12 @@ class ServerArgs:
|
|
1921
2074
|
default=ServerArgs.scheduler_recv_interval,
|
1922
2075
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1923
2076
|
)
|
2077
|
+
parser.add_argument(
|
2078
|
+
"--numa-node",
|
2079
|
+
type=int,
|
2080
|
+
nargs="+",
|
2081
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2082
|
+
)
|
1924
2083
|
|
1925
2084
|
# Debug tensor dumps
|
1926
2085
|
parser.add_argument(
|
@@ -1959,7 +2118,7 @@ class ServerArgs:
|
|
1959
2118
|
"--disaggregation-transfer-backend",
|
1960
2119
|
type=str,
|
1961
2120
|
default=ServerArgs.disaggregation_transfer_backend,
|
1962
|
-
choices=
|
2121
|
+
choices=DISAGG_TRANSFER_BACKEND_CHOICES,
|
1963
2122
|
help="The backend for disaggregation transfer. Default is mooncake.",
|
1964
2123
|
)
|
1965
2124
|
parser.add_argument(
|
@@ -2000,12 +2159,6 @@ class ServerArgs:
|
|
2000
2159
|
default=ServerArgs.num_reserved_decode_tokens,
|
2001
2160
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
2002
2161
|
)
|
2003
|
-
parser.add_argument(
|
2004
|
-
"--pdlb-url",
|
2005
|
-
type=str,
|
2006
|
-
default=None,
|
2007
|
-
help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
|
2008
|
-
)
|
2009
2162
|
|
2010
2163
|
# Custom weight loader
|
2011
2164
|
parser.add_argument(
|
@@ -2134,6 +2287,15 @@ class ServerArgs:
|
|
2134
2287
|
self.chunked_prefill_size % self.page_size == 0
|
2135
2288
|
), "chunked_prefill_size must be divisible by page_size"
|
2136
2289
|
|
2290
|
+
# Check multi tokenizer
|
2291
|
+
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
2292
|
+
self.validate_buckets_rule(
|
2293
|
+
"--prompt-tokens-buckets", self.prompt_tokens_buckets
|
2294
|
+
)
|
2295
|
+
self.validate_buckets_rule(
|
2296
|
+
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2297
|
+
)
|
2298
|
+
|
2137
2299
|
def check_lora_server_args(self):
|
2138
2300
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
2139
2301
|
|
@@ -2225,6 +2387,54 @@ class ServerArgs:
|
|
2225
2387
|
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
2226
2388
|
)
|
2227
2389
|
|
2390
|
+
def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
|
2391
|
+
if not buckets_rule:
|
2392
|
+
return
|
2393
|
+
|
2394
|
+
assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
|
2395
|
+
rule = buckets_rule[0]
|
2396
|
+
assert rule in [
|
2397
|
+
"tse",
|
2398
|
+
"default",
|
2399
|
+
"customer",
|
2400
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
|
2401
|
+
|
2402
|
+
if rule == "tse":
|
2403
|
+
assert (
|
2404
|
+
len(buckets_rule) == 4
|
2405
|
+
), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
|
2406
|
+
try:
|
2407
|
+
middle = float(buckets_rule[1])
|
2408
|
+
base = float(buckets_rule[2])
|
2409
|
+
count = int(buckets_rule[3])
|
2410
|
+
except (ValueError, IndexError):
|
2411
|
+
assert (
|
2412
|
+
False
|
2413
|
+
), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
|
2414
|
+
assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
|
2415
|
+
assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
|
2416
|
+
assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
|
2417
|
+
|
2418
|
+
elif rule == "default":
|
2419
|
+
assert (
|
2420
|
+
len(buckets_rule) == 1
|
2421
|
+
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
2422
|
+
|
2423
|
+
elif rule == "customer":
|
2424
|
+
assert (
|
2425
|
+
len(buckets_rule) >= 2
|
2426
|
+
), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
|
2427
|
+
try:
|
2428
|
+
bucket_values = [float(x) for x in buckets_rule[1:]]
|
2429
|
+
except ValueError:
|
2430
|
+
assert False, f"{arg_name} customer rule bucket values must be numeric"
|
2431
|
+
assert len(set(bucket_values)) == len(
|
2432
|
+
bucket_values
|
2433
|
+
), f"{arg_name} customer rule bucket values should not contain duplicates"
|
2434
|
+
assert all(
|
2435
|
+
val >= 0 for val in bucket_values
|
2436
|
+
), f"{arg_name} customer rule bucket values should be non-negative"
|
2437
|
+
|
2228
2438
|
def model_specific_adjustments(self):
|
2229
2439
|
hf_config = self.get_hf_config()
|
2230
2440
|
model_arch = hf_config.architectures[0]
|
@@ -2284,7 +2494,8 @@ class ServerArgs:
|
|
2284
2494
|
assert self.attention_backend in {
|
2285
2495
|
"fa3",
|
2286
2496
|
"aiter",
|
2287
|
-
|
2497
|
+
"triton",
|
2498
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
2288
2499
|
elif model_arch in [
|
2289
2500
|
"Gemma2ForCausalLM",
|
2290
2501
|
"Gemma3ForCausalLM",
|
@@ -2377,6 +2588,9 @@ class PortArgs:
|
|
2377
2588
|
# The ipc filename for Scheduler to send metrics
|
2378
2589
|
metrics_ipc_name: str
|
2379
2590
|
|
2591
|
+
# The ipc filename for Tokenizer and worker tokenizer
|
2592
|
+
tokenizer_worker_ipc_name: Optional[str]
|
2593
|
+
|
2380
2594
|
@staticmethod
|
2381
2595
|
def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
|
2382
2596
|
if server_args.nccl_port is None:
|
@@ -2400,6 +2614,7 @@ class PortArgs:
|
|
2400
2614
|
nccl_port=nccl_port,
|
2401
2615
|
rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2402
2616
|
metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
|
2617
|
+
tokenizer_worker_ipc_name=None,
|
2403
2618
|
)
|
2404
2619
|
else:
|
2405
2620
|
# DP attention. Use TCP + port to handle both single-node and multi-node.
|
@@ -2433,6 +2648,7 @@ class PortArgs:
|
|
2433
2648
|
nccl_port=nccl_port,
|
2434
2649
|
rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
|
2435
2650
|
metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
|
2651
|
+
tokenizer_worker_ipc_name=None,
|
2436
2652
|
)
|
2437
2653
|
|
2438
2654
|
|
@@ -2478,7 +2694,9 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2478
2694
|
"""
|
2479
2695
|
hf_config = self.get_hf_config()
|
2480
2696
|
arch = hf_config.architectures[0]
|
2481
|
-
|
2697
|
+
if self.speculative_algorithm == "STANDALONE":
|
2698
|
+
# The default value for standalone speculative decoding
|
2699
|
+
return (3, 1, 4)
|
2482
2700
|
if arch in ["LlamaForCausalLM"]:
|
2483
2701
|
# The default value for llama
|
2484
2702
|
return (5, 4, 8)
|