sglang 0.5.2rc1__py3-none-any.whl → 0.5.3rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +10 -1
- sglang/bench_serving.py +257 -29
- sglang/lang/interpreter.py +1 -1
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/device_config.py +3 -1
- sglang/srt/configs/dots_vlm.py +139 -0
- sglang/srt/configs/internvl.py +6 -0
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +50 -6
- sglang/srt/configs/qwen3_next.py +326 -0
- sglang/srt/connector/__init__.py +8 -1
- sglang/srt/connector/remote_instance.py +82 -0
- sglang/srt/constrained/base_grammar_backend.py +48 -12
- sglang/srt/constrained/llguidance_backend.py +0 -1
- sglang/srt/constrained/outlines_backend.py +0 -1
- sglang/srt/constrained/xgrammar_backend.py +28 -9
- sglang/srt/custom_op.py +11 -1
- sglang/srt/debug_utils/dump_comparator.py +81 -44
- sglang/srt/debug_utils/dump_loader.py +97 -0
- sglang/srt/debug_utils/dumper.py +11 -3
- sglang/srt/debug_utils/text_comparator.py +73 -11
- sglang/srt/disaggregation/base/conn.py +1 -1
- sglang/srt/disaggregation/common/conn.py +15 -12
- sglang/srt/disaggregation/decode.py +21 -10
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +4 -1
- sglang/srt/disaggregation/fake/conn.py +1 -1
- sglang/srt/disaggregation/mini_lb.py +6 -445
- sglang/srt/disaggregation/mooncake/conn.py +18 -10
- sglang/srt/disaggregation/nixl/conn.py +180 -16
- sglang/srt/disaggregation/prefill.py +5 -3
- sglang/srt/disaggregation/utils.py +5 -50
- sglang/srt/distributed/parallel_state.py +67 -43
- sglang/srt/entrypoints/engine.py +38 -17
- sglang/srt/entrypoints/grpc_request_manager.py +580 -0
- sglang/srt/entrypoints/grpc_server.py +680 -0
- sglang/srt/entrypoints/http_server.py +88 -53
- sglang/srt/entrypoints/openai/protocol.py +7 -4
- sglang/srt/entrypoints/openai/serving_base.py +46 -3
- sglang/srt/entrypoints/openai/serving_chat.py +39 -19
- sglang/srt/entrypoints/openai/serving_completions.py +15 -4
- sglang/srt/entrypoints/openai/serving_embedding.py +9 -4
- sglang/srt/entrypoints/openai/serving_rerank.py +3 -1
- sglang/srt/entrypoints/openai/serving_responses.py +7 -4
- sglang/srt/entrypoints/openai/serving_score.py +1 -0
- sglang/srt/eplb/eplb_manager.py +2 -2
- sglang/srt/eplb/expert_distribution.py +26 -13
- sglang/srt/eplb/expert_location.py +8 -3
- sglang/srt/eplb/expert_location_updater.py +1 -1
- sglang/srt/function_call/base_format_detector.py +3 -6
- sglang/srt/function_call/ebnf_composer.py +11 -9
- sglang/srt/function_call/function_call_parser.py +6 -0
- sglang/srt/function_call/glm4_moe_detector.py +1 -1
- sglang/srt/function_call/gpt_oss_detector.py +1 -1
- sglang/srt/function_call/qwen3_coder_detector.py +1 -1
- sglang/srt/grpc/__init__.py +1 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +106 -0
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +427 -0
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +236 -0
- sglang/srt/hf_transformers_utils.py +4 -0
- sglang/srt/layers/activation.py +142 -9
- sglang/srt/layers/attention/aiter_backend.py +93 -68
- sglang/srt/layers/attention/ascend_backend.py +11 -4
- sglang/srt/layers/attention/fla/chunk.py +242 -0
- sglang/srt/layers/attention/fla/chunk_delta_h.py +314 -0
- sglang/srt/layers/attention/fla/chunk_o.py +178 -0
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +151 -0
- sglang/srt/layers/attention/fla/cumsum.py +300 -0
- sglang/srt/layers/attention/fla/fused_recurrent.py +640 -0
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +232 -0
- sglang/srt/layers/attention/fla/index.py +37 -0
- sglang/srt/layers/attention/fla/l2norm.py +150 -0
- sglang/srt/layers/attention/fla/layernorm_gated.py +326 -0
- sglang/srt/layers/attention/fla/op.py +66 -0
- sglang/srt/layers/attention/fla/solve_tril.py +465 -0
- sglang/srt/layers/attention/fla/utils.py +331 -0
- sglang/srt/layers/attention/fla/wy_fast.py +158 -0
- sglang/srt/layers/attention/flashinfer_backend.py +6 -4
- sglang/srt/layers/attention/flashinfer_mla_backend.py +16 -12
- sglang/srt/layers/attention/hybrid_attn_backend.py +57 -50
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +602 -0
- sglang/srt/layers/attention/intel_amx_backend.py +3 -0
- sglang/srt/layers/attention/mamba/causal_conv1d.py +128 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +1052 -0
- sglang/srt/layers/attention/mamba/mamba.py +64 -0
- sglang/srt/layers/attention/torch_native_backend.py +12 -6
- sglang/srt/layers/attention/triton_backend.py +18 -1
- sglang/srt/layers/attention/trtllm_mla_backend.py +124 -31
- sglang/srt/layers/attention/wave_ops/decode_attention.py +2 -4
- sglang/srt/layers/attention/wave_ops/extend_attention.py +1 -3
- sglang/srt/layers/communicator.py +45 -7
- sglang/srt/layers/dp_attention.py +30 -1
- sglang/srt/layers/layernorm.py +32 -15
- sglang/srt/layers/linear.py +34 -3
- sglang/srt/layers/logits_processor.py +29 -10
- sglang/srt/layers/moe/__init__.py +2 -1
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +3 -3
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +182 -62
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +156 -0
- sglang/srt/layers/moe/fused_moe_native.py +5 -3
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=128,N=352,device_name=NVIDIA_RTX_5880_Ada_Generation,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=512,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/{E=128,N=768,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json → E=257,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json } +29 -29
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=64,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +5 -2
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +23 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +61 -59
- sglang/srt/layers/moe/moe_runner/__init__.py +2 -1
- sglang/srt/layers/moe/moe_runner/base.py +274 -1
- sglang/srt/layers/moe/moe_runner/runner.py +80 -0
- sglang/srt/layers/moe/moe_runner/triton.py +448 -0
- sglang/srt/layers/moe/token_dispatcher/__init__.py +16 -4
- sglang/srt/layers/moe/token_dispatcher/{base_dispatcher.py → base.py} +67 -17
- sglang/srt/layers/moe/token_dispatcher/deepep.py +43 -39
- sglang/srt/layers/moe/token_dispatcher/standard.py +44 -2
- sglang/srt/layers/moe/topk.py +30 -9
- sglang/srt/layers/moe/utils.py +12 -7
- sglang/srt/layers/quantization/awq.py +19 -7
- sglang/srt/layers/quantization/base_config.py +11 -6
- sglang/srt/layers/quantization/blockwise_int8.py +38 -27
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +50 -30
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +13 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +27 -0
- sglang/srt/layers/quantization/fp8.py +76 -47
- sglang/srt/layers/quantization/fp8_utils.py +50 -31
- sglang/srt/layers/quantization/gptq.py +25 -17
- sglang/srt/layers/quantization/modelopt_quant.py +182 -49
- sglang/srt/layers/quantization/moe_wna16.py +21 -18
- sglang/srt/layers/quantization/mxfp4.py +68 -41
- sglang/srt/layers/quantization/quark/quark_moe.py +32 -27
- sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py +49 -30
- sglang/srt/layers/quantization/quark/utils.py +97 -0
- sglang/srt/layers/quantization/rocm_mxfp4_utils.py +13 -0
- sglang/srt/layers/quantization/unquant.py +135 -47
- sglang/srt/layers/quantization/w4afp8.py +30 -17
- sglang/srt/layers/quantization/w8a8_fp8.py +35 -20
- sglang/srt/layers/quantization/w8a8_int8.py +76 -38
- sglang/srt/layers/rocm_linear_utils.py +44 -0
- sglang/srt/layers/rotary_embedding.py +0 -18
- sglang/srt/layers/sampler.py +162 -18
- sglang/srt/lora/backend/base_backend.py +50 -8
- sglang/srt/lora/backend/triton_backend.py +90 -2
- sglang/srt/lora/layers.py +32 -0
- sglang/srt/lora/lora.py +4 -1
- sglang/srt/lora/lora_manager.py +35 -112
- sglang/srt/lora/mem_pool.py +24 -10
- sglang/srt/lora/utils.py +18 -9
- sglang/srt/managers/async_dynamic_batch_tokenizer.py +170 -0
- sglang/srt/managers/cache_controller.py +200 -199
- sglang/srt/managers/data_parallel_controller.py +105 -35
- sglang/srt/managers/detokenizer_manager.py +8 -4
- sglang/srt/managers/disagg_service.py +46 -0
- sglang/srt/managers/io_struct.py +199 -12
- sglang/srt/managers/mm_utils.py +1 -0
- sglang/srt/managers/multi_tokenizer_mixin.py +351 -397
- sglang/srt/managers/schedule_batch.py +77 -56
- sglang/srt/managers/schedule_policy.py +4 -3
- sglang/srt/managers/scheduler.py +191 -139
- sglang/srt/managers/scheduler_metrics_mixin.py +116 -9
- sglang/srt/managers/scheduler_output_processor_mixin.py +55 -11
- sglang/srt/managers/scheduler_profiler_mixin.py +1 -1
- sglang/srt/managers/template_manager.py +3 -3
- sglang/srt/managers/tokenizer_communicator_mixin.py +569 -0
- sglang/srt/managers/tokenizer_manager.py +260 -519
- sglang/srt/managers/tp_worker.py +53 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +42 -19
- sglang/srt/mem_cache/allocator.py +1 -1
- sglang/srt/mem_cache/hicache_storage.py +18 -33
- sglang/srt/mem_cache/hiradix_cache.py +108 -48
- sglang/srt/mem_cache/memory_pool.py +347 -48
- sglang/srt/mem_cache/memory_pool_host.py +121 -57
- sglang/srt/mem_cache/radix_cache.py +0 -2
- sglang/srt/mem_cache/storage/hf3fs/hf3fs_client.py +164 -0
- sglang/srt/mem_cache/storage/hf3fs/{client_hf3fs.py → hf3fs_usrbio_client.py} +5 -1
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +95 -5
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +280 -0
- sglang/srt/mem_cache/storage/lmcache/unit_test.py +121 -0
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +81 -20
- sglang/srt/mem_cache/storage/mooncake_store/test_mooncake_store.py +161 -0
- sglang/srt/mem_cache/swa_radix_cache.py +0 -2
- sglang/srt/metrics/collector.py +502 -77
- sglang/srt/metrics/startup_func_log_and_timer.py +150 -0
- sglang/srt/metrics/utils.py +48 -0
- sglang/srt/model_executor/cpu_graph_runner.py +640 -0
- sglang/srt/model_executor/cuda_graph_runner.py +13 -5
- sglang/srt/model_executor/forward_batch_info.py +75 -19
- sglang/srt/model_executor/model_runner.py +357 -30
- sglang/srt/model_loader/__init__.py +9 -3
- sglang/srt/model_loader/loader.py +128 -4
- sglang/srt/model_loader/weight_utils.py +2 -1
- sglang/srt/models/apertus.py +686 -0
- sglang/srt/models/bailing_moe.py +798 -218
- sglang/srt/models/bailing_moe_nextn.py +168 -0
- sglang/srt/models/deepseek_v2.py +346 -48
- sglang/srt/models/dots_vlm.py +174 -0
- sglang/srt/models/dots_vlm_vit.py +337 -0
- sglang/srt/models/ernie4.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +11 -2
- sglang/srt/models/glm4v.py +4 -2
- sglang/srt/models/glm4v_moe.py +3 -0
- sglang/srt/models/gpt_oss.py +1 -1
- sglang/srt/models/internvl.py +28 -0
- sglang/srt/models/llama4.py +9 -0
- sglang/srt/models/llama_eagle3.py +13 -0
- sglang/srt/models/longcat_flash.py +2 -2
- sglang/srt/models/minicpmv.py +165 -3
- sglang/srt/models/mllama4.py +25 -0
- sglang/srt/models/opt.py +637 -0
- sglang/srt/models/qwen2.py +7 -0
- sglang/srt/models/qwen2_5_vl.py +27 -3
- sglang/srt/models/qwen2_moe.py +60 -13
- sglang/srt/models/qwen3.py +8 -2
- sglang/srt/models/qwen3_moe.py +40 -9
- sglang/srt/models/qwen3_next.py +1042 -0
- sglang/srt/models/qwen3_next_mtp.py +112 -0
- sglang/srt/models/step3_vl.py +1 -1
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/multimodal/processors/dots_vlm.py +99 -0
- sglang/srt/multimodal/processors/glm4v.py +9 -9
- sglang/srt/multimodal/processors/internvl.py +141 -129
- sglang/srt/multimodal/processors/qwen_vl.py +15 -5
- sglang/srt/offloader.py +27 -3
- sglang/srt/{reasoning_parser.py → parser/reasoning_parser.py} +1 -1
- sglang/srt/remote_instance_weight_loader_utils.py +69 -0
- sglang/srt/sampling/sampling_batch_info.py +18 -15
- sglang/srt/server_args.py +355 -37
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +5 -0
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +10 -1
- sglang/srt/speculative/eagle_utils.py +0 -2
- sglang/srt/speculative/eagle_worker.py +197 -112
- sglang/srt/speculative/spec_info.py +5 -0
- sglang/srt/speculative/standalone_worker.py +109 -0
- sglang/srt/tracing/trace.py +552 -0
- sglang/srt/utils.py +46 -3
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/attention/test_trtllm_mla_backend.py +169 -5
- sglang/test/few_shot_gsm8k.py +1 -0
- sglang/test/runners.py +4 -0
- sglang/test/test_cutlass_moe.py +24 -6
- sglang/test/test_disaggregation_utils.py +66 -0
- sglang/test/test_fp4_moe.py +370 -1
- sglang/test/test_utils.py +28 -1
- sglang/utils.py +12 -0
- sglang/version.py +1 -1
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/METADATA +59 -123
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/RECORD +263 -200
- sglang/srt/disaggregation/launch_lb.py +0 -118
- sglang/srt/mem_cache/storage/mooncake_store/unit_test.py +0 -40
- /sglang/srt/{model_parallel.py → layers/model_parallel.py} +0 -0
- /sglang/srt/{code_completion_parser.py → parser/code_completion_parser.py} +0 -0
- /sglang/srt/{conversation.py → parser/conversation.py} +0 -0
- /sglang/srt/{harmony_parser.py → parser/harmony_parser.py} +0 -0
- /sglang/srt/{jinja_template_utils.py → parser/jinja_template_utils.py} +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/WHEEL +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.2rc1.dist-info → sglang-0.5.3rc0.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,14 +19,16 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
+
import socket
|
22
23
|
import sys
|
23
24
|
import tempfile
|
24
25
|
from typing import List, Literal, Optional, Union
|
25
26
|
|
27
|
+
from sglang.srt.connector import ConnectorType
|
26
28
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
27
29
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
28
30
|
from sglang.srt.lora.lora_registry import LoRARef
|
29
|
-
from sglang.srt.reasoning_parser import ReasoningParser
|
31
|
+
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
30
32
|
from sglang.srt.utils import (
|
31
33
|
LORA_TARGET_ALL_MODULES,
|
32
34
|
SUPPORTED_LORA_TARGET_MODULES,
|
@@ -36,14 +38,18 @@ from sglang.srt.utils import (
|
|
36
38
|
is_cuda,
|
37
39
|
is_flashinfer_available,
|
38
40
|
is_hip,
|
41
|
+
is_npu,
|
39
42
|
is_port_available,
|
40
43
|
is_remote_url,
|
41
44
|
is_sm90_supported,
|
42
45
|
is_sm100_supported,
|
43
46
|
is_triton_kernels_available,
|
44
47
|
is_valid_ipv6_address,
|
48
|
+
json_list_type,
|
45
49
|
nullable_str,
|
50
|
+
parse_connector_type,
|
46
51
|
)
|
52
|
+
from sglang.utils import is_in_ci
|
47
53
|
|
48
54
|
logger = logging.getLogger(__name__)
|
49
55
|
|
@@ -60,6 +66,7 @@ LOAD_FORMAT_CHOICES = [
|
|
60
66
|
"bitsandbytes",
|
61
67
|
"layered",
|
62
68
|
"remote",
|
69
|
+
"remote_instance",
|
63
70
|
]
|
64
71
|
|
65
72
|
QUANTIZATION_CHOICES = [
|
@@ -94,6 +101,7 @@ ATTENTION_BACKEND_CHOICES = [
|
|
94
101
|
"trtllm_mla",
|
95
102
|
"trtllm_mha",
|
96
103
|
"dual_chunk_flash_attn",
|
104
|
+
"hybrid_linear_attn",
|
97
105
|
# AMD specific
|
98
106
|
"aiter",
|
99
107
|
"wave",
|
@@ -104,6 +112,8 @@ ATTENTION_BACKEND_CHOICES = [
|
|
104
112
|
|
105
113
|
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
106
114
|
|
115
|
+
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
116
|
+
|
107
117
|
|
108
118
|
# Allow external code to add more choices
|
109
119
|
def add_load_format_choices(choices):
|
@@ -122,6 +132,10 @@ def add_disagg_transfer_backend_choices(choices):
|
|
122
132
|
DISAGG_TRANSFER_BACKEND_CHOICES.extend(choices)
|
123
133
|
|
124
134
|
|
135
|
+
def add_grammar_backend_choices(choices):
|
136
|
+
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
137
|
+
|
138
|
+
|
125
139
|
@dataclasses.dataclass
|
126
140
|
class ServerArgs:
|
127
141
|
# Model and tokenizer
|
@@ -191,14 +205,20 @@ class ServerArgs:
|
|
191
205
|
show_time_cost: bool = False
|
192
206
|
enable_metrics: bool = False
|
193
207
|
enable_metrics_for_all_schedulers: bool = False
|
208
|
+
tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
|
209
|
+
tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
|
194
210
|
bucket_time_to_first_token: Optional[List[float]] = None
|
195
211
|
bucket_inter_token_latency: Optional[List[float]] = None
|
196
212
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
197
213
|
collect_tokens_histogram: bool = False
|
214
|
+
prompt_tokens_buckets: Optional[List[str]] = None
|
215
|
+
generation_tokens_buckets: Optional[List[str]] = None
|
198
216
|
decode_log_interval: int = 40
|
199
217
|
enable_request_time_stats_logging: bool = False
|
200
218
|
kv_events_config: Optional[str] = None
|
201
219
|
gc_warning_threshold_secs: float = 0.0
|
220
|
+
enable_trace: bool = False
|
221
|
+
oltp_traces_endpoint: str = "localhost:4317"
|
202
222
|
|
203
223
|
# API related
|
204
224
|
api_key: Optional[str] = None
|
@@ -215,6 +235,9 @@ class ServerArgs:
|
|
215
235
|
# Data parallelism
|
216
236
|
dp_size: int = 1
|
217
237
|
load_balance_method: str = "round_robin"
|
238
|
+
load_watch_interval: float = 0.1
|
239
|
+
# FIXME: remove this after dp rank scheduling is fully supported with PD-Disaggregation
|
240
|
+
prefill_round_robin_balance: bool = False
|
218
241
|
|
219
242
|
# Multi-node distributed serving
|
220
243
|
dist_init_addr: Optional[str] = None
|
@@ -247,12 +270,14 @@ class ServerArgs:
|
|
247
270
|
# Speculative decoding
|
248
271
|
speculative_algorithm: Optional[str] = None
|
249
272
|
speculative_draft_model_path: Optional[str] = None
|
273
|
+
speculative_draft_model_revision: Optional[str] = None
|
250
274
|
speculative_num_steps: Optional[int] = None
|
251
275
|
speculative_eagle_topk: Optional[int] = None
|
252
276
|
speculative_num_draft_tokens: Optional[int] = None
|
253
277
|
speculative_accept_threshold_single: float = 1.0
|
254
278
|
speculative_accept_threshold_acc: float = 1.0
|
255
279
|
speculative_token_map: Optional[str] = None
|
280
|
+
speculative_attention_mode: str = "prefill"
|
256
281
|
|
257
282
|
# Expert parallelism
|
258
283
|
ep_size: int = 1
|
@@ -294,6 +319,8 @@ class ServerArgs:
|
|
294
319
|
hicache_storage_backend: Optional[str] = None
|
295
320
|
hicache_storage_prefetch_policy: str = "best_effort"
|
296
321
|
hicache_storage_backend_extra_config: Optional[str] = None
|
322
|
+
# LMCache
|
323
|
+
enable_lmcache: bool = False
|
297
324
|
|
298
325
|
# Double Sparsity
|
299
326
|
enable_double_sparsity: bool = False
|
@@ -338,6 +365,7 @@ class ServerArgs:
|
|
338
365
|
enable_p2p_check: bool = False
|
339
366
|
triton_attention_reduce_in_fp32: bool = False
|
340
367
|
triton_attention_num_kv_splits: int = 8
|
368
|
+
triton_attention_split_tile_size: Optional[int] = None
|
341
369
|
num_continuous_decode_steps: int = 1
|
342
370
|
delete_ckpt_after_loading: bool = False
|
343
371
|
enable_memory_saver: bool = False
|
@@ -349,6 +377,12 @@ class ServerArgs:
|
|
349
377
|
disable_fast_image_processor: bool = False
|
350
378
|
enable_return_hidden_states: bool = False
|
351
379
|
scheduler_recv_interval: int = 1
|
380
|
+
numa_node: Optional[List[int]] = None
|
381
|
+
|
382
|
+
# Dynamic batch tokenizer
|
383
|
+
enable_dynamic_batch_tokenizer: bool = False
|
384
|
+
dynamic_batch_tokenizer_batch_size: int = 32
|
385
|
+
dynamic_batch_tokenizer_batch_timeout: float = 0.002
|
352
386
|
|
353
387
|
# Debug tensor dumps
|
354
388
|
debug_tensor_dump_output_folder: Optional[str] = None
|
@@ -357,7 +391,7 @@ class ServerArgs:
|
|
357
391
|
debug_tensor_dump_prefill_only: bool = False
|
358
392
|
|
359
393
|
# PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
|
360
|
-
disaggregation_mode:
|
394
|
+
disaggregation_mode: Literal["null", "prefill", "decode"] = "null"
|
361
395
|
disaggregation_transfer_backend: str = "mooncake"
|
362
396
|
disaggregation_bootstrap_port: int = 8998
|
363
397
|
disaggregation_decode_tp: Optional[int] = None
|
@@ -365,20 +399,32 @@ class ServerArgs:
|
|
365
399
|
disaggregation_prefill_pp: Optional[int] = 1
|
366
400
|
disaggregation_ib_device: Optional[str] = None
|
367
401
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
368
|
-
|
402
|
+
|
403
|
+
# FIXME: hack to reduce ITL when decode bs is small
|
404
|
+
disaggregation_decode_polling_interval: int = 1
|
369
405
|
|
370
406
|
# For model weight update
|
371
407
|
custom_weight_loader: Optional[List[str]] = None
|
372
408
|
weight_loader_disable_mmap: bool = False
|
373
409
|
|
410
|
+
# Remote instance weight loading
|
411
|
+
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
|
412
|
+
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
|
413
|
+
remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
|
414
|
+
|
374
415
|
# For PD-Multiplexing
|
375
416
|
enable_pdmux: bool = False
|
376
417
|
sm_group_num: int = 3
|
377
418
|
|
419
|
+
# Mamba cache
|
420
|
+
max_mamba_cache_size: Optional[int] = None
|
421
|
+
mamba_ssm_dtype: str = "float32"
|
422
|
+
|
378
423
|
# Deprecated arguments
|
379
424
|
enable_ep_moe: bool = False
|
380
425
|
enable_deepep_moe: bool = False
|
381
426
|
enable_flashinfer_cutlass_moe: bool = False
|
427
|
+
enable_flashinfer_cutedsl_moe: bool = False
|
382
428
|
enable_flashinfer_trtllm_moe: bool = False
|
383
429
|
enable_triton_kernel_moe: bool = False
|
384
430
|
enable_flashinfer_mxfp4_moe: bool = False
|
@@ -400,6 +446,11 @@ class ServerArgs:
|
|
400
446
|
print_deprecated_warning(
|
401
447
|
"NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
|
402
448
|
)
|
449
|
+
if self.enable_flashinfer_cutedsl_moe:
|
450
|
+
self.moe_runner_backend = "flashinfer_cutedsl"
|
451
|
+
print_deprecated_warning(
|
452
|
+
"NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
|
453
|
+
)
|
403
454
|
if self.enable_flashinfer_cutlass_moe:
|
404
455
|
self.moe_runner_backend = "flashinfer_cutlass"
|
405
456
|
print_deprecated_warning(
|
@@ -419,6 +470,7 @@ class ServerArgs:
|
|
419
470
|
# Set missing default values
|
420
471
|
if self.tokenizer_path is None:
|
421
472
|
self.tokenizer_path = self.model_path
|
473
|
+
|
422
474
|
if self.served_model_name is None:
|
423
475
|
self.served_model_name = self.model_path
|
424
476
|
if self.device is None:
|
@@ -462,9 +514,14 @@ class ServerArgs:
|
|
462
514
|
# B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
463
515
|
reserved_mem = 32 * 1024
|
464
516
|
|
517
|
+
# draft model and larger cuda graph buffers
|
465
518
|
if self.speculative_algorithm is not None:
|
466
|
-
|
467
|
-
|
519
|
+
if self.speculative_algorithm == "STANDALONE":
|
520
|
+
# Standalone speculative decoding needs more memory than other speculative
|
521
|
+
# decoding algorithms since the draft model is typically larger.
|
522
|
+
reserved_mem += 6 * 1024
|
523
|
+
else:
|
524
|
+
reserved_mem += 2 * 1024
|
468
525
|
if self.enable_dp_attention:
|
469
526
|
reserved_mem += 4 * 1024
|
470
527
|
|
@@ -507,7 +564,8 @@ class ServerArgs:
|
|
507
564
|
self.sampling_backend = "pytorch"
|
508
565
|
|
509
566
|
# Model-specific adjustments
|
510
|
-
self.
|
567
|
+
if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
|
568
|
+
self.model_specific_adjustments()
|
511
569
|
|
512
570
|
# Set kernel backends
|
513
571
|
if self.device == "cpu":
|
@@ -526,7 +584,7 @@ class ServerArgs:
|
|
526
584
|
)
|
527
585
|
self.disable_cuda_graph = True
|
528
586
|
|
529
|
-
if self.attention_backend
|
587
|
+
if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
|
530
588
|
logger.warning(
|
531
589
|
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
532
590
|
)
|
@@ -606,12 +664,13 @@ class ServerArgs:
|
|
606
664
|
if self.grammar_backend is None:
|
607
665
|
self.grammar_backend = "xgrammar"
|
608
666
|
|
667
|
+
if self.dp_size == 1:
|
668
|
+
self.enable_dp_attention = False
|
669
|
+
self.enable_dp_lm_head = False
|
670
|
+
|
609
671
|
# Data parallelism attention
|
610
672
|
if self.enable_dp_attention:
|
611
673
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
612
|
-
assert (
|
613
|
-
self.dp_size > 1
|
614
|
-
), "Please set a dp-size > 1. You can use 1 < dp-size <= tp-size "
|
615
674
|
assert self.tp_size % self.dp_size == 0
|
616
675
|
self.chunked_prefill_size = self.chunked_prefill_size // self.dp_size
|
617
676
|
logger.warning(
|
@@ -634,11 +693,13 @@ class ServerArgs:
|
|
634
693
|
], "The expert parallel size must be 1 or the same as the tensor parallel size"
|
635
694
|
|
636
695
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
637
|
-
|
638
|
-
self.
|
639
|
-
|
640
|
-
|
641
|
-
|
696
|
+
assert (
|
697
|
+
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
698
|
+
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
699
|
+
self.disable_shared_experts_fusion = True
|
700
|
+
logger.warning(
|
701
|
+
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
702
|
+
)
|
642
703
|
|
643
704
|
# DeepEP MoE
|
644
705
|
if self.moe_a2a_backend == "deepep":
|
@@ -688,12 +749,24 @@ class ServerArgs:
|
|
688
749
|
self.hicache_io_backend = "kernel"
|
689
750
|
self.hicache_mem_layout = "page_first"
|
690
751
|
|
752
|
+
if self.hicache_mem_layout == "page_first_direct":
|
753
|
+
if self.hicache_io_backend != "direct":
|
754
|
+
self.hicache_io_backend = "direct"
|
755
|
+
logger.warning(
|
756
|
+
"Page first direct layout only support direct io backend"
|
757
|
+
)
|
758
|
+
|
691
759
|
# Speculative Decoding
|
692
760
|
if self.speculative_algorithm == "NEXTN":
|
693
761
|
# NEXTN shares the same implementation of EAGLE
|
694
762
|
self.speculative_algorithm = "EAGLE"
|
695
763
|
|
696
|
-
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
764
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
765
|
+
if self.speculative_algorithm == "STANDALONE":
|
766
|
+
# TODO: support dp attention for standalone speculative decoding
|
767
|
+
assert (
|
768
|
+
self.enable_dp_attention is False
|
769
|
+
), "Currently standalone speculative decoding does not support dp attention."
|
697
770
|
if self.max_running_requests is None:
|
698
771
|
self.max_running_requests = 48
|
699
772
|
self.disable_overlap_schedule = True
|
@@ -709,7 +782,12 @@ class ServerArgs:
|
|
709
782
|
)
|
710
783
|
|
711
784
|
model_arch = self.get_hf_config().architectures[0]
|
712
|
-
if model_arch in [
|
785
|
+
if model_arch in [
|
786
|
+
"DeepseekV3ForCausalLM",
|
787
|
+
"Glm4MoeForCausalLM",
|
788
|
+
"BailingMoeForCausalLM",
|
789
|
+
"BailingMoeV2ForCausalLM",
|
790
|
+
]:
|
713
791
|
# Auto set draft_model_path DeepSeek-V3/R1
|
714
792
|
if self.speculative_draft_model_path is None:
|
715
793
|
self.speculative_draft_model_path = self.model_path
|
@@ -768,12 +846,19 @@ class ServerArgs:
|
|
768
846
|
) and check_gguf_file(self.model_path):
|
769
847
|
self.quantization = self.load_format = "gguf"
|
770
848
|
|
771
|
-
# Model loading
|
772
849
|
if is_remote_url(self.model_path):
|
773
850
|
self.load_format = "remote"
|
774
851
|
if self.custom_weight_loader is None:
|
775
852
|
self.custom_weight_loader = []
|
776
853
|
|
854
|
+
if self.load_format == "remote_instance":
|
855
|
+
if (
|
856
|
+
self.remote_instance_weight_loader_seed_instance_ip is None
|
857
|
+
or self.remote_instance_weight_loader_seed_instance_service_port is None
|
858
|
+
or self.remote_instance_weight_loader_send_weights_group_ports is None
|
859
|
+
):
|
860
|
+
self.load_format = "auto"
|
861
|
+
|
777
862
|
# PD disaggregation
|
778
863
|
if self.disaggregation_mode == "decode":
|
779
864
|
assert (
|
@@ -785,6 +870,13 @@ class ServerArgs:
|
|
785
870
|
|
786
871
|
self.disable_radix_cache = True
|
787
872
|
logger.warning("KV cache is forced as chunk cache for decode server")
|
873
|
+
|
874
|
+
if self.dp_size > 1 and not is_in_ci():
|
875
|
+
assert self.prefill_round_robin_balance, (
|
876
|
+
"Prefill round robin balance is required when dp size > 1. "
|
877
|
+
"Please make sure that the prefill instance is launched with `--load-balance-method round_robin`"
|
878
|
+
" and `--prefill-round-robin-balance` is set for decode server."
|
879
|
+
)
|
788
880
|
elif self.disaggregation_mode == "prefill":
|
789
881
|
if self.disaggregation_decode_tp is None:
|
790
882
|
self.disaggregation_decode_tp = self.tp_size
|
@@ -797,10 +889,19 @@ class ServerArgs:
|
|
797
889
|
self.disable_cuda_graph = True
|
798
890
|
logger.warning("Cuda graph is disabled for prefill server")
|
799
891
|
|
892
|
+
# Validation: prevent both tokenizer batching features from being enabled
|
893
|
+
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
|
894
|
+
raise ValueError(
|
895
|
+
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
|
896
|
+
"Please choose one tokenizer batching approach."
|
897
|
+
)
|
898
|
+
|
800
899
|
# Propagate env vars
|
801
900
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
802
901
|
"1" if self.enable_torch_compile else "0"
|
803
902
|
)
|
903
|
+
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
904
|
+
|
804
905
|
# Set env var before grammar backends init
|
805
906
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
806
907
|
"1" if self.disable_outlines_disk_cache else "0"
|
@@ -812,6 +913,14 @@ class ServerArgs:
|
|
812
913
|
"and cannot be used at the same time. Please use only one of them."
|
813
914
|
)
|
814
915
|
|
916
|
+
if (
|
917
|
+
not self.tokenizer_metrics_custom_labels_header
|
918
|
+
and self.tokenizer_metrics_allowed_customer_labels
|
919
|
+
):
|
920
|
+
raise ValueError(
|
921
|
+
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
|
922
|
+
)
|
923
|
+
|
815
924
|
@staticmethod
|
816
925
|
def add_cli_args(parser: argparse.ArgumentParser):
|
817
926
|
# Model and tokenizer
|
@@ -823,16 +932,28 @@ class ServerArgs:
|
|
823
932
|
required=True,
|
824
933
|
)
|
825
934
|
parser.add_argument(
|
826
|
-
"--
|
935
|
+
"--remote-instance-weight-loader-seed-instance-ip",
|
827
936
|
type=str,
|
828
|
-
default=ServerArgs.
|
829
|
-
help="The
|
937
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
938
|
+
help="The ip of the seed instance for loading weights from remote instance.",
|
830
939
|
)
|
831
940
|
parser.add_argument(
|
832
|
-
"--
|
941
|
+
"--remote-instance-weight-loader-seed-instance-service-port",
|
833
942
|
type=int,
|
834
|
-
default=ServerArgs.
|
835
|
-
help="The
|
943
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
944
|
+
help="The service port of the seed instance for loading weights from remote instance.",
|
945
|
+
)
|
946
|
+
parser.add_argument(
|
947
|
+
"--remote-instance-weight-loader-send-weights-group-ports",
|
948
|
+
type=json_list_type,
|
949
|
+
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
950
|
+
help="The communication group ports for loading weights from remote instance.",
|
951
|
+
)
|
952
|
+
parser.add_argument(
|
953
|
+
"--tokenizer-path",
|
954
|
+
type=str,
|
955
|
+
default=ServerArgs.tokenizer_path,
|
956
|
+
help="The path of the tokenizer.",
|
836
957
|
)
|
837
958
|
parser.add_argument(
|
838
959
|
"--tokenizer-mode",
|
@@ -843,6 +964,12 @@ class ServerArgs:
|
|
843
964
|
"tokenizer if available, and 'slow' will "
|
844
965
|
"always use the slow tokenizer.",
|
845
966
|
)
|
967
|
+
parser.add_argument(
|
968
|
+
"--tokenizer-worker-num",
|
969
|
+
type=int,
|
970
|
+
default=ServerArgs.tokenizer_worker_num,
|
971
|
+
help="The worker num of the tokenizer manager.",
|
972
|
+
)
|
846
973
|
parser.add_argument(
|
847
974
|
"--skip-tokenizer-init",
|
848
975
|
action="store_true",
|
@@ -1033,7 +1160,7 @@ class ServerArgs:
|
|
1033
1160
|
"--schedule-policy",
|
1034
1161
|
type=str,
|
1035
1162
|
default=ServerArgs.schedule_policy,
|
1036
|
-
choices=["lpm", "random", "fcfs", "dfs-weight", "lof"],
|
1163
|
+
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
1037
1164
|
help="The scheduling policy of the requests.",
|
1038
1165
|
)
|
1039
1166
|
parser.add_argument(
|
@@ -1207,6 +1334,21 @@ class ServerArgs:
|
|
1207
1334
|
"to record request metrics separately. This is especially useful when dp_attention is enabled, as "
|
1208
1335
|
"otherwise all metrics appear to come from TP 0.",
|
1209
1336
|
)
|
1337
|
+
parser.add_argument(
|
1338
|
+
"--tokenizer-metrics-custom-labels-header",
|
1339
|
+
type=str,
|
1340
|
+
default=ServerArgs.tokenizer_metrics_custom_labels_header,
|
1341
|
+
help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
|
1342
|
+
)
|
1343
|
+
parser.add_argument(
|
1344
|
+
"--tokenizer-metrics-allowed-customer-labels",
|
1345
|
+
type=str,
|
1346
|
+
nargs="+",
|
1347
|
+
default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
|
1348
|
+
help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
|
1349
|
+
"'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
|
1350
|
+
"'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
|
1351
|
+
)
|
1210
1352
|
parser.add_argument(
|
1211
1353
|
"--bucket-time-to-first-token",
|
1212
1354
|
type=float,
|
@@ -1234,6 +1376,26 @@ class ServerArgs:
|
|
1234
1376
|
default=ServerArgs.collect_tokens_histogram,
|
1235
1377
|
help="Collect prompt/generation tokens histogram.",
|
1236
1378
|
)
|
1379
|
+
bucket_rule = (
|
1380
|
+
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1381
|
+
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1382
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
|
1383
|
+
"<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
|
1384
|
+
)
|
1385
|
+
parser.add_argument(
|
1386
|
+
"--prompt-tokens-buckets",
|
1387
|
+
type=str,
|
1388
|
+
nargs="+",
|
1389
|
+
default=ServerArgs.prompt_tokens_buckets,
|
1390
|
+
help=f"The buckets rule of prompt tokens. {bucket_rule}",
|
1391
|
+
)
|
1392
|
+
parser.add_argument(
|
1393
|
+
"--generation-tokens-buckets",
|
1394
|
+
type=str,
|
1395
|
+
nargs="+",
|
1396
|
+
default=ServerArgs.generation_tokens_buckets,
|
1397
|
+
help=f"The buckets rule for generation tokens histogram. {bucket_rule}",
|
1398
|
+
)
|
1237
1399
|
parser.add_argument(
|
1238
1400
|
"--gc-warning-threshold-secs",
|
1239
1401
|
type=float,
|
@@ -1258,6 +1420,17 @@ class ServerArgs:
|
|
1258
1420
|
default=None,
|
1259
1421
|
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
1260
1422
|
)
|
1423
|
+
parser.add_argument(
|
1424
|
+
"--enable-trace",
|
1425
|
+
action="store_true",
|
1426
|
+
help="Enable opentelemetry trace",
|
1427
|
+
)
|
1428
|
+
parser.add_argument(
|
1429
|
+
"--oltp-traces-endpoint",
|
1430
|
+
type=str,
|
1431
|
+
default="localhost:4317",
|
1432
|
+
help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
|
1433
|
+
)
|
1261
1434
|
|
1262
1435
|
# API related
|
1263
1436
|
parser.add_argument(
|
@@ -1342,6 +1515,18 @@ class ServerArgs:
|
|
1342
1515
|
"minimum_tokens",
|
1343
1516
|
],
|
1344
1517
|
)
|
1518
|
+
parser.add_argument(
|
1519
|
+
"--load-watch-interval",
|
1520
|
+
type=float,
|
1521
|
+
default=ServerArgs.load_watch_interval,
|
1522
|
+
help="The interval of load watching in seconds.",
|
1523
|
+
)
|
1524
|
+
parser.add_argument(
|
1525
|
+
"--prefill-round-robin-balance",
|
1526
|
+
default=ServerArgs.prefill_round_robin_balance,
|
1527
|
+
action="store_true",
|
1528
|
+
help="Prefill is round robin balanced. This is used to promise decode server can get the correct dp rank.",
|
1529
|
+
)
|
1345
1530
|
|
1346
1531
|
# Multi-node distributed serving
|
1347
1532
|
parser.add_argument(
|
@@ -1452,7 +1637,7 @@ class ServerArgs:
|
|
1452
1637
|
parser.add_argument(
|
1453
1638
|
"--grammar-backend",
|
1454
1639
|
type=str,
|
1455
|
-
choices=
|
1640
|
+
choices=GRAMMAR_BACKEND_CHOICES,
|
1456
1641
|
default=ServerArgs.grammar_backend,
|
1457
1642
|
help="Choose the backend for grammar-guided decoding.",
|
1458
1643
|
)
|
@@ -1468,14 +1653,23 @@ class ServerArgs:
|
|
1468
1653
|
parser.add_argument(
|
1469
1654
|
"--speculative-algorithm",
|
1470
1655
|
type=str,
|
1471
|
-
choices=["EAGLE", "EAGLE3", "NEXTN"],
|
1656
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
1472
1657
|
help="Speculative algorithm.",
|
1473
1658
|
)
|
1474
1659
|
parser.add_argument(
|
1475
1660
|
"--speculative-draft-model-path",
|
1661
|
+
"--speculative-draft-model",
|
1476
1662
|
type=str,
|
1477
1663
|
help="The path of the draft model weights. This can be a local folder or a Hugging Face repo ID.",
|
1478
1664
|
)
|
1665
|
+
parser.add_argument(
|
1666
|
+
"--speculative-draft-model-revision",
|
1667
|
+
type=str,
|
1668
|
+
default=None,
|
1669
|
+
help="The specific draft model version to use. It can be a branch "
|
1670
|
+
"name, a tag name, or a commit id. If unspecified, will use "
|
1671
|
+
"the default version.",
|
1672
|
+
)
|
1479
1673
|
parser.add_argument(
|
1480
1674
|
"--speculative-num-steps",
|
1481
1675
|
type=int,
|
@@ -1512,6 +1706,13 @@ class ServerArgs:
|
|
1512
1706
|
help="The path of the draft model's small vocab table.",
|
1513
1707
|
default=ServerArgs.speculative_token_map,
|
1514
1708
|
)
|
1709
|
+
parser.add_argument(
|
1710
|
+
"--speculative-attention-mode",
|
1711
|
+
type=str,
|
1712
|
+
choices=["prefill", "decode"],
|
1713
|
+
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
1714
|
+
default=ServerArgs.speculative_attention_mode,
|
1715
|
+
)
|
1515
1716
|
|
1516
1717
|
# Expert parallelism
|
1517
1718
|
parser.add_argument(
|
@@ -1539,6 +1740,7 @@ class ServerArgs:
|
|
1539
1740
|
"flashinfer_trtllm",
|
1540
1741
|
"flashinfer_cutlass",
|
1541
1742
|
"flashinfer_mxfp4",
|
1743
|
+
"flashinfer_cutedsl",
|
1542
1744
|
],
|
1543
1745
|
default=ServerArgs.moe_runner_backend,
|
1544
1746
|
help="Choose the runner backend for MoE.",
|
@@ -1546,7 +1748,7 @@ class ServerArgs:
|
|
1546
1748
|
parser.add_argument(
|
1547
1749
|
"--flashinfer-mxfp4-moe-precision",
|
1548
1750
|
type=str,
|
1549
|
-
choices=["
|
1751
|
+
choices=["default", "bf16"],
|
1550
1752
|
default=ServerArgs.flashinfer_mxfp4_moe_precision,
|
1551
1753
|
help="Choose the computation precision of flashinfer mxfp4 moe",
|
1552
1754
|
)
|
@@ -1639,6 +1841,21 @@ class ServerArgs:
|
|
1639
1841
|
help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
|
1640
1842
|
)
|
1641
1843
|
|
1844
|
+
# Mamba Cache
|
1845
|
+
parser.add_argument(
|
1846
|
+
"--max-mamba-cache-size",
|
1847
|
+
type=int,
|
1848
|
+
default=ServerArgs.max_mamba_cache_size,
|
1849
|
+
help="The maximum size of the mamba cache.",
|
1850
|
+
)
|
1851
|
+
parser.add_argument(
|
1852
|
+
"--mamba-ssm-dtype",
|
1853
|
+
type=str,
|
1854
|
+
default=ServerArgs.mamba_ssm_dtype,
|
1855
|
+
choices=["float32", "bfloat16"],
|
1856
|
+
help="The data type of the SSM states in mamba cache.",
|
1857
|
+
)
|
1858
|
+
|
1642
1859
|
# Hierarchical cache
|
1643
1860
|
parser.add_argument(
|
1644
1861
|
"--enable-hierarchical-cache",
|
@@ -1674,7 +1891,7 @@ class ServerArgs:
|
|
1674
1891
|
parser.add_argument(
|
1675
1892
|
"--hicache-mem-layout",
|
1676
1893
|
type=str,
|
1677
|
-
choices=["layer_first", "page_first"],
|
1894
|
+
choices=["layer_first", "page_first", "page_first_direct"],
|
1678
1895
|
default=ServerArgs.hicache_mem_layout,
|
1679
1896
|
help="The layout of host memory pool for hierarchical cache.",
|
1680
1897
|
)
|
@@ -1698,6 +1915,12 @@ class ServerArgs:
|
|
1698
1915
|
default=ServerArgs.hicache_storage_backend_extra_config,
|
1699
1916
|
help="A dictionary in JSON string format containing extra configuration for the storage backend.",
|
1700
1917
|
)
|
1918
|
+
# LMCache
|
1919
|
+
parser.add_argument(
|
1920
|
+
"--enable-lmcache",
|
1921
|
+
action="store_true",
|
1922
|
+
help="Using LMCache as an alternative hierarchical cache solution",
|
1923
|
+
)
|
1701
1924
|
|
1702
1925
|
# Double Sparsity
|
1703
1926
|
parser.add_argument(
|
@@ -1911,6 +2134,12 @@ class ServerArgs:
|
|
1911
2134
|
default=ServerArgs.triton_attention_num_kv_splits,
|
1912
2135
|
help="The number of KV splits in flash decoding Triton kernel. Larger value is better in longer context scenarios. The default value is 8.",
|
1913
2136
|
)
|
2137
|
+
parser.add_argument(
|
2138
|
+
"--triton-attention-split-tile-size",
|
2139
|
+
type=int,
|
2140
|
+
default=ServerArgs.triton_attention_split_tile_size,
|
2141
|
+
help="The size of split KV tile in flash decoding Triton kernel. Used for deterministic inference.",
|
2142
|
+
)
|
1914
2143
|
parser.add_argument(
|
1915
2144
|
"--num-continuous-decode-steps",
|
1916
2145
|
type=int,
|
@@ -1970,6 +2199,12 @@ class ServerArgs:
|
|
1970
2199
|
default=ServerArgs.scheduler_recv_interval,
|
1971
2200
|
help="The interval to poll requests in scheduler. Can be set to >1 to reduce the overhead of this.",
|
1972
2201
|
)
|
2202
|
+
parser.add_argument(
|
2203
|
+
"--numa-node",
|
2204
|
+
type=int,
|
2205
|
+
nargs="+",
|
2206
|
+
help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
|
2207
|
+
)
|
1973
2208
|
|
1974
2209
|
# Debug tensor dumps
|
1975
2210
|
parser.add_argument(
|
@@ -1995,12 +2230,29 @@ class ServerArgs:
|
|
1995
2230
|
action="store_true",
|
1996
2231
|
help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
|
1997
2232
|
)
|
2233
|
+
parser.add_argument(
|
2234
|
+
"--enable-dynamic-batch-tokenizer",
|
2235
|
+
action="store_true",
|
2236
|
+
help="Enable async dynamic batch tokenizer for improved performance when multiple requests arrive concurrently.",
|
2237
|
+
)
|
2238
|
+
parser.add_argument(
|
2239
|
+
"--dynamic-batch-tokenizer-batch-size",
|
2240
|
+
type=int,
|
2241
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_size,
|
2242
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Maximum batch size for dynamic batch tokenizer.",
|
2243
|
+
)
|
2244
|
+
parser.add_argument(
|
2245
|
+
"--dynamic-batch-tokenizer-batch-timeout",
|
2246
|
+
type=float,
|
2247
|
+
default=ServerArgs.dynamic_batch_tokenizer_batch_timeout,
|
2248
|
+
help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
|
2249
|
+
)
|
1998
2250
|
|
1999
2251
|
# PD disaggregation
|
2000
2252
|
parser.add_argument(
|
2001
2253
|
"--disaggregation-mode",
|
2002
2254
|
type=str,
|
2003
|
-
default=
|
2255
|
+
default=ServerArgs.disaggregation_mode,
|
2004
2256
|
choices=["null", "prefill", "decode"],
|
2005
2257
|
help='Only used for PD disaggregation. "prefill" for prefill-only server, and "decode" for decode-only server. If not specified, it is not PD disaggregated',
|
2006
2258
|
)
|
@@ -2050,10 +2302,10 @@ class ServerArgs:
|
|
2050
2302
|
help="Number of decode tokens that will have memory reserved when adding new request to the running batch.",
|
2051
2303
|
)
|
2052
2304
|
parser.add_argument(
|
2053
|
-
"--
|
2054
|
-
type=
|
2055
|
-
default=
|
2056
|
-
help="The
|
2305
|
+
"--disaggregation-decode-polling-interval",
|
2306
|
+
type=int,
|
2307
|
+
default=ServerArgs.disaggregation_decode_polling_interval,
|
2308
|
+
help="The interval to poll requests in decode server. Can be set to >1 to reduce the overhead of this.",
|
2057
2309
|
)
|
2058
2310
|
|
2059
2311
|
# Custom weight loader
|
@@ -2100,6 +2352,11 @@ class ServerArgs:
|
|
2100
2352
|
action="store_true",
|
2101
2353
|
help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2102
2354
|
)
|
2355
|
+
parser.add_argument(
|
2356
|
+
"--enable-flashinfer-cutedsl-moe",
|
2357
|
+
action="store_true",
|
2358
|
+
help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
|
2359
|
+
)
|
2103
2360
|
parser.add_argument(
|
2104
2361
|
"--enable-flashinfer-trtllm-moe",
|
2105
2362
|
action="store_true",
|
@@ -2122,6 +2379,7 @@ class ServerArgs:
|
|
2122
2379
|
args.pp_size = args.pipeline_parallel_size
|
2123
2380
|
args.dp_size = args.data_parallel_size
|
2124
2381
|
args.ep_size = args.expert_parallel_size
|
2382
|
+
|
2125
2383
|
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
2126
2384
|
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
2127
2385
|
|
@@ -2178,13 +2436,20 @@ class ServerArgs:
|
|
2178
2436
|
|
2179
2437
|
# Check chunked prefill
|
2180
2438
|
# Skip validation if chunked prefill is disabled (i.e., size <= 0).
|
2181
|
-
if
|
2439
|
+
# Skip validation if disaggregation mode is decode.
|
2440
|
+
if self.chunked_prefill_size > 0 and self.disaggregation_mode != "decode":
|
2182
2441
|
assert (
|
2183
2442
|
self.chunked_prefill_size % self.page_size == 0
|
2184
2443
|
), "chunked_prefill_size must be divisible by page_size"
|
2185
2444
|
|
2186
2445
|
# Check multi tokenizer
|
2187
2446
|
assert self.tokenizer_worker_num > 0, "Tokenizer worker num must >= 1"
|
2447
|
+
self.validate_buckets_rule(
|
2448
|
+
"--prompt-tokens-buckets", self.prompt_tokens_buckets
|
2449
|
+
)
|
2450
|
+
self.validate_buckets_rule(
|
2451
|
+
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2452
|
+
)
|
2188
2453
|
|
2189
2454
|
def check_lora_server_args(self):
|
2190
2455
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
@@ -2277,6 +2542,54 @@ class ServerArgs:
|
|
2277
2542
|
f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
|
2278
2543
|
)
|
2279
2544
|
|
2545
|
+
def validate_buckets_rule(self, arg_name: str, buckets_rule: List[str]):
|
2546
|
+
if not buckets_rule:
|
2547
|
+
return
|
2548
|
+
|
2549
|
+
assert len(buckets_rule) > 0, f"{arg_name} cannot be empty list"
|
2550
|
+
rule = buckets_rule[0]
|
2551
|
+
assert rule in [
|
2552
|
+
"tse",
|
2553
|
+
"default",
|
2554
|
+
"customer",
|
2555
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
|
2556
|
+
|
2557
|
+
if rule == "tse":
|
2558
|
+
assert (
|
2559
|
+
len(buckets_rule) == 4
|
2560
|
+
), f"{arg_name} TSE rule requires exactly 4 parameters: ['tse', middle, base, count], got {len(buckets_rule)}"
|
2561
|
+
try:
|
2562
|
+
middle = float(buckets_rule[1])
|
2563
|
+
base = float(buckets_rule[2])
|
2564
|
+
count = int(buckets_rule[3])
|
2565
|
+
except (ValueError, IndexError):
|
2566
|
+
assert (
|
2567
|
+
False
|
2568
|
+
), f"{arg_name} TSE rule parameters must be: ['tse', <float:middle>, <float:base>, <int:count>]"
|
2569
|
+
assert base > 1, f"{arg_name} TSE base must be larger than 1, got: {base}"
|
2570
|
+
assert count > 0, f"{arg_name} TSE count must be positive, got: {count}"
|
2571
|
+
assert middle > 0, f"{arg_name} TSE middle must be positive, got: {middle}"
|
2572
|
+
|
2573
|
+
elif rule == "default":
|
2574
|
+
assert (
|
2575
|
+
len(buckets_rule) == 1
|
2576
|
+
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
2577
|
+
|
2578
|
+
elif rule == "customer":
|
2579
|
+
assert (
|
2580
|
+
len(buckets_rule) >= 2
|
2581
|
+
), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
|
2582
|
+
try:
|
2583
|
+
bucket_values = [float(x) for x in buckets_rule[1:]]
|
2584
|
+
except ValueError:
|
2585
|
+
assert False, f"{arg_name} customer rule bucket values must be numeric"
|
2586
|
+
assert len(set(bucket_values)) == len(
|
2587
|
+
bucket_values
|
2588
|
+
), f"{arg_name} customer rule bucket values should not contain duplicates"
|
2589
|
+
assert all(
|
2590
|
+
val >= 0 for val in bucket_values
|
2591
|
+
), f"{arg_name} customer rule bucket values should be non-negative"
|
2592
|
+
|
2280
2593
|
def model_specific_adjustments(self):
|
2281
2594
|
hf_config = self.get_hf_config()
|
2282
2595
|
model_arch = hf_config.architectures[0]
|
@@ -2336,7 +2649,8 @@ class ServerArgs:
|
|
2336
2649
|
assert self.attention_backend in {
|
2337
2650
|
"fa3",
|
2338
2651
|
"aiter",
|
2339
|
-
|
2652
|
+
"triton",
|
2653
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
2340
2654
|
elif model_arch in [
|
2341
2655
|
"Gemma2ForCausalLM",
|
2342
2656
|
"Gemma3ForCausalLM",
|
@@ -2535,7 +2849,9 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2535
2849
|
"""
|
2536
2850
|
hf_config = self.get_hf_config()
|
2537
2851
|
arch = hf_config.architectures[0]
|
2538
|
-
|
2852
|
+
if self.speculative_algorithm == "STANDALONE":
|
2853
|
+
# The default value for standalone speculative decoding
|
2854
|
+
return (3, 1, 4)
|
2539
2855
|
if arch in ["LlamaForCausalLM"]:
|
2540
2856
|
# The default value for llama
|
2541
2857
|
return (5, 4, 8)
|
@@ -2543,6 +2859,8 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2543
2859
|
"DeepseekV3ForCausalLM",
|
2544
2860
|
"DeepseekV2ForCausalLM",
|
2545
2861
|
"GptOssForCausalLM",
|
2862
|
+
"BailingMoeForCausalLM",
|
2863
|
+
"BailingMoeV2ForCausalLM",
|
2546
2864
|
]:
|
2547
2865
|
# The default value for deepseek and gpt-oss
|
2548
2866
|
return (3, 1, 4)
|