sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +7 -9
- sglang/bench_one_batch_server.py +321 -31
- sglang/bench_serving.py +10 -3
- sglang/global_config.py +2 -2
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/launch_server.py +14 -0
- sglang/profiler.py +2 -2
- sglang/srt/batch_invariant_ops/__init__.py +27 -0
- sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
- sglang/srt/configs/__init__.py +4 -0
- sglang/srt/configs/dots_ocr.py +64 -0
- sglang/srt/configs/falcon_h1.py +360 -0
- sglang/srt/configs/load_config.py +8 -0
- sglang/srt/configs/model_config.py +160 -105
- sglang/srt/configs/qwen3_vl.py +586 -0
- sglang/srt/constrained/base_grammar_backend.py +1 -0
- sglang/srt/constrained/outlines_jump_forward.py +1 -1
- sglang/srt/constrained/xgrammar_backend.py +6 -4
- sglang/srt/debug_utils/dumper.py +10 -3
- sglang/srt/disaggregation/ascend/conn.py +2 -2
- sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
- sglang/srt/disaggregation/common/conn.py +266 -98
- sglang/srt/disaggregation/decode.py +50 -9
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
- sglang/srt/disaggregation/mooncake/conn.py +51 -541
- sglang/srt/disaggregation/nixl/conn.py +148 -39
- sglang/srt/disaggregation/prefill.py +31 -14
- sglang/srt/disaggregation/utils.py +36 -5
- sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
- sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
- sglang/srt/distributed/parallel_state.py +135 -80
- sglang/srt/entrypoints/engine.py +23 -3
- sglang/srt/entrypoints/grpc_request_manager.py +330 -55
- sglang/srt/entrypoints/grpc_server.py +232 -102
- sglang/srt/entrypoints/http_server.py +49 -9
- sglang/srt/entrypoints/openai/protocol.py +110 -5
- sglang/srt/entrypoints/openai/serving_base.py +25 -6
- sglang/srt/entrypoints/openai/serving_chat.py +178 -49
- sglang/srt/entrypoints/openai/serving_completions.py +5 -3
- sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
- sglang/srt/entrypoints/openai/serving_responses.py +42 -0
- sglang/srt/environ.py +285 -0
- sglang/srt/eplb/expert_location.py +30 -5
- sglang/srt/function_call/function_call_parser.py +3 -2
- sglang/srt/function_call/glm4_moe_detector.py +3 -3
- sglang/srt/function_call/gpt_oss_detector.py +23 -0
- sglang/srt/function_call/json_array_parser.py +63 -0
- sglang/srt/function_call/kimik2_detector.py +17 -4
- sglang/srt/function_call/utils.py +96 -5
- sglang/srt/grpc/compile_proto.py +245 -0
- sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
- sglang/srt/layers/activation.py +7 -6
- sglang/srt/layers/attention/aiter_backend.py +14 -15
- sglang/srt/layers/attention/ascend_backend.py +108 -9
- sglang/srt/layers/attention/attention_registry.py +206 -0
- sglang/srt/layers/attention/base_attn_backend.py +12 -3
- sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
- sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
- sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
- sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
- sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
- sglang/srt/layers/attention/flashattention_backend.py +41 -8
- sglang/srt/layers/attention/flashinfer_backend.py +112 -194
- sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
- sglang/srt/layers/attention/flashmla_backend.py +7 -5
- sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
- sglang/srt/layers/attention/mamba/mamba.py +566 -1
- sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
- sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
- sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
- sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
- sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
- sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
- sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
- sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
- sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
- sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
- sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
- sglang/srt/layers/attention/nsa/transform_index.py +144 -0
- sglang/srt/layers/attention/nsa/utils.py +24 -0
- sglang/srt/layers/attention/nsa_backend.py +887 -0
- sglang/srt/layers/attention/tbo_backend.py +6 -6
- sglang/srt/layers/attention/torch_flex_backend.py +325 -0
- sglang/srt/layers/attention/triton_backend.py +42 -9
- sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
- sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
- sglang/srt/layers/attention/vision.py +58 -0
- sglang/srt/layers/attention/wave_backend.py +4 -4
- sglang/srt/layers/communicator.py +8 -0
- sglang/srt/layers/dp_attention.py +11 -1
- sglang/srt/layers/elementwise.py +3 -1
- sglang/srt/layers/layernorm.py +2 -0
- sglang/srt/layers/linear.py +21 -4
- sglang/srt/layers/logits_processor.py +15 -2
- sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
- sglang/srt/layers/moe/ep_moe/layer.py +147 -74
- sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
- sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
- sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
- sglang/srt/layers/moe/utils.py +10 -0
- sglang/srt/layers/parameter.py +23 -6
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
- sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
- sglang/srt/layers/quantization/fp8.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +1 -1
- sglang/srt/layers/quantization/modelopt_quant.py +44 -9
- sglang/srt/layers/quantization/mxfp4.py +12 -4
- sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
- sglang/srt/layers/quantization/w4afp8.py +0 -4
- sglang/srt/layers/quantization/w8a8_int8.py +15 -3
- sglang/srt/layers/rotary_embedding.py +78 -31
- sglang/srt/layers/sampler.py +52 -4
- sglang/srt/layers/utils.py +23 -0
- sglang/srt/lora/backend/base_backend.py +3 -3
- sglang/srt/lora/backend/chunked_backend.py +348 -0
- sglang/srt/lora/backend/triton_backend.py +10 -4
- sglang/srt/lora/lora.py +7 -5
- sglang/srt/lora/lora_manager.py +17 -6
- sglang/srt/lora/mem_pool.py +1 -1
- sglang/srt/lora/triton_ops/__init__.py +4 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
- sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
- sglang/srt/lora/utils.py +7 -5
- sglang/srt/managers/cache_controller.py +42 -142
- sglang/srt/managers/data_parallel_controller.py +11 -46
- sglang/srt/managers/detokenizer_manager.py +11 -11
- sglang/srt/managers/io_struct.py +162 -118
- sglang/srt/managers/mm_utils.py +43 -6
- sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
- sglang/srt/managers/multimodal_processor.py +1 -2
- sglang/srt/managers/overlap_utils.py +53 -0
- sglang/srt/managers/schedule_batch.py +167 -86
- sglang/srt/managers/schedule_policy.py +143 -16
- sglang/srt/managers/scheduler.py +359 -214
- sglang/srt/managers/scheduler_input_blocker.py +1 -1
- sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
- sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
- sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
- sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
- sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
- sglang/srt/managers/tokenizer_manager.py +84 -136
- sglang/srt/managers/tp_worker.py +39 -29
- sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
- sglang/srt/managers/utils.py +1 -45
- sglang/srt/mem_cache/allocator.py +14 -20
- sglang/srt/mem_cache/allocator_ascend.py +41 -27
- sglang/srt/mem_cache/base_prefix_cache.py +1 -1
- sglang/srt/mem_cache/chunk_cache.py +8 -1
- sglang/srt/mem_cache/evict_policy.py +23 -0
- sglang/srt/mem_cache/hicache_storage.py +40 -1
- sglang/srt/mem_cache/hiradix_cache.py +119 -32
- sglang/srt/mem_cache/memory_pool.py +188 -10
- sglang/srt/mem_cache/memory_pool_host.py +134 -182
- sglang/srt/mem_cache/radix_cache.py +222 -71
- sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
- sglang/srt/mem_cache/storage/__init__.py +10 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
- sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
- sglang/srt/mem_cache/storage/backend_factory.py +223 -0
- sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
- sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
- sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
- sglang/srt/mem_cache/swa_radix_cache.py +25 -34
- sglang/srt/metrics/collector.py +82 -120
- sglang/srt/metrics/func_timer.py +2 -7
- sglang/srt/metrics/utils.py +8 -1
- sglang/srt/model_executor/cpu_graph_runner.py +2 -2
- sglang/srt/model_executor/cuda_graph_runner.py +39 -32
- sglang/srt/model_executor/forward_batch_info.py +23 -38
- sglang/srt/model_executor/model_runner.py +131 -183
- sglang/srt/model_executor/npu_graph_runner.py +12 -5
- sglang/srt/model_loader/loader.py +14 -10
- sglang/srt/model_loader/weight_utils.py +156 -2
- sglang/srt/models/bailing_moe.py +27 -4
- sglang/srt/models/deepseek_nextn.py +6 -1
- sglang/srt/models/deepseek_v2.py +536 -153
- sglang/srt/models/dots_ocr.py +173 -0
- sglang/srt/models/falcon_h1.py +576 -0
- sglang/srt/models/gemma3_causal.py +0 -2
- sglang/srt/models/gemma3_mm.py +1 -1
- sglang/srt/models/gemma3n_mm.py +1 -1
- sglang/srt/models/glm4_moe.py +3 -3
- sglang/srt/models/glm4_moe_nextn.py +2 -2
- sglang/srt/models/glm4v.py +1 -1
- sglang/srt/models/glm4v_moe.py +1 -1
- sglang/srt/models/gpt_oss.py +7 -30
- sglang/srt/models/kimi_vl_moonvit.py +2 -2
- sglang/srt/models/llama.py +4 -0
- sglang/srt/models/longcat_flash.py +1 -1
- sglang/srt/models/longcat_flash_nextn.py +1 -1
- sglang/srt/models/mllama4.py +15 -4
- sglang/srt/models/qwen2.py +0 -7
- sglang/srt/models/qwen2_5_vl.py +2 -2
- sglang/srt/models/qwen2_audio.py +1 -1
- sglang/srt/models/qwen2_moe.py +64 -1
- sglang/srt/models/qwen2_vl.py +1 -1
- sglang/srt/models/qwen3.py +18 -3
- sglang/srt/models/qwen3_moe.py +31 -3
- sglang/srt/models/qwen3_next.py +36 -9
- sglang/srt/models/qwen3_vl.py +787 -0
- sglang/srt/models/qwen3_vl_moe.py +471 -0
- sglang/srt/models/registry.py +15 -3
- sglang/srt/models/sarashina2_vision.py +269 -0
- sglang/srt/models/solar.py +505 -0
- sglang/srt/models/starcoder2.py +357 -0
- sglang/srt/models/torch_native_llama.py +9 -2
- sglang/srt/models/utils.py +51 -0
- sglang/srt/multimodal/processors/base_processor.py +15 -7
- sglang/srt/multimodal/processors/dots_vlm.py +2 -3
- sglang/srt/multimodal/processors/internvl.py +20 -8
- sglang/srt/multimodal/processors/qwen_vl.py +8 -1
- sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
- sglang/srt/parser/jinja_template_utils.py +6 -0
- sglang/srt/sampling/sampling_batch_info.py +20 -2
- sglang/srt/sampling/sampling_params.py +7 -0
- sglang/srt/server_args.py +753 -295
- sglang/srt/server_args_config_parser.py +146 -0
- sglang/srt/single_batch_overlap.py +151 -0
- sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
- sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
- sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
- sglang/srt/speculative/cpp_ngram/param.h +125 -0
- sglang/srt/speculative/cpp_ngram/queue.h +71 -0
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
- sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
- sglang/srt/speculative/eagle_worker.py +57 -25
- sglang/srt/speculative/ngram_utils.py +428 -0
- sglang/srt/speculative/ngram_worker.py +245 -0
- sglang/srt/speculative/spec_info.py +47 -0
- sglang/srt/speculative/spec_utils.py +606 -0
- sglang/srt/torch_memory_saver_adapter.py +5 -7
- sglang/srt/tracing/trace.py +32 -6
- sglang/srt/two_batch_overlap.py +8 -5
- sglang/srt/utils/__init__.py +2 -0
- sglang/srt/{utils.py → utils/common.py} +399 -74
- sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
- sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
- sglang/srt/utils/rpd_utils.py +452 -0
- sglang/srt/utils/slow_rank_detector.py +71 -0
- sglang/srt/warmup.py +8 -4
- sglang/srt/weight_sync/utils.py +1 -1
- sglang/test/get_logits_ut.py +57 -0
- sglang/test/run_eval.py +79 -11
- sglang/test/runners.py +1 -1
- sglang/test/simple_eval_common.py +5 -2
- sglang/test/simple_eval_mmmu_vlm.py +441 -0
- sglang/test/test_block_fp8.py +2 -2
- sglang/test/test_deterministic.py +297 -0
- sglang/test/test_disaggregation_utils.py +12 -1
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +355 -4
- sglang/utils.py +10 -1
- sglang/version.py +1 -1
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
- sglang/srt/mem_cache/lora_radix_cache.py +0 -421
- /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
- /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py
CHANGED
@@ -19,14 +19,11 @@ import json
|
|
19
19
|
import logging
|
20
20
|
import os
|
21
21
|
import random
|
22
|
-
import socket
|
23
|
-
import sys
|
24
22
|
import tempfile
|
25
23
|
from typing import List, Literal, Optional, Union
|
26
24
|
|
27
25
|
from sglang.srt.connector import ConnectorType
|
28
26
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
29
|
-
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
30
27
|
from sglang.srt.lora.lora_registry import LoRARef
|
31
28
|
from sglang.srt.parser.reasoning_parser import ReasoningParser
|
32
29
|
from sglang.srt.utils import (
|
@@ -49,11 +46,11 @@ from sglang.srt.utils import (
|
|
49
46
|
nullable_str,
|
50
47
|
parse_connector_type,
|
51
48
|
)
|
49
|
+
from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
|
52
50
|
from sglang.utils import is_in_ci
|
53
51
|
|
54
52
|
logger = logging.getLogger(__name__)
|
55
53
|
|
56
|
-
|
57
54
|
# Define constants
|
58
55
|
LOAD_FORMAT_CHOICES = [
|
59
56
|
"auto",
|
@@ -93,15 +90,17 @@ ATTENTION_BACKEND_CHOICES = [
|
|
93
90
|
# Common
|
94
91
|
"triton",
|
95
92
|
"torch_native",
|
93
|
+
"flex_attention",
|
94
|
+
"nsa",
|
96
95
|
# NVIDIA specific
|
97
96
|
"cutlass_mla",
|
98
97
|
"fa3",
|
98
|
+
"fa4",
|
99
99
|
"flashinfer",
|
100
100
|
"flashmla",
|
101
101
|
"trtllm_mla",
|
102
102
|
"trtllm_mha",
|
103
103
|
"dual_chunk_flash_attn",
|
104
|
-
"hybrid_linear_attn",
|
105
104
|
# AMD specific
|
106
105
|
"aiter",
|
107
106
|
"wave",
|
@@ -110,10 +109,18 @@ ATTENTION_BACKEND_CHOICES = [
|
|
110
109
|
"ascend",
|
111
110
|
]
|
112
111
|
|
112
|
+
LORA_BACKEND_CHOICES = ["triton", "csgmv"]
|
113
|
+
|
113
114
|
DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
|
114
115
|
|
115
116
|
GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
|
116
117
|
|
118
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
|
119
|
+
|
120
|
+
NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
|
121
|
+
|
122
|
+
RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
|
123
|
+
|
117
124
|
|
118
125
|
# Allow external code to add more choices
|
119
126
|
def add_load_format_choices(choices):
|
@@ -136,6 +143,14 @@ def add_grammar_backend_choices(choices):
|
|
136
143
|
GRAMMAR_BACKEND_CHOICES.extend(choices)
|
137
144
|
|
138
145
|
|
146
|
+
def add_deterministic_attention_backend_choices(choices):
|
147
|
+
DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
|
148
|
+
|
149
|
+
|
150
|
+
def add_radix_eviction_policy_choices(choices):
|
151
|
+
RADIX_EVICTION_POLICY_CHOICES.extend(choices)
|
152
|
+
|
153
|
+
|
139
154
|
@dataclasses.dataclass
|
140
155
|
class ServerArgs:
|
141
156
|
# Model and tokenizer
|
@@ -165,20 +180,25 @@ class ServerArgs:
|
|
165
180
|
quantization: Optional[str] = None
|
166
181
|
quantization_param_path: Optional[str] = None
|
167
182
|
kv_cache_dtype: str = "auto"
|
183
|
+
enable_fp32_lm_head: bool = False
|
168
184
|
|
169
185
|
# Memory and scheduling
|
170
186
|
mem_fraction_static: Optional[float] = None
|
171
187
|
max_running_requests: Optional[int] = None
|
172
|
-
max_queued_requests: Optional[int] =
|
188
|
+
max_queued_requests: Optional[int] = None
|
173
189
|
max_total_tokens: Optional[int] = None
|
174
190
|
chunked_prefill_size: Optional[int] = None
|
175
191
|
max_prefill_tokens: int = 16384
|
176
192
|
schedule_policy: str = "fcfs"
|
193
|
+
enable_priority_scheduling: bool = False
|
194
|
+
schedule_low_priority_values_first: bool = False
|
195
|
+
priority_scheduling_preemption_threshold: int = 10
|
177
196
|
schedule_conservativeness: float = 1.0
|
178
197
|
page_size: Optional[int] = None
|
179
198
|
hybrid_kvcache_ratio: Optional[float] = None
|
180
199
|
swa_full_tokens_ratio: float = 0.8
|
181
200
|
disable_hybrid_swa_memory: bool = False
|
201
|
+
radix_eviction_policy: str = "lru"
|
182
202
|
|
183
203
|
# Runtime options
|
184
204
|
device: Optional[str] = None
|
@@ -205,8 +225,8 @@ class ServerArgs:
|
|
205
225
|
show_time_cost: bool = False
|
206
226
|
enable_metrics: bool = False
|
207
227
|
enable_metrics_for_all_schedulers: bool = False
|
208
|
-
tokenizer_metrics_custom_labels_header: str = "x-
|
209
|
-
|
228
|
+
tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
|
229
|
+
tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
|
210
230
|
bucket_time_to_first_token: Optional[List[float]] = None
|
211
231
|
bucket_inter_token_latency: Optional[List[float]] = None
|
212
232
|
bucket_e2e_request_latency: Optional[List[float]] = None
|
@@ -258,6 +278,7 @@ class ServerArgs:
|
|
258
278
|
max_loaded_loras: Optional[int] = None
|
259
279
|
max_loras_per_batch: int = 8
|
260
280
|
lora_backend: str = "triton"
|
281
|
+
max_lora_chunk_size: Optional[int] = 16
|
261
282
|
|
262
283
|
# Kernel backend
|
263
284
|
attention_backend: Optional[str] = None
|
@@ -266,6 +287,8 @@ class ServerArgs:
|
|
266
287
|
sampling_backend: Optional[str] = None
|
267
288
|
grammar_backend: Optional[str] = None
|
268
289
|
mm_attention_backend: Optional[str] = None
|
290
|
+
nsa_prefill: str = "flashmla_prefill"
|
291
|
+
nsa_decode: str = "fa3"
|
269
292
|
|
270
293
|
# Speculative decoding
|
271
294
|
speculative_algorithm: Optional[str] = None
|
@@ -278,6 +301,14 @@ class ServerArgs:
|
|
278
301
|
speculative_accept_threshold_acc: float = 1.0
|
279
302
|
speculative_token_map: Optional[str] = None
|
280
303
|
speculative_attention_mode: str = "prefill"
|
304
|
+
# For ngram only
|
305
|
+
speculative_ngram_min_match_window_size: int = 1
|
306
|
+
speculative_ngram_max_match_window_size: int = 12
|
307
|
+
speculative_ngram_min_bfs_breadth: int = 1
|
308
|
+
speculative_ngram_max_bfs_breadth: int = 10
|
309
|
+
speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
|
310
|
+
speculative_ngram_branch_length: int = 18
|
311
|
+
speculative_ngram_capacity: int = 10 * 1000 * 1000
|
281
312
|
|
282
313
|
# Expert parallelism
|
283
314
|
ep_size: int = 1
|
@@ -309,6 +340,10 @@ class ServerArgs:
|
|
309
340
|
deepep_config: Optional[str] = None
|
310
341
|
moe_dense_tp_size: Optional[int] = None
|
311
342
|
|
343
|
+
# Mamba cache
|
344
|
+
max_mamba_cache_size: Optional[int] = None
|
345
|
+
mamba_ssm_dtype: str = "float32"
|
346
|
+
|
312
347
|
# Hierarchical cache
|
313
348
|
enable_hierarchical_cache: bool = False
|
314
349
|
hicache_ratio: float = 2.0
|
@@ -352,11 +387,13 @@ class ServerArgs:
|
|
352
387
|
disable_outlines_disk_cache: bool = False
|
353
388
|
disable_custom_all_reduce: bool = False
|
354
389
|
enable_mscclpp: bool = False
|
390
|
+
enable_torch_symm_mem: bool = False
|
355
391
|
disable_overlap_schedule: bool = False
|
356
392
|
enable_mixed_chunk: bool = False
|
357
393
|
enable_dp_attention: bool = False
|
358
394
|
enable_dp_lm_head: bool = False
|
359
395
|
enable_two_batch_overlap: bool = False
|
396
|
+
enable_single_batch_overlap: bool = False
|
360
397
|
tbo_token_distribution_threshold: float = 0.48
|
361
398
|
enable_torch_compile: bool = False
|
362
399
|
torch_compile_max_bs: int = 32
|
@@ -369,15 +406,18 @@ class ServerArgs:
|
|
369
406
|
num_continuous_decode_steps: int = 1
|
370
407
|
delete_ckpt_after_loading: bool = False
|
371
408
|
enable_memory_saver: bool = False
|
409
|
+
enable_weights_cpu_backup: bool = False
|
372
410
|
allow_auto_truncate: bool = False
|
373
411
|
enable_custom_logit_processor: bool = False
|
374
412
|
flashinfer_mla_disable_ragged: bool = False
|
375
413
|
disable_shared_experts_fusion: bool = False
|
376
414
|
disable_chunked_prefix_cache: bool = False
|
377
415
|
disable_fast_image_processor: bool = False
|
416
|
+
keep_mm_feature_on_device: bool = False
|
378
417
|
enable_return_hidden_states: bool = False
|
379
418
|
scheduler_recv_interval: int = 1
|
380
419
|
numa_node: Optional[List[int]] = None
|
420
|
+
enable_deterministic_inference: bool = False
|
381
421
|
|
382
422
|
# Dynamic batch tokenizer
|
383
423
|
enable_dynamic_batch_tokenizer: bool = False
|
@@ -398,16 +438,14 @@ class ServerArgs:
|
|
398
438
|
disaggregation_decode_dp: Optional[int] = None
|
399
439
|
disaggregation_prefill_pp: Optional[int] = 1
|
400
440
|
disaggregation_ib_device: Optional[str] = None
|
441
|
+
disaggregation_decode_enable_offload_kvcache: bool = False
|
401
442
|
num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
|
402
|
-
|
403
443
|
# FIXME: hack to reduce ITL when decode bs is small
|
404
444
|
disaggregation_decode_polling_interval: int = 1
|
405
445
|
|
406
|
-
# For model weight update
|
446
|
+
# For model weight update and weight loading
|
407
447
|
custom_weight_loader: Optional[List[str]] = None
|
408
448
|
weight_loader_disable_mmap: bool = False
|
409
|
-
|
410
|
-
# Remote instance weight loading
|
411
449
|
remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
|
412
450
|
remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
|
413
451
|
remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
|
@@ -416,61 +454,84 @@ class ServerArgs:
|
|
416
454
|
enable_pdmux: bool = False
|
417
455
|
sm_group_num: int = 3
|
418
456
|
|
419
|
-
|
420
|
-
|
421
|
-
|
457
|
+
def __post_init__(self):
|
458
|
+
"""
|
459
|
+
Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
|
460
|
+
"""
|
461
|
+
# Handle deprecated arguments.
|
462
|
+
self._handle_deprecated_args()
|
422
463
|
|
423
|
-
|
424
|
-
|
425
|
-
enable_deepep_moe: bool = False
|
426
|
-
enable_flashinfer_cutlass_moe: bool = False
|
427
|
-
enable_flashinfer_cutedsl_moe: bool = False
|
428
|
-
enable_flashinfer_trtllm_moe: bool = False
|
429
|
-
enable_triton_kernel_moe: bool = False
|
430
|
-
enable_flashinfer_mxfp4_moe: bool = False
|
464
|
+
# Set missing default values.
|
465
|
+
self._handle_missing_default_values()
|
431
466
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
463
|
-
|
464
|
-
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
467
|
+
# Get GPU memory capacity, which is a common dependency for several configuration steps.
|
468
|
+
gpu_mem = get_device_memory_capacity(self.device)
|
469
|
+
|
470
|
+
# Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
|
471
|
+
self._handle_gpu_memory_settings(gpu_mem)
|
472
|
+
|
473
|
+
# Handle device-specific backends.
|
474
|
+
self._handle_hpu_backends()
|
475
|
+
self._handle_cpu_backends()
|
476
|
+
|
477
|
+
# Apply model-specific adjustments.
|
478
|
+
self._handle_model_specific_adjustments()
|
479
|
+
|
480
|
+
# Set kernel backends.
|
481
|
+
self._handle_sampling_backend()
|
482
|
+
self._handle_attention_backend_compatibility()
|
483
|
+
self._handle_page_size()
|
484
|
+
self._handle_amd_specifics()
|
485
|
+
self._handle_grammar_backend()
|
486
|
+
|
487
|
+
# Handle data parallelism.
|
488
|
+
self._handle_data_parallelism()
|
489
|
+
|
490
|
+
# Handle MoE configurations.
|
491
|
+
self._handle_moe_kernel_config()
|
492
|
+
self._handle_deepep_moe()
|
493
|
+
self._handle_eplb_and_dispatch()
|
494
|
+
self._handle_expert_distribution_metrics()
|
495
|
+
|
496
|
+
# Handle pipeline parallelism.
|
497
|
+
self._handle_pipeline_parallelism()
|
498
|
+
|
499
|
+
# Handle Hicache settings.
|
500
|
+
self._handle_hicache()
|
501
|
+
|
502
|
+
# Handle speculative decoding logic.
|
503
|
+
self._handle_speculative_decoding()
|
504
|
+
|
505
|
+
# Handle model loading format.
|
506
|
+
self._handle_load_format()
|
469
507
|
|
470
|
-
#
|
508
|
+
# Handle PD disaggregation.
|
509
|
+
self._handle_disaggregation()
|
510
|
+
|
511
|
+
# Validate tokenizer settings.
|
512
|
+
self._handle_tokenizer_batching()
|
513
|
+
|
514
|
+
# Propagate environment variables.
|
515
|
+
self._handle_environment_variables()
|
516
|
+
|
517
|
+
# Validate cache settings.
|
518
|
+
self._handle_cache_compatibility()
|
519
|
+
|
520
|
+
# Validate metrics labels.
|
521
|
+
self._handle_metrics_labels()
|
522
|
+
|
523
|
+
# Handle deterministic inference.
|
524
|
+
self._handle_deterministic_inference()
|
525
|
+
|
526
|
+
# Handle any other necessary validations.
|
527
|
+
self._handle_other_validations()
|
528
|
+
|
529
|
+
def _handle_deprecated_args(self):
|
530
|
+
pass
|
531
|
+
|
532
|
+
def _handle_missing_default_values(self):
|
471
533
|
if self.tokenizer_path is None:
|
472
534
|
self.tokenizer_path = self.model_path
|
473
|
-
|
474
535
|
if self.served_model_name is None:
|
475
536
|
self.served_model_name = self.model_path
|
476
537
|
if self.device is None:
|
@@ -478,56 +539,140 @@ class ServerArgs:
|
|
478
539
|
if self.random_seed is None:
|
479
540
|
self.random_seed = random.randint(0, 1 << 30)
|
480
541
|
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
|
514
|
-
|
515
|
-
|
516
|
-
|
517
|
-
|
518
|
-
if self.
|
519
|
-
|
520
|
-
|
521
|
-
|
522
|
-
|
542
|
+
def _handle_gpu_memory_settings(self, gpu_mem):
|
543
|
+
"""
|
544
|
+
Configure GPU memory-dependent settings including
|
545
|
+
chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
|
546
|
+
|
547
|
+
Here are our heuristics:
|
548
|
+
- Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
|
549
|
+
This is because GPUs with more memory are generally more powerful, we need to use a larger
|
550
|
+
chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
|
551
|
+
- Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
|
552
|
+
|
553
|
+
GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
|
554
|
+
|
555
|
+
The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
|
556
|
+
or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
|
557
|
+
|
558
|
+
In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
|
559
|
+
The activation memory is proportional to the chunked_prefill_size.
|
560
|
+
The cuda graph memory is proportional to the cuda_graph_max_bs.
|
561
|
+
We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
|
562
|
+
and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
|
563
|
+
|
564
|
+
The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
|
565
|
+
"""
|
566
|
+
if gpu_mem is not None:
|
567
|
+
if gpu_mem < 20 * 1024:
|
568
|
+
# T4, 4080
|
569
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 8)
|
570
|
+
if self.chunked_prefill_size is None:
|
571
|
+
self.chunked_prefill_size = 2048
|
572
|
+
if self.cuda_graph_max_bs is None:
|
573
|
+
self.cuda_graph_max_bs = 8
|
574
|
+
elif gpu_mem < 35 * 1024:
|
575
|
+
# A10, 4090, 5090
|
576
|
+
# (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
|
577
|
+
if self.chunked_prefill_size is None:
|
578
|
+
self.chunked_prefill_size = 2048
|
579
|
+
if self.cuda_graph_max_bs is None:
|
580
|
+
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
|
581
|
+
# However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
|
582
|
+
# from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
583
|
+
if self.tp_size < 4:
|
584
|
+
self.cuda_graph_max_bs = 16
|
585
|
+
else:
|
586
|
+
self.cuda_graph_max_bs = 80
|
587
|
+
elif gpu_mem < 60 * 1024:
|
588
|
+
# A100 (40GB), L40,
|
589
|
+
# (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
|
590
|
+
if self.chunked_prefill_size is None:
|
591
|
+
self.chunked_prefill_size = 4096
|
592
|
+
if self.cuda_graph_max_bs is None:
|
593
|
+
if self.tp_size < 4:
|
594
|
+
self.cuda_graph_max_bs = 32
|
523
595
|
else:
|
524
|
-
|
525
|
-
|
526
|
-
|
596
|
+
self.cuda_graph_max_bs = 160
|
597
|
+
elif gpu_mem < 90 * 1024:
|
598
|
+
# H100, A100
|
599
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
600
|
+
if self.chunked_prefill_size is None:
|
601
|
+
self.chunked_prefill_size = 8192
|
602
|
+
if self.cuda_graph_max_bs is None:
|
603
|
+
if self.tp_size < 4:
|
604
|
+
self.cuda_graph_max_bs = 256
|
605
|
+
else:
|
606
|
+
self.cuda_graph_max_bs = 512
|
607
|
+
elif gpu_mem < 160 * 1024:
|
608
|
+
# H20, H200
|
609
|
+
# (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
|
610
|
+
if self.chunked_prefill_size is None:
|
611
|
+
self.chunked_prefill_size = 8192
|
612
|
+
if self.cuda_graph_max_bs is None:
|
613
|
+
if self.tp_size < 4:
|
614
|
+
self.cuda_graph_max_bs = 256
|
615
|
+
else:
|
616
|
+
self.cuda_graph_max_bs = 512
|
617
|
+
else:
|
618
|
+
# B200, MI300
|
619
|
+
# (chunked_prefill_size 16k, cuda_graph_max_bs 512)
|
620
|
+
if self.chunked_prefill_size is None:
|
621
|
+
self.chunked_prefill_size = 16384
|
622
|
+
if self.cuda_graph_max_bs is None:
|
623
|
+
self.cuda_graph_max_bs = 512
|
624
|
+
else:
|
625
|
+
# Fallback defaults when gpu_mem is None
|
626
|
+
if self.chunked_prefill_size is None:
|
627
|
+
self.chunked_prefill_size = 4096
|
628
|
+
if self.cuda_graph_max_bs is None:
|
629
|
+
self.cuda_graph_max_bs = 160
|
630
|
+
|
631
|
+
# Set cuda graph batch sizes
|
632
|
+
if self.cuda_graph_bs is None:
|
633
|
+
self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
|
634
|
+
else:
|
635
|
+
self.cuda_graph_max_bs = max(self.cuda_graph_bs)
|
527
636
|
|
528
|
-
|
637
|
+
if self.mem_fraction_static is None:
|
638
|
+
# Constant meta data (e.g., from attention backend)
|
639
|
+
reserved_mem = 512
|
640
|
+
# For activation during large prefill
|
641
|
+
if self.chunked_prefill_size > 0:
|
642
|
+
reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
|
529
643
|
else:
|
530
|
-
self.
|
644
|
+
reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
|
645
|
+
# For cuda graphs
|
646
|
+
reserved_mem += self.cuda_graph_max_bs * 2
|
647
|
+
# Some adjustments for large parallel size
|
648
|
+
reserved_mem += self.tp_size * self.pp_size / 8 * 1024
|
649
|
+
|
650
|
+
if self.enable_dp_attention:
|
651
|
+
# DP attention needs more padding for some operations
|
652
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
|
653
|
+
|
654
|
+
# DP attention uses much more memory for large cuda graph max bs,
|
655
|
+
# likely due to some inefficiencies in torch allocator or our implementation.
|
656
|
+
# So we need to reserve more memory.
|
657
|
+
if self.cuda_graph_max_bs > 300:
|
658
|
+
reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
|
659
|
+
|
660
|
+
if gpu_mem is not None and gpu_mem > 60 * 1024:
|
661
|
+
reserved_mem = max(reserved_mem, 10 * 1024)
|
662
|
+
|
663
|
+
if self.speculative_algorithm is not None:
|
664
|
+
if self.speculative_algorithm == "STANDALONE":
|
665
|
+
# standalonedraft model and cuda graphs
|
666
|
+
reserved_mem += 6 * 1024
|
667
|
+
elif self.speculative_algorithm != "NGRAM":
|
668
|
+
# eagle draft models and cuda graphs
|
669
|
+
reserved_mem += 2 * 1024
|
670
|
+
|
671
|
+
self.mem_fraction_static = (
|
672
|
+
round((gpu_mem - reserved_mem) / gpu_mem, 3)
|
673
|
+
if gpu_mem is not None
|
674
|
+
else 0.88
|
675
|
+
)
|
531
676
|
|
532
677
|
# Lazy init to avoid circular import
|
533
678
|
# Multimodal models need more memory for the image processor
|
@@ -537,54 +682,192 @@ class ServerArgs:
|
|
537
682
|
if model_config.is_multimodal:
|
538
683
|
self.adjust_mem_fraction_for_vlm(model_config)
|
539
684
|
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
685
|
+
def _generate_cuda_graph_batch_sizes(self):
|
686
|
+
"""
|
687
|
+
Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
|
688
|
+
This integrates the logic from cuda_graph_runner.py.
|
689
|
+
"""
|
690
|
+
# Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
|
691
|
+
if self.disable_cuda_graph_padding:
|
692
|
+
capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
|
693
|
+
elif self.speculative_algorithm is None:
|
694
|
+
# Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
|
695
|
+
capture_bs = (
|
696
|
+
[1, 2, 4, 8, 12]
|
697
|
+
+ list(range(16, 257, 8))
|
698
|
+
+ list(range(272, 512, 16))
|
699
|
+
+ list(range(512, self.cuda_graph_max_bs + 1, 32))
|
700
|
+
)
|
701
|
+
else:
|
702
|
+
# Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
|
703
|
+
capture_bs = (
|
704
|
+
list(range(1, 9, 1))
|
705
|
+
+ list(range(10, 33, 2))
|
706
|
+
+ list(range(40, 64, 4))
|
707
|
+
+ list(range(72, 257, 8))
|
708
|
+
+ list(range(272, self.cuda_graph_max_bs + 1, 16))
|
709
|
+
)
|
551
710
|
|
552
|
-
|
553
|
-
if self.cuda_graph_max_bs is None:
|
554
|
-
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
555
|
-
if gpu_mem is not None and gpu_mem < 35 * 1024:
|
556
|
-
if self.tp_size < 4:
|
557
|
-
self.cuda_graph_max_bs = 8
|
558
|
-
else:
|
559
|
-
self.cuda_graph_max_bs = 80
|
711
|
+
capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
|
560
712
|
|
561
|
-
|
713
|
+
return capture_bs
|
714
|
+
|
715
|
+
def _handle_hpu_backends(self):
|
562
716
|
if self.device == "hpu":
|
563
717
|
self.attention_backend = "torch_native"
|
564
718
|
self.sampling_backend = "pytorch"
|
565
719
|
|
566
|
-
|
567
|
-
if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
|
568
|
-
self.model_specific_adjustments()
|
569
|
-
|
570
|
-
# Set kernel backends
|
720
|
+
def _handle_cpu_backends(self):
|
571
721
|
if self.device == "cpu":
|
572
722
|
if self.attention_backend is None:
|
573
723
|
self.attention_backend = "intel_amx"
|
574
724
|
self.sampling_backend = "pytorch"
|
575
725
|
|
726
|
+
def _handle_model_specific_adjustments(self):
|
727
|
+
from sglang.srt.configs.model_config import is_deepseek_nsa
|
728
|
+
|
729
|
+
if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
|
730
|
+
return
|
731
|
+
|
732
|
+
hf_config = self.get_hf_config()
|
733
|
+
model_arch = hf_config.architectures[0]
|
734
|
+
if model_arch in ["GptOssForCausalLM"]:
|
735
|
+
if self.attention_backend is None:
|
736
|
+
if is_cuda() and is_sm100_supported():
|
737
|
+
self.attention_backend = "trtllm_mha"
|
738
|
+
elif is_cuda() and is_sm90_supported():
|
739
|
+
self.attention_backend = "fa3"
|
740
|
+
else:
|
741
|
+
self.attention_backend = "triton"
|
742
|
+
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
743
|
+
logger.info(
|
744
|
+
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
745
|
+
)
|
746
|
+
assert (
|
747
|
+
self.attention_backend in supported_backends
|
748
|
+
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
749
|
+
|
750
|
+
if is_sm100_supported():
|
751
|
+
if not self.enable_dp_attention:
|
752
|
+
self.enable_flashinfer_allreduce_fusion = True
|
753
|
+
logger.info(
|
754
|
+
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
755
|
+
)
|
756
|
+
quantization_config = getattr(hf_config, "quantization_config", None)
|
757
|
+
is_mxfp4_quant_format = (
|
758
|
+
quantization_config is not None
|
759
|
+
and quantization_config.get("quant_method") == "mxfp4"
|
760
|
+
)
|
761
|
+
|
762
|
+
if is_sm100_supported() and is_mxfp4_quant_format:
|
763
|
+
self.moe_runner_backend = "flashinfer_mxfp4"
|
764
|
+
logger.warning(
|
765
|
+
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
766
|
+
)
|
767
|
+
else:
|
768
|
+
if self.moe_runner_backend == "triton_kernel":
|
769
|
+
assert (
|
770
|
+
self.ep_size == 1
|
771
|
+
), "Triton kernel MoE is only supported when ep_size == 1"
|
772
|
+
if (
|
773
|
+
self.moe_runner_backend == "auto"
|
774
|
+
and self.ep_size == 1
|
775
|
+
and is_triton_kernels_available()
|
776
|
+
):
|
777
|
+
self.moe_runner_backend = "triton_kernel"
|
778
|
+
logger.warning(
|
779
|
+
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
780
|
+
)
|
781
|
+
self.disable_hybrid_swa_memory = True
|
782
|
+
if is_mxfp4_quant_format:
|
783
|
+
# use bf16 for mxfp4 triton kernels
|
784
|
+
self.dtype = "bfloat16"
|
785
|
+
|
786
|
+
elif "Llama4" in model_arch and self.device != "cpu":
|
787
|
+
assert self.attention_backend in {
|
788
|
+
"fa3",
|
789
|
+
"aiter",
|
790
|
+
"triton",
|
791
|
+
}, "fa3, aiter, or triton is required for Llama4 model"
|
792
|
+
elif model_arch in [
|
793
|
+
"Gemma2ForCausalLM",
|
794
|
+
"Gemma3ForCausalLM",
|
795
|
+
"Gemma3ForConditionalGeneration",
|
796
|
+
"Gemma3nForCausalLM",
|
797
|
+
"Gemma3nForConditionalGeneration",
|
798
|
+
]:
|
799
|
+
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
800
|
+
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
801
|
+
logger.warning(
|
802
|
+
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
803
|
+
)
|
804
|
+
self.disable_hybrid_swa_memory = True
|
805
|
+
|
806
|
+
if is_deepseek_nsa(hf_config):
|
807
|
+
if (
|
808
|
+
self.attention_backend is None
|
809
|
+
and self.prefill_attention_backend is None
|
810
|
+
and self.decode_attention_backend is None
|
811
|
+
):
|
812
|
+
self.attention_backend = "nsa"
|
813
|
+
logger.warning("Set nsa attention backend for DeepSeek NSA.")
|
814
|
+
|
815
|
+
if not is_npu():
|
816
|
+
self.enable_dp_attention = True
|
817
|
+
self.dp_size = self.tp_size
|
818
|
+
logger.warning("DP attention is enabled for DeepSeek NSA.")
|
819
|
+
|
820
|
+
self.page_size = 64
|
821
|
+
logger.warning("Setting page size to 64 for DeepSeek NSA.")
|
822
|
+
|
823
|
+
self.mem_fraction_static = 0.8
|
824
|
+
logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
|
825
|
+
|
826
|
+
# For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
|
827
|
+
import torch
|
828
|
+
|
829
|
+
major, _ = torch.cuda.get_device_capability()
|
830
|
+
if major >= 10:
|
831
|
+
self.kv_cache_dtype = "fp8_e4m3"
|
832
|
+
logger.warning("Setting KV cache dtype to fp8.")
|
833
|
+
|
834
|
+
if self.kv_cache_dtype == "fp8_e4m3":
|
835
|
+
self.nsa_prefill = "flashmla_decode"
|
836
|
+
self.nsa_decode = "flashmla_decode"
|
837
|
+
logger.warning(
|
838
|
+
"Setting NSA backend to flashmla_decode for FP8 KV Cache."
|
839
|
+
)
|
840
|
+
|
841
|
+
# Logging env vars for NSA
|
842
|
+
from sglang.srt.layers.attention.nsa.utils import (
|
843
|
+
print_nsa_bool_env_vars,
|
844
|
+
)
|
845
|
+
|
846
|
+
print_nsa_bool_env_vars()
|
847
|
+
|
848
|
+
def _handle_sampling_backend(self):
|
576
849
|
if self.sampling_backend is None:
|
577
850
|
self.sampling_backend = (
|
578
851
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
579
852
|
)
|
580
853
|
|
854
|
+
def _handle_attention_backend_compatibility(self):
|
581
855
|
if self.attention_backend == "torch_native":
|
582
856
|
logger.warning(
|
583
857
|
"Cuda graph is disabled because of using torch native attention backend"
|
584
858
|
)
|
585
859
|
self.disable_cuda_graph = True
|
586
860
|
|
587
|
-
if
|
861
|
+
if self.attention_backend == "flex_attention":
|
862
|
+
logger.warning(
|
863
|
+
"Cuda graph is disabled because of using torch Flex Attention backend"
|
864
|
+
)
|
865
|
+
self.disable_cuda_graph = True
|
866
|
+
assert (
|
867
|
+
self.speculative_algorithm is None
|
868
|
+
), "Speculative decoding is currently not supported with Flex Attention backend"
|
869
|
+
|
870
|
+
if is_npu() and self.attention_backend in ["ascend"]:
|
588
871
|
logger.warning(
|
589
872
|
"At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
|
590
873
|
)
|
@@ -646,29 +929,28 @@ class ServerArgs:
|
|
646
929
|
|
647
930
|
if self.attention_backend == "dual_chunk_flash_attn":
|
648
931
|
logger.warning(
|
649
|
-
"Mixed chunk
|
932
|
+
"Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
|
650
933
|
)
|
651
934
|
self.enable_mixed_chunk = False
|
652
|
-
self.disable_cuda_graph = True
|
653
935
|
self.disable_radix_cache = True
|
654
936
|
|
655
|
-
|
937
|
+
def _handle_page_size(self):
|
656
938
|
if self.page_size is None:
|
657
939
|
self.page_size = 1
|
658
940
|
|
659
|
-
|
941
|
+
def _handle_amd_specifics(self):
|
660
942
|
if is_hip():
|
661
943
|
self.triton_attention_num_kv_splits = 16
|
662
944
|
|
663
|
-
|
945
|
+
def _handle_grammar_backend(self):
|
664
946
|
if self.grammar_backend is None:
|
665
947
|
self.grammar_backend = "xgrammar"
|
666
948
|
|
949
|
+
def _handle_data_parallelism(self):
|
667
950
|
if self.dp_size == 1:
|
668
951
|
self.enable_dp_attention = False
|
669
952
|
self.enable_dp_lm_head = False
|
670
953
|
|
671
|
-
# Data parallelism attention
|
672
954
|
if self.enable_dp_attention:
|
673
955
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
674
956
|
assert self.tp_size % self.dp_size == 0
|
@@ -682,7 +964,7 @@ class ServerArgs:
|
|
682
964
|
self.enable_dp_attention
|
683
965
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
684
966
|
|
685
|
-
|
967
|
+
def _handle_moe_kernel_config(self):
|
686
968
|
if self.moe_runner_backend == "flashinfer_cutlass":
|
687
969
|
assert (
|
688
970
|
self.quantization == "modelopt_fp4"
|
@@ -695,13 +977,13 @@ class ServerArgs:
|
|
695
977
|
if self.moe_runner_backend == "flashinfer_trtllm":
|
696
978
|
assert (
|
697
979
|
self.quantization == "modelopt_fp4" or self.quantization == "fp8"
|
698
|
-
), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
|
980
|
+
), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
|
699
981
|
self.disable_shared_experts_fusion = True
|
700
982
|
logger.warning(
|
701
983
|
"FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
|
702
984
|
)
|
703
985
|
|
704
|
-
|
986
|
+
def _handle_deepep_moe(self):
|
705
987
|
if self.moe_a2a_backend == "deepep":
|
706
988
|
if self.deepep_mode == "normal":
|
707
989
|
logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
|
@@ -711,6 +993,7 @@ class ServerArgs:
|
|
711
993
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
712
994
|
)
|
713
995
|
|
996
|
+
def _handle_eplb_and_dispatch(self):
|
714
997
|
if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
|
715
998
|
self.expert_distribution_recorder_mode = "stat"
|
716
999
|
logger.warning(
|
@@ -725,6 +1008,7 @@ class ServerArgs:
|
|
725
1008
|
if self.enable_eplb:
|
726
1009
|
assert self.ep_size > 1
|
727
1010
|
|
1011
|
+
def _handle_expert_distribution_metrics(self):
|
728
1012
|
if self.enable_expert_distribution_metrics and (
|
729
1013
|
self.expert_distribution_recorder_mode is None
|
730
1014
|
):
|
@@ -736,18 +1020,24 @@ class ServerArgs:
|
|
736
1020
|
elif self.expert_distribution_recorder_mode is not None:
|
737
1021
|
self.expert_distribution_recorder_buffer_size = 1000
|
738
1022
|
|
739
|
-
|
1023
|
+
def _handle_pipeline_parallelism(self):
|
740
1024
|
if self.pp_size > 1:
|
741
1025
|
self.disable_overlap_schedule = True
|
742
1026
|
logger.warning(
|
743
1027
|
"Pipeline parallelism is incompatible with overlap schedule."
|
744
1028
|
)
|
745
1029
|
|
746
|
-
|
1030
|
+
def _handle_hicache(self):
|
747
1031
|
if self.hicache_storage_backend == "mooncake":
|
748
|
-
|
749
|
-
|
750
|
-
|
1032
|
+
if self.hicache_mem_layout == "layer_first":
|
1033
|
+
if self.hicache_io_backend == "direct":
|
1034
|
+
self.hicache_mem_layout = "page_first_direct"
|
1035
|
+
elif self.hicache_io_backend == "kernel":
|
1036
|
+
self.hicache_mem_layout = "page_first"
|
1037
|
+
logger.warning(
|
1038
|
+
f"Mooncake storage backend does not support layer_first layout, "
|
1039
|
+
f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
|
1040
|
+
)
|
751
1041
|
|
752
1042
|
if self.hicache_mem_layout == "page_first_direct":
|
753
1043
|
if self.hicache_io_backend != "direct":
|
@@ -756,17 +1046,16 @@ class ServerArgs:
|
|
756
1046
|
"Page first direct layout only support direct io backend"
|
757
1047
|
)
|
758
1048
|
|
759
|
-
|
1049
|
+
def _handle_speculative_decoding(self):
|
760
1050
|
if self.speculative_algorithm == "NEXTN":
|
761
|
-
# NEXTN shares the same implementation of EAGLE
|
762
1051
|
self.speculative_algorithm = "EAGLE"
|
763
1052
|
|
764
1053
|
if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
|
765
|
-
if self.speculative_algorithm == "STANDALONE":
|
1054
|
+
if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
|
766
1055
|
# TODO: support dp attention for standalone speculative decoding
|
767
|
-
|
768
|
-
|
769
|
-
)
|
1056
|
+
raise ValueError(
|
1057
|
+
"Currently standalone speculative decoding does not support dp attention."
|
1058
|
+
)
|
770
1059
|
if self.max_running_requests is None:
|
771
1060
|
self.max_running_requests = 48
|
772
1061
|
self.disable_overlap_schedule = True
|
@@ -783,12 +1072,12 @@ class ServerArgs:
|
|
783
1072
|
|
784
1073
|
model_arch = self.get_hf_config().architectures[0]
|
785
1074
|
if model_arch in [
|
1075
|
+
"DeepseekV32ForCausalLM",
|
786
1076
|
"DeepseekV3ForCausalLM",
|
787
1077
|
"Glm4MoeForCausalLM",
|
788
1078
|
"BailingMoeForCausalLM",
|
789
1079
|
"BailingMoeV2ForCausalLM",
|
790
1080
|
]:
|
791
|
-
# Auto set draft_model_path DeepSeek-V3/R1
|
792
1081
|
if self.speculative_draft_model_path is None:
|
793
1082
|
self.speculative_draft_model_path = self.model_path
|
794
1083
|
else:
|
@@ -796,7 +1085,6 @@ class ServerArgs:
|
|
796
1085
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
797
1086
|
)
|
798
1087
|
|
799
|
-
# Auto choose parameters
|
800
1088
|
if self.speculative_num_steps is None:
|
801
1089
|
assert (
|
802
1090
|
self.speculative_eagle_topk is None
|
@@ -836,11 +1124,43 @@ class ServerArgs:
|
|
836
1124
|
"speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
|
837
1125
|
)
|
838
1126
|
|
839
|
-
|
840
|
-
|
841
|
-
|
1127
|
+
if self.speculative_algorithm == "NGRAM":
|
1128
|
+
if not self.device.startswith("cuda"):
|
1129
|
+
raise ValueError(
|
1130
|
+
"Ngram speculative decoding only supports CUDA device."
|
1131
|
+
)
|
1132
|
+
if self.max_running_requests is None:
|
1133
|
+
self.max_running_requests = 48
|
1134
|
+
self.disable_overlap_schedule = True
|
1135
|
+
self.enable_mixed_chunk = False
|
1136
|
+
self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
|
1137
|
+
if self.speculative_num_draft_tokens is None:
|
1138
|
+
self.speculative_num_draft_tokens = (
|
1139
|
+
self.speculative_ngram_max_match_window_size
|
1140
|
+
)
|
1141
|
+
logger.warning(
|
1142
|
+
"The overlap scheduler and mixed chunked prefill are disabled because of "
|
1143
|
+
"using ngram speculative decoding."
|
1144
|
+
)
|
842
1145
|
|
843
|
-
|
1146
|
+
if (
|
1147
|
+
self.speculative_eagle_topk > 1
|
1148
|
+
and self.page_size > 1
|
1149
|
+
and self.attention_backend != "flashinfer"
|
1150
|
+
):
|
1151
|
+
raise ValueError(
|
1152
|
+
f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
|
1153
|
+
f"with page_size({self.page_size}) > 1 is unstable "
|
1154
|
+
"and produces incorrect results for paged attention backends. "
|
1155
|
+
"This combination is only supported for the 'flashinfer' backend."
|
1156
|
+
)
|
1157
|
+
if self.enable_dp_attention:
|
1158
|
+
# TODO: support dp attention for ngram speculative decoding
|
1159
|
+
raise ValueError(
|
1160
|
+
"Currently ngram speculative decoding does not support dp attention."
|
1161
|
+
)
|
1162
|
+
|
1163
|
+
def _handle_load_format(self):
|
844
1164
|
if (
|
845
1165
|
self.load_format == "auto" or self.load_format == "gguf"
|
846
1166
|
) and check_gguf_file(self.model_path):
|
@@ -848,6 +1168,7 @@ class ServerArgs:
|
|
848
1168
|
|
849
1169
|
if is_remote_url(self.model_path):
|
850
1170
|
self.load_format = "remote"
|
1171
|
+
|
851
1172
|
if self.custom_weight_loader is None:
|
852
1173
|
self.custom_weight_loader = []
|
853
1174
|
|
@@ -859,7 +1180,7 @@ class ServerArgs:
|
|
859
1180
|
):
|
860
1181
|
self.load_format = "auto"
|
861
1182
|
|
862
|
-
|
1183
|
+
def _handle_disaggregation(self):
|
863
1184
|
if self.disaggregation_mode == "decode":
|
864
1185
|
assert (
|
865
1186
|
self.disaggregation_decode_tp is None
|
@@ -885,42 +1206,84 @@ class ServerArgs:
|
|
885
1206
|
|
886
1207
|
self.disaggregation_prefill_pp = self.pp_size
|
887
1208
|
self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
|
888
|
-
|
889
1209
|
self.disable_cuda_graph = True
|
890
1210
|
logger.warning("Cuda graph is disabled for prefill server")
|
891
1211
|
|
892
|
-
|
1212
|
+
def _handle_tokenizer_batching(self):
|
893
1213
|
if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
|
894
1214
|
raise ValueError(
|
895
1215
|
"Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
|
896
1216
|
"Please choose one tokenizer batching approach."
|
897
1217
|
)
|
898
1218
|
|
899
|
-
|
1219
|
+
def _handle_environment_variables(self):
|
900
1220
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
901
1221
|
"1" if self.enable_torch_compile else "0"
|
902
1222
|
)
|
903
1223
|
os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
|
904
|
-
|
905
|
-
# Set env var before grammar backends init
|
906
1224
|
os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
|
907
1225
|
"1" if self.disable_outlines_disk_cache else "0"
|
908
1226
|
)
|
1227
|
+
os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
|
1228
|
+
"1" if self.enable_deterministic_inference else "0"
|
1229
|
+
)
|
909
1230
|
|
1231
|
+
def _handle_cache_compatibility(self):
|
910
1232
|
if self.enable_hierarchical_cache and self.disable_radix_cache:
|
911
1233
|
raise ValueError(
|
912
1234
|
"The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
|
913
1235
|
"and cannot be used at the same time. Please use only one of them."
|
914
1236
|
)
|
915
1237
|
|
1238
|
+
if (
|
1239
|
+
self.disaggregation_decode_enable_offload_kvcache
|
1240
|
+
and self.disaggregation_mode != "decode"
|
1241
|
+
):
|
1242
|
+
raise ValueError(
|
1243
|
+
"The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
|
1244
|
+
)
|
1245
|
+
|
1246
|
+
def _handle_metrics_labels(self):
|
916
1247
|
if (
|
917
1248
|
not self.tokenizer_metrics_custom_labels_header
|
918
|
-
and self.
|
1249
|
+
and self.tokenizer_metrics_allowed_custom_labels
|
919
1250
|
):
|
920
1251
|
raise ValueError(
|
921
|
-
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-
|
1252
|
+
"Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
|
922
1253
|
)
|
923
1254
|
|
1255
|
+
def _handle_deterministic_inference(self):
|
1256
|
+
if self.enable_deterministic_inference:
|
1257
|
+
# Check sampling backend
|
1258
|
+
self.sampling_backend = "pytorch"
|
1259
|
+
logger.warning(
|
1260
|
+
"Sampling backend is set to pytorch for deterministic inference."
|
1261
|
+
)
|
1262
|
+
|
1263
|
+
# Check attention backend
|
1264
|
+
if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
|
1265
|
+
raise ValueError(
|
1266
|
+
f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
|
1267
|
+
)
|
1268
|
+
|
1269
|
+
# Currently, only FA3 supports radix cache. Support for other backends is in progress
|
1270
|
+
if self.attention_backend != "fa3":
|
1271
|
+
self.disable_radix_cache = True
|
1272
|
+
logger.warning(
|
1273
|
+
f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
|
1274
|
+
)
|
1275
|
+
|
1276
|
+
# Check TP size
|
1277
|
+
if self.tp_size > 1:
|
1278
|
+
os.environ["NCCL_ALGO"] = "allreduce:tree"
|
1279
|
+
self.disable_custom_all_reduce = True
|
1280
|
+
logger.warning(
|
1281
|
+
"NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
|
1282
|
+
)
|
1283
|
+
|
1284
|
+
def _handle_other_validations(self):
|
1285
|
+
pass
|
1286
|
+
|
924
1287
|
@staticmethod
|
925
1288
|
def add_cli_args(parser: argparse.ArgumentParser):
|
926
1289
|
# Model and tokenizer
|
@@ -931,24 +1294,6 @@ class ServerArgs:
|
|
931
1294
|
help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
|
932
1295
|
required=True,
|
933
1296
|
)
|
934
|
-
parser.add_argument(
|
935
|
-
"--remote-instance-weight-loader-seed-instance-ip",
|
936
|
-
type=str,
|
937
|
-
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
938
|
-
help="The ip of the seed instance for loading weights from remote instance.",
|
939
|
-
)
|
940
|
-
parser.add_argument(
|
941
|
-
"--remote-instance-weight-loader-seed-instance-service-port",
|
942
|
-
type=int,
|
943
|
-
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
944
|
-
help="The service port of the seed instance for loading weights from remote instance.",
|
945
|
-
)
|
946
|
-
parser.add_argument(
|
947
|
-
"--remote-instance-weight-loader-send-weights-group-ports",
|
948
|
-
type=json_list_type,
|
949
|
-
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
950
|
-
help="The communication group ports for loading weights from remote instance.",
|
951
|
-
)
|
952
1297
|
parser.add_argument(
|
953
1298
|
"--tokenizer-path",
|
954
1299
|
type=str,
|
@@ -1117,6 +1462,11 @@ class ServerArgs:
|
|
1117
1462
|
choices=["auto", "fp8_e5m2", "fp8_e4m3"],
|
1118
1463
|
help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
|
1119
1464
|
)
|
1465
|
+
parser.add_argument(
|
1466
|
+
"--enable-fp32-lm-head",
|
1467
|
+
action="store_true",
|
1468
|
+
help="If set, the LM head outputs (logits) are in FP32.",
|
1469
|
+
)
|
1120
1470
|
|
1121
1471
|
# Memory and scheduling
|
1122
1472
|
parser.add_argument(
|
@@ -1163,6 +1513,24 @@ class ServerArgs:
|
|
1163
1513
|
choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
|
1164
1514
|
help="The scheduling policy of the requests.",
|
1165
1515
|
)
|
1516
|
+
parser.add_argument(
|
1517
|
+
"--enable-priority-scheduling",
|
1518
|
+
action="store_true",
|
1519
|
+
default=ServerArgs.enable_priority_scheduling,
|
1520
|
+
help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
|
1521
|
+
)
|
1522
|
+
parser.add_argument(
|
1523
|
+
"--schedule-low-priority-values-first",
|
1524
|
+
action="store_true",
|
1525
|
+
default=ServerArgs.schedule_low_priority_values_first,
|
1526
|
+
help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
|
1527
|
+
)
|
1528
|
+
parser.add_argument(
|
1529
|
+
"--priority-scheduling-preemption-threshold",
|
1530
|
+
type=int,
|
1531
|
+
default=ServerArgs.priority_scheduling_preemption_threshold,
|
1532
|
+
help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
|
1533
|
+
)
|
1166
1534
|
parser.add_argument(
|
1167
1535
|
"--schedule-conservativeness",
|
1168
1536
|
type=float,
|
@@ -1338,16 +1706,16 @@ class ServerArgs:
|
|
1338
1706
|
"--tokenizer-metrics-custom-labels-header",
|
1339
1707
|
type=str,
|
1340
1708
|
default=ServerArgs.tokenizer_metrics_custom_labels_header,
|
1341
|
-
help="Specify the HTTP header for passing
|
1709
|
+
help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
|
1342
1710
|
)
|
1343
1711
|
parser.add_argument(
|
1344
|
-
"--tokenizer-metrics-allowed-
|
1712
|
+
"--tokenizer-metrics-allowed-custom-labels",
|
1345
1713
|
type=str,
|
1346
1714
|
nargs="+",
|
1347
|
-
default=ServerArgs.
|
1348
|
-
help="The
|
1715
|
+
default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
|
1716
|
+
help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
|
1349
1717
|
"'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
|
1350
|
-
"'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
|
1718
|
+
"'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
|
1351
1719
|
)
|
1352
1720
|
parser.add_argument(
|
1353
1721
|
"--bucket-time-to-first-token",
|
@@ -1379,8 +1747,8 @@ class ServerArgs:
|
|
1379
1747
|
bucket_rule = (
|
1380
1748
|
"Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
|
1381
1749
|
"generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
|
1382
|
-
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); '
|
1383
|
-
"<value2> ...' uses custom bucket values (e.g., '
|
1750
|
+
"[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
|
1751
|
+
"<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
|
1384
1752
|
)
|
1385
1753
|
parser.add_argument(
|
1386
1754
|
"--prompt-tokens-buckets",
|
@@ -1601,9 +1969,17 @@ class ServerArgs:
|
|
1601
1969
|
parser.add_argument(
|
1602
1970
|
"--lora-backend",
|
1603
1971
|
type=str,
|
1604
|
-
|
1972
|
+
choices=LORA_BACKEND_CHOICES,
|
1973
|
+
default=ServerArgs.lora_backend,
|
1605
1974
|
help="Choose the kernel backend for multi-LoRA serving.",
|
1606
1975
|
)
|
1976
|
+
parser.add_argument(
|
1977
|
+
"--max-lora-chunk-size",
|
1978
|
+
type=int,
|
1979
|
+
default=ServerArgs.max_lora_chunk_size,
|
1980
|
+
choices=[16, 32, 64, 128],
|
1981
|
+
help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
|
1982
|
+
)
|
1607
1983
|
|
1608
1984
|
# Kernel backend
|
1609
1985
|
parser.add_argument(
|
@@ -1644,16 +2020,28 @@ class ServerArgs:
|
|
1644
2020
|
parser.add_argument(
|
1645
2021
|
"--mm-attention-backend",
|
1646
2022
|
type=str,
|
1647
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
2023
|
+
choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
|
1648
2024
|
default=ServerArgs.mm_attention_backend,
|
1649
2025
|
help="Set multimodal attention backend.",
|
1650
2026
|
)
|
2027
|
+
parser.add_argument(
|
2028
|
+
"--nsa-prefill",
|
2029
|
+
default=ServerArgs.nsa_prefill,
|
2030
|
+
type=str,
|
2031
|
+
choices=NSA_CHOICES,
|
2032
|
+
)
|
2033
|
+
parser.add_argument(
|
2034
|
+
"--nsa-decode",
|
2035
|
+
default=ServerArgs.nsa_decode,
|
2036
|
+
type=str,
|
2037
|
+
choices=NSA_CHOICES,
|
2038
|
+
)
|
1651
2039
|
|
1652
2040
|
# Speculative decoding
|
1653
2041
|
parser.add_argument(
|
1654
2042
|
"--speculative-algorithm",
|
1655
2043
|
type=str,
|
1656
|
-
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
|
2044
|
+
choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
|
1657
2045
|
help="Speculative algorithm.",
|
1658
2046
|
)
|
1659
2047
|
parser.add_argument(
|
@@ -1713,6 +2101,50 @@ class ServerArgs:
|
|
1713
2101
|
help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
|
1714
2102
|
default=ServerArgs.speculative_attention_mode,
|
1715
2103
|
)
|
2104
|
+
# Ngram speculative decoding
|
2105
|
+
parser.add_argument(
|
2106
|
+
"--speculative-ngram-min-match-window-size",
|
2107
|
+
type=int,
|
2108
|
+
default=ServerArgs.speculative_ngram_min_match_window_size,
|
2109
|
+
help="The minimum window size for pattern matching in ngram speculative decoding.",
|
2110
|
+
)
|
2111
|
+
parser.add_argument(
|
2112
|
+
"--speculative-ngram-max-match-window-size",
|
2113
|
+
type=int,
|
2114
|
+
default=ServerArgs.speculative_ngram_max_match_window_size,
|
2115
|
+
help="The maximum window size for pattern matching in ngram speculative decoding.",
|
2116
|
+
)
|
2117
|
+
parser.add_argument(
|
2118
|
+
"--speculative-ngram-min-bfs-breadth",
|
2119
|
+
type=int,
|
2120
|
+
default=ServerArgs.speculative_ngram_min_bfs_breadth,
|
2121
|
+
help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2122
|
+
)
|
2123
|
+
parser.add_argument(
|
2124
|
+
"--speculative-ngram-max-bfs-breadth",
|
2125
|
+
type=int,
|
2126
|
+
default=ServerArgs.speculative_ngram_max_bfs_breadth,
|
2127
|
+
help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
|
2128
|
+
)
|
2129
|
+
parser.add_argument(
|
2130
|
+
"--speculative-ngram-match-type",
|
2131
|
+
type=str,
|
2132
|
+
choices=["BFS", "PROB"],
|
2133
|
+
default=ServerArgs.speculative_ngram_match_type,
|
2134
|
+
help="The match type for cache tree.",
|
2135
|
+
)
|
2136
|
+
parser.add_argument(
|
2137
|
+
"--speculative-ngram-branch-length",
|
2138
|
+
type=int,
|
2139
|
+
default=ServerArgs.speculative_ngram_branch_length,
|
2140
|
+
help="The branch length for ngram speculative decoding.",
|
2141
|
+
)
|
2142
|
+
parser.add_argument(
|
2143
|
+
"--speculative-ngram-capacity",
|
2144
|
+
type=int,
|
2145
|
+
default=ServerArgs.speculative_ngram_capacity,
|
2146
|
+
help="The cache capacity for ngram speculative decoding.",
|
2147
|
+
)
|
1716
2148
|
|
1717
2149
|
# Expert parallelism
|
1718
2150
|
parser.add_argument(
|
@@ -1881,6 +2313,13 @@ class ServerArgs:
|
|
1881
2313
|
default=ServerArgs.hicache_write_policy,
|
1882
2314
|
help="The write policy of hierarchical cache.",
|
1883
2315
|
)
|
2316
|
+
parser.add_argument(
|
2317
|
+
"--radix-eviction-policy",
|
2318
|
+
type=str,
|
2319
|
+
choices=RADIX_EVICTION_POLICY_CHOICES,
|
2320
|
+
default=ServerArgs.radix_eviction_policy,
|
2321
|
+
help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
|
2322
|
+
)
|
1884
2323
|
parser.add_argument(
|
1885
2324
|
"--hicache-io-backend",
|
1886
2325
|
type=str,
|
@@ -1898,9 +2337,12 @@ class ServerArgs:
|
|
1898
2337
|
parser.add_argument(
|
1899
2338
|
"--hicache-storage-backend",
|
1900
2339
|
type=str,
|
1901
|
-
choices=["file", "mooncake", "hf3fs", "nixl"],
|
2340
|
+
choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
|
1902
2341
|
default=ServerArgs.hicache_storage_backend,
|
1903
|
-
help="The storage backend for hierarchical KV cache."
|
2342
|
+
help="The storage backend for hierarchical KV cache. "
|
2343
|
+
"Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
|
2344
|
+
"For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
|
2345
|
+
"backend_name (custom name), module_path (Python module path), class_name (backend class name).",
|
1904
2346
|
)
|
1905
2347
|
parser.add_argument(
|
1906
2348
|
"--hicache-storage-prefetch-policy",
|
@@ -2064,6 +2506,11 @@ class ServerArgs:
|
|
2064
2506
|
action="store_true",
|
2065
2507
|
help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
|
2066
2508
|
)
|
2509
|
+
parser.add_argument(
|
2510
|
+
"--enable-torch-symm-mem",
|
2511
|
+
action="store_true",
|
2512
|
+
help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
|
2513
|
+
)
|
2067
2514
|
parser.add_argument(
|
2068
2515
|
"--disable-overlap-schedule",
|
2069
2516
|
action="store_true",
|
@@ -2089,6 +2536,11 @@ class ServerArgs:
|
|
2089
2536
|
action="store_true",
|
2090
2537
|
help="Enabling two micro batches to overlap.",
|
2091
2538
|
)
|
2539
|
+
parser.add_argument(
|
2540
|
+
"--enable-single-batch-overlap",
|
2541
|
+
action="store_true",
|
2542
|
+
help="Let computation and communication overlap within one micro batch.",
|
2543
|
+
)
|
2092
2544
|
parser.add_argument(
|
2093
2545
|
"--tbo-token-distribution-threshold",
|
2094
2546
|
type=float,
|
@@ -2158,6 +2610,11 @@ class ServerArgs:
|
|
2158
2610
|
action="store_true",
|
2159
2611
|
help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
|
2160
2612
|
)
|
2613
|
+
parser.add_argument(
|
2614
|
+
"--enable-weights-cpu-backup",
|
2615
|
+
action="store_true",
|
2616
|
+
help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
|
2617
|
+
)
|
2161
2618
|
parser.add_argument(
|
2162
2619
|
"--allow-auto-truncate",
|
2163
2620
|
action="store_true",
|
@@ -2188,6 +2645,11 @@ class ServerArgs:
|
|
2188
2645
|
action="store_true",
|
2189
2646
|
help="Adopt base image processor instead of fast image processor.",
|
2190
2647
|
)
|
2648
|
+
parser.add_argument(
|
2649
|
+
"--keep-mm-feature-on-device",
|
2650
|
+
action="store_true",
|
2651
|
+
help="Keep multimodal feature tensors on device after processing to save D2H copy.",
|
2652
|
+
)
|
2191
2653
|
parser.add_argument(
|
2192
2654
|
"--enable-return-hidden-states",
|
2193
2655
|
action="store_true",
|
@@ -2295,6 +2757,11 @@ class ServerArgs:
|
|
2295
2757
|
"or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
|
2296
2758
|
"Default is None, which triggers automatic device detection when mooncake backend is enabled.",
|
2297
2759
|
)
|
2760
|
+
parser.add_argument(
|
2761
|
+
"--disaggregation-decode-enable-offload-kvcache",
|
2762
|
+
action="store_true",
|
2763
|
+
help="Enable async KV cache offloading on decode server (PD mode).",
|
2764
|
+
)
|
2298
2765
|
parser.add_argument(
|
2299
2766
|
"--num-reserved-decode-tokens",
|
2300
2767
|
type=int,
|
@@ -2321,6 +2788,24 @@ class ServerArgs:
|
|
2321
2788
|
action="store_true",
|
2322
2789
|
help="Disable mmap while loading weight using safetensors.",
|
2323
2790
|
)
|
2791
|
+
parser.add_argument(
|
2792
|
+
"--remote-instance-weight-loader-seed-instance-ip",
|
2793
|
+
type=str,
|
2794
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
|
2795
|
+
help="The ip of the seed instance for loading weights from remote instance.",
|
2796
|
+
)
|
2797
|
+
parser.add_argument(
|
2798
|
+
"--remote-instance-weight-loader-seed-instance-service-port",
|
2799
|
+
type=int,
|
2800
|
+
default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
|
2801
|
+
help="The service port of the seed instance for loading weights from remote instance.",
|
2802
|
+
)
|
2803
|
+
parser.add_argument(
|
2804
|
+
"--remote-instance-weight-loader-send-weights-group-ports",
|
2805
|
+
type=json_list_type,
|
2806
|
+
default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
|
2807
|
+
help="The communication group ports for loading weights from remote instance.",
|
2808
|
+
)
|
2324
2809
|
|
2325
2810
|
# For PD-Multiplexing
|
2326
2811
|
parser.add_argument(
|
@@ -2336,41 +2821,55 @@ class ServerArgs:
|
|
2336
2821
|
help="Number of sm partition groups.",
|
2337
2822
|
)
|
2338
2823
|
|
2824
|
+
# For deterministic inference
|
2825
|
+
parser.add_argument(
|
2826
|
+
"--enable-deterministic-inference",
|
2827
|
+
action="store_true",
|
2828
|
+
help="Enable deterministic inference mode with batch invariant ops.",
|
2829
|
+
)
|
2830
|
+
|
2339
2831
|
# Deprecated arguments
|
2340
2832
|
parser.add_argument(
|
2341
2833
|
"--enable-ep-moe",
|
2342
|
-
action=
|
2343
|
-
help="
|
2834
|
+
action=DeprecatedAction,
|
2835
|
+
help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
|
2344
2836
|
)
|
2345
2837
|
parser.add_argument(
|
2346
2838
|
"--enable-deepep-moe",
|
2347
|
-
action=
|
2348
|
-
help="
|
2839
|
+
action=DeprecatedAction,
|
2840
|
+
help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
|
2349
2841
|
)
|
2350
2842
|
parser.add_argument(
|
2351
2843
|
"--enable-flashinfer-cutlass-moe",
|
2352
|
-
action=
|
2353
|
-
help="
|
2844
|
+
action=DeprecatedAction,
|
2845
|
+
help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
|
2354
2846
|
)
|
2355
2847
|
parser.add_argument(
|
2356
2848
|
"--enable-flashinfer-cutedsl-moe",
|
2357
|
-
action=
|
2358
|
-
help="
|
2849
|
+
action=DeprecatedAction,
|
2850
|
+
help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
|
2359
2851
|
)
|
2360
2852
|
parser.add_argument(
|
2361
2853
|
"--enable-flashinfer-trtllm-moe",
|
2362
|
-
action=
|
2363
|
-
help="
|
2854
|
+
action=DeprecatedAction,
|
2855
|
+
help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
|
2364
2856
|
)
|
2365
2857
|
parser.add_argument(
|
2366
2858
|
"--enable-triton-kernel-moe",
|
2367
|
-
action=
|
2368
|
-
help="
|
2859
|
+
action=DeprecatedAction,
|
2860
|
+
help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
|
2369
2861
|
)
|
2370
2862
|
parser.add_argument(
|
2371
2863
|
"--enable-flashinfer-mxfp4-moe",
|
2372
|
-
action=
|
2373
|
-
help="
|
2864
|
+
action=DeprecatedAction,
|
2865
|
+
help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
|
2866
|
+
)
|
2867
|
+
|
2868
|
+
# Configuration file support
|
2869
|
+
parser.add_argument(
|
2870
|
+
"--config",
|
2871
|
+
type=str,
|
2872
|
+
help="Read CLI options from a config file. Must be a YAML file with configuration options.",
|
2374
2873
|
)
|
2375
2874
|
|
2376
2875
|
@classmethod
|
@@ -2451,6 +2950,13 @@ class ServerArgs:
|
|
2451
2950
|
"--generation-tokens-buckets", self.generation_tokens_buckets
|
2452
2951
|
)
|
2453
2952
|
|
2953
|
+
# Check scheduling policy
|
2954
|
+
if self.enable_priority_scheduling:
|
2955
|
+
assert self.schedule_policy in [
|
2956
|
+
"fcfs",
|
2957
|
+
"lof",
|
2958
|
+
], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
|
2959
|
+
|
2454
2960
|
def check_lora_server_args(self):
|
2455
2961
|
assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
|
2456
2962
|
|
@@ -2534,6 +3040,12 @@ class ServerArgs:
|
|
2534
3040
|
f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
|
2535
3041
|
)
|
2536
3042
|
|
3043
|
+
if self.max_lora_chunk_size is not None:
|
3044
|
+
assert (
|
3045
|
+
16 <= self.max_lora_chunk_size <= 128
|
3046
|
+
and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
|
3047
|
+
), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
|
3048
|
+
|
2537
3049
|
def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
|
2538
3050
|
larger_tp = max(decode_tp, prefill_tp)
|
2539
3051
|
smaller_tp = min(decode_tp, prefill_tp)
|
@@ -2551,8 +3063,8 @@ class ServerArgs:
|
|
2551
3063
|
assert rule in [
|
2552
3064
|
"tse",
|
2553
3065
|
"default",
|
2554
|
-
"
|
2555
|
-
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', '
|
3066
|
+
"custom",
|
3067
|
+
], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
|
2556
3068
|
|
2557
3069
|
if rule == "tse":
|
2558
3070
|
assert (
|
@@ -2575,95 +3087,20 @@ class ServerArgs:
|
|
2575
3087
|
len(buckets_rule) == 1
|
2576
3088
|
), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
|
2577
3089
|
|
2578
|
-
elif rule == "
|
3090
|
+
elif rule == "custom":
|
2579
3091
|
assert (
|
2580
3092
|
len(buckets_rule) >= 2
|
2581
|
-
), f"{arg_name}
|
3093
|
+
), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
|
2582
3094
|
try:
|
2583
3095
|
bucket_values = [float(x) for x in buckets_rule[1:]]
|
2584
3096
|
except ValueError:
|
2585
|
-
assert False, f"{arg_name}
|
3097
|
+
assert False, f"{arg_name} custom rule bucket values must be numeric"
|
2586
3098
|
assert len(set(bucket_values)) == len(
|
2587
3099
|
bucket_values
|
2588
|
-
), f"{arg_name}
|
3100
|
+
), f"{arg_name} custom rule bucket values should not contain duplicates"
|
2589
3101
|
assert all(
|
2590
3102
|
val >= 0 for val in bucket_values
|
2591
|
-
), f"{arg_name}
|
2592
|
-
|
2593
|
-
def model_specific_adjustments(self):
|
2594
|
-
hf_config = self.get_hf_config()
|
2595
|
-
model_arch = hf_config.architectures[0]
|
2596
|
-
if model_arch in ["GptOssForCausalLM"]:
|
2597
|
-
if self.attention_backend is None:
|
2598
|
-
if is_cuda() and is_sm100_supported():
|
2599
|
-
self.attention_backend = "trtllm_mha"
|
2600
|
-
elif is_cuda() and is_sm90_supported():
|
2601
|
-
self.attention_backend = "fa3"
|
2602
|
-
else:
|
2603
|
-
self.attention_backend = "triton"
|
2604
|
-
supported_backends = ["triton", "trtllm_mha", "fa3"]
|
2605
|
-
logger.info(
|
2606
|
-
f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
|
2607
|
-
)
|
2608
|
-
assert (
|
2609
|
-
self.attention_backend in supported_backends
|
2610
|
-
), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
|
2611
|
-
|
2612
|
-
if is_sm100_supported():
|
2613
|
-
if not self.enable_dp_attention:
|
2614
|
-
self.enable_flashinfer_allreduce_fusion = True
|
2615
|
-
logger.info(
|
2616
|
-
"Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
|
2617
|
-
)
|
2618
|
-
quantization_config = getattr(hf_config, "quantization_config", None)
|
2619
|
-
is_mxfp4_quant_format = (
|
2620
|
-
quantization_config is not None
|
2621
|
-
and quantization_config.get("quant_method") == "mxfp4"
|
2622
|
-
)
|
2623
|
-
|
2624
|
-
if is_sm100_supported() and is_mxfp4_quant_format:
|
2625
|
-
self.moe_runner_backend = "flashinfer_mxfp4"
|
2626
|
-
logger.warning(
|
2627
|
-
"Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
|
2628
|
-
)
|
2629
|
-
else:
|
2630
|
-
if self.moe_runner_backend == "triton_kernel":
|
2631
|
-
assert (
|
2632
|
-
self.ep_size == 1
|
2633
|
-
), "Triton kernel MoE is only supported when ep_size == 1"
|
2634
|
-
if (
|
2635
|
-
self.moe_runner_backend == "auto"
|
2636
|
-
and self.ep_size == 1
|
2637
|
-
and is_triton_kernels_available()
|
2638
|
-
):
|
2639
|
-
self.moe_runner_backend = "triton_kernel"
|
2640
|
-
logger.warning(
|
2641
|
-
"Detected GPT-OSS model, enabling triton_kernels MOE kernel."
|
2642
|
-
)
|
2643
|
-
self.disable_hybrid_swa_memory = True
|
2644
|
-
if is_mxfp4_quant_format:
|
2645
|
-
# use bf16 for mxfp4 triton kernels
|
2646
|
-
self.dtype = "bfloat16"
|
2647
|
-
|
2648
|
-
elif "Llama4" in model_arch:
|
2649
|
-
assert self.attention_backend in {
|
2650
|
-
"fa3",
|
2651
|
-
"aiter",
|
2652
|
-
"triton",
|
2653
|
-
}, "fa3, aiter, or triton is required for Llama4 model"
|
2654
|
-
elif model_arch in [
|
2655
|
-
"Gemma2ForCausalLM",
|
2656
|
-
"Gemma3ForCausalLM",
|
2657
|
-
"Gemma3ForConditionalGeneration",
|
2658
|
-
"Gemma3nForCausalLM",
|
2659
|
-
"Gemma3nForConditionalGeneration",
|
2660
|
-
]:
|
2661
|
-
# FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
|
2662
|
-
# It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
|
2663
|
-
logger.warning(
|
2664
|
-
f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
|
2665
|
-
)
|
2666
|
-
self.disable_hybrid_swa_memory = True
|
3103
|
+
), f"{arg_name} custom rule bucket values should be non-negative"
|
2667
3104
|
|
2668
3105
|
def adjust_mem_fraction_for_vlm(self, model_config):
|
2669
3106
|
vision_config = getattr(model_config.hf_config, "vision_config", None)
|
@@ -2715,6 +3152,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
|
|
2715
3152
|
Returns:
|
2716
3153
|
The server arguments.
|
2717
3154
|
"""
|
3155
|
+
# Import here to avoid circular imports
|
3156
|
+
from sglang.srt.server_args_config_parser import ConfigArgumentMerger
|
3157
|
+
|
3158
|
+
# Check for config file and merge arguments if present
|
3159
|
+
if "--config" in argv:
|
3160
|
+
# Extract boolean actions from the parser to handle them correctly
|
3161
|
+
parser = argparse.ArgumentParser()
|
3162
|
+
ServerArgs.add_cli_args(parser)
|
3163
|
+
|
3164
|
+
# Get boolean action destinations
|
3165
|
+
boolean_actions = []
|
3166
|
+
for action in parser._actions:
|
3167
|
+
if hasattr(action, "dest") and hasattr(action, "action"):
|
3168
|
+
if action.action in ["store_true", "store_false"]:
|
3169
|
+
boolean_actions.append(action.dest)
|
3170
|
+
|
3171
|
+
# Merge config file arguments with CLI arguments
|
3172
|
+
config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
|
3173
|
+
argv = config_merger.merge_config_with_args(argv)
|
3174
|
+
|
2718
3175
|
parser = argparse.ArgumentParser()
|
2719
3176
|
ServerArgs.add_cli_args(parser)
|
2720
3177
|
raw_args = parser.parse_args(argv)
|
@@ -2856,6 +3313,7 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
2856
3313
|
# The default value for llama
|
2857
3314
|
return (5, 4, 8)
|
2858
3315
|
elif arch in [
|
3316
|
+
"DeepseekV32ForCausalLM",
|
2859
3317
|
"DeepseekV3ForCausalLM",
|
2860
3318
|
"DeepseekV2ForCausalLM",
|
2861
3319
|
"GptOssForCausalLM",
|