sglang 0.5.3rc0__py3-none-any.whl → 0.5.3rc2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. sglang/bench_one_batch.py +7 -9
  2. sglang/bench_one_batch_server.py +321 -31
  3. sglang/bench_serving.py +10 -3
  4. sglang/global_config.py +2 -2
  5. sglang/lang/backend/runtime_endpoint.py +1 -1
  6. sglang/launch_server.py +14 -0
  7. sglang/profiler.py +2 -2
  8. sglang/srt/batch_invariant_ops/__init__.py +27 -0
  9. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +549 -0
  10. sglang/srt/configs/__init__.py +4 -0
  11. sglang/srt/configs/dots_ocr.py +64 -0
  12. sglang/srt/configs/falcon_h1.py +360 -0
  13. sglang/srt/configs/load_config.py +8 -0
  14. sglang/srt/configs/model_config.py +160 -105
  15. sglang/srt/configs/qwen3_vl.py +586 -0
  16. sglang/srt/constrained/base_grammar_backend.py +1 -0
  17. sglang/srt/constrained/outlines_jump_forward.py +1 -1
  18. sglang/srt/constrained/xgrammar_backend.py +6 -4
  19. sglang/srt/debug_utils/dumper.py +10 -3
  20. sglang/srt/disaggregation/ascend/conn.py +2 -2
  21. sglang/srt/disaggregation/ascend/transfer_engine.py +47 -9
  22. sglang/srt/disaggregation/common/conn.py +266 -98
  23. sglang/srt/disaggregation/decode.py +50 -9
  24. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +185 -0
  25. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +25 -16
  26. sglang/srt/disaggregation/mooncake/conn.py +51 -541
  27. sglang/srt/disaggregation/nixl/conn.py +148 -39
  28. sglang/srt/disaggregation/prefill.py +31 -14
  29. sglang/srt/disaggregation/utils.py +36 -5
  30. sglang/srt/distributed/device_communicators/all_reduce_utils.py +16 -0
  31. sglang/srt/distributed/device_communicators/shm_broadcast.py +4 -2
  32. sglang/srt/distributed/device_communicators/symm_mem.py +164 -0
  33. sglang/srt/distributed/parallel_state.py +135 -80
  34. sglang/srt/entrypoints/engine.py +23 -3
  35. sglang/srt/entrypoints/grpc_request_manager.py +330 -55
  36. sglang/srt/entrypoints/grpc_server.py +232 -102
  37. sglang/srt/entrypoints/http_server.py +49 -9
  38. sglang/srt/entrypoints/openai/protocol.py +110 -5
  39. sglang/srt/entrypoints/openai/serving_base.py +25 -6
  40. sglang/srt/entrypoints/openai/serving_chat.py +178 -49
  41. sglang/srt/entrypoints/openai/serving_completions.py +5 -3
  42. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  43. sglang/srt/entrypoints/openai/serving_responses.py +42 -0
  44. sglang/srt/environ.py +285 -0
  45. sglang/srt/eplb/expert_location.py +30 -5
  46. sglang/srt/function_call/function_call_parser.py +3 -2
  47. sglang/srt/function_call/glm4_moe_detector.py +3 -3
  48. sglang/srt/function_call/gpt_oss_detector.py +23 -0
  49. sglang/srt/function_call/json_array_parser.py +63 -0
  50. sglang/srt/function_call/kimik2_detector.py +17 -4
  51. sglang/srt/function_call/utils.py +96 -5
  52. sglang/srt/grpc/compile_proto.py +245 -0
  53. sglang/srt/grpc/sglang_scheduler_pb2.py +73 -68
  54. sglang/srt/grpc/sglang_scheduler_pb2.pyi +60 -53
  55. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +3 -0
  56. sglang/srt/layers/activation.py +7 -6
  57. sglang/srt/layers/attention/aiter_backend.py +14 -15
  58. sglang/srt/layers/attention/ascend_backend.py +108 -9
  59. sglang/srt/layers/attention/attention_registry.py +206 -0
  60. sglang/srt/layers/attention/base_attn_backend.py +12 -3
  61. sglang/srt/layers/attention/cutlass_mla_backend.py +3 -3
  62. sglang/srt/layers/attention/dual_chunk_flashattention_backend.py +1 -1
  63. sglang/srt/layers/attention/fla/chunk_scaled_dot_kkt.py +2 -2
  64. sglang/srt/layers/attention/fla/fused_recurrent.py +4 -4
  65. sglang/srt/layers/attention/fla/fused_sigmoid_gating_recurrent.py +2 -2
  66. sglang/srt/layers/attention/flashattention_backend.py +41 -8
  67. sglang/srt/layers/attention/flashinfer_backend.py +112 -194
  68. sglang/srt/layers/attention/flashinfer_mla_backend.py +11 -15
  69. sglang/srt/layers/attention/flashmla_backend.py +7 -5
  70. sglang/srt/layers/attention/hybrid_attn_backend.py +11 -3
  71. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +72 -72
  72. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -0
  73. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +15 -98
  74. sglang/srt/layers/attention/mamba/mamba.py +566 -1
  75. sglang/srt/layers/attention/mamba/mamba_utils.py +81 -0
  76. sglang/srt/layers/attention/mamba/ops/__init__.py +2 -0
  77. sglang/srt/layers/attention/mamba/ops/layernorm_gated.py +172 -0
  78. sglang/srt/layers/attention/mamba/ops/mamba_ssm.py +442 -0
  79. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +264 -0
  80. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +622 -0
  81. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +757 -0
  82. sglang/srt/layers/attention/mamba/ops/ssd_combined.py +262 -0
  83. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +275 -0
  84. sglang/srt/layers/attention/npu_ops/mla_preprocess.py +393 -0
  85. sglang/srt/layers/attention/nsa/dequant_k_cache.py +163 -0
  86. sglang/srt/layers/attention/nsa/index_buf_accessor.py +354 -0
  87. sglang/srt/layers/attention/nsa/nsa_indexer.py +761 -0
  88. sglang/srt/layers/attention/nsa/quant_k_cache.py +255 -0
  89. sglang/srt/layers/attention/nsa/tilelang_kernel.py +785 -0
  90. sglang/srt/layers/attention/nsa/transform_index.py +144 -0
  91. sglang/srt/layers/attention/nsa/utils.py +24 -0
  92. sglang/srt/layers/attention/nsa_backend.py +887 -0
  93. sglang/srt/layers/attention/tbo_backend.py +6 -6
  94. sglang/srt/layers/attention/torch_flex_backend.py +325 -0
  95. sglang/srt/layers/attention/triton_backend.py +42 -9
  96. sglang/srt/layers/attention/trtllm_mha_backend.py +5 -7
  97. sglang/srt/layers/attention/trtllm_mla_backend.py +178 -34
  98. sglang/srt/layers/attention/vision.py +58 -0
  99. sglang/srt/layers/attention/wave_backend.py +4 -4
  100. sglang/srt/layers/communicator.py +8 -0
  101. sglang/srt/layers/dp_attention.py +11 -1
  102. sglang/srt/layers/elementwise.py +3 -1
  103. sglang/srt/layers/layernorm.py +2 -0
  104. sglang/srt/layers/linear.py +21 -4
  105. sglang/srt/layers/logits_processor.py +15 -2
  106. sglang/srt/layers/moe/ep_moe/kernels.py +1 -1
  107. sglang/srt/layers/moe/ep_moe/layer.py +147 -74
  108. sglang/srt/layers/moe/flashinfer_cutedsl_moe.py +52 -25
  109. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  110. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=128,device_name=NVIDIA_H800,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  111. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=512,N=256,device_name=NVIDIA_B200.json +146 -0
  112. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +6 -2
  113. sglang/srt/layers/moe/fused_moe_triton/layer.py +11 -12
  114. sglang/srt/layers/moe/token_dispatcher/deepep.py +77 -19
  115. sglang/srt/layers/moe/utils.py +10 -0
  116. sglang/srt/layers/parameter.py +23 -6
  117. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +1 -0
  118. sglang/srt/layers/quantization/compressed_tensors/schemes/__init__.py +2 -0
  119. sglang/srt/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +173 -0
  120. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +2 -10
  121. sglang/srt/layers/quantization/fp8.py +2 -2
  122. sglang/srt/layers/quantization/fp8_utils.py +1 -1
  123. sglang/srt/layers/quantization/modelopt_quant.py +44 -9
  124. sglang/srt/layers/quantization/mxfp4.py +12 -4
  125. sglang/srt/layers/quantization/quark/quark_moe.py +16 -3
  126. sglang/srt/layers/quantization/w4afp8.py +0 -4
  127. sglang/srt/layers/quantization/w8a8_int8.py +15 -3
  128. sglang/srt/layers/rotary_embedding.py +78 -31
  129. sglang/srt/layers/sampler.py +52 -4
  130. sglang/srt/layers/utils.py +23 -0
  131. sglang/srt/lora/backend/base_backend.py +3 -3
  132. sglang/srt/lora/backend/chunked_backend.py +348 -0
  133. sglang/srt/lora/backend/triton_backend.py +10 -4
  134. sglang/srt/lora/lora.py +7 -5
  135. sglang/srt/lora/lora_manager.py +17 -6
  136. sglang/srt/lora/mem_pool.py +1 -1
  137. sglang/srt/lora/triton_ops/__init__.py +4 -0
  138. sglang/srt/lora/triton_ops/chunked_sgmv_expand.py +214 -0
  139. sglang/srt/lora/triton_ops/chunked_sgmv_shrink.py +174 -0
  140. sglang/srt/lora/utils.py +7 -5
  141. sglang/srt/managers/cache_controller.py +42 -142
  142. sglang/srt/managers/data_parallel_controller.py +11 -46
  143. sglang/srt/managers/detokenizer_manager.py +11 -11
  144. sglang/srt/managers/io_struct.py +162 -118
  145. sglang/srt/managers/mm_utils.py +43 -6
  146. sglang/srt/managers/multi_tokenizer_mixin.py +17 -17
  147. sglang/srt/managers/multimodal_processor.py +1 -2
  148. sglang/srt/managers/overlap_utils.py +53 -0
  149. sglang/srt/managers/schedule_batch.py +167 -86
  150. sglang/srt/managers/schedule_policy.py +143 -16
  151. sglang/srt/managers/scheduler.py +359 -214
  152. sglang/srt/managers/scheduler_input_blocker.py +1 -1
  153. sglang/srt/managers/scheduler_metrics_mixin.py +98 -126
  154. sglang/srt/managers/scheduler_output_processor_mixin.py +21 -12
  155. sglang/srt/managers/scheduler_profiler_mixin.py +5 -5
  156. sglang/srt/managers/scheduler_update_weights_mixin.py +7 -0
  157. sglang/srt/managers/tokenizer_communicator_mixin.py +111 -5
  158. sglang/srt/managers/tokenizer_manager.py +84 -136
  159. sglang/srt/managers/tp_worker.py +39 -29
  160. sglang/srt/managers/tp_worker_overlap_thread.py +33 -41
  161. sglang/srt/managers/utils.py +1 -45
  162. sglang/srt/mem_cache/allocator.py +14 -20
  163. sglang/srt/mem_cache/allocator_ascend.py +41 -27
  164. sglang/srt/mem_cache/base_prefix_cache.py +1 -1
  165. sglang/srt/mem_cache/chunk_cache.py +8 -1
  166. sglang/srt/mem_cache/evict_policy.py +23 -0
  167. sglang/srt/mem_cache/hicache_storage.py +40 -1
  168. sglang/srt/mem_cache/hiradix_cache.py +119 -32
  169. sglang/srt/mem_cache/memory_pool.py +188 -10
  170. sglang/srt/mem_cache/memory_pool_host.py +134 -182
  171. sglang/srt/mem_cache/radix_cache.py +222 -71
  172. sglang/srt/mem_cache/radix_cache_cpp.py +11 -8
  173. sglang/srt/mem_cache/storage/__init__.py +10 -0
  174. sglang/srt/mem_cache/storage/aibrix_kvcache/aibrix_kvcache_storage.py +151 -0
  175. sglang/srt/mem_cache/storage/aibrix_kvcache/unit_test.py +109 -0
  176. sglang/srt/mem_cache/storage/backend_factory.py +223 -0
  177. sglang/srt/mem_cache/storage/eic/eic_storage.py +778 -0
  178. sglang/srt/mem_cache/storage/eic/test_unit.py +115 -0
  179. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +173 -58
  180. sglang/srt/mem_cache/storage/lmcache/lmc_radix_cache.py +10 -6
  181. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +117 -10
  182. sglang/srt/mem_cache/swa_radix_cache.py +25 -34
  183. sglang/srt/metrics/collector.py +82 -120
  184. sglang/srt/metrics/func_timer.py +2 -7
  185. sglang/srt/metrics/utils.py +8 -1
  186. sglang/srt/model_executor/cpu_graph_runner.py +2 -2
  187. sglang/srt/model_executor/cuda_graph_runner.py +39 -32
  188. sglang/srt/model_executor/forward_batch_info.py +23 -38
  189. sglang/srt/model_executor/model_runner.py +131 -183
  190. sglang/srt/model_executor/npu_graph_runner.py +12 -5
  191. sglang/srt/model_loader/loader.py +14 -10
  192. sglang/srt/model_loader/weight_utils.py +156 -2
  193. sglang/srt/models/bailing_moe.py +27 -4
  194. sglang/srt/models/deepseek_nextn.py +6 -1
  195. sglang/srt/models/deepseek_v2.py +536 -153
  196. sglang/srt/models/dots_ocr.py +173 -0
  197. sglang/srt/models/falcon_h1.py +576 -0
  198. sglang/srt/models/gemma3_causal.py +0 -2
  199. sglang/srt/models/gemma3_mm.py +1 -1
  200. sglang/srt/models/gemma3n_mm.py +1 -1
  201. sglang/srt/models/glm4_moe.py +3 -3
  202. sglang/srt/models/glm4_moe_nextn.py +2 -2
  203. sglang/srt/models/glm4v.py +1 -1
  204. sglang/srt/models/glm4v_moe.py +1 -1
  205. sglang/srt/models/gpt_oss.py +7 -30
  206. sglang/srt/models/kimi_vl_moonvit.py +2 -2
  207. sglang/srt/models/llama.py +4 -0
  208. sglang/srt/models/longcat_flash.py +1 -1
  209. sglang/srt/models/longcat_flash_nextn.py +1 -1
  210. sglang/srt/models/mllama4.py +15 -4
  211. sglang/srt/models/qwen2.py +0 -7
  212. sglang/srt/models/qwen2_5_vl.py +2 -2
  213. sglang/srt/models/qwen2_audio.py +1 -1
  214. sglang/srt/models/qwen2_moe.py +64 -1
  215. sglang/srt/models/qwen2_vl.py +1 -1
  216. sglang/srt/models/qwen3.py +18 -3
  217. sglang/srt/models/qwen3_moe.py +31 -3
  218. sglang/srt/models/qwen3_next.py +36 -9
  219. sglang/srt/models/qwen3_vl.py +787 -0
  220. sglang/srt/models/qwen3_vl_moe.py +471 -0
  221. sglang/srt/models/registry.py +15 -3
  222. sglang/srt/models/sarashina2_vision.py +269 -0
  223. sglang/srt/models/solar.py +505 -0
  224. sglang/srt/models/starcoder2.py +357 -0
  225. sglang/srt/models/torch_native_llama.py +9 -2
  226. sglang/srt/models/utils.py +51 -0
  227. sglang/srt/multimodal/processors/base_processor.py +15 -7
  228. sglang/srt/multimodal/processors/dots_vlm.py +2 -3
  229. sglang/srt/multimodal/processors/internvl.py +20 -8
  230. sglang/srt/multimodal/processors/qwen_vl.py +8 -1
  231. sglang/srt/multimodal/processors/sarashina2_vision.py +81 -0
  232. sglang/srt/parser/jinja_template_utils.py +6 -0
  233. sglang/srt/sampling/sampling_batch_info.py +20 -2
  234. sglang/srt/sampling/sampling_params.py +7 -0
  235. sglang/srt/server_args.py +753 -295
  236. sglang/srt/server_args_config_parser.py +146 -0
  237. sglang/srt/single_batch_overlap.py +151 -0
  238. sglang/srt/speculative/cpp_ngram/ngram.cpp +374 -0
  239. sglang/srt/speculative/cpp_ngram/ngram.h +110 -0
  240. sglang/srt/speculative/cpp_ngram/ngram_cache.py +138 -0
  241. sglang/srt/speculative/cpp_ngram/ngram_cache_binding.cpp +43 -0
  242. sglang/srt/speculative/cpp_ngram/param.h +125 -0
  243. sglang/srt/speculative/cpp_ngram/queue.h +71 -0
  244. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +2 -1
  245. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +3 -1
  246. sglang/srt/speculative/{eagle_utils.py → eagle_info.py} +207 -755
  247. sglang/srt/speculative/eagle_worker.py +57 -25
  248. sglang/srt/speculative/ngram_utils.py +428 -0
  249. sglang/srt/speculative/ngram_worker.py +245 -0
  250. sglang/srt/speculative/spec_info.py +47 -0
  251. sglang/srt/speculative/spec_utils.py +606 -0
  252. sglang/srt/torch_memory_saver_adapter.py +5 -7
  253. sglang/srt/tracing/trace.py +32 -6
  254. sglang/srt/two_batch_overlap.py +8 -5
  255. sglang/srt/utils/__init__.py +2 -0
  256. sglang/srt/{utils.py → utils/common.py} +399 -74
  257. sglang/srt/{hf_transformers_utils.py → utils/hf_transformers_utils.py} +49 -5
  258. sglang/srt/{patch_torch.py → utils/patch_torch.py} +8 -0
  259. sglang/srt/utils/rpd_utils.py +452 -0
  260. sglang/srt/utils/slow_rank_detector.py +71 -0
  261. sglang/srt/warmup.py +8 -4
  262. sglang/srt/weight_sync/utils.py +1 -1
  263. sglang/test/get_logits_ut.py +57 -0
  264. sglang/test/run_eval.py +79 -11
  265. sglang/test/runners.py +1 -1
  266. sglang/test/simple_eval_common.py +5 -2
  267. sglang/test/simple_eval_mmmu_vlm.py +441 -0
  268. sglang/test/test_block_fp8.py +2 -2
  269. sglang/test/test_deterministic.py +297 -0
  270. sglang/test/test_disaggregation_utils.py +12 -1
  271. sglang/test/test_programs.py +1 -1
  272. sglang/test/test_utils.py +355 -4
  273. sglang/utils.py +10 -1
  274. sglang/version.py +1 -1
  275. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/METADATA +34 -25
  276. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/RECORD +281 -210
  277. sglang/srt/mem_cache/lora_radix_cache.py +0 -421
  278. /sglang/srt/{remote_instance_weight_loader_utils.py → model_loader/remote_instance_weight_loader_utils.py} +0 -0
  279. /sglang/srt/{poll_based_barrier.py → utils/poll_based_barrier.py} +0 -0
  280. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/WHEEL +0 -0
  281. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/licenses/LICENSE +0 -0
  282. {sglang-0.5.3rc0.dist-info → sglang-0.5.3rc2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -19,14 +19,11 @@ import json
19
19
  import logging
20
20
  import os
21
21
  import random
22
- import socket
23
- import sys
24
22
  import tempfile
25
23
  from typing import List, Literal, Optional, Union
26
24
 
27
25
  from sglang.srt.connector import ConnectorType
28
26
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
29
- from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
30
27
  from sglang.srt.lora.lora_registry import LoRARef
31
28
  from sglang.srt.parser.reasoning_parser import ReasoningParser
32
29
  from sglang.srt.utils import (
@@ -49,11 +46,11 @@ from sglang.srt.utils import (
49
46
  nullable_str,
50
47
  parse_connector_type,
51
48
  )
49
+ from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
52
50
  from sglang.utils import is_in_ci
53
51
 
54
52
  logger = logging.getLogger(__name__)
55
53
 
56
-
57
54
  # Define constants
58
55
  LOAD_FORMAT_CHOICES = [
59
56
  "auto",
@@ -93,15 +90,17 @@ ATTENTION_BACKEND_CHOICES = [
93
90
  # Common
94
91
  "triton",
95
92
  "torch_native",
93
+ "flex_attention",
94
+ "nsa",
96
95
  # NVIDIA specific
97
96
  "cutlass_mla",
98
97
  "fa3",
98
+ "fa4",
99
99
  "flashinfer",
100
100
  "flashmla",
101
101
  "trtllm_mla",
102
102
  "trtllm_mha",
103
103
  "dual_chunk_flash_attn",
104
- "hybrid_linear_attn",
105
104
  # AMD specific
106
105
  "aiter",
107
106
  "wave",
@@ -110,10 +109,18 @@ ATTENTION_BACKEND_CHOICES = [
110
109
  "ascend",
111
110
  ]
112
111
 
112
+ LORA_BACKEND_CHOICES = ["triton", "csgmv"]
113
+
113
114
  DISAGG_TRANSFER_BACKEND_CHOICES = ["mooncake", "nixl", "ascend", "fake"]
114
115
 
115
116
  GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
116
117
 
118
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
119
+
120
+ NSA_CHOICES = ["flashmla_prefill", "flashmla_decode", "fa3", "tilelang", "aiter"]
121
+
122
+ RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
123
+
117
124
 
118
125
  # Allow external code to add more choices
119
126
  def add_load_format_choices(choices):
@@ -136,6 +143,14 @@ def add_grammar_backend_choices(choices):
136
143
  GRAMMAR_BACKEND_CHOICES.extend(choices)
137
144
 
138
145
 
146
+ def add_deterministic_attention_backend_choices(choices):
147
+ DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
148
+
149
+
150
+ def add_radix_eviction_policy_choices(choices):
151
+ RADIX_EVICTION_POLICY_CHOICES.extend(choices)
152
+
153
+
139
154
  @dataclasses.dataclass
140
155
  class ServerArgs:
141
156
  # Model and tokenizer
@@ -165,20 +180,25 @@ class ServerArgs:
165
180
  quantization: Optional[str] = None
166
181
  quantization_param_path: Optional[str] = None
167
182
  kv_cache_dtype: str = "auto"
183
+ enable_fp32_lm_head: bool = False
168
184
 
169
185
  # Memory and scheduling
170
186
  mem_fraction_static: Optional[float] = None
171
187
  max_running_requests: Optional[int] = None
172
- max_queued_requests: Optional[int] = sys.maxsize
188
+ max_queued_requests: Optional[int] = None
173
189
  max_total_tokens: Optional[int] = None
174
190
  chunked_prefill_size: Optional[int] = None
175
191
  max_prefill_tokens: int = 16384
176
192
  schedule_policy: str = "fcfs"
193
+ enable_priority_scheduling: bool = False
194
+ schedule_low_priority_values_first: bool = False
195
+ priority_scheduling_preemption_threshold: int = 10
177
196
  schedule_conservativeness: float = 1.0
178
197
  page_size: Optional[int] = None
179
198
  hybrid_kvcache_ratio: Optional[float] = None
180
199
  swa_full_tokens_ratio: float = 0.8
181
200
  disable_hybrid_swa_memory: bool = False
201
+ radix_eviction_policy: str = "lru"
182
202
 
183
203
  # Runtime options
184
204
  device: Optional[str] = None
@@ -205,8 +225,8 @@ class ServerArgs:
205
225
  show_time_cost: bool = False
206
226
  enable_metrics: bool = False
207
227
  enable_metrics_for_all_schedulers: bool = False
208
- tokenizer_metrics_custom_labels_header: str = "x-customer-labels"
209
- tokenizer_metrics_allowed_customer_labels: Optional[List[str]] = None
228
+ tokenizer_metrics_custom_labels_header: str = "x-custom-labels"
229
+ tokenizer_metrics_allowed_custom_labels: Optional[List[str]] = None
210
230
  bucket_time_to_first_token: Optional[List[float]] = None
211
231
  bucket_inter_token_latency: Optional[List[float]] = None
212
232
  bucket_e2e_request_latency: Optional[List[float]] = None
@@ -258,6 +278,7 @@ class ServerArgs:
258
278
  max_loaded_loras: Optional[int] = None
259
279
  max_loras_per_batch: int = 8
260
280
  lora_backend: str = "triton"
281
+ max_lora_chunk_size: Optional[int] = 16
261
282
 
262
283
  # Kernel backend
263
284
  attention_backend: Optional[str] = None
@@ -266,6 +287,8 @@ class ServerArgs:
266
287
  sampling_backend: Optional[str] = None
267
288
  grammar_backend: Optional[str] = None
268
289
  mm_attention_backend: Optional[str] = None
290
+ nsa_prefill: str = "flashmla_prefill"
291
+ nsa_decode: str = "fa3"
269
292
 
270
293
  # Speculative decoding
271
294
  speculative_algorithm: Optional[str] = None
@@ -278,6 +301,14 @@ class ServerArgs:
278
301
  speculative_accept_threshold_acc: float = 1.0
279
302
  speculative_token_map: Optional[str] = None
280
303
  speculative_attention_mode: str = "prefill"
304
+ # For ngram only
305
+ speculative_ngram_min_match_window_size: int = 1
306
+ speculative_ngram_max_match_window_size: int = 12
307
+ speculative_ngram_min_bfs_breadth: int = 1
308
+ speculative_ngram_max_bfs_breadth: int = 10
309
+ speculative_ngram_match_type: Literal["BFS", "PROB"] = "BFS"
310
+ speculative_ngram_branch_length: int = 18
311
+ speculative_ngram_capacity: int = 10 * 1000 * 1000
281
312
 
282
313
  # Expert parallelism
283
314
  ep_size: int = 1
@@ -309,6 +340,10 @@ class ServerArgs:
309
340
  deepep_config: Optional[str] = None
310
341
  moe_dense_tp_size: Optional[int] = None
311
342
 
343
+ # Mamba cache
344
+ max_mamba_cache_size: Optional[int] = None
345
+ mamba_ssm_dtype: str = "float32"
346
+
312
347
  # Hierarchical cache
313
348
  enable_hierarchical_cache: bool = False
314
349
  hicache_ratio: float = 2.0
@@ -352,11 +387,13 @@ class ServerArgs:
352
387
  disable_outlines_disk_cache: bool = False
353
388
  disable_custom_all_reduce: bool = False
354
389
  enable_mscclpp: bool = False
390
+ enable_torch_symm_mem: bool = False
355
391
  disable_overlap_schedule: bool = False
356
392
  enable_mixed_chunk: bool = False
357
393
  enable_dp_attention: bool = False
358
394
  enable_dp_lm_head: bool = False
359
395
  enable_two_batch_overlap: bool = False
396
+ enable_single_batch_overlap: bool = False
360
397
  tbo_token_distribution_threshold: float = 0.48
361
398
  enable_torch_compile: bool = False
362
399
  torch_compile_max_bs: int = 32
@@ -369,15 +406,18 @@ class ServerArgs:
369
406
  num_continuous_decode_steps: int = 1
370
407
  delete_ckpt_after_loading: bool = False
371
408
  enable_memory_saver: bool = False
409
+ enable_weights_cpu_backup: bool = False
372
410
  allow_auto_truncate: bool = False
373
411
  enable_custom_logit_processor: bool = False
374
412
  flashinfer_mla_disable_ragged: bool = False
375
413
  disable_shared_experts_fusion: bool = False
376
414
  disable_chunked_prefix_cache: bool = False
377
415
  disable_fast_image_processor: bool = False
416
+ keep_mm_feature_on_device: bool = False
378
417
  enable_return_hidden_states: bool = False
379
418
  scheduler_recv_interval: int = 1
380
419
  numa_node: Optional[List[int]] = None
420
+ enable_deterministic_inference: bool = False
381
421
 
382
422
  # Dynamic batch tokenizer
383
423
  enable_dynamic_batch_tokenizer: bool = False
@@ -398,16 +438,14 @@ class ServerArgs:
398
438
  disaggregation_decode_dp: Optional[int] = None
399
439
  disaggregation_prefill_pp: Optional[int] = 1
400
440
  disaggregation_ib_device: Optional[str] = None
441
+ disaggregation_decode_enable_offload_kvcache: bool = False
401
442
  num_reserved_decode_tokens: int = 512 # used for decode kv cache offload in PD
402
-
403
443
  # FIXME: hack to reduce ITL when decode bs is small
404
444
  disaggregation_decode_polling_interval: int = 1
405
445
 
406
- # For model weight update
446
+ # For model weight update and weight loading
407
447
  custom_weight_loader: Optional[List[str]] = None
408
448
  weight_loader_disable_mmap: bool = False
409
-
410
- # Remote instance weight loading
411
449
  remote_instance_weight_loader_seed_instance_ip: Optional[str] = None
412
450
  remote_instance_weight_loader_seed_instance_service_port: Optional[int] = None
413
451
  remote_instance_weight_loader_send_weights_group_ports: Optional[List[int]] = None
@@ -416,61 +454,84 @@ class ServerArgs:
416
454
  enable_pdmux: bool = False
417
455
  sm_group_num: int = 3
418
456
 
419
- # Mamba cache
420
- max_mamba_cache_size: Optional[int] = None
421
- mamba_ssm_dtype: str = "float32"
457
+ def __post_init__(self):
458
+ """
459
+ Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
460
+ """
461
+ # Handle deprecated arguments.
462
+ self._handle_deprecated_args()
422
463
 
423
- # Deprecated arguments
424
- enable_ep_moe: bool = False
425
- enable_deepep_moe: bool = False
426
- enable_flashinfer_cutlass_moe: bool = False
427
- enable_flashinfer_cutedsl_moe: bool = False
428
- enable_flashinfer_trtllm_moe: bool = False
429
- enable_triton_kernel_moe: bool = False
430
- enable_flashinfer_mxfp4_moe: bool = False
464
+ # Set missing default values.
465
+ self._handle_missing_default_values()
431
466
 
432
- def __post_init__(self):
433
- # Check deprecated arguments
434
- if self.enable_ep_moe:
435
- self.ep_size = self.tp_size
436
- print_deprecated_warning(
437
- "NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead."
438
- )
439
- if self.enable_deepep_moe:
440
- self.moe_a2a_backend = "deepep"
441
- print_deprecated_warning(
442
- "NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead."
443
- )
444
- if self.enable_triton_kernel_moe:
445
- self.moe_runner_backend = "triton_kernel"
446
- print_deprecated_warning(
447
- "NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead."
448
- )
449
- if self.enable_flashinfer_cutedsl_moe:
450
- self.moe_runner_backend = "flashinfer_cutedsl"
451
- print_deprecated_warning(
452
- "NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead."
453
- )
454
- if self.enable_flashinfer_cutlass_moe:
455
- self.moe_runner_backend = "flashinfer_cutlass"
456
- print_deprecated_warning(
457
- "NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead."
458
- )
459
- if self.enable_flashinfer_trtllm_moe:
460
- self.moe_runner_backend = "flashinfer_trtllm"
461
- print_deprecated_warning(
462
- "NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead."
463
- )
464
- if self.enable_flashinfer_mxfp4_moe:
465
- self.moe_runner_backend = "flashinfer_mxfp4"
466
- print_deprecated_warning(
467
- "NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead."
468
- )
467
+ # Get GPU memory capacity, which is a common dependency for several configuration steps.
468
+ gpu_mem = get_device_memory_capacity(self.device)
469
+
470
+ # Handle memory-related, chunked prefill, and CUDA graph batch size configurations.
471
+ self._handle_gpu_memory_settings(gpu_mem)
472
+
473
+ # Handle device-specific backends.
474
+ self._handle_hpu_backends()
475
+ self._handle_cpu_backends()
476
+
477
+ # Apply model-specific adjustments.
478
+ self._handle_model_specific_adjustments()
479
+
480
+ # Set kernel backends.
481
+ self._handle_sampling_backend()
482
+ self._handle_attention_backend_compatibility()
483
+ self._handle_page_size()
484
+ self._handle_amd_specifics()
485
+ self._handle_grammar_backend()
486
+
487
+ # Handle data parallelism.
488
+ self._handle_data_parallelism()
489
+
490
+ # Handle MoE configurations.
491
+ self._handle_moe_kernel_config()
492
+ self._handle_deepep_moe()
493
+ self._handle_eplb_and_dispatch()
494
+ self._handle_expert_distribution_metrics()
495
+
496
+ # Handle pipeline parallelism.
497
+ self._handle_pipeline_parallelism()
498
+
499
+ # Handle Hicache settings.
500
+ self._handle_hicache()
501
+
502
+ # Handle speculative decoding logic.
503
+ self._handle_speculative_decoding()
504
+
505
+ # Handle model loading format.
506
+ self._handle_load_format()
469
507
 
470
- # Set missing default values
508
+ # Handle PD disaggregation.
509
+ self._handle_disaggregation()
510
+
511
+ # Validate tokenizer settings.
512
+ self._handle_tokenizer_batching()
513
+
514
+ # Propagate environment variables.
515
+ self._handle_environment_variables()
516
+
517
+ # Validate cache settings.
518
+ self._handle_cache_compatibility()
519
+
520
+ # Validate metrics labels.
521
+ self._handle_metrics_labels()
522
+
523
+ # Handle deterministic inference.
524
+ self._handle_deterministic_inference()
525
+
526
+ # Handle any other necessary validations.
527
+ self._handle_other_validations()
528
+
529
+ def _handle_deprecated_args(self):
530
+ pass
531
+
532
+ def _handle_missing_default_values(self):
471
533
  if self.tokenizer_path is None:
472
534
  self.tokenizer_path = self.model_path
473
-
474
535
  if self.served_model_name is None:
475
536
  self.served_model_name = self.model_path
476
537
  if self.device is None:
@@ -478,56 +539,140 @@ class ServerArgs:
478
539
  if self.random_seed is None:
479
540
  self.random_seed = random.randint(0, 1 << 30)
480
541
 
481
- gpu_mem = get_device_memory_capacity(self.device)
482
-
483
- # Set mem fraction static
484
- if self.mem_fraction_static is None:
485
- if gpu_mem is not None:
486
- # GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
487
- # mem_fraction_static = (model weights + KV cache pool) / GPU memory capacity.
488
-
489
- # We want mem_fraction_static to be as large as possible but still has enough room
490
- # for activations and cuda graph buffers. We use the following heuristic to
491
- # compute the needed size for activations and cuda graph buffers:
492
- # - The size of the activation depends on the chunked_prefill_size and model size.
493
- # - The size of cuda graph buffers depends on the cuda graph capture range and model size.
494
- # For GPUs with more memory, we use a larger chunked_prefill_size and
495
- # capture more cuda graphs, so they need to reserve more memory.
496
- parallel_size = self.tp_size * self.pp_size
497
-
498
- if gpu_mem < 20 * 1024:
499
- # T4, 4080. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
500
- reserved_mem = (2.8 + parallel_size / 10) * 1024
501
- elif gpu_mem < 35 * 1024:
502
- # A10, L40, 4090, 5090. (chunked_prefill_size 2k, cuda_graph_max_bs 8)
503
- reserved_mem = (2.8 + parallel_size / 10) * 1024
504
- elif gpu_mem < 90 * 1024:
505
- # H100, A100. (chunked_prefill_size 8k, cuda_graph_max_bs 160)
506
- reserved_mem = (9.5 + parallel_size / 2) * 1024
507
- elif gpu_mem < 100 * 1024:
508
- # H20. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
509
- reserved_mem = (12 + parallel_size / 2) * 1024
510
- elif gpu_mem < 160 * 1024:
511
- # H200. (chunked_prefill_size 8k, cuda_graph_max_bs 256)
512
- reserved_mem = (12 + parallel_size / 2) * 1024
513
- else:
514
- # B200, MI300. (chunked_prefill_size 16k, cuda_graph_max_bs 512)
515
- reserved_mem = 32 * 1024
516
-
517
- # draft model and larger cuda graph buffers
518
- if self.speculative_algorithm is not None:
519
- if self.speculative_algorithm == "STANDALONE":
520
- # Standalone speculative decoding needs more memory than other speculative
521
- # decoding algorithms since the draft model is typically larger.
522
- reserved_mem += 6 * 1024
542
+ def _handle_gpu_memory_settings(self, gpu_mem):
543
+ """
544
+ Configure GPU memory-dependent settings including
545
+ chunked_prefill_size, cuda_graph_max_bs, and mem_fraction_static.
546
+
547
+ Here are our heuristics:
548
+ - Set chunked_prefill_size and cuda_graph_max_bs based on the GPU memory capacity.
549
+ This is because GPUs with more memory are generally more powerful, we need to use a larger
550
+ chunked_prefill_size and a larger cuda_graph_max_bs to fully utilize the GPU.
551
+ - Then set mem_fraction_static based on chunked_prefill_size and cuda_graph_max_bs.
552
+
553
+ GPU memory capacity = model weights + KV cache pool + activations + cuda graph buffers
554
+
555
+ The argument mem_fraction_static is defined as (model weights + KV cache pool) / GPU memory capacity,
556
+ or equivalently, mem_fraction_static = (GPU memory capacity - activations - cuda graph buffers) / GPU memory capacity.
557
+
558
+ In order to compute mem_fraction_static, we need to estimate the size of activations and cuda graph buffers.
559
+ The activation memory is proportional to the chunked_prefill_size.
560
+ The cuda graph memory is proportional to the cuda_graph_max_bs.
561
+ We use reserved_mem = chunked_prefill_size * 1.5 + cuda_graph_max_bs * 2 to estimate the size of activations and cuda graph buffers in GB.
562
+ and set mem_fraction_static = (GPU memory capacity - reserved_mem) / GPU memory capacity.
563
+
564
+ The coefficient 1.5 is a heuristic value, in the future, we can do better estimation by looking at the model types, hidden sizes or even do a dummy run.
565
+ """
566
+ if gpu_mem is not None:
567
+ if gpu_mem < 20 * 1024:
568
+ # T4, 4080
569
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 8)
570
+ if self.chunked_prefill_size is None:
571
+ self.chunked_prefill_size = 2048
572
+ if self.cuda_graph_max_bs is None:
573
+ self.cuda_graph_max_bs = 8
574
+ elif gpu_mem < 35 * 1024:
575
+ # A10, 4090, 5090
576
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
577
+ if self.chunked_prefill_size is None:
578
+ self.chunked_prefill_size = 2048
579
+ if self.cuda_graph_max_bs is None:
580
+ # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM < 35GB, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance.
581
+ # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
582
+ # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
583
+ if self.tp_size < 4:
584
+ self.cuda_graph_max_bs = 16
585
+ else:
586
+ self.cuda_graph_max_bs = 80
587
+ elif gpu_mem < 60 * 1024:
588
+ # A100 (40GB), L40,
589
+ # (chunked_prefill_size 4k, cuda_graph_max_bs 32 if tp < 4 else 160)
590
+ if self.chunked_prefill_size is None:
591
+ self.chunked_prefill_size = 4096
592
+ if self.cuda_graph_max_bs is None:
593
+ if self.tp_size < 4:
594
+ self.cuda_graph_max_bs = 32
523
595
  else:
524
- reserved_mem += 2 * 1024
525
- if self.enable_dp_attention:
526
- reserved_mem += 4 * 1024
596
+ self.cuda_graph_max_bs = 160
597
+ elif gpu_mem < 90 * 1024:
598
+ # H100, A100
599
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
600
+ if self.chunked_prefill_size is None:
601
+ self.chunked_prefill_size = 8192
602
+ if self.cuda_graph_max_bs is None:
603
+ if self.tp_size < 4:
604
+ self.cuda_graph_max_bs = 256
605
+ else:
606
+ self.cuda_graph_max_bs = 512
607
+ elif gpu_mem < 160 * 1024:
608
+ # H20, H200
609
+ # (chunked_prefill_size 8k, cuda_graph_max_bs 256 if tp < 4 else 512)
610
+ if self.chunked_prefill_size is None:
611
+ self.chunked_prefill_size = 8192
612
+ if self.cuda_graph_max_bs is None:
613
+ if self.tp_size < 4:
614
+ self.cuda_graph_max_bs = 256
615
+ else:
616
+ self.cuda_graph_max_bs = 512
617
+ else:
618
+ # B200, MI300
619
+ # (chunked_prefill_size 16k, cuda_graph_max_bs 512)
620
+ if self.chunked_prefill_size is None:
621
+ self.chunked_prefill_size = 16384
622
+ if self.cuda_graph_max_bs is None:
623
+ self.cuda_graph_max_bs = 512
624
+ else:
625
+ # Fallback defaults when gpu_mem is None
626
+ if self.chunked_prefill_size is None:
627
+ self.chunked_prefill_size = 4096
628
+ if self.cuda_graph_max_bs is None:
629
+ self.cuda_graph_max_bs = 160
630
+
631
+ # Set cuda graph batch sizes
632
+ if self.cuda_graph_bs is None:
633
+ self.cuda_graph_bs = self._generate_cuda_graph_batch_sizes()
634
+ else:
635
+ self.cuda_graph_max_bs = max(self.cuda_graph_bs)
527
636
 
528
- self.mem_fraction_static = round((gpu_mem - reserved_mem) / gpu_mem, 3)
637
+ if self.mem_fraction_static is None:
638
+ # Constant meta data (e.g., from attention backend)
639
+ reserved_mem = 512
640
+ # For activation during large prefill
641
+ if self.chunked_prefill_size > 0:
642
+ reserved_mem += max(self.chunked_prefill_size, 2048) * 1.5
529
643
  else:
530
- self.mem_fraction_static = 0.88
644
+ reserved_mem += max(self.max_prefill_tokens, 2048) * 1.5
645
+ # For cuda graphs
646
+ reserved_mem += self.cuda_graph_max_bs * 2
647
+ # Some adjustments for large parallel size
648
+ reserved_mem += self.tp_size * self.pp_size / 8 * 1024
649
+
650
+ if self.enable_dp_attention:
651
+ # DP attention needs more padding for some operations
652
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 3
653
+
654
+ # DP attention uses much more memory for large cuda graph max bs,
655
+ # likely due to some inefficiencies in torch allocator or our implementation.
656
+ # So we need to reserve more memory.
657
+ if self.cuda_graph_max_bs > 300:
658
+ reserved_mem += self.cuda_graph_max_bs * self.dp_size * 1.5
659
+
660
+ if gpu_mem is not None and gpu_mem > 60 * 1024:
661
+ reserved_mem = max(reserved_mem, 10 * 1024)
662
+
663
+ if self.speculative_algorithm is not None:
664
+ if self.speculative_algorithm == "STANDALONE":
665
+ # standalonedraft model and cuda graphs
666
+ reserved_mem += 6 * 1024
667
+ elif self.speculative_algorithm != "NGRAM":
668
+ # eagle draft models and cuda graphs
669
+ reserved_mem += 2 * 1024
670
+
671
+ self.mem_fraction_static = (
672
+ round((gpu_mem - reserved_mem) / gpu_mem, 3)
673
+ if gpu_mem is not None
674
+ else 0.88
675
+ )
531
676
 
532
677
  # Lazy init to avoid circular import
533
678
  # Multimodal models need more memory for the image processor
@@ -537,54 +682,192 @@ class ServerArgs:
537
682
  if model_config.is_multimodal:
538
683
  self.adjust_mem_fraction_for_vlm(model_config)
539
684
 
540
- # Set chunked prefill size, which depends on the gpu memory capacity
541
- if self.chunked_prefill_size is None:
542
- if gpu_mem is not None:
543
- if gpu_mem < 35 * 1024: # A10, L40, 4090
544
- self.chunked_prefill_size = 2048
545
- elif gpu_mem < 160 * 1024: # H100, H200, A100, H20
546
- self.chunked_prefill_size = 8192
547
- else: # B200, MI300
548
- self.chunked_prefill_size = 16384
549
- else:
550
- self.chunked_prefill_size = 4096
685
+ def _generate_cuda_graph_batch_sizes(self):
686
+ """
687
+ Generate the list of batch sizes for CUDA graph capture based on cuda_graph_max_bs.
688
+ This integrates the logic from cuda_graph_runner.py.
689
+ """
690
+ # Handle disable_cuda_graph_padding as the first condition for both spec and non-spec
691
+ if self.disable_cuda_graph_padding:
692
+ capture_bs = list(range(1, self.cuda_graph_max_bs + 1))
693
+ elif self.speculative_algorithm is None:
694
+ # Normal case: [1, 2, 4, 8, 12] + list(range(16, 257, 8)) + list(range(272, 512, 16)) + list(range(512, cuda_graph_max_bs + 1))
695
+ capture_bs = (
696
+ [1, 2, 4, 8, 12]
697
+ + list(range(16, 257, 8))
698
+ + list(range(272, 512, 16))
699
+ + list(range(512, self.cuda_graph_max_bs + 1, 32))
700
+ )
701
+ else:
702
+ # Spec decoding case: list(range(1, 9, 1)) + list(range(10, 33, 2)) + list(range(40, 64, 4)) + list(range(72, 257, 8))
703
+ capture_bs = (
704
+ list(range(1, 9, 1))
705
+ + list(range(10, 33, 2))
706
+ + list(range(40, 64, 4))
707
+ + list(range(72, 257, 8))
708
+ + list(range(272, self.cuda_graph_max_bs + 1, 16))
709
+ )
551
710
 
552
- # Set cuda graph max batch size
553
- if self.cuda_graph_max_bs is None:
554
- # Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
555
- if gpu_mem is not None and gpu_mem < 35 * 1024:
556
- if self.tp_size < 4:
557
- self.cuda_graph_max_bs = 8
558
- else:
559
- self.cuda_graph_max_bs = 80
711
+ capture_bs = [bs for bs in capture_bs if bs <= self.cuda_graph_max_bs]
560
712
 
561
- # Set kernel backends for hpu device
713
+ return capture_bs
714
+
715
+ def _handle_hpu_backends(self):
562
716
  if self.device == "hpu":
563
717
  self.attention_backend = "torch_native"
564
718
  self.sampling_backend = "pytorch"
565
719
 
566
- # Model-specific adjustments
567
- if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
568
- self.model_specific_adjustments()
569
-
570
- # Set kernel backends
720
+ def _handle_cpu_backends(self):
571
721
  if self.device == "cpu":
572
722
  if self.attention_backend is None:
573
723
  self.attention_backend = "intel_amx"
574
724
  self.sampling_backend = "pytorch"
575
725
 
726
+ def _handle_model_specific_adjustments(self):
727
+ from sglang.srt.configs.model_config import is_deepseek_nsa
728
+
729
+ if parse_connector_type(self.model_path) == ConnectorType.INSTANCE:
730
+ return
731
+
732
+ hf_config = self.get_hf_config()
733
+ model_arch = hf_config.architectures[0]
734
+ if model_arch in ["GptOssForCausalLM"]:
735
+ if self.attention_backend is None:
736
+ if is_cuda() and is_sm100_supported():
737
+ self.attention_backend = "trtllm_mha"
738
+ elif is_cuda() and is_sm90_supported():
739
+ self.attention_backend = "fa3"
740
+ else:
741
+ self.attention_backend = "triton"
742
+ supported_backends = ["triton", "trtllm_mha", "fa3"]
743
+ logger.info(
744
+ f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
745
+ )
746
+ assert (
747
+ self.attention_backend in supported_backends
748
+ ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
749
+
750
+ if is_sm100_supported():
751
+ if not self.enable_dp_attention:
752
+ self.enable_flashinfer_allreduce_fusion = True
753
+ logger.info(
754
+ "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
755
+ )
756
+ quantization_config = getattr(hf_config, "quantization_config", None)
757
+ is_mxfp4_quant_format = (
758
+ quantization_config is not None
759
+ and quantization_config.get("quant_method") == "mxfp4"
760
+ )
761
+
762
+ if is_sm100_supported() and is_mxfp4_quant_format:
763
+ self.moe_runner_backend = "flashinfer_mxfp4"
764
+ logger.warning(
765
+ "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
766
+ )
767
+ else:
768
+ if self.moe_runner_backend == "triton_kernel":
769
+ assert (
770
+ self.ep_size == 1
771
+ ), "Triton kernel MoE is only supported when ep_size == 1"
772
+ if (
773
+ self.moe_runner_backend == "auto"
774
+ and self.ep_size == 1
775
+ and is_triton_kernels_available()
776
+ ):
777
+ self.moe_runner_backend = "triton_kernel"
778
+ logger.warning(
779
+ "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
780
+ )
781
+ self.disable_hybrid_swa_memory = True
782
+ if is_mxfp4_quant_format:
783
+ # use bf16 for mxfp4 triton kernels
784
+ self.dtype = "bfloat16"
785
+
786
+ elif "Llama4" in model_arch and self.device != "cpu":
787
+ assert self.attention_backend in {
788
+ "fa3",
789
+ "aiter",
790
+ "triton",
791
+ }, "fa3, aiter, or triton is required for Llama4 model"
792
+ elif model_arch in [
793
+ "Gemma2ForCausalLM",
794
+ "Gemma3ForCausalLM",
795
+ "Gemma3ForConditionalGeneration",
796
+ "Gemma3nForCausalLM",
797
+ "Gemma3nForConditionalGeneration",
798
+ ]:
799
+ # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
800
+ # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
801
+ logger.warning(
802
+ f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
803
+ )
804
+ self.disable_hybrid_swa_memory = True
805
+
806
+ if is_deepseek_nsa(hf_config):
807
+ if (
808
+ self.attention_backend is None
809
+ and self.prefill_attention_backend is None
810
+ and self.decode_attention_backend is None
811
+ ):
812
+ self.attention_backend = "nsa"
813
+ logger.warning("Set nsa attention backend for DeepSeek NSA.")
814
+
815
+ if not is_npu():
816
+ self.enable_dp_attention = True
817
+ self.dp_size = self.tp_size
818
+ logger.warning("DP attention is enabled for DeepSeek NSA.")
819
+
820
+ self.page_size = 64
821
+ logger.warning("Setting page size to 64 for DeepSeek NSA.")
822
+
823
+ self.mem_fraction_static = 0.8
824
+ logger.warning("Setting mem fraction static to 0.8 for DeepSeek NSA.")
825
+
826
+ # For Hopper, we support both bf16 and fp8 kv cache; for Blackwell, we support fp8 only currently
827
+ import torch
828
+
829
+ major, _ = torch.cuda.get_device_capability()
830
+ if major >= 10:
831
+ self.kv_cache_dtype = "fp8_e4m3"
832
+ logger.warning("Setting KV cache dtype to fp8.")
833
+
834
+ if self.kv_cache_dtype == "fp8_e4m3":
835
+ self.nsa_prefill = "flashmla_decode"
836
+ self.nsa_decode = "flashmla_decode"
837
+ logger.warning(
838
+ "Setting NSA backend to flashmla_decode for FP8 KV Cache."
839
+ )
840
+
841
+ # Logging env vars for NSA
842
+ from sglang.srt.layers.attention.nsa.utils import (
843
+ print_nsa_bool_env_vars,
844
+ )
845
+
846
+ print_nsa_bool_env_vars()
847
+
848
+ def _handle_sampling_backend(self):
576
849
  if self.sampling_backend is None:
577
850
  self.sampling_backend = (
578
851
  "flashinfer" if is_flashinfer_available() else "pytorch"
579
852
  )
580
853
 
854
+ def _handle_attention_backend_compatibility(self):
581
855
  if self.attention_backend == "torch_native":
582
856
  logger.warning(
583
857
  "Cuda graph is disabled because of using torch native attention backend"
584
858
  )
585
859
  self.disable_cuda_graph = True
586
860
 
587
- if is_npu() and self.attention_backend in ["ascend", "hybrid_linear_attn"]:
861
+ if self.attention_backend == "flex_attention":
862
+ logger.warning(
863
+ "Cuda graph is disabled because of using torch Flex Attention backend"
864
+ )
865
+ self.disable_cuda_graph = True
866
+ assert (
867
+ self.speculative_algorithm is None
868
+ ), "Speculative decoding is currently not supported with Flex Attention backend"
869
+
870
+ if is_npu() and self.attention_backend in ["ascend"]:
588
871
  logger.warning(
589
872
  "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
590
873
  )
@@ -646,29 +929,28 @@ class ServerArgs:
646
929
 
647
930
  if self.attention_backend == "dual_chunk_flash_attn":
648
931
  logger.warning(
649
- "Mixed chunk, radix cache, and cuda graphs are disabled because of using dual chunk flash attention backend"
932
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
650
933
  )
651
934
  self.enable_mixed_chunk = False
652
- self.disable_cuda_graph = True
653
935
  self.disable_radix_cache = True
654
936
 
655
- # Set page size
937
+ def _handle_page_size(self):
656
938
  if self.page_size is None:
657
939
  self.page_size = 1
658
940
 
659
- # AMD-specific Triton attention KV splits default number
941
+ def _handle_amd_specifics(self):
660
942
  if is_hip():
661
943
  self.triton_attention_num_kv_splits = 16
662
944
 
663
- # Choose grammar backend
945
+ def _handle_grammar_backend(self):
664
946
  if self.grammar_backend is None:
665
947
  self.grammar_backend = "xgrammar"
666
948
 
949
+ def _handle_data_parallelism(self):
667
950
  if self.dp_size == 1:
668
951
  self.enable_dp_attention = False
669
952
  self.enable_dp_lm_head = False
670
953
 
671
- # Data parallelism attention
672
954
  if self.enable_dp_attention:
673
955
  self.schedule_conservativeness = self.schedule_conservativeness * 0.3
674
956
  assert self.tp_size % self.dp_size == 0
@@ -682,7 +964,7 @@ class ServerArgs:
682
964
  self.enable_dp_attention
683
965
  ), "Please enable dp attention when setting enable_dp_lm_head. "
684
966
 
685
- # MoE kernel
967
+ def _handle_moe_kernel_config(self):
686
968
  if self.moe_runner_backend == "flashinfer_cutlass":
687
969
  assert (
688
970
  self.quantization == "modelopt_fp4"
@@ -695,13 +977,13 @@ class ServerArgs:
695
977
  if self.moe_runner_backend == "flashinfer_trtllm":
696
978
  assert (
697
979
  self.quantization == "modelopt_fp4" or self.quantization == "fp8"
698
- ), "modelopt_fp4 quantization is required for Flashinfer TRTLLM MoE"
980
+ ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
699
981
  self.disable_shared_experts_fusion = True
700
982
  logger.warning(
701
983
  "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
702
984
  )
703
985
 
704
- # DeepEP MoE
986
+ def _handle_deepep_moe(self):
705
987
  if self.moe_a2a_backend == "deepep":
706
988
  if self.deepep_mode == "normal":
707
989
  logger.warning("Cuda graph is disabled because deepep_mode=`normal`")
@@ -711,6 +993,7 @@ class ServerArgs:
711
993
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
712
994
  )
713
995
 
996
+ def _handle_eplb_and_dispatch(self):
714
997
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
715
998
  self.expert_distribution_recorder_mode = "stat"
716
999
  logger.warning(
@@ -725,6 +1008,7 @@ class ServerArgs:
725
1008
  if self.enable_eplb:
726
1009
  assert self.ep_size > 1
727
1010
 
1011
+ def _handle_expert_distribution_metrics(self):
728
1012
  if self.enable_expert_distribution_metrics and (
729
1013
  self.expert_distribution_recorder_mode is None
730
1014
  ):
@@ -736,18 +1020,24 @@ class ServerArgs:
736
1020
  elif self.expert_distribution_recorder_mode is not None:
737
1021
  self.expert_distribution_recorder_buffer_size = 1000
738
1022
 
739
- # Pipeline parallelism
1023
+ def _handle_pipeline_parallelism(self):
740
1024
  if self.pp_size > 1:
741
1025
  self.disable_overlap_schedule = True
742
1026
  logger.warning(
743
1027
  "Pipeline parallelism is incompatible with overlap schedule."
744
1028
  )
745
1029
 
746
- # Hicache
1030
+ def _handle_hicache(self):
747
1031
  if self.hicache_storage_backend == "mooncake":
748
- # to use mooncake storage backend, the following conditions must be met:
749
- self.hicache_io_backend = "kernel"
750
- self.hicache_mem_layout = "page_first"
1032
+ if self.hicache_mem_layout == "layer_first":
1033
+ if self.hicache_io_backend == "direct":
1034
+ self.hicache_mem_layout = "page_first_direct"
1035
+ elif self.hicache_io_backend == "kernel":
1036
+ self.hicache_mem_layout = "page_first"
1037
+ logger.warning(
1038
+ f"Mooncake storage backend does not support layer_first layout, "
1039
+ f"switching to {self.hicache_mem_layout} layout for {self.hicache_io_backend} io backend"
1040
+ )
751
1041
 
752
1042
  if self.hicache_mem_layout == "page_first_direct":
753
1043
  if self.hicache_io_backend != "direct":
@@ -756,17 +1046,16 @@ class ServerArgs:
756
1046
  "Page first direct layout only support direct io backend"
757
1047
  )
758
1048
 
759
- # Speculative Decoding
1049
+ def _handle_speculative_decoding(self):
760
1050
  if self.speculative_algorithm == "NEXTN":
761
- # NEXTN shares the same implementation of EAGLE
762
1051
  self.speculative_algorithm = "EAGLE"
763
1052
 
764
1053
  if self.speculative_algorithm in ("EAGLE", "EAGLE3", "STANDALONE"):
765
- if self.speculative_algorithm == "STANDALONE":
1054
+ if self.speculative_algorithm == "STANDALONE" and self.enable_dp_attention:
766
1055
  # TODO: support dp attention for standalone speculative decoding
767
- assert (
768
- self.enable_dp_attention is False
769
- ), "Currently standalone speculative decoding does not support dp attention."
1056
+ raise ValueError(
1057
+ "Currently standalone speculative decoding does not support dp attention."
1058
+ )
770
1059
  if self.max_running_requests is None:
771
1060
  self.max_running_requests = 48
772
1061
  self.disable_overlap_schedule = True
@@ -783,12 +1072,12 @@ class ServerArgs:
783
1072
 
784
1073
  model_arch = self.get_hf_config().architectures[0]
785
1074
  if model_arch in [
1075
+ "DeepseekV32ForCausalLM",
786
1076
  "DeepseekV3ForCausalLM",
787
1077
  "Glm4MoeForCausalLM",
788
1078
  "BailingMoeForCausalLM",
789
1079
  "BailingMoeV2ForCausalLM",
790
1080
  ]:
791
- # Auto set draft_model_path DeepSeek-V3/R1
792
1081
  if self.speculative_draft_model_path is None:
793
1082
  self.speculative_draft_model_path = self.model_path
794
1083
  else:
@@ -796,7 +1085,6 @@ class ServerArgs:
796
1085
  "DeepSeek MTP does not require setting speculative_draft_model_path."
797
1086
  )
798
1087
 
799
- # Auto choose parameters
800
1088
  if self.speculative_num_steps is None:
801
1089
  assert (
802
1090
  self.speculative_eagle_topk is None
@@ -836,11 +1124,43 @@ class ServerArgs:
836
1124
  "speculative_eagle_topk > 1 with page_size > 1 is unstable and produces incorrect results for paged attention backends. This combination is only supported for the 'flashinfer' backend."
837
1125
  )
838
1126
 
839
- # The token generated from the verify step is counted.
840
- # If sepculative_num_steps >= speculative_num_draft_tokens, the additional tokens will definitely be discarded.
841
- # assert self.speculative_num_steps < self.speculative_num_draft_tokens
1127
+ if self.speculative_algorithm == "NGRAM":
1128
+ if not self.device.startswith("cuda"):
1129
+ raise ValueError(
1130
+ "Ngram speculative decoding only supports CUDA device."
1131
+ )
1132
+ if self.max_running_requests is None:
1133
+ self.max_running_requests = 48
1134
+ self.disable_overlap_schedule = True
1135
+ self.enable_mixed_chunk = False
1136
+ self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
1137
+ if self.speculative_num_draft_tokens is None:
1138
+ self.speculative_num_draft_tokens = (
1139
+ self.speculative_ngram_max_match_window_size
1140
+ )
1141
+ logger.warning(
1142
+ "The overlap scheduler and mixed chunked prefill are disabled because of "
1143
+ "using ngram speculative decoding."
1144
+ )
842
1145
 
843
- # GGUF
1146
+ if (
1147
+ self.speculative_eagle_topk > 1
1148
+ and self.page_size > 1
1149
+ and self.attention_backend != "flashinfer"
1150
+ ):
1151
+ raise ValueError(
1152
+ f"speculative_eagle_topk({self.speculative_eagle_topk}) > 1 "
1153
+ f"with page_size({self.page_size}) > 1 is unstable "
1154
+ "and produces incorrect results for paged attention backends. "
1155
+ "This combination is only supported for the 'flashinfer' backend."
1156
+ )
1157
+ if self.enable_dp_attention:
1158
+ # TODO: support dp attention for ngram speculative decoding
1159
+ raise ValueError(
1160
+ "Currently ngram speculative decoding does not support dp attention."
1161
+ )
1162
+
1163
+ def _handle_load_format(self):
844
1164
  if (
845
1165
  self.load_format == "auto" or self.load_format == "gguf"
846
1166
  ) and check_gguf_file(self.model_path):
@@ -848,6 +1168,7 @@ class ServerArgs:
848
1168
 
849
1169
  if is_remote_url(self.model_path):
850
1170
  self.load_format = "remote"
1171
+
851
1172
  if self.custom_weight_loader is None:
852
1173
  self.custom_weight_loader = []
853
1174
 
@@ -859,7 +1180,7 @@ class ServerArgs:
859
1180
  ):
860
1181
  self.load_format = "auto"
861
1182
 
862
- # PD disaggregation
1183
+ def _handle_disaggregation(self):
863
1184
  if self.disaggregation_mode == "decode":
864
1185
  assert (
865
1186
  self.disaggregation_decode_tp is None
@@ -885,42 +1206,84 @@ class ServerArgs:
885
1206
 
886
1207
  self.disaggregation_prefill_pp = self.pp_size
887
1208
  self.validate_disagg_tp_size(self.tp_size, self.disaggregation_decode_tp)
888
-
889
1209
  self.disable_cuda_graph = True
890
1210
  logger.warning("Cuda graph is disabled for prefill server")
891
1211
 
892
- # Validation: prevent both tokenizer batching features from being enabled
1212
+ def _handle_tokenizer_batching(self):
893
1213
  if self.enable_tokenizer_batch_encode and self.enable_dynamic_batch_tokenizer:
894
1214
  raise ValueError(
895
1215
  "Cannot enable both --enable-tokenizer-batch-encode and --enable-dynamic-batch-tokenizer. "
896
1216
  "Please choose one tokenizer batching approach."
897
1217
  )
898
1218
 
899
- # Propagate env vars
1219
+ def _handle_environment_variables(self):
900
1220
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
901
1221
  "1" if self.enable_torch_compile else "0"
902
1222
  )
903
1223
  os.environ["SGLANG_MAMBA_SSM_DTYPE"] = self.mamba_ssm_dtype
904
-
905
- # Set env var before grammar backends init
906
1224
  os.environ["SGLANG_DISABLE_OUTLINES_DISK_CACHE"] = (
907
1225
  "1" if self.disable_outlines_disk_cache else "0"
908
1226
  )
1227
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = (
1228
+ "1" if self.enable_deterministic_inference else "0"
1229
+ )
909
1230
 
1231
+ def _handle_cache_compatibility(self):
910
1232
  if self.enable_hierarchical_cache and self.disable_radix_cache:
911
1233
  raise ValueError(
912
1234
  "The arguments enable-hierarchical-cache and disable-radix-cache are mutually exclusive "
913
1235
  "and cannot be used at the same time. Please use only one of them."
914
1236
  )
915
1237
 
1238
+ if (
1239
+ self.disaggregation_decode_enable_offload_kvcache
1240
+ and self.disaggregation_mode != "decode"
1241
+ ):
1242
+ raise ValueError(
1243
+ "The argument disaggregation-decode-enable-offload-kvcache is only supported for decode side."
1244
+ )
1245
+
1246
+ def _handle_metrics_labels(self):
916
1247
  if (
917
1248
  not self.tokenizer_metrics_custom_labels_header
918
- and self.tokenizer_metrics_allowed_customer_labels
1249
+ and self.tokenizer_metrics_allowed_custom_labels
919
1250
  ):
920
1251
  raise ValueError(
921
- "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-customer-labels."
1252
+ "Please set --tokenizer-metrics-custom-labels-header when setting --tokenizer-metrics-allowed-custom-labels."
922
1253
  )
923
1254
 
1255
+ def _handle_deterministic_inference(self):
1256
+ if self.enable_deterministic_inference:
1257
+ # Check sampling backend
1258
+ self.sampling_backend = "pytorch"
1259
+ logger.warning(
1260
+ "Sampling backend is set to pytorch for deterministic inference."
1261
+ )
1262
+
1263
+ # Check attention backend
1264
+ if self.attention_backend not in DETERMINISTIC_ATTENTION_BACKEND_CHOICES:
1265
+ raise ValueError(
1266
+ f"Currently only {DETERMINISTIC_ATTENTION_BACKEND_CHOICES} attention backends are supported for deterministic inference."
1267
+ )
1268
+
1269
+ # Currently, only FA3 supports radix cache. Support for other backends is in progress
1270
+ if self.attention_backend != "fa3":
1271
+ self.disable_radix_cache = True
1272
+ logger.warning(
1273
+ f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
1274
+ )
1275
+
1276
+ # Check TP size
1277
+ if self.tp_size > 1:
1278
+ os.environ["NCCL_ALGO"] = "allreduce:tree"
1279
+ self.disable_custom_all_reduce = True
1280
+ logger.warning(
1281
+ "NCCL_ALGO is set to 'allreduce:tree' and custom all reduce is disabled for deterministic inference when TP size > 1."
1282
+ )
1283
+
1284
+ def _handle_other_validations(self):
1285
+ pass
1286
+
924
1287
  @staticmethod
925
1288
  def add_cli_args(parser: argparse.ArgumentParser):
926
1289
  # Model and tokenizer
@@ -931,24 +1294,6 @@ class ServerArgs:
931
1294
  help="The path of the model weights. This can be a local folder or a Hugging Face repo ID.",
932
1295
  required=True,
933
1296
  )
934
- parser.add_argument(
935
- "--remote-instance-weight-loader-seed-instance-ip",
936
- type=str,
937
- default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
938
- help="The ip of the seed instance for loading weights from remote instance.",
939
- )
940
- parser.add_argument(
941
- "--remote-instance-weight-loader-seed-instance-service-port",
942
- type=int,
943
- default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
944
- help="The service port of the seed instance for loading weights from remote instance.",
945
- )
946
- parser.add_argument(
947
- "--remote-instance-weight-loader-send-weights-group-ports",
948
- type=json_list_type,
949
- default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
950
- help="The communication group ports for loading weights from remote instance.",
951
- )
952
1297
  parser.add_argument(
953
1298
  "--tokenizer-path",
954
1299
  type=str,
@@ -1117,6 +1462,11 @@ class ServerArgs:
1117
1462
  choices=["auto", "fp8_e5m2", "fp8_e4m3"],
1118
1463
  help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
1119
1464
  )
1465
+ parser.add_argument(
1466
+ "--enable-fp32-lm-head",
1467
+ action="store_true",
1468
+ help="If set, the LM head outputs (logits) are in FP32.",
1469
+ )
1120
1470
 
1121
1471
  # Memory and scheduling
1122
1472
  parser.add_argument(
@@ -1163,6 +1513,24 @@ class ServerArgs:
1163
1513
  choices=["lpm", "random", "fcfs", "dfs-weight", "lof", "priority"],
1164
1514
  help="The scheduling policy of the requests.",
1165
1515
  )
1516
+ parser.add_argument(
1517
+ "--enable-priority-scheduling",
1518
+ action="store_true",
1519
+ default=ServerArgs.enable_priority_scheduling,
1520
+ help="Enable priority scheduling. Requests with higher priority integer values will be scheduled first by default.",
1521
+ )
1522
+ parser.add_argument(
1523
+ "--schedule-low-priority-values-first",
1524
+ action="store_true",
1525
+ default=ServerArgs.schedule_low_priority_values_first,
1526
+ help="If specified with --enable-priority-scheduling, the scheduler will schedule requests with lower priority integer values first.",
1527
+ )
1528
+ parser.add_argument(
1529
+ "--priority-scheduling-preemption-threshold",
1530
+ type=int,
1531
+ default=ServerArgs.priority_scheduling_preemption_threshold,
1532
+ help="Minimum difference in priorities for an incoming request to have to preempt running request(s).",
1533
+ )
1166
1534
  parser.add_argument(
1167
1535
  "--schedule-conservativeness",
1168
1536
  type=float,
@@ -1338,16 +1706,16 @@ class ServerArgs:
1338
1706
  "--tokenizer-metrics-custom-labels-header",
1339
1707
  type=str,
1340
1708
  default=ServerArgs.tokenizer_metrics_custom_labels_header,
1341
- help="Specify the HTTP header for passing customer labels for tokenizer metrics.",
1709
+ help="Specify the HTTP header for passing custom labels for tokenizer metrics.",
1342
1710
  )
1343
1711
  parser.add_argument(
1344
- "--tokenizer-metrics-allowed-customer-labels",
1712
+ "--tokenizer-metrics-allowed-custom-labels",
1345
1713
  type=str,
1346
1714
  nargs="+",
1347
- default=ServerArgs.tokenizer_metrics_allowed_customer_labels,
1348
- help="The customer labels allowed for tokenizer metrics. The labels are specified via a dict in "
1715
+ default=ServerArgs.tokenizer_metrics_allowed_custom_labels,
1716
+ help="The custom labels allowed for tokenizer metrics. The labels are specified via a dict in "
1349
1717
  "'--tokenizer-metrics-custom-labels-header' field in HTTP requests, e.g., {'label1': 'value1', 'label2': "
1350
- "'value2'} is allowed if '--tokenizer-metrics-allowed-labels label1 label2' is set.",
1718
+ "'value2'} is allowed if '--tokenizer-metrics-allowed-custom-labels label1 label2' is set.",
1351
1719
  )
1352
1720
  parser.add_argument(
1353
1721
  "--bucket-time-to-first-token",
@@ -1379,8 +1747,8 @@ class ServerArgs:
1379
1747
  bucket_rule = (
1380
1748
  "Supports 3 rule types: 'default' uses predefined buckets; 'tse <middle> <base> <count>' "
1381
1749
  "generates two sides exponential distributed buckets (e.g., 'tse 1000 2 8' generates buckets "
1382
- "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'customer <value1> "
1383
- "<value2> ...' uses custom bucket values (e.g., 'customer 10 50 100 500')."
1750
+ "[984.0, 992.0, 996.0, 998.0, 1000.0, 1002.0, 1004.0, 1008.0, 1016.0]).); 'custom <value1> "
1751
+ "<value2> ...' uses custom bucket values (e.g., 'custom 10 50 100 500')."
1384
1752
  )
1385
1753
  parser.add_argument(
1386
1754
  "--prompt-tokens-buckets",
@@ -1601,9 +1969,17 @@ class ServerArgs:
1601
1969
  parser.add_argument(
1602
1970
  "--lora-backend",
1603
1971
  type=str,
1604
- default="triton",
1972
+ choices=LORA_BACKEND_CHOICES,
1973
+ default=ServerArgs.lora_backend,
1605
1974
  help="Choose the kernel backend for multi-LoRA serving.",
1606
1975
  )
1976
+ parser.add_argument(
1977
+ "--max-lora-chunk-size",
1978
+ type=int,
1979
+ default=ServerArgs.max_lora_chunk_size,
1980
+ choices=[16, 32, 64, 128],
1981
+ help="Maximum chunk size for the ChunkedSGMV LoRA backend. Only used when --lora-backend is 'csgmv'. Choosing a larger value might improve performance.",
1982
+ )
1607
1983
 
1608
1984
  # Kernel backend
1609
1985
  parser.add_argument(
@@ -1644,16 +2020,28 @@ class ServerArgs:
1644
2020
  parser.add_argument(
1645
2021
  "--mm-attention-backend",
1646
2022
  type=str,
1647
- choices=["sdpa", "fa3", "triton_attn"],
2023
+ choices=["sdpa", "fa3", "triton_attn", "ascend_attn"],
1648
2024
  default=ServerArgs.mm_attention_backend,
1649
2025
  help="Set multimodal attention backend.",
1650
2026
  )
2027
+ parser.add_argument(
2028
+ "--nsa-prefill",
2029
+ default=ServerArgs.nsa_prefill,
2030
+ type=str,
2031
+ choices=NSA_CHOICES,
2032
+ )
2033
+ parser.add_argument(
2034
+ "--nsa-decode",
2035
+ default=ServerArgs.nsa_decode,
2036
+ type=str,
2037
+ choices=NSA_CHOICES,
2038
+ )
1651
2039
 
1652
2040
  # Speculative decoding
1653
2041
  parser.add_argument(
1654
2042
  "--speculative-algorithm",
1655
2043
  type=str,
1656
- choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE"],
2044
+ choices=["EAGLE", "EAGLE3", "NEXTN", "STANDALONE", "NGRAM"],
1657
2045
  help="Speculative algorithm.",
1658
2046
  )
1659
2047
  parser.add_argument(
@@ -1713,6 +2101,50 @@ class ServerArgs:
1713
2101
  help="Attention backend for speculative decoding operations (both target verify and draft extend). Can be one of 'prefill' (default) or 'decode'.",
1714
2102
  default=ServerArgs.speculative_attention_mode,
1715
2103
  )
2104
+ # Ngram speculative decoding
2105
+ parser.add_argument(
2106
+ "--speculative-ngram-min-match-window-size",
2107
+ type=int,
2108
+ default=ServerArgs.speculative_ngram_min_match_window_size,
2109
+ help="The minimum window size for pattern matching in ngram speculative decoding.",
2110
+ )
2111
+ parser.add_argument(
2112
+ "--speculative-ngram-max-match-window-size",
2113
+ type=int,
2114
+ default=ServerArgs.speculative_ngram_max_match_window_size,
2115
+ help="The maximum window size for pattern matching in ngram speculative decoding.",
2116
+ )
2117
+ parser.add_argument(
2118
+ "--speculative-ngram-min-bfs-breadth",
2119
+ type=int,
2120
+ default=ServerArgs.speculative_ngram_min_bfs_breadth,
2121
+ help="The minimum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2122
+ )
2123
+ parser.add_argument(
2124
+ "--speculative-ngram-max-bfs-breadth",
2125
+ type=int,
2126
+ default=ServerArgs.speculative_ngram_max_bfs_breadth,
2127
+ help="The maximum breadth for BFS (Breadth-First Search) in ngram speculative decoding.",
2128
+ )
2129
+ parser.add_argument(
2130
+ "--speculative-ngram-match-type",
2131
+ type=str,
2132
+ choices=["BFS", "PROB"],
2133
+ default=ServerArgs.speculative_ngram_match_type,
2134
+ help="The match type for cache tree.",
2135
+ )
2136
+ parser.add_argument(
2137
+ "--speculative-ngram-branch-length",
2138
+ type=int,
2139
+ default=ServerArgs.speculative_ngram_branch_length,
2140
+ help="The branch length for ngram speculative decoding.",
2141
+ )
2142
+ parser.add_argument(
2143
+ "--speculative-ngram-capacity",
2144
+ type=int,
2145
+ default=ServerArgs.speculative_ngram_capacity,
2146
+ help="The cache capacity for ngram speculative decoding.",
2147
+ )
1716
2148
 
1717
2149
  # Expert parallelism
1718
2150
  parser.add_argument(
@@ -1881,6 +2313,13 @@ class ServerArgs:
1881
2313
  default=ServerArgs.hicache_write_policy,
1882
2314
  help="The write policy of hierarchical cache.",
1883
2315
  )
2316
+ parser.add_argument(
2317
+ "--radix-eviction-policy",
2318
+ type=str,
2319
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2320
+ default=ServerArgs.radix_eviction_policy,
2321
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2322
+ )
1884
2323
  parser.add_argument(
1885
2324
  "--hicache-io-backend",
1886
2325
  type=str,
@@ -1898,9 +2337,12 @@ class ServerArgs:
1898
2337
  parser.add_argument(
1899
2338
  "--hicache-storage-backend",
1900
2339
  type=str,
1901
- choices=["file", "mooncake", "hf3fs", "nixl"],
2340
+ choices=["file", "mooncake", "hf3fs", "nixl", "aibrix", "dynamic", "eic"],
1902
2341
  default=ServerArgs.hicache_storage_backend,
1903
- help="The storage backend for hierarchical KV cache.",
2342
+ help="The storage backend for hierarchical KV cache. "
2343
+ "Built-in backends: file, mooncake, hf3fs, nixl, aibrix. "
2344
+ "For dynamic backend, use --hicache-storage-backend-extra-config to specify: "
2345
+ "backend_name (custom name), module_path (Python module path), class_name (backend class name).",
1904
2346
  )
1905
2347
  parser.add_argument(
1906
2348
  "--hicache-storage-prefetch-policy",
@@ -2064,6 +2506,11 @@ class ServerArgs:
2064
2506
  action="store_true",
2065
2507
  help="Enable using mscclpp for small messages for all-reduce kernel and fall back to NCCL.",
2066
2508
  )
2509
+ parser.add_argument(
2510
+ "--enable-torch-symm-mem",
2511
+ action="store_true",
2512
+ help="Enable using torch symm mem for all-reduce kernel and fall back to NCCL. Only supports CUDA device SM90 and above. SM90 supports world size 4, 6, 8. SM10 supports world size 6, 8.",
2513
+ )
2067
2514
  parser.add_argument(
2068
2515
  "--disable-overlap-schedule",
2069
2516
  action="store_true",
@@ -2089,6 +2536,11 @@ class ServerArgs:
2089
2536
  action="store_true",
2090
2537
  help="Enabling two micro batches to overlap.",
2091
2538
  )
2539
+ parser.add_argument(
2540
+ "--enable-single-batch-overlap",
2541
+ action="store_true",
2542
+ help="Let computation and communication overlap within one micro batch.",
2543
+ )
2092
2544
  parser.add_argument(
2093
2545
  "--tbo-token-distribution-threshold",
2094
2546
  type=float,
@@ -2158,6 +2610,11 @@ class ServerArgs:
2158
2610
  action="store_true",
2159
2611
  help="Allow saving memory using release_memory_occupation and resume_memory_occupation",
2160
2612
  )
2613
+ parser.add_argument(
2614
+ "--enable-weights-cpu-backup",
2615
+ action="store_true",
2616
+ help="Save model weights to CPU memory during release_weights_occupation and resume_weights_occupation",
2617
+ )
2161
2618
  parser.add_argument(
2162
2619
  "--allow-auto-truncate",
2163
2620
  action="store_true",
@@ -2188,6 +2645,11 @@ class ServerArgs:
2188
2645
  action="store_true",
2189
2646
  help="Adopt base image processor instead of fast image processor.",
2190
2647
  )
2648
+ parser.add_argument(
2649
+ "--keep-mm-feature-on-device",
2650
+ action="store_true",
2651
+ help="Keep multimodal feature tensors on device after processing to save D2H copy.",
2652
+ )
2191
2653
  parser.add_argument(
2192
2654
  "--enable-return-hidden-states",
2193
2655
  action="store_true",
@@ -2295,6 +2757,11 @@ class ServerArgs:
2295
2757
  "or multiple comma-separated devices (e.g., --disaggregation-ib-device mlx5_0,mlx5_1). "
2296
2758
  "Default is None, which triggers automatic device detection when mooncake backend is enabled.",
2297
2759
  )
2760
+ parser.add_argument(
2761
+ "--disaggregation-decode-enable-offload-kvcache",
2762
+ action="store_true",
2763
+ help="Enable async KV cache offloading on decode server (PD mode).",
2764
+ )
2298
2765
  parser.add_argument(
2299
2766
  "--num-reserved-decode-tokens",
2300
2767
  type=int,
@@ -2321,6 +2788,24 @@ class ServerArgs:
2321
2788
  action="store_true",
2322
2789
  help="Disable mmap while loading weight using safetensors.",
2323
2790
  )
2791
+ parser.add_argument(
2792
+ "--remote-instance-weight-loader-seed-instance-ip",
2793
+ type=str,
2794
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_ip,
2795
+ help="The ip of the seed instance for loading weights from remote instance.",
2796
+ )
2797
+ parser.add_argument(
2798
+ "--remote-instance-weight-loader-seed-instance-service-port",
2799
+ type=int,
2800
+ default=ServerArgs.remote_instance_weight_loader_seed_instance_service_port,
2801
+ help="The service port of the seed instance for loading weights from remote instance.",
2802
+ )
2803
+ parser.add_argument(
2804
+ "--remote-instance-weight-loader-send-weights-group-ports",
2805
+ type=json_list_type,
2806
+ default=ServerArgs.remote_instance_weight_loader_send_weights_group_ports,
2807
+ help="The communication group ports for loading weights from remote instance.",
2808
+ )
2324
2809
 
2325
2810
  # For PD-Multiplexing
2326
2811
  parser.add_argument(
@@ -2336,41 +2821,55 @@ class ServerArgs:
2336
2821
  help="Number of sm partition groups.",
2337
2822
  )
2338
2823
 
2824
+ # For deterministic inference
2825
+ parser.add_argument(
2826
+ "--enable-deterministic-inference",
2827
+ action="store_true",
2828
+ help="Enable deterministic inference mode with batch invariant ops.",
2829
+ )
2830
+
2339
2831
  # Deprecated arguments
2340
2832
  parser.add_argument(
2341
2833
  "--enable-ep-moe",
2342
- action="store_true",
2343
- help="(Deprecated) Enabling expert parallelism for moe. The ep size is equal to the tp size.",
2834
+ action=DeprecatedAction,
2835
+ help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
2344
2836
  )
2345
2837
  parser.add_argument(
2346
2838
  "--enable-deepep-moe",
2347
- action="store_true",
2348
- help="(Deprecated) Enabling DeepEP MoE implementation for EP MoE.",
2839
+ action=DeprecatedAction,
2840
+ help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
2349
2841
  )
2350
2842
  parser.add_argument(
2351
2843
  "--enable-flashinfer-cutlass-moe",
2352
- action="store_true",
2353
- help="(Deprecated) Enable FlashInfer CUTLASS MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2844
+ action=DeprecatedAction,
2845
+ help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
2354
2846
  )
2355
2847
  parser.add_argument(
2356
2848
  "--enable-flashinfer-cutedsl-moe",
2357
- action="store_true",
2358
- help="(Deprecated) Enable FlashInfer CuteDSL MoE backend for modelopt_fp4 quant on Blackwell. Supports MoE-EP",
2849
+ action=DeprecatedAction,
2850
+ help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
2359
2851
  )
2360
2852
  parser.add_argument(
2361
2853
  "--enable-flashinfer-trtllm-moe",
2362
- action="store_true",
2363
- help="(Deprecated) Enable FlashInfer TRTLLM MoE backend on Blackwell. Supports BlockScale FP8 MoE-EP",
2854
+ action=DeprecatedAction,
2855
+ help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
2364
2856
  )
2365
2857
  parser.add_argument(
2366
2858
  "--enable-triton-kernel-moe",
2367
- action="store_true",
2368
- help="(Deprecated) Use triton moe grouped gemm kernel.",
2859
+ action=DeprecatedAction,
2860
+ help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
2369
2861
  )
2370
2862
  parser.add_argument(
2371
2863
  "--enable-flashinfer-mxfp4-moe",
2372
- action="store_true",
2373
- help="(Deprecated) Enable FlashInfer MXFP4 MoE backend for modelopt_fp4 quant on Blackwell.",
2864
+ action=DeprecatedAction,
2865
+ help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
2866
+ )
2867
+
2868
+ # Configuration file support
2869
+ parser.add_argument(
2870
+ "--config",
2871
+ type=str,
2872
+ help="Read CLI options from a config file. Must be a YAML file with configuration options.",
2374
2873
  )
2375
2874
 
2376
2875
  @classmethod
@@ -2451,6 +2950,13 @@ class ServerArgs:
2451
2950
  "--generation-tokens-buckets", self.generation_tokens_buckets
2452
2951
  )
2453
2952
 
2953
+ # Check scheduling policy
2954
+ if self.enable_priority_scheduling:
2955
+ assert self.schedule_policy in [
2956
+ "fcfs",
2957
+ "lof",
2958
+ ], f"To use priority scheduling, schedule_policy must be 'fcfs' or 'lof'. '{self.schedule_policy}' is not supported."
2959
+
2454
2960
  def check_lora_server_args(self):
2455
2961
  assert self.max_loras_per_batch > 0, "max_loras_per_batch must be positive"
2456
2962
 
@@ -2534,6 +3040,12 @@ class ServerArgs:
2534
3040
  f"max_loaded_loras={self.max_loaded_loras}, lora_paths={len(self.lora_paths)}"
2535
3041
  )
2536
3042
 
3043
+ if self.max_lora_chunk_size is not None:
3044
+ assert (
3045
+ 16 <= self.max_lora_chunk_size <= 128
3046
+ and (self.max_lora_chunk_size & (self.max_lora_chunk_size - 1)) == 0
3047
+ ), "--max-lora-chunk-size must be a power of 2 between 16 and 128."
3048
+
2537
3049
  def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
2538
3050
  larger_tp = max(decode_tp, prefill_tp)
2539
3051
  smaller_tp = min(decode_tp, prefill_tp)
@@ -2551,8 +3063,8 @@ class ServerArgs:
2551
3063
  assert rule in [
2552
3064
  "tse",
2553
3065
  "default",
2554
- "customer",
2555
- ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'customer'"
3066
+ "custom",
3067
+ ], f"Unsupported {arg_name} rule type: '{rule}'. Must be one of: 'tse', 'default', 'custom'"
2556
3068
 
2557
3069
  if rule == "tse":
2558
3070
  assert (
@@ -2575,95 +3087,20 @@ class ServerArgs:
2575
3087
  len(buckets_rule) == 1
2576
3088
  ), f"{arg_name} default rule should only have one parameter: ['default'], got {len(buckets_rule)}"
2577
3089
 
2578
- elif rule == "customer":
3090
+ elif rule == "custom":
2579
3091
  assert (
2580
3092
  len(buckets_rule) >= 2
2581
- ), f"{arg_name} customer rule requires at least one bucket value: ['customer', value1, ...]"
3093
+ ), f"{arg_name} custom rule requires at least one bucket value: ['custom', value1, ...]"
2582
3094
  try:
2583
3095
  bucket_values = [float(x) for x in buckets_rule[1:]]
2584
3096
  except ValueError:
2585
- assert False, f"{arg_name} customer rule bucket values must be numeric"
3097
+ assert False, f"{arg_name} custom rule bucket values must be numeric"
2586
3098
  assert len(set(bucket_values)) == len(
2587
3099
  bucket_values
2588
- ), f"{arg_name} customer rule bucket values should not contain duplicates"
3100
+ ), f"{arg_name} custom rule bucket values should not contain duplicates"
2589
3101
  assert all(
2590
3102
  val >= 0 for val in bucket_values
2591
- ), f"{arg_name} customer rule bucket values should be non-negative"
2592
-
2593
- def model_specific_adjustments(self):
2594
- hf_config = self.get_hf_config()
2595
- model_arch = hf_config.architectures[0]
2596
- if model_arch in ["GptOssForCausalLM"]:
2597
- if self.attention_backend is None:
2598
- if is_cuda() and is_sm100_supported():
2599
- self.attention_backend = "trtllm_mha"
2600
- elif is_cuda() and is_sm90_supported():
2601
- self.attention_backend = "fa3"
2602
- else:
2603
- self.attention_backend = "triton"
2604
- supported_backends = ["triton", "trtllm_mha", "fa3"]
2605
- logger.info(
2606
- f"Use {self.attention_backend} as attention backend for GptOssForCausalLM"
2607
- )
2608
- assert (
2609
- self.attention_backend in supported_backends
2610
- ), f"GptOssForCausalLM requires one of {supported_backends} attention backend, but got '{self.attention_backend}'"
2611
-
2612
- if is_sm100_supported():
2613
- if not self.enable_dp_attention:
2614
- self.enable_flashinfer_allreduce_fusion = True
2615
- logger.info(
2616
- "Enable FlashInfer AllReduce Fusion on sm100 for GptOssForCausalLM"
2617
- )
2618
- quantization_config = getattr(hf_config, "quantization_config", None)
2619
- is_mxfp4_quant_format = (
2620
- quantization_config is not None
2621
- and quantization_config.get("quant_method") == "mxfp4"
2622
- )
2623
-
2624
- if is_sm100_supported() and is_mxfp4_quant_format:
2625
- self.moe_runner_backend = "flashinfer_mxfp4"
2626
- logger.warning(
2627
- "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
2628
- )
2629
- else:
2630
- if self.moe_runner_backend == "triton_kernel":
2631
- assert (
2632
- self.ep_size == 1
2633
- ), "Triton kernel MoE is only supported when ep_size == 1"
2634
- if (
2635
- self.moe_runner_backend == "auto"
2636
- and self.ep_size == 1
2637
- and is_triton_kernels_available()
2638
- ):
2639
- self.moe_runner_backend = "triton_kernel"
2640
- logger.warning(
2641
- "Detected GPT-OSS model, enabling triton_kernels MOE kernel."
2642
- )
2643
- self.disable_hybrid_swa_memory = True
2644
- if is_mxfp4_quant_format:
2645
- # use bf16 for mxfp4 triton kernels
2646
- self.dtype = "bfloat16"
2647
-
2648
- elif "Llama4" in model_arch:
2649
- assert self.attention_backend in {
2650
- "fa3",
2651
- "aiter",
2652
- "triton",
2653
- }, "fa3, aiter, or triton is required for Llama4 model"
2654
- elif model_arch in [
2655
- "Gemma2ForCausalLM",
2656
- "Gemma3ForCausalLM",
2657
- "Gemma3ForConditionalGeneration",
2658
- "Gemma3nForCausalLM",
2659
- "Gemma3nForConditionalGeneration",
2660
- ]:
2661
- # FIXME: https://github.com/sgl-project/sglang/pull/7367 is not compatible with gemma2 model.
2662
- # It failed at this test: https://github.com/sgl-project/sglang/actions/runs/16255155597/job/45890331952#step:4:736
2663
- logger.warning(
2664
- f"Disable hybrid SWA memory for {model_arch} as it is not yet supported."
2665
- )
2666
- self.disable_hybrid_swa_memory = True
3103
+ ), f"{arg_name} custom rule bucket values should be non-negative"
2667
3104
 
2668
3105
  def adjust_mem_fraction_for_vlm(self, model_config):
2669
3106
  vision_config = getattr(model_config.hf_config, "vision_config", None)
@@ -2715,6 +3152,26 @@ def prepare_server_args(argv: List[str]) -> ServerArgs:
2715
3152
  Returns:
2716
3153
  The server arguments.
2717
3154
  """
3155
+ # Import here to avoid circular imports
3156
+ from sglang.srt.server_args_config_parser import ConfigArgumentMerger
3157
+
3158
+ # Check for config file and merge arguments if present
3159
+ if "--config" in argv:
3160
+ # Extract boolean actions from the parser to handle them correctly
3161
+ parser = argparse.ArgumentParser()
3162
+ ServerArgs.add_cli_args(parser)
3163
+
3164
+ # Get boolean action destinations
3165
+ boolean_actions = []
3166
+ for action in parser._actions:
3167
+ if hasattr(action, "dest") and hasattr(action, "action"):
3168
+ if action.action in ["store_true", "store_false"]:
3169
+ boolean_actions.append(action.dest)
3170
+
3171
+ # Merge config file arguments with CLI arguments
3172
+ config_merger = ConfigArgumentMerger(boolean_actions=boolean_actions)
3173
+ argv = config_merger.merge_config_with_args(argv)
3174
+
2718
3175
  parser = argparse.ArgumentParser()
2719
3176
  ServerArgs.add_cli_args(parser)
2720
3177
  raw_args = parser.parse_args(argv)
@@ -2856,6 +3313,7 @@ def auto_choose_speculative_params(self: ServerArgs):
2856
3313
  # The default value for llama
2857
3314
  return (5, 4, 8)
2858
3315
  elif arch in [
3316
+ "DeepseekV32ForCausalLM",
2859
3317
  "DeepseekV3ForCausalLM",
2860
3318
  "DeepseekV2ForCausalLM",
2861
3319
  "GptOssForCausalLM",