sglang 0.4.9.post2__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (200) hide show
  1. sglang/bench_one_batch.py +2 -1
  2. sglang/eval/loogle_eval.py +7 -0
  3. sglang/srt/_custom_ops.py +29 -1
  4. sglang/srt/configs/deepseekvl2.py +11 -2
  5. sglang/srt/configs/internvl.py +3 -0
  6. sglang/srt/configs/janus_pro.py +3 -0
  7. sglang/srt/configs/model_config.py +10 -8
  8. sglang/srt/configs/update_config.py +3 -1
  9. sglang/srt/conversation.py +2 -1
  10. sglang/srt/custom_op.py +5 -2
  11. sglang/srt/disaggregation/common/conn.py +34 -6
  12. sglang/srt/disaggregation/decode.py +9 -1
  13. sglang/srt/disaggregation/mini_lb.py +3 -2
  14. sglang/srt/disaggregation/mooncake/conn.py +93 -76
  15. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  16. sglang/srt/disaggregation/nixl/conn.py +17 -13
  17. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  18. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  19. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  20. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  21. sglang/srt/distributed/parallel_state.py +103 -15
  22. sglang/srt/entrypoints/engine.py +31 -33
  23. sglang/srt/entrypoints/http_server.py +20 -32
  24. sglang/srt/entrypoints/openai/protocol.py +3 -3
  25. sglang/srt/entrypoints/openai/serving_chat.py +48 -6
  26. sglang/srt/eplb/expert_location_dispatch.py +1 -1
  27. sglang/srt/function_call/base_format_detector.py +74 -12
  28. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  29. sglang/srt/function_call/ebnf_composer.py +95 -63
  30. sglang/srt/function_call/function_call_parser.py +4 -2
  31. sglang/srt/function_call/kimik2_detector.py +41 -16
  32. sglang/srt/function_call/llama32_detector.py +6 -3
  33. sglang/srt/function_call/mistral_detector.py +11 -3
  34. sglang/srt/function_call/pythonic_detector.py +16 -14
  35. sglang/srt/function_call/qwen25_detector.py +12 -3
  36. sglang/srt/function_call/qwen3_coder_detector.py +151 -0
  37. sglang/srt/hf_transformers_utils.py +0 -1
  38. sglang/srt/layers/activation.py +24 -3
  39. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  40. sglang/srt/layers/attention/flashattention_backend.py +3 -3
  41. sglang/srt/layers/attention/flashinfer_backend.py +40 -1
  42. sglang/srt/layers/communicator.py +12 -12
  43. sglang/srt/layers/dp_attention.py +72 -24
  44. sglang/srt/layers/linear.py +13 -102
  45. sglang/srt/layers/logits_processor.py +34 -24
  46. sglang/srt/layers/moe/ep_moe/kernels.py +4 -2
  47. sglang/srt/layers/moe/ep_moe/layer.py +23 -402
  48. sglang/srt/layers/moe/fused_moe_native.py +7 -47
  49. sglang/srt/layers/moe/fused_moe_triton/__init__.py +4 -4
  50. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  51. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  52. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  53. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=384,N=256,device_name=NVIDIA_H20-3e,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  54. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  55. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=385,N=128,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  56. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +54 -263
  57. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -396
  58. sglang/srt/layers/moe/topk.py +190 -23
  59. sglang/srt/layers/quantization/__init__.py +20 -134
  60. sglang/srt/layers/quantization/awq.py +578 -11
  61. sglang/srt/layers/quantization/awq_triton.py +339 -0
  62. sglang/srt/layers/quantization/base_config.py +85 -10
  63. sglang/srt/layers/quantization/blockwise_int8.py +17 -55
  64. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors.py +13 -11
  65. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +23 -79
  66. sglang/srt/layers/quantization/fp8.py +273 -62
  67. sglang/srt/layers/quantization/fp8_kernel.py +210 -46
  68. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  69. sglang/srt/layers/quantization/gptq.py +501 -143
  70. sglang/srt/layers/quantization/marlin_utils.py +790 -0
  71. sglang/srt/layers/quantization/modelopt_quant.py +34 -112
  72. sglang/srt/layers/quantization/moe_wna16.py +45 -49
  73. sglang/srt/layers/quantization/petit.py +252 -0
  74. sglang/srt/layers/quantization/petit_utils.py +104 -0
  75. sglang/srt/layers/quantization/qoq.py +7 -6
  76. sglang/srt/layers/quantization/scalar_type.py +352 -0
  77. sglang/srt/layers/quantization/unquant.py +422 -0
  78. sglang/srt/layers/quantization/utils.py +340 -9
  79. sglang/srt/layers/quantization/w4afp8.py +8 -4
  80. sglang/srt/layers/quantization/w8a8_fp8.py +17 -51
  81. sglang/srt/layers/quantization/w8a8_int8.py +51 -115
  82. sglang/srt/layers/radix_attention.py +5 -3
  83. sglang/srt/layers/vocab_parallel_embedding.py +1 -41
  84. sglang/srt/lora/lora.py +0 -4
  85. sglang/srt/lora/lora_manager.py +162 -164
  86. sglang/srt/lora/lora_registry.py +124 -0
  87. sglang/srt/lora/mem_pool.py +83 -35
  88. sglang/srt/lora/utils.py +12 -5
  89. sglang/srt/managers/cache_controller.py +288 -0
  90. sglang/srt/managers/io_struct.py +60 -30
  91. sglang/srt/managers/mm_utils.py +7 -8
  92. sglang/srt/managers/schedule_batch.py +163 -113
  93. sglang/srt/managers/schedule_policy.py +68 -27
  94. sglang/srt/managers/scheduler.py +256 -86
  95. sglang/srt/managers/scheduler_output_processor_mixin.py +22 -4
  96. sglang/srt/managers/tokenizer_manager.py +38 -27
  97. sglang/srt/managers/tp_worker.py +16 -4
  98. sglang/srt/managers/tp_worker_overlap_thread.py +11 -0
  99. sglang/srt/mem_cache/allocator.py +74 -23
  100. sglang/srt/mem_cache/base_prefix_cache.py +14 -2
  101. sglang/srt/mem_cache/chunk_cache.py +5 -2
  102. sglang/srt/mem_cache/hicache_storage.py +168 -0
  103. sglang/srt/mem_cache/hiradix_cache.py +194 -5
  104. sglang/srt/mem_cache/memory_pool.py +16 -1
  105. sglang/srt/mem_cache/memory_pool_host.py +44 -2
  106. sglang/srt/mem_cache/radix_cache.py +26 -0
  107. sglang/srt/mem_cache/swa_radix_cache.py +1025 -0
  108. sglang/srt/metrics/collector.py +9 -0
  109. sglang/srt/model_executor/cuda_graph_runner.py +66 -31
  110. sglang/srt/model_executor/forward_batch_info.py +210 -25
  111. sglang/srt/model_executor/model_runner.py +147 -42
  112. sglang/srt/model_loader/loader.py +7 -1
  113. sglang/srt/model_loader/utils.py +4 -4
  114. sglang/srt/models/clip.py +1 -1
  115. sglang/srt/models/deepseek.py +9 -6
  116. sglang/srt/models/deepseek_janus_pro.py +1 -1
  117. sglang/srt/models/deepseek_v2.py +192 -173
  118. sglang/srt/models/deepseek_vl2.py +5 -5
  119. sglang/srt/models/gemma.py +48 -0
  120. sglang/srt/models/gemma2.py +52 -0
  121. sglang/srt/models/gemma3_causal.py +63 -0
  122. sglang/srt/models/gemma3_mm.py +1 -1
  123. sglang/srt/models/gemma3n_mm.py +2 -4
  124. sglang/srt/models/granitemoe.py +385 -0
  125. sglang/srt/models/grok.py +9 -3
  126. sglang/srt/models/hunyuan.py +63 -16
  127. sglang/srt/models/internvl.py +1 -1
  128. sglang/srt/models/kimi_vl.py +1 -1
  129. sglang/srt/models/llama.py +41 -0
  130. sglang/srt/models/llama4.py +11 -11
  131. sglang/srt/models/llava.py +2 -2
  132. sglang/srt/models/llavavid.py +1 -1
  133. sglang/srt/models/minicpm.py +0 -2
  134. sglang/srt/models/minicpmo.py +3 -7
  135. sglang/srt/models/minicpmv.py +1 -1
  136. sglang/srt/models/mistral.py +1 -1
  137. sglang/srt/models/mixtral.py +9 -2
  138. sglang/srt/models/mllama.py +3 -5
  139. sglang/srt/models/mllama4.py +13 -6
  140. sglang/srt/models/olmoe.py +8 -5
  141. sglang/srt/models/persimmon.py +330 -0
  142. sglang/srt/models/phi.py +321 -0
  143. sglang/srt/models/phi4mm.py +44 -4
  144. sglang/srt/models/phi4mm_audio.py +1260 -0
  145. sglang/srt/models/phi4mm_utils.py +1917 -0
  146. sglang/srt/models/phimoe.py +9 -3
  147. sglang/srt/models/qwen.py +37 -0
  148. sglang/srt/models/qwen2.py +41 -0
  149. sglang/srt/models/qwen2_5_vl.py +4 -4
  150. sglang/srt/models/qwen2_audio.py +1 -1
  151. sglang/srt/models/qwen2_moe.py +53 -9
  152. sglang/srt/models/qwen2_vl.py +4 -4
  153. sglang/srt/models/qwen3.py +65 -1
  154. sglang/srt/models/qwen3_moe.py +57 -24
  155. sglang/srt/models/vila.py +1 -1
  156. sglang/srt/multimodal/processors/base_processor.py +91 -97
  157. sglang/srt/multimodal/processors/clip.py +21 -19
  158. sglang/srt/multimodal/processors/deepseek_vl_v2.py +8 -26
  159. sglang/srt/multimodal/processors/gemma3.py +13 -17
  160. sglang/srt/multimodal/processors/gemma3n.py +19 -23
  161. sglang/srt/multimodal/processors/internvl.py +9 -10
  162. sglang/srt/multimodal/processors/janus_pro.py +12 -27
  163. sglang/srt/multimodal/processors/kimi_vl.py +12 -14
  164. sglang/srt/multimodal/processors/llava.py +4 -2
  165. sglang/srt/multimodal/processors/minicpm.py +35 -44
  166. sglang/srt/multimodal/processors/mlama.py +21 -18
  167. sglang/srt/multimodal/processors/mllama4.py +4 -5
  168. sglang/srt/multimodal/processors/phi4mm.py +63 -39
  169. sglang/srt/multimodal/processors/pixtral.py +14 -35
  170. sglang/srt/multimodal/processors/qwen_audio.py +65 -0
  171. sglang/srt/multimodal/processors/qwen_vl.py +16 -21
  172. sglang/srt/multimodal/processors/vila.py +14 -14
  173. sglang/srt/reasoning_parser.py +46 -4
  174. sglang/srt/sampling/sampling_batch_info.py +6 -5
  175. sglang/srt/sampling/sampling_params.py +8 -1
  176. sglang/srt/server_args.py +454 -270
  177. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  178. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +46 -37
  179. sglang/srt/speculative/eagle_utils.py +51 -23
  180. sglang/srt/speculative/eagle_worker.py +59 -44
  181. sglang/srt/two_batch_overlap.py +10 -5
  182. sglang/srt/utils.py +44 -69
  183. sglang/test/runners.py +14 -3
  184. sglang/test/test_activation.py +50 -1
  185. sglang/test/test_block_fp8.py +8 -3
  186. sglang/test/test_block_fp8_ep.py +1 -1
  187. sglang/test/test_custom_ops.py +12 -7
  188. sglang/test/test_cutlass_w4a8_moe.py +1 -3
  189. sglang/test/test_fp4_moe.py +1 -3
  190. sglang/test/test_marlin_moe.py +286 -0
  191. sglang/test/test_marlin_utils.py +171 -0
  192. sglang/test/test_utils.py +35 -0
  193. sglang/version.py +1 -1
  194. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +10 -10
  195. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +198 -175
  196. sglang/srt/layers/quantization/quant_utils.py +0 -166
  197. sglang/srt/managers/multimodal_processors/qwen_audio.py +0 -94
  198. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
  199. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
  200. {sglang-0.4.9.post2.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -23,8 +23,11 @@ import tempfile
23
23
  from typing import List, Literal, Optional, Union
24
24
 
25
25
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
26
+ from sglang.srt.lora.lora_registry import LoRARef
26
27
  from sglang.srt.reasoning_parser import ReasoningParser
27
28
  from sglang.srt.utils import (
29
+ LORA_TARGET_ALL_MODULES,
30
+ SUPPORTED_LORA_TARGET_MODULES,
28
31
  configure_ipv6,
29
32
  get_device,
30
33
  get_device_memory_capacity,
@@ -46,30 +49,28 @@ class ServerArgs:
46
49
  tokenizer_path: Optional[str] = None
47
50
  tokenizer_mode: str = "auto"
48
51
  skip_tokenizer_init: bool = False
49
- skip_server_warmup: bool = False
50
52
  load_format: str = "auto"
51
53
  model_loader_extra_config: str = "{}"
52
54
  trust_remote_code: bool = False
53
- dtype: str = "auto"
54
- kv_cache_dtype: str = "auto"
55
- quantization: Optional[str] = None
56
- quantization_param_path: Optional[str] = None
57
55
  context_length: Optional[int] = None
58
- device: Optional[str] = None
59
- served_model_name: Optional[str] = None
60
- chat_template: Optional[str] = None
61
- completion_template: Optional[str] = None
62
56
  is_embedding: bool = False
63
57
  enable_multimodal: Optional[bool] = None
64
58
  revision: Optional[str] = None
65
- hybrid_kvcache_ratio: Optional[float] = None
66
- impl: str = "auto"
59
+ model_impl: str = "auto"
67
60
 
68
- # Port for the HTTP server
61
+ # HTTP server
69
62
  host: str = "127.0.0.1"
70
63
  port: int = 30000
64
+ skip_server_warmup: bool = False
65
+ warmups: Optional[str] = None
71
66
  nccl_port: Optional[int] = None
72
67
 
68
+ # Quantization and data type
69
+ dtype: str = "auto"
70
+ quantization: Optional[str] = None
71
+ quantization_param_path: Optional[str] = None
72
+ kv_cache_dtype: str = "auto"
73
+
73
74
  # Memory and scheduling
74
75
  mem_fraction_static: Optional[float] = None
75
76
  max_running_requests: Optional[int] = None
@@ -79,9 +80,13 @@ class ServerArgs:
79
80
  schedule_policy: str = "fcfs"
80
81
  schedule_conservativeness: float = 1.0
81
82
  cpu_offload_gb: int = 0
82
- page_size: int = 1
83
+ page_size: Optional[int] = None
84
+ hybrid_kvcache_ratio: Optional[float] = None
85
+ swa_full_tokens_ratio: float = 0.8
86
+ disable_hybrid_swa_memory: bool = False
83
87
 
84
- # Other runtime options
88
+ # Runtime options
89
+ device: Optional[str] = None
85
90
  tp_size: int = 1
86
91
  pp_size: int = 1
87
92
  max_micro_batch_size: Optional[int] = None
@@ -104,9 +109,10 @@ class ServerArgs:
104
109
  crash_dump_folder: Optional[str] = None
105
110
  show_time_cost: bool = False
106
111
  enable_metrics: bool = False
112
+ enable_metrics_for_all_schedulers: bool = False
107
113
  bucket_time_to_first_token: Optional[List[float]] = None
108
- bucket_e2e_request_latency: Optional[List[float]] = None
109
114
  bucket_inter_token_latency: Optional[List[float]] = None
115
+ bucket_e2e_request_latency: Optional[List[float]] = None
110
116
  collect_tokens_histogram: bool = False
111
117
  decode_log_interval: int = 40
112
118
  enable_request_time_stats_logging: bool = False
@@ -114,6 +120,9 @@ class ServerArgs:
114
120
 
115
121
  # API related
116
122
  api_key: Optional[str] = None
123
+ served_model_name: Optional[str] = None
124
+ chat_template: Optional[str] = None
125
+ completion_template: Optional[str] = None
117
126
  file_storage_path: str = "sglang_storage"
118
127
  enable_cache_report: bool = False
119
128
  reasoning_parser: Optional[str] = None
@@ -133,7 +142,10 @@ class ServerArgs:
133
142
  preferred_sampling_params: Optional[str] = None
134
143
 
135
144
  # LoRA
136
- lora_paths: Optional[Union[dict[str, str], List[str]]] = None
145
+ enable_lora: Optional[bool] = None
146
+ max_lora_rank: Optional[int] = None
147
+ lora_target_modules: Optional[Union[set[str], List[str]]] = None
148
+ lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
137
149
  max_loras_per_batch: int = 8
138
150
  lora_backend: str = "triton"
139
151
 
@@ -175,6 +187,14 @@ class ServerArgs:
175
187
  deepep_config: Optional[str] = None
176
188
  moe_dense_tp_size: Optional[int] = None
177
189
 
190
+ # Hierarchical cache
191
+ enable_hierarchical_cache: bool = False
192
+ hicache_ratio: float = 2.0
193
+ hicache_size: int = 0
194
+ hicache_write_policy: str = "write_through_selective"
195
+ hicache_io_backend: str = ""
196
+ hicache_storage_backend: Optional[str] = None
197
+
178
198
  # Double Sparsity
179
199
  enable_double_sparsity: bool = False
180
200
  ds_channel_config_path: Optional[str] = None
@@ -196,7 +216,6 @@ class ServerArgs:
196
216
  disable_custom_all_reduce: bool = False
197
217
  enable_mscclpp: bool = False
198
218
  disable_overlap_schedule: bool = False
199
- disable_overlap_cg_plan: bool = False
200
219
  enable_mixed_chunk: bool = False
201
220
  enable_dp_attention: bool = False
202
221
  enable_dp_lm_head: bool = False
@@ -213,18 +232,12 @@ class ServerArgs:
213
232
  enable_memory_saver: bool = False
214
233
  allow_auto_truncate: bool = False
215
234
  enable_custom_logit_processor: bool = False
216
- enable_hierarchical_cache: bool = False
217
- hicache_ratio: float = 2.0
218
- hicache_size: int = 0
219
- hicache_write_policy: str = "write_through_selective"
220
- hicache_io_backend: str = ""
221
235
  flashinfer_mla_disable_ragged: bool = False
222
236
  disable_shared_experts_fusion: bool = False
223
237
  disable_chunked_prefix_cache: bool = False
224
238
  disable_fast_image_processor: bool = False
225
239
  enable_return_hidden_states: bool = False
226
240
  enable_triton_kernel_moe: bool = False
227
- warmups: Optional[str] = None
228
241
 
229
242
  # Debug tensor dumps
230
243
  debug_tensor_dump_output_folder: Optional[str] = None
@@ -232,7 +245,7 @@ class ServerArgs:
232
245
  debug_tensor_dump_inject: bool = False
233
246
  debug_tensor_dump_prefill_only: bool = False
234
247
 
235
- # For PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
248
+ # PD disaggregation: can be "null" (not disaggregated), "prefill" (prefill-only), or "decode" (decode-only)
236
249
  disaggregation_mode: str = "null"
237
250
  disaggregation_transfer_backend: str = "mooncake"
238
251
  disaggregation_bootstrap_port: int = 8998
@@ -247,32 +260,26 @@ class ServerArgs:
247
260
  custom_weight_loader: Optional[List[str]] = None
248
261
  weight_loader_disable_mmap: bool = False
249
262
 
263
+ # For PD-Multiplexing
264
+ enable_pdmux: bool = False
265
+ sm_group_num: int = 3
266
+
250
267
  def __post_init__(self):
251
268
  # Expert parallelism
269
+ # We put it here first due to some internal ckpt conversation issues.
252
270
  if self.enable_ep_moe:
253
271
  self.ep_size = self.tp_size
254
272
  logger.warning(
255
273
  f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
256
274
  )
257
- if self.enable_flashinfer_moe:
258
- assert (
259
- self.quantization == "modelopt_fp4"
260
- ), "modelopt_fp4 quantization is required for Flashinfer MOE"
261
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
262
- self.disable_shared_experts_fusion = True
263
- logger.warning(
264
- f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
265
- )
275
+
266
276
  # Set missing default values
267
277
  if self.tokenizer_path is None:
268
278
  self.tokenizer_path = self.model_path
269
-
270
- if self.device is None:
271
- self.device = get_device()
272
-
273
279
  if self.served_model_name is None:
274
280
  self.served_model_name = self.model_path
275
-
281
+ if self.device is None:
282
+ self.device = get_device()
276
283
  if self.random_seed is None:
277
284
  self.random_seed = random.randint(0, 1 << 30)
278
285
 
@@ -323,12 +330,12 @@ class ServerArgs:
323
330
  self.mem_fraction_static = 0.88
324
331
 
325
332
  # Lazy init to avoid circular import
333
+ # Multimodal models need more memory for the image processor
326
334
  from sglang.srt.configs.model_config import ModelConfig
327
335
 
328
- # Multimodal models need more memory for the image processor
329
336
  model_config = ModelConfig.from_server_args(self)
330
337
  if model_config.is_multimodal:
331
- self.mem_fraction_static *= 0.90
338
+ self.adjust_mem_fraction_for_vlm(model_config)
332
339
 
333
340
  # Set chunked prefill size, which depends on the gpu memory capacity
334
341
  if self.chunked_prefill_size is None:
@@ -341,7 +348,6 @@ class ServerArgs:
341
348
  self.chunked_prefill_size = 16384
342
349
  else:
343
350
  self.chunked_prefill_size = 4096
344
- assert self.chunked_prefill_size % self.page_size == 0
345
351
 
346
352
  # Set cuda graph max batch size
347
353
  if self.cuda_graph_max_bs is None:
@@ -352,23 +358,6 @@ class ServerArgs:
352
358
  else:
353
359
  self.cuda_graph_max_bs = 80
354
360
 
355
- assert self.moe_dense_tp_size in {
356
- 1,
357
- None,
358
- }, "moe_dense_tp_size only support 1 and None currently"
359
-
360
- if self.attention_backend == "flashmla":
361
- logger.warning(
362
- "FlashMLA only supports a page_size of 64, change page_size to 64."
363
- )
364
- self.page_size = 64
365
-
366
- if self.attention_backend == "cutlass_mla":
367
- logger.warning(
368
- "Cutlass MLA only supports a page_size of 128, change page_size to 128."
369
- )
370
- self.page_size = 128
371
-
372
361
  # Set kernel backends for hpu device
373
362
  if self.device == "hpu":
374
363
  self.attention_backend = "torch_native"
@@ -397,6 +386,26 @@ class ServerArgs:
397
386
  )
398
387
  self.page_size = 128
399
388
 
389
+ if self.attention_backend == "flashmla":
390
+ logger.warning(
391
+ "FlashMLA only supports a page_size of 64, change page_size to 64."
392
+ )
393
+ self.page_size = 64
394
+
395
+ if self.attention_backend == "cutlass_mla":
396
+ logger.warning(
397
+ "Cutlass MLA only supports a page_size of 128, change page_size to 128."
398
+ )
399
+ self.page_size = 128
400
+
401
+ # Set page size
402
+ if self.page_size is None:
403
+ self.page_size = 1
404
+
405
+ # AMD-specific Triton attention KV splits default number
406
+ if is_hip():
407
+ self.triton_attention_num_kv_splits = 16
408
+
400
409
  # Choose grammar backend
401
410
  if self.grammar_backend is None:
402
411
  self.grammar_backend = "xgrammar"
@@ -418,6 +427,13 @@ class ServerArgs:
418
427
  self.enable_dp_attention
419
428
  ), "Please enable dp attention when setting enable_dp_lm_head. "
420
429
 
430
+ # MoE kernel
431
+ if self.enable_flashinfer_moe:
432
+ assert (
433
+ self.quantization == "modelopt_fp4"
434
+ ), "modelopt_fp4 quantization is required for Flashinfer MOE"
435
+ os.environ["TRTLLM_ENABLE_PDL"] = "1"
436
+
421
437
  # DeepEP MoE
422
438
  if self.enable_deepep_moe:
423
439
  if self.deepep_mode == "normal":
@@ -428,12 +444,6 @@ class ServerArgs:
428
444
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
429
445
  )
430
446
 
431
- if self.pp_size > 1:
432
- self.disable_overlap_schedule = True
433
- logger.warning(
434
- "Pipeline parallelism is incompatible with overlap schedule."
435
- )
436
-
437
447
  if self.enable_eplb and (self.expert_distribution_recorder_mode is None):
438
448
  self.expert_distribution_recorder_mode = "stat"
439
449
  logger.info(
@@ -459,6 +469,13 @@ class ServerArgs:
459
469
  elif self.expert_distribution_recorder_mode is not None:
460
470
  self.expert_distribution_recorder_buffer_size = 1000
461
471
 
472
+ # Pipeline parallelism
473
+ if self.pp_size > 1:
474
+ self.disable_overlap_schedule = True
475
+ logger.warning(
476
+ "Pipeline parallelism is incompatible with overlap schedule."
477
+ )
478
+
462
479
  # Speculative Decoding
463
480
  if self.speculative_algorithm == "NEXTN":
464
481
  # NEXTN shares the same implementation of EAGLE
@@ -479,10 +496,9 @@ class ServerArgs:
479
496
  "eagle speculative decoding."
480
497
  )
481
498
 
482
- model_arch = get_model_arch(self)
483
-
484
- # Auto set draft_model_path DeepSeek-V3/R1
499
+ model_arch = self.get_hf_config().architectures[0]
485
500
  if model_arch == "DeepseekV3ForCausalLM":
501
+ # Auto set draft_model_path DeepSeek-V3/R1
486
502
  if self.speculative_draft_model_path is None:
487
503
  self.speculative_draft_model_path = self.model_path
488
504
  else:
@@ -521,12 +537,11 @@ class ServerArgs:
521
537
  ) and check_gguf_file(self.model_path):
522
538
  self.quantization = self.load_format = "gguf"
523
539
 
540
+ # Model loading
524
541
  if is_remote_url(self.model_path):
525
542
  self.load_format = "remote"
526
-
527
- # AMD-specific Triton attention KV splits default number
528
- if is_hip():
529
- self.triton_attention_num_kv_splits = 16
543
+ if self.custom_weight_loader is None:
544
+ self.custom_weight_loader = []
530
545
 
531
546
  # PD disaggregation
532
547
  if self.disaggregation_mode == "decode":
@@ -551,6 +566,7 @@ class ServerArgs:
551
566
  self.disable_cuda_graph = True
552
567
  logger.warning("Cuda graph is disabled for prefill server")
553
568
 
569
+ # Propagate env vars
554
570
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
555
571
  "1" if self.enable_torch_compile else "0"
556
572
  )
@@ -559,20 +575,9 @@ class ServerArgs:
559
575
  "1" if self.disable_outlines_disk_cache else "0"
560
576
  )
561
577
 
562
- if self.custom_weight_loader is None:
563
- self.custom_weight_loader = []
564
-
565
- def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
566
- larger_tp = max(decode_tp, prefill_tp)
567
- smaller_tp = min(decode_tp, prefill_tp)
568
- assert larger_tp % smaller_tp == 0, (
569
- "Different tp size is supported only when one tp is multiple of the other. "
570
- f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
571
- )
572
-
573
578
  @staticmethod
574
579
  def add_cli_args(parser: argparse.ArgumentParser):
575
- # Model and port args
580
+ # Model and tokenizer
576
581
  parser.add_argument(
577
582
  "--model-path",
578
583
  "--model",
@@ -586,24 +591,6 @@ class ServerArgs:
586
591
  default=ServerArgs.tokenizer_path,
587
592
  help="The path of the tokenizer.",
588
593
  )
589
- parser.add_argument(
590
- "--host",
591
- type=str,
592
- default=ServerArgs.host,
593
- help="The host of the HTTP server.",
594
- )
595
- parser.add_argument(
596
- "--port",
597
- type=int,
598
- default=ServerArgs.port,
599
- help="The port of the HTTP server.",
600
- )
601
- parser.add_argument(
602
- "--nccl-port",
603
- type=int,
604
- default=ServerArgs.nccl_port,
605
- help="The port for NCCL distributed environment setup. Defaults to a random port.",
606
- )
607
594
  parser.add_argument(
608
595
  "--tokenizer-mode",
609
596
  type=str,
@@ -618,11 +605,6 @@ class ServerArgs:
618
605
  action="store_true",
619
606
  help="If set, skip init tokenizer and pass input_ids in generate request.",
620
607
  )
621
- parser.add_argument(
622
- "--skip-server-warmup",
623
- action="store_true",
624
- help="If set, skip warmup.",
625
- )
626
608
  parser.add_argument(
627
609
  "--load-format",
628
610
  type=str,
@@ -668,6 +650,77 @@ class ServerArgs:
668
650
  action="store_true",
669
651
  help="Whether or not to allow for custom models defined on the Hub in their own modeling files.",
670
652
  )
653
+ parser.add_argument(
654
+ "--context-length",
655
+ type=int,
656
+ default=ServerArgs.context_length,
657
+ help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
658
+ )
659
+ parser.add_argument(
660
+ "--is-embedding",
661
+ action="store_true",
662
+ help="Whether to use a CausalLM as an embedding model.",
663
+ )
664
+ parser.add_argument(
665
+ "--enable-multimodal",
666
+ default=ServerArgs.enable_multimodal,
667
+ action="store_true",
668
+ help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
669
+ )
670
+ parser.add_argument(
671
+ "--revision",
672
+ type=str,
673
+ default=None,
674
+ help="The specific model version to use. It can be a branch "
675
+ "name, a tag name, or a commit id. If unspecified, will use "
676
+ "the default version.",
677
+ )
678
+ parser.add_argument(
679
+ "--model-impl",
680
+ type=str,
681
+ default=ServerArgs.model_impl,
682
+ help="Which implementation of the model to use.\n\n"
683
+ '* "auto" will try to use the SGLang implementation if it exists '
684
+ "and fall back to the Transformers implementation if no SGLang "
685
+ "implementation is available.\n"
686
+ '* "sglang" will use the SGLang model implementation.\n'
687
+ '* "transformers" will use the Transformers model '
688
+ "implementation.\n",
689
+ )
690
+
691
+ # HTTP server
692
+ parser.add_argument(
693
+ "--host",
694
+ type=str,
695
+ default=ServerArgs.host,
696
+ help="The host of the HTTP server.",
697
+ )
698
+ parser.add_argument(
699
+ "--port",
700
+ type=int,
701
+ default=ServerArgs.port,
702
+ help="The port of the HTTP server.",
703
+ )
704
+ parser.add_argument(
705
+ "--skip-server-warmup",
706
+ action="store_true",
707
+ help="If set, skip warmup.",
708
+ )
709
+ parser.add_argument(
710
+ "--warmups",
711
+ type=str,
712
+ required=False,
713
+ help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
714
+ "will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
715
+ )
716
+ parser.add_argument(
717
+ "--nccl-port",
718
+ type=int,
719
+ default=ServerArgs.nccl_port,
720
+ help="The port for NCCL distributed environment setup. Defaults to a random port.",
721
+ )
722
+
723
+ # Quantization and data type
671
724
  parser.add_argument(
672
725
  "--dtype",
673
726
  type=str,
@@ -682,13 +735,6 @@ class ServerArgs:
682
735
  '* "float" is shorthand for FP32 precision.\n'
683
736
  '* "float32" for FP32 precision.',
684
737
  )
685
- parser.add_argument(
686
- "--kv-cache-dtype",
687
- type=str,
688
- default=ServerArgs.kv_cache_dtype,
689
- choices=["auto", "fp8_e5m2", "fp8_e4m3"],
690
- help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
691
- )
692
738
  parser.add_argument(
693
739
  "--quantization",
694
740
  type=str,
@@ -704,6 +750,7 @@ class ServerArgs:
704
750
  "gguf",
705
751
  "modelopt",
706
752
  "modelopt_fp4",
753
+ "petit_nvfp4",
707
754
  "w8a8_int8",
708
755
  "w8a8_fp8",
709
756
  "moe_wna16",
@@ -722,65 +769,11 @@ class ServerArgs:
722
769
  "default to 1.0, which may cause accuracy issues. ",
723
770
  )
724
771
  parser.add_argument(
725
- "--context-length",
726
- type=int,
727
- default=ServerArgs.context_length,
728
- help="The model's maximum context length. Defaults to None (will use the value from the model's config.json instead).",
729
- )
730
- parser.add_argument(
731
- "--device",
732
- type=str,
733
- default=ServerArgs.device,
734
- help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
735
- )
736
- parser.add_argument(
737
- "--served-model-name",
738
- type=str,
739
- default=ServerArgs.served_model_name,
740
- help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
741
- )
742
- parser.add_argument(
743
- "--chat-template",
744
- type=str,
745
- default=ServerArgs.chat_template,
746
- help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
747
- )
748
- parser.add_argument(
749
- "--completion-template",
750
- type=str,
751
- default=ServerArgs.completion_template,
752
- help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
753
- )
754
- parser.add_argument(
755
- "--is-embedding",
756
- action="store_true",
757
- help="Whether to use a CausalLM as an embedding model.",
758
- )
759
- parser.add_argument(
760
- "--enable-multimodal",
761
- default=ServerArgs.enable_multimodal,
762
- action="store_true",
763
- help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
764
- )
765
- parser.add_argument(
766
- "--revision",
767
- type=str,
768
- default=None,
769
- help="The specific model version to use. It can be a branch "
770
- "name, a tag name, or a commit id. If unspecified, will use "
771
- "the default version.",
772
- )
773
- parser.add_argument(
774
- "--impl",
772
+ "--kv-cache-dtype",
775
773
  type=str,
776
- default=ServerArgs.impl,
777
- help="Which implementation of the model to use.\n\n"
778
- '* "auto" will try to use the SGLang implementation if it exists '
779
- "and fall back to the Transformers implementation if no SGLang "
780
- "implementation is available.\n"
781
- '* "sglang" will use the SGLang model implementation.\n'
782
- '* "transformers" will use the Transformers model '
783
- "implementation.\n",
774
+ default=ServerArgs.kv_cache_dtype,
775
+ choices=["auto", "fp8_e5m2", "fp8_e4m3"],
776
+ help='Data type for kv cache storage. "auto" will use model data type. "fp8_e5m2" and "fp8_e4m3" is supported for CUDA 11.8+.',
784
777
  )
785
778
 
786
779
  # Memory and scheduling
@@ -852,8 +845,26 @@ class ServerArgs:
852
845
  "(1.0 = pure hybrid: swa_size / full_size = local_attention_size / context_length)"
853
846
  ),
854
847
  )
848
+ parser.add_argument(
849
+ "--swa-full-tokens-ratio",
850
+ type=float,
851
+ default=ServerArgs.swa_full_tokens_ratio,
852
+ help="The ratio of SWA layer KV tokens / full layer KV tokens, regardless of the number of swa:full layers. It should be between 0 and 1. "
853
+ "E.g. 0.5 means if each swa layer has 50 tokens, then each full layer has 100 tokens.",
854
+ )
855
+ parser.add_argument(
856
+ "--disable-hybrid-swa-memory",
857
+ action="store_true",
858
+ help="Disable the hybrid SWA memory.",
859
+ )
855
860
 
856
- # Other runtime options
861
+ # Runtime options
862
+ parser.add_argument(
863
+ "--device",
864
+ type=str,
865
+ default=ServerArgs.device,
866
+ help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
867
+ )
857
868
  parser.add_argument(
858
869
  "--tensor-parallel-size",
859
870
  "--tp-size",
@@ -895,7 +906,7 @@ class ServerArgs:
895
906
  "--constrained-json-whitespace-pattern",
896
907
  type=str,
897
908
  default=ServerArgs.constrained_json_whitespace_pattern,
898
- help=r"Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
909
+ help="(outlines backend only) Regex pattern for syntactic whitespaces allowed in JSON constrained output. For example, to allow the model generate consecutive whitespaces, set the pattern to [\n\t ]*",
899
910
  )
900
911
  parser.add_argument(
901
912
  "--watchdog-timeout",
@@ -974,6 +985,13 @@ class ServerArgs:
974
985
  action="store_true",
975
986
  help="Enable log prometheus metrics.",
976
987
  )
988
+ parser.add_argument(
989
+ "--enable-metrics-for-all-schedulers",
990
+ action="store_true",
991
+ help="Enable --enable-metrics-for-all-schedulers when you want schedulers on all TP ranks (not just TP 0) "
992
+ "to record request metrics separately. This is especially useful when dp_attention is enabled, as "
993
+ "otherwise all metrics appear to come from TP 0.",
994
+ )
977
995
  parser.add_argument(
978
996
  "--bucket-time-to-first-token",
979
997
  type=float,
@@ -1001,12 +1019,6 @@ class ServerArgs:
1001
1019
  default=ServerArgs.collect_tokens_histogram,
1002
1020
  help="Collect prompt/generation tokens histogram.",
1003
1021
  )
1004
- parser.add_argument(
1005
- "--kv-events-config",
1006
- type=str,
1007
- default=None,
1008
- help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1009
- )
1010
1022
  parser.add_argument(
1011
1023
  "--decode-log-interval",
1012
1024
  type=int,
@@ -1019,6 +1031,12 @@ class ServerArgs:
1019
1031
  default=ServerArgs.enable_request_time_stats_logging,
1020
1032
  help="Enable per request time stats logging",
1021
1033
  )
1034
+ parser.add_argument(
1035
+ "--kv-events-config",
1036
+ type=str,
1037
+ default=None,
1038
+ help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
1039
+ )
1022
1040
 
1023
1041
  # API related
1024
1042
  parser.add_argument(
@@ -1027,6 +1045,24 @@ class ServerArgs:
1027
1045
  default=ServerArgs.api_key,
1028
1046
  help="Set API key of the server. It is also used in the OpenAI API compatible server.",
1029
1047
  )
1048
+ parser.add_argument(
1049
+ "--served-model-name",
1050
+ type=str,
1051
+ default=ServerArgs.served_model_name,
1052
+ help="Override the model name returned by the v1/models endpoint in OpenAI API server.",
1053
+ )
1054
+ parser.add_argument(
1055
+ "--chat-template",
1056
+ type=str,
1057
+ default=ServerArgs.chat_template,
1058
+ help="The buliltin chat template name or the path of the chat template file. This is only used for OpenAI-compatible API server.",
1059
+ )
1060
+ parser.add_argument(
1061
+ "--completion-template",
1062
+ type=str,
1063
+ default=ServerArgs.completion_template,
1064
+ help="The buliltin completion template name or the path of the completion template file. This is only used for OpenAI-compatible API server. only for code completion currently.",
1065
+ )
1030
1066
  parser.add_argument(
1031
1067
  "--file-storage-path",
1032
1068
  type=str,
@@ -1055,9 +1091,10 @@ class ServerArgs:
1055
1091
  "deepseekv3",
1056
1092
  "pythonic",
1057
1093
  "kimi_k2",
1094
+ "qwen3_coder",
1058
1095
  ],
1059
1096
  default=ServerArgs.tool_call_parser,
1060
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
1097
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
1061
1098
  )
1062
1099
 
1063
1100
  # Data parallelism
@@ -1107,6 +1144,28 @@ class ServerArgs:
1107
1144
  )
1108
1145
 
1109
1146
  # LoRA
1147
+ parser.add_argument(
1148
+ "--enable-lora",
1149
+ default=ServerArgs.enable_lora,
1150
+ action="store_true",
1151
+ help="Enable LoRA support for the model. This argument is automatically set to True if `--lora-paths` is provided for backward compatibility.",
1152
+ )
1153
+ parser.add_argument(
1154
+ "--max-lora-rank",
1155
+ default=ServerArgs.max_lora_rank,
1156
+ type=int,
1157
+ help="The maximum rank of LoRA adapters. If not specified, it will be automatically inferred from the adapters provided in --lora-paths.",
1158
+ )
1159
+ parser.add_argument(
1160
+ "--lora-target-modules",
1161
+ type=str,
1162
+ choices=SUPPORTED_LORA_TARGET_MODULES + [LORA_TARGET_ALL_MODULES],
1163
+ nargs="*",
1164
+ default=None,
1165
+ help="The union set of all target modules where LoRA should be applied. If not specified, "
1166
+ "it will be automatically inferred from the adapters provided in --lora-paths. If 'all' is specified, "
1167
+ "all supported modules will be targeted.",
1168
+ )
1110
1169
  parser.add_argument(
1111
1170
  "--lora-paths",
1112
1171
  type=str,
@@ -1160,6 +1219,13 @@ class ServerArgs:
1160
1219
  default=ServerArgs.grammar_backend,
1161
1220
  help="Choose the backend for grammar-guided decoding.",
1162
1221
  )
1222
+ parser.add_argument(
1223
+ "--mm-attention-backend",
1224
+ type=str,
1225
+ choices=["sdpa", "fa3", "triton_attn"],
1226
+ default=ServerArgs.mm_attention_backend,
1227
+ help="Set multimodal attention backend.",
1228
+ )
1163
1229
 
1164
1230
  # Speculative decoding
1165
1231
  parser.add_argument(
@@ -1209,13 +1275,6 @@ class ServerArgs:
1209
1275
  help="The path of the draft model's small vocab table.",
1210
1276
  default=ServerArgs.speculative_token_map,
1211
1277
  )
1212
- parser.add_argument(
1213
- "--mm-attention-backend",
1214
- type=str,
1215
- choices=["sdpa", "fa3", "triton_attn"],
1216
- default=ServerArgs.mm_attention_backend,
1217
- help="Set multimodal attention backend.",
1218
- )
1219
1278
 
1220
1279
  # Expert parallelism
1221
1280
  parser.add_argument(
@@ -1323,6 +1382,46 @@ class ServerArgs:
1323
1382
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
1324
1383
  )
1325
1384
 
1385
+ # Hierarchical cache
1386
+ parser.add_argument(
1387
+ "--enable-hierarchical-cache",
1388
+ action="store_true",
1389
+ help="Enable hierarchical cache",
1390
+ )
1391
+ parser.add_argument(
1392
+ "--hicache-ratio",
1393
+ type=float,
1394
+ default=ServerArgs.hicache_ratio,
1395
+ help="The ratio of the size of host KV cache memory pool to the size of device pool.",
1396
+ )
1397
+ parser.add_argument(
1398
+ "--hicache-size",
1399
+ type=int,
1400
+ default=ServerArgs.hicache_size,
1401
+ help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
1402
+ )
1403
+ parser.add_argument(
1404
+ "--hicache-write-policy",
1405
+ type=str,
1406
+ choices=["write_back", "write_through", "write_through_selective"],
1407
+ default=ServerArgs.hicache_write_policy,
1408
+ help="The write policy of hierarchical cache.",
1409
+ )
1410
+ parser.add_argument(
1411
+ "--hicache-io-backend",
1412
+ type=str,
1413
+ choices=["direct", "kernel"],
1414
+ default=ServerArgs.hicache_io_backend,
1415
+ help="The IO backend for KV cache transfer between CPU and GPU",
1416
+ )
1417
+ parser.add_argument(
1418
+ "--hicache-storage-backend",
1419
+ type=str,
1420
+ choices=["file"], # todo, mooncake
1421
+ default=ServerArgs.hicache_storage_backend,
1422
+ help="The storage backend for hierarchical KV cache.",
1423
+ )
1424
+
1326
1425
  # Double Sparsity
1327
1426
  parser.add_argument(
1328
1427
  "--enable-double-sparsity",
@@ -1423,11 +1522,6 @@ class ServerArgs:
1423
1522
  action="store_true",
1424
1523
  help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
1425
1524
  )
1426
- parser.add_argument(
1427
- "--disable-overlap-cg-plan",
1428
- action="store_true",
1429
- help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
1430
- )
1431
1525
  parser.add_argument(
1432
1526
  "--enable-mixed-chunk",
1433
1527
  action="store_true",
@@ -1515,37 +1609,6 @@ class ServerArgs:
1515
1609
  action="store_true",
1516
1610
  help="Enable users to pass custom logit processors to the server (disabled by default for security)",
1517
1611
  )
1518
- parser.add_argument(
1519
- "--enable-hierarchical-cache",
1520
- action="store_true",
1521
- help="Enable hierarchical cache",
1522
- )
1523
- parser.add_argument(
1524
- "--hicache-ratio",
1525
- type=float,
1526
- default=ServerArgs.hicache_ratio,
1527
- help="The ratio of the size of host KV cache memory pool to the size of device pool.",
1528
- )
1529
- parser.add_argument(
1530
- "--hicache-size",
1531
- type=int,
1532
- default=ServerArgs.hicache_size,
1533
- help="The size of host KV cache memory pool in gigabytes, which will override the hicache_ratio if set.",
1534
- )
1535
- parser.add_argument(
1536
- "--hicache-write-policy",
1537
- type=str,
1538
- choices=["write_back", "write_through", "write_through_selective"],
1539
- default=ServerArgs.hicache_write_policy,
1540
- help="The write policy of hierarchical cache.",
1541
- )
1542
- parser.add_argument(
1543
- "--hicache-io-backend",
1544
- type=str,
1545
- choices=["direct", "kernel"],
1546
- default=ServerArgs.hicache_io_backend,
1547
- help="The IO backend for KV cache transfer between CPU and GPU",
1548
- )
1549
1612
  parser.add_argument(
1550
1613
  "--flashinfer-mla-disable-ragged",
1551
1614
  action="store_true",
@@ -1576,13 +1639,6 @@ class ServerArgs:
1576
1639
  action="store_true",
1577
1640
  help="Use triton moe grouped gemm kernel.",
1578
1641
  )
1579
- parser.add_argument(
1580
- "--warmups",
1581
- type=str,
1582
- required=False,
1583
- help="Specify custom warmup functions (csv) to run before server starts eg. --warmups=warmup_name1,warmup_name2 "
1584
- "will run the functions `warmup_name1` and `warmup_name2` specified in warmup.py before the server starts listening for requests",
1585
- )
1586
1642
 
1587
1643
  # Debug tensor dumps
1588
1644
  parser.add_argument(
@@ -1609,7 +1665,7 @@ class ServerArgs:
1609
1665
  help="Only dump the tensors for prefill requests (i.e. batch size > 1).",
1610
1666
  )
1611
1667
 
1612
- # Disaggregation
1668
+ # PD disaggregation
1613
1669
  parser.add_argument(
1614
1670
  "--disaggregation-mode",
1615
1671
  type=str,
@@ -1668,6 +1724,8 @@ class ServerArgs:
1668
1724
  default=None,
1669
1725
  help="The URL of the PD disaggregation load balancer. If set, the prefill/decode server will register with the load balancer.",
1670
1726
  )
1727
+
1728
+ # Custom weight loader
1671
1729
  parser.add_argument(
1672
1730
  "--custom-weight-loader",
1673
1731
  type=str,
@@ -1675,6 +1733,19 @@ class ServerArgs:
1675
1733
  default=None,
1676
1734
  help="The custom dataloader which used to update the model. Should be set with a valid import path, such as my_package.weight_load_func",
1677
1735
  )
1736
+ parser.add_argument(
1737
+ "--enable-pdmux",
1738
+ action="store_true",
1739
+ help="Enable PD-Multiplexing, PD running on greenctx stream.",
1740
+ )
1741
+
1742
+ # For PD-Multiplexing
1743
+ parser.add_argument(
1744
+ "--sm-group-num",
1745
+ type=int,
1746
+ default=ServerArgs.sm_group_num,
1747
+ help="Number of sm partition groups.",
1748
+ )
1678
1749
  parser.add_argument(
1679
1750
  "--weight-loader-disable-mmap",
1680
1751
  action="store_true",
@@ -1696,12 +1767,23 @@ class ServerArgs:
1696
1767
  else:
1697
1768
  return f"http://{self.host}:{self.port}"
1698
1769
 
1770
+ def get_hf_config(self):
1771
+ kwargs = {}
1772
+ hf_config = get_config(
1773
+ self.model_path,
1774
+ trust_remote_code=self.trust_remote_code,
1775
+ revision=self.revision,
1776
+ model_override_args=json.loads(self.json_model_override_args),
1777
+ **kwargs,
1778
+ )
1779
+ return hf_config
1780
+
1699
1781
  def check_server_args(self):
1782
+ # Check parallel size constraints
1700
1783
  assert (
1701
1784
  self.tp_size * self.pp_size
1702
1785
  ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
1703
1786
 
1704
- # FIXME pp constraints
1705
1787
  if self.pp_size > 1:
1706
1788
  assert (
1707
1789
  self.disable_overlap_schedule
@@ -1712,23 +1794,143 @@ class ServerArgs:
1712
1794
  assert not (
1713
1795
  self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
1714
1796
  ), "multi-node data parallel is not supported unless dp attention!"
1797
+
1798
+ assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1799
+ assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
1800
+
1801
+ assert self.moe_dense_tp_size in {
1802
+ 1,
1803
+ None,
1804
+ }, "moe_dense_tp_size only support 1 and None currently"
1805
+
1806
+ # Check model architecture
1807
+ model_arch = self.get_hf_config().architectures[0]
1808
+ if "Llama4" in model_arch:
1809
+ assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
1810
+
1811
+ # Check LoRA
1812
+ self.check_lora_server_args()
1813
+
1814
+ # Check speculative decoding
1815
+ if self.speculative_algorithm is not None:
1816
+ assert (
1817
+ not self.enable_mixed_chunk
1818
+ ), "enable_mixed_chunk is required for speculative decoding"
1819
+
1820
+ # Check chunked prefill
1821
+ assert (
1822
+ self.chunked_prefill_size % self.page_size == 0
1823
+ ), "chunked_prefill_size must be divisible by page_size"
1824
+
1825
+ def check_lora_server_args(self):
1715
1826
  assert (
1716
1827
  self.max_loras_per_batch > 0
1717
1828
  # FIXME
1718
1829
  and (self.lora_paths is None or self.disable_radix_cache)
1719
1830
  ), "compatibility of lora and radix attention is in progress"
1720
- assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1721
- assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
1722
1831
 
1723
- if isinstance(self.lora_paths, list):
1724
- lora_paths = self.lora_paths
1725
- self.lora_paths = {}
1726
- for lora_path in lora_paths:
1727
- if "=" in lora_path:
1728
- name, path = lora_path.split("=", 1)
1729
- self.lora_paths[name] = path
1730
- else:
1731
- self.lora_paths[lora_path] = lora_path
1832
+ # Enable LoRA if any LoRA paths are provided for backward compatibility.
1833
+ if self.lora_paths:
1834
+ if self.enable_lora is None:
1835
+ self.enable_lora = True
1836
+ logger.info(
1837
+ "--enable-lora is set to True because --lora-paths is provided."
1838
+ )
1839
+ elif self.enable_lora is False:
1840
+ logger.warning(
1841
+ "--enable-lora is set to False, any provided lora_paths will be ignored."
1842
+ )
1843
+
1844
+ if self.enable_lora:
1845
+ # Normalize lora_paths to a dictionary if it is a list.
1846
+ if isinstance(self.lora_paths, list):
1847
+ lora_paths = self.lora_paths
1848
+ self.lora_paths = {}
1849
+ for lora_path in lora_paths:
1850
+ if "=" in lora_path:
1851
+ name, path = lora_path.split("=", 1)
1852
+ self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
1853
+ else:
1854
+ self.lora_paths[lora_path] = LoRARef(
1855
+ lora_name=lora_path,
1856
+ lora_path=lora_path,
1857
+ )
1858
+ elif isinstance(self.lora_paths, dict):
1859
+ self.lora_paths = {
1860
+ k: LoRARef(lora_name=k, lora_path=v)
1861
+ for k, v in self.lora_paths.items()
1862
+ }
1863
+ elif self.lora_paths is None:
1864
+ self.lora_paths = {}
1865
+ else:
1866
+ raise ValueError(
1867
+ f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
1868
+ "Expected a list or a dictionary."
1869
+ )
1870
+
1871
+ # Expand target modules
1872
+ if self.lora_target_modules:
1873
+ self.lora_target_modules = set(self.lora_target_modules)
1874
+ if "all" in self.lora_target_modules:
1875
+ assert (
1876
+ len(self.lora_target_modules) == 1
1877
+ ), "If 'all' is specified in --lora-target-modules, it should be the only module specified."
1878
+ self.lora_target_modules = set(SUPPORTED_LORA_TARGET_MODULES)
1879
+
1880
+ # Ensure sufficient information is provided for LoRA initialization.
1881
+ assert self.lora_paths or (
1882
+ self.max_lora_rank and self.lora_target_modules
1883
+ ), "When no initial --lora-paths is provided, you need to specify both --max-lora-rank and --lora-target-modules for LoRA initialization."
1884
+
1885
+ def validate_disagg_tp_size(self, prefill_tp: int, decode_tp: int):
1886
+ larger_tp = max(decode_tp, prefill_tp)
1887
+ smaller_tp = min(decode_tp, prefill_tp)
1888
+ assert larger_tp % smaller_tp == 0, (
1889
+ "Different tp size is supported only when one tp is multiple of the other. "
1890
+ f"decode_tp={decode_tp}, prefill_tp={prefill_tp}"
1891
+ )
1892
+
1893
+ def adjust_mem_fraction_for_vlm(self, model_config):
1894
+ vision_config = getattr(model_config.hf_config, "vision_config", None)
1895
+ if vision_config is None:
1896
+ return
1897
+
1898
+ # roughly reduce the mem_fraction_static base on params of Vit
1899
+ original_server_arg_mem_fraction = self.mem_fraction_static
1900
+ # a base mem_fraction_static factor for regular Vit
1901
+ base_mem_fraction_reduction_ratio = 0.95
1902
+
1903
+ vit_num_layers = getattr(vision_config, "num_hidden_layers", 24)
1904
+ vit_hidden_size = getattr(vision_config, "hidden_size", 1024)
1905
+
1906
+ # baseline ViT params (ViT-L/14)
1907
+ baseline_vit_layers = 24
1908
+ baseline_vit_hidden_size = 1024
1909
+
1910
+ # weight params count
1911
+ current_complexity_score = vit_num_layers * (vit_hidden_size**2)
1912
+ baseline_complexity_score = baseline_vit_layers * (baseline_vit_hidden_size**2)
1913
+ complexity_ratio = (
1914
+ current_complexity_score / baseline_complexity_score
1915
+ if baseline_complexity_score > 0
1916
+ else 1.0
1917
+ )
1918
+
1919
+ # every time the complexity grows 100%, adjust final factor for 10%
1920
+ sensitivity_scale = 0.1
1921
+ dynamic_adjustment_factor = 1.0 - sensitivity_scale * (complexity_ratio - 1.0)
1922
+ dynamic_adjustment_factor = max(0.8, min(1.05, dynamic_adjustment_factor))
1923
+
1924
+ final_overall_factor = (
1925
+ base_mem_fraction_reduction_ratio * dynamic_adjustment_factor
1926
+ )
1927
+ self.mem_fraction_static = (
1928
+ original_server_arg_mem_fraction * final_overall_factor
1929
+ )
1930
+ logger.warning(
1931
+ f"Multimodal model: Dynamically adjusted --mem-fraction-static "
1932
+ f"from: {original_server_arg_mem_fraction:.3f} to: {self.mem_fraction_static:.3f}."
1933
+ )
1732
1934
 
1733
1935
 
1734
1936
  def prepare_server_args(argv: List[str]) -> ServerArgs:
@@ -1773,16 +1975,16 @@ class PortArgs:
1773
1975
  @staticmethod
1774
1976
  def init_new(server_args, dp_rank: Optional[int] = None) -> "PortArgs":
1775
1977
  if server_args.nccl_port is None:
1776
- port = server_args.port + random.randint(100, 1000)
1978
+ nccl_port = server_args.port + random.randint(100, 1000)
1777
1979
  while True:
1778
- if is_port_available(port):
1980
+ if is_port_available(nccl_port):
1779
1981
  break
1780
- if port < 60000:
1781
- port += 42
1982
+ if nccl_port < 60000:
1983
+ nccl_port += 42
1782
1984
  else:
1783
- port -= 43
1985
+ nccl_port -= 43
1784
1986
  else:
1785
- port = server_args.nccl_port
1987
+ nccl_port = server_args.nccl_port
1786
1988
 
1787
1989
  if not server_args.enable_dp_attention:
1788
1990
  # Normal case, use IPC within a single node
@@ -1790,7 +1992,7 @@ class PortArgs:
1790
1992
  tokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
1791
1993
  scheduler_input_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
1792
1994
  detokenizer_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
1793
- nccl_port=port,
1995
+ nccl_port=nccl_port,
1794
1996
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
1795
1997
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
1796
1998
  )
@@ -1820,7 +2022,7 @@ class PortArgs:
1820
2022
  tokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base}",
1821
2023
  scheduler_input_ipc_name=f"tcp://{dist_init_host}:{scheduler_input_port}",
1822
2024
  detokenizer_ipc_name=f"tcp://{dist_init_host}:{port_base + 1}",
1823
- nccl_port=port,
2025
+ nccl_port=nccl_port,
1824
2026
  rpc_ipc_name=f"tcp://{dist_init_host}:{port_base + 2}",
1825
2027
  metrics_ipc_name=f"tcp://{dist_init_host}:{port_base + 3}",
1826
2028
  )
@@ -1847,31 +2049,13 @@ class DeprecatedAction(argparse.Action):
1847
2049
  raise ValueError(self.help)
1848
2050
 
1849
2051
 
1850
- def get_model_arch(args: ServerArgs):
1851
- hf_config = get_config(
1852
- args.model_path,
1853
- trust_remote_code=args.trust_remote_code,
1854
- revision=args.revision,
1855
- model_override_args=json.loads(args.json_model_override_args),
1856
- )
1857
- return hf_config.architectures[0]
1858
-
1859
-
1860
2052
  def auto_choose_speculative_params(self: ServerArgs):
1861
2053
  """
1862
2054
  Automatically choose the parameters for speculative decoding.
1863
2055
 
1864
2056
  You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
1865
2057
  """
1866
- kwargs = {}
1867
-
1868
- hf_config = get_config(
1869
- self.model_path,
1870
- trust_remote_code=self.trust_remote_code,
1871
- revision=self.revision,
1872
- model_override_args=json.loads(self.json_model_override_args),
1873
- **kwargs,
1874
- )
2058
+ hf_config = self.get_hf_config()
1875
2059
  arch = hf_config.architectures[0]
1876
2060
 
1877
2061
  if arch in ["LlamaForCausalLM"]: