sglang 0.5.4__py3-none-any.whl → 0.5.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. sglang/bench_one_batch.py +149 -34
  2. sglang/bench_serving.py +73 -14
  3. sglang/compile_deep_gemm.py +13 -7
  4. sglang/launch_server.py +2 -0
  5. sglang/srt/batch_invariant_ops/__init__.py +2 -0
  6. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +221 -4
  7. sglang/srt/checkpoint_engine/__init__.py +9 -0
  8. sglang/srt/checkpoint_engine/update.py +317 -0
  9. sglang/srt/compilation/backend.py +1 -1
  10. sglang/srt/configs/__init__.py +2 -0
  11. sglang/srt/configs/deepseek_ocr.py +542 -10
  12. sglang/srt/configs/deepseekvl2.py +95 -194
  13. sglang/srt/configs/kimi_linear.py +160 -0
  14. sglang/srt/configs/mamba_utils.py +66 -0
  15. sglang/srt/configs/model_config.py +30 -7
  16. sglang/srt/constants.py +7 -0
  17. sglang/srt/debug_utils/tensor_dump_forward_hook.py +149 -0
  18. sglang/srt/disaggregation/decode.py +34 -6
  19. sglang/srt/disaggregation/nixl/conn.py +2 -2
  20. sglang/srt/disaggregation/prefill.py +25 -3
  21. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -1
  22. sglang/srt/distributed/parallel_state.py +9 -12
  23. sglang/srt/entrypoints/engine.py +31 -20
  24. sglang/srt/entrypoints/grpc_server.py +0 -1
  25. sglang/srt/entrypoints/http_server.py +94 -94
  26. sglang/srt/entrypoints/openai/protocol.py +7 -1
  27. sglang/srt/entrypoints/openai/serving_chat.py +42 -0
  28. sglang/srt/entrypoints/openai/serving_completions.py +10 -0
  29. sglang/srt/entrypoints/openai/serving_embedding.py +1 -0
  30. sglang/srt/environ.py +23 -2
  31. sglang/srt/eplb/expert_distribution.py +64 -1
  32. sglang/srt/eplb/expert_location.py +106 -36
  33. sglang/srt/function_call/function_call_parser.py +2 -0
  34. sglang/srt/function_call/minimax_m2.py +367 -0
  35. sglang/srt/grpc/compile_proto.py +3 -0
  36. sglang/srt/layers/activation.py +6 -0
  37. sglang/srt/layers/attention/ascend_backend.py +233 -5
  38. sglang/srt/layers/attention/attention_registry.py +3 -0
  39. sglang/srt/layers/attention/fla/chunk_delta_h.py +61 -32
  40. sglang/srt/layers/attention/fla/fused_recurrent.py +17 -4
  41. sglang/srt/layers/attention/fla/kda.py +1359 -0
  42. sglang/srt/layers/attention/fla/layernorm_gated.py +7 -1
  43. sglang/srt/layers/attention/flashattention_backend.py +19 -8
  44. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  45. sglang/srt/layers/attention/flashinfer_mla_backend.py +21 -11
  46. sglang/srt/layers/attention/flashmla_backend.py +1 -1
  47. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +223 -0
  48. sglang/srt/layers/attention/mamba/mamba.py +20 -11
  49. sglang/srt/layers/attention/nsa/dequant_k_cache.py +138 -6
  50. sglang/srt/layers/attention/nsa/nsa_indexer.py +45 -22
  51. sglang/srt/layers/attention/nsa/quant_k_cache.py +44 -12
  52. sglang/srt/layers/attention/nsa/transform_index.py +1 -1
  53. sglang/srt/layers/attention/nsa_backend.py +157 -23
  54. sglang/srt/layers/attention/triton_backend.py +4 -1
  55. sglang/srt/layers/attention/trtllm_mha_backend.py +10 -4
  56. sglang/srt/layers/attention/trtllm_mla_backend.py +11 -15
  57. sglang/srt/layers/attention/utils.py +78 -0
  58. sglang/srt/layers/communicator.py +24 -1
  59. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  60. sglang/srt/layers/layernorm.py +35 -6
  61. sglang/srt/layers/logits_processor.py +9 -20
  62. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  63. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  64. sglang/srt/layers/moe/ep_moe/layer.py +78 -289
  65. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  66. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=256,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128]_down.json +164 -0
  67. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +68 -22
  68. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_config.py +43 -3
  69. sglang/srt/layers/moe/fused_moe_triton/fused_moe_triton_kernels.py +106 -26
  70. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  71. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  72. sglang/srt/layers/moe/moe_runner/deep_gemm.py +340 -55
  73. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  74. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  75. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  76. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  77. sglang/srt/layers/moe/token_dispatcher/deepep.py +25 -18
  78. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  79. sglang/srt/layers/moe/topk.py +35 -10
  80. sglang/srt/layers/moe/utils.py +3 -4
  81. sglang/srt/layers/pooler.py +21 -2
  82. sglang/srt/layers/quantization/__init__.py +13 -84
  83. sglang/srt/layers/quantization/auto_round.py +394 -0
  84. sglang/srt/layers/quantization/awq.py +0 -3
  85. sglang/srt/layers/quantization/base_config.py +7 -0
  86. sglang/srt/layers/quantization/fp8.py +68 -63
  87. sglang/srt/layers/quantization/fp8_kernel.py +1 -1
  88. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  89. sglang/srt/layers/quantization/gguf.py +566 -0
  90. sglang/srt/layers/quantization/modelopt_quant.py +168 -11
  91. sglang/srt/layers/quantization/mxfp4.py +30 -38
  92. sglang/srt/layers/quantization/unquant.py +23 -45
  93. sglang/srt/layers/quantization/w4afp8.py +38 -2
  94. sglang/srt/layers/radix_attention.py +5 -2
  95. sglang/srt/layers/rotary_embedding.py +130 -46
  96. sglang/srt/layers/sampler.py +12 -1
  97. sglang/srt/lora/lora_registry.py +9 -0
  98. sglang/srt/managers/async_mm_data_processor.py +122 -0
  99. sglang/srt/managers/data_parallel_controller.py +30 -3
  100. sglang/srt/managers/detokenizer_manager.py +3 -0
  101. sglang/srt/managers/io_struct.py +29 -4
  102. sglang/srt/managers/multi_tokenizer_mixin.py +22 -1
  103. sglang/srt/managers/schedule_batch.py +74 -15
  104. sglang/srt/managers/scheduler.py +185 -144
  105. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +40 -3
  107. sglang/srt/managers/scheduler_pp_mixin.py +7 -2
  108. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  109. sglang/srt/managers/scheduler_runtime_checker_mixin.py +45 -0
  110. sglang/srt/managers/scheduler_update_weights_mixin.py +18 -3
  111. sglang/srt/managers/session_controller.py +6 -5
  112. sglang/srt/managers/tokenizer_manager.py +165 -78
  113. sglang/srt/managers/tp_worker.py +24 -1
  114. sglang/srt/mem_cache/base_prefix_cache.py +23 -4
  115. sglang/srt/mem_cache/common.py +1 -0
  116. sglang/srt/mem_cache/hicache_storage.py +7 -1
  117. sglang/srt/mem_cache/memory_pool.py +253 -57
  118. sglang/srt/mem_cache/memory_pool_host.py +12 -5
  119. sglang/srt/mem_cache/radix_cache.py +4 -0
  120. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  121. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +1 -1
  122. sglang/srt/metrics/collector.py +46 -3
  123. sglang/srt/model_executor/cuda_graph_runner.py +15 -3
  124. sglang/srt/model_executor/forward_batch_info.py +55 -14
  125. sglang/srt/model_executor/model_runner.py +77 -170
  126. sglang/srt/model_executor/npu_graph_runner.py +7 -3
  127. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  128. sglang/srt/model_loader/weight_utils.py +1 -1
  129. sglang/srt/models/bailing_moe.py +9 -2
  130. sglang/srt/models/deepseek_nextn.py +11 -2
  131. sglang/srt/models/deepseek_v2.py +296 -78
  132. sglang/srt/models/glm4.py +391 -77
  133. sglang/srt/models/glm4_moe.py +322 -354
  134. sglang/srt/models/glm4_moe_nextn.py +4 -14
  135. sglang/srt/models/glm4v.py +196 -55
  136. sglang/srt/models/glm4v_moe.py +29 -197
  137. sglang/srt/models/gpt_oss.py +1 -10
  138. sglang/srt/models/kimi_linear.py +678 -0
  139. sglang/srt/models/llama4.py +1 -1
  140. sglang/srt/models/llama_eagle3.py +11 -1
  141. sglang/srt/models/longcat_flash.py +2 -2
  142. sglang/srt/models/minimax_m2.py +922 -0
  143. sglang/srt/models/nvila.py +355 -0
  144. sglang/srt/models/nvila_lite.py +184 -0
  145. sglang/srt/models/qwen2.py +23 -2
  146. sglang/srt/models/qwen2_moe.py +30 -15
  147. sglang/srt/models/qwen3.py +35 -5
  148. sglang/srt/models/qwen3_moe.py +18 -12
  149. sglang/srt/models/qwen3_next.py +7 -0
  150. sglang/srt/multimodal/customized_mm_processor_utils.py +35 -0
  151. sglang/srt/multimodal/processors/base_processor.py +1 -0
  152. sglang/srt/multimodal/processors/glm4v.py +1 -1
  153. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  154. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  155. sglang/srt/multiplex/multiplexing_mixin.py +209 -0
  156. sglang/srt/multiplex/pdmux_context.py +164 -0
  157. sglang/srt/parser/conversation.py +7 -1
  158. sglang/srt/parser/reasoning_parser.py +28 -1
  159. sglang/srt/sampling/custom_logit_processor.py +67 -1
  160. sglang/srt/sampling/penaltylib/frequency_penalty.py +6 -8
  161. sglang/srt/sampling/penaltylib/min_new_tokens.py +7 -8
  162. sglang/srt/sampling/penaltylib/orchestrator.py +43 -3
  163. sglang/srt/sampling/penaltylib/presence_penalty.py +6 -8
  164. sglang/srt/server_args.py +459 -199
  165. sglang/srt/single_batch_overlap.py +2 -4
  166. sglang/srt/speculative/draft_utils.py +16 -0
  167. sglang/srt/speculative/eagle_info.py +42 -36
  168. sglang/srt/speculative/eagle_info_v2.py +68 -25
  169. sglang/srt/speculative/eagle_utils.py +261 -16
  170. sglang/srt/speculative/eagle_worker.py +11 -3
  171. sglang/srt/speculative/eagle_worker_v2.py +15 -9
  172. sglang/srt/speculative/spec_info.py +305 -31
  173. sglang/srt/speculative/spec_utils.py +44 -8
  174. sglang/srt/tracing/trace.py +121 -12
  175. sglang/srt/utils/common.py +142 -74
  176. sglang/srt/utils/hf_transformers_utils.py +38 -12
  177. sglang/srt/utils/torch_memory_saver_adapter.py +20 -0
  178. sglang/test/kits/radix_cache_server_kit.py +50 -0
  179. sglang/test/runners.py +31 -7
  180. sglang/test/simple_eval_common.py +5 -3
  181. sglang/test/simple_eval_humaneval.py +1 -0
  182. sglang/test/simple_eval_math.py +1 -0
  183. sglang/test/simple_eval_mmlu.py +1 -0
  184. sglang/test/simple_eval_mmmu_vlm.py +1 -0
  185. sglang/test/test_deterministic.py +235 -12
  186. sglang/test/test_deterministic_utils.py +2 -1
  187. sglang/test/test_utils.py +7 -1
  188. sglang/version.py +1 -1
  189. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/METADATA +15 -28
  190. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/RECORD +194 -175
  191. sglang/srt/models/vila.py +0 -306
  192. /sglang/test/{kit_matched_stop.py → kits/matched_stop_kit.py} +0 -0
  193. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/WHEEL +0 -0
  194. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/licenses/LICENSE +0 -0
  195. {sglang-0.5.4.dist-info → sglang-0.5.4.post2.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -27,19 +27,25 @@ from typing import Dict, List, Literal, Optional, Union
27
27
  import orjson
28
28
 
29
29
  from sglang.srt.connector import ConnectorType
30
+ from sglang.srt.environ import envs
30
31
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
31
32
  from sglang.srt.lora.lora_registry import LoRARef
32
33
  from sglang.srt.parser.reasoning_parser import ReasoningParser
33
- from sglang.srt.utils import (
34
+ from sglang.srt.utils.common import (
34
35
  LORA_TARGET_ALL_MODULES,
35
36
  SUPPORTED_LORA_TARGET_MODULES,
36
37
  configure_ipv6,
38
+ cpu_has_amx_support,
37
39
  get_device,
38
40
  get_device_memory_capacity,
39
41
  get_device_sm,
42
+ is_blackwell_supported,
40
43
  is_cuda,
44
+ is_fa3_default_architecture,
41
45
  is_flashinfer_available,
42
46
  is_hip,
47
+ is_hopper_with_cuda_12_3,
48
+ is_no_spec_infer_or_topk_one,
43
49
  is_npu,
44
50
  is_port_available,
45
51
  is_remote_url,
@@ -51,6 +57,7 @@ from sglang.srt.utils import (
51
57
  json_list_type,
52
58
  nullable_str,
53
59
  parse_connector_type,
60
+ xpu_has_xmx_support,
54
61
  )
55
62
  from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
56
63
  from sglang.utils import is_in_ci
@@ -92,6 +99,7 @@ QUANTIZATION_CHOICES = [
92
99
  "qoq",
93
100
  "w4afp8",
94
101
  "mxfp4",
102
+ "auto-round",
95
103
  "compressed-tensors", # for Ktransformers
96
104
  ]
97
105
 
@@ -127,9 +135,18 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
127
135
 
128
136
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
129
137
 
138
+ RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND = ["fa3", "triton"]
139
+
130
140
  DEFAULT_LORA_EVICTION_POLICY = "lru"
131
141
 
132
- NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
142
+ NSA_CHOICES = [
143
+ "flashmla_sparse",
144
+ "flashmla_kv",
145
+ "flashmla_auto",
146
+ "fa3",
147
+ "tilelang",
148
+ "aiter",
149
+ ]
133
150
 
134
151
  RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
135
152
 
@@ -175,12 +192,25 @@ def add_deterministic_attention_backend_choices(choices):
175
192
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES.extend(choices)
176
193
 
177
194
 
195
+ def add_radix_supported_deterministic_attention_backend_choices(choices):
196
+ RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND.extend(choices)
197
+
198
+
178
199
  def add_radix_eviction_policy_choices(choices):
179
200
  RADIX_EVICTION_POLICY_CHOICES.extend(choices)
180
201
 
181
202
 
182
203
  @dataclasses.dataclass
183
204
  class ServerArgs:
205
+ """
206
+ The arguments of the server.
207
+
208
+ NOTE: When you add new arguments, please make sure the order
209
+ in this class definition the same as the order in the the function
210
+ `ServerArgs.add_cli_args`.
211
+ Please follow the existing style to group the new arguments into related groups or create new groups.
212
+ """
213
+
184
214
  # Model and tokenizer
185
215
  model_path: str
186
216
  tokenizer_path: Optional[str] = None
@@ -190,11 +220,6 @@ class ServerArgs:
190
220
  load_format: str = "auto"
191
221
  model_loader_extra_config: str = "{}"
192
222
  trust_remote_code: bool = False
193
- modelopt_quant: Optional[Union[str, Dict]] = None
194
- modelopt_checkpoint_restore_path: Optional[str] = None
195
- modelopt_checkpoint_save_path: Optional[str] = None
196
- modelopt_export_path: Optional[str] = None
197
- quantize_and_serve: bool = False
198
223
  context_length: Optional[int] = None
199
224
  is_embedding: bool = False
200
225
  enable_multimodal: Optional[bool] = None
@@ -216,6 +241,11 @@ class ServerArgs:
216
241
  quantization_param_path: Optional[str] = None
217
242
  kv_cache_dtype: str = "auto"
218
243
  enable_fp32_lm_head: bool = False
244
+ modelopt_quant: Optional[Union[str, Dict]] = None
245
+ modelopt_checkpoint_restore_path: Optional[str] = None
246
+ modelopt_checkpoint_save_path: Optional[str] = None
247
+ modelopt_export_path: Optional[str] = None
248
+ quantize_and_serve: bool = False
219
249
 
220
250
  # Memory and scheduling
221
251
  mem_fraction_static: Optional[float] = None
@@ -238,8 +268,6 @@ class ServerArgs:
238
268
 
239
269
  # Runtime options
240
270
  device: Optional[str] = None
241
- elastic_ep_backend: Literal[None, "mooncake"] = None
242
- mooncake_ib_device: Optional[str] = None
243
271
  tp_size: int = 1
244
272
  pp_size: int = 1
245
273
  pp_max_micro_batch_size: Optional[int] = None
@@ -272,12 +300,12 @@ class ServerArgs:
272
300
  collect_tokens_histogram: bool = False
273
301
  prompt_tokens_buckets: Optional[List[str]] = None
274
302
  generation_tokens_buckets: Optional[List[str]] = None
303
+ gc_warning_threshold_secs: float = 0.0
275
304
  decode_log_interval: int = 40
276
305
  enable_request_time_stats_logging: bool = False
277
306
  kv_events_config: Optional[str] = None
278
- gc_warning_threshold_secs: float = 0.0
279
307
  enable_trace: bool = False
280
- oltp_traces_endpoint: str = "localhost:4317"
308
+ otlp_traces_endpoint: str = "localhost:4317"
281
309
 
282
310
  # API related
283
311
  api_key: Optional[str] = None
@@ -317,8 +345,8 @@ class ServerArgs:
317
345
  ] = None
318
346
  max_loaded_loras: Optional[int] = None
319
347
  max_loras_per_batch: int = 8
320
- lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
321
- lora_backend: str = "triton"
348
+ lora_eviction_policy: str = "lru"
349
+ lora_backend: str = "csgmv"
322
350
  max_lora_chunk_size: Optional[int] = 16
323
351
 
324
352
  # Kernel backend
@@ -332,7 +360,6 @@ class ServerArgs:
332
360
  nsa_decode_backend: str = "fa3"
333
361
 
334
362
  # Speculative decoding
335
- enable_beta_spec: bool = False
336
363
  speculative_algorithm: Optional[str] = None
337
364
  speculative_draft_model_path: Optional[str] = None
338
365
  speculative_draft_model_revision: Optional[str] = None
@@ -375,6 +402,8 @@ class ServerArgs:
375
402
  enable_expert_distribution_metrics: bool = False
376
403
  deepep_config: Optional[str] = None
377
404
  moe_dense_tp_size: Optional[int] = None
405
+ elastic_ep_backend: Literal[None, "mooncake"] = None
406
+ mooncake_ib_device: Optional[str] = None
378
407
 
379
408
  # Mamba cache
380
409
  max_mamba_cache_size: Optional[int] = None
@@ -473,6 +502,7 @@ class ServerArgs:
473
502
  scheduler_recv_interval: int = 1
474
503
  numa_node: Optional[List[int]] = None
475
504
  enable_deterministic_inference: bool = False
505
+ rl_on_policy_target: Optional[str] = None
476
506
 
477
507
  # Dynamic batch tokenizer
478
508
  enable_dynamic_batch_tokenizer: bool = False
@@ -481,6 +511,9 @@ class ServerArgs:
481
511
 
482
512
  # Debug tensor dumps
483
513
  debug_tensor_dump_output_folder: Optional[str] = None
514
+ # -1 mean dump all layers.
515
+ debug_tensor_dump_layers: int = -1
516
+ # TODO(guoyuhong): clean the old dumper code.
484
517
  debug_tensor_dump_input_file: Optional[str] = None
485
518
  debug_tensor_dump_inject: bool = False
486
519
 
@@ -509,18 +542,9 @@ class ServerArgs:
509
542
  pdmux_config_path: Optional[str] = None
510
543
  sm_group_num: int = 8
511
544
 
512
- def get_attention_backends(server_args):
513
- prefill_attention_backend_str = (
514
- server_args.prefill_attention_backend
515
- if server_args.prefill_attention_backend
516
- else server_args.attention_backend
517
- )
518
- decode_attention_backend_str = (
519
- server_args.decode_attention_backend
520
- if server_args.decode_attention_backend
521
- else server_args.attention_backend
522
- )
523
- return prefill_attention_backend_str, decode_attention_backend_str
545
+ # For Multi-Modal
546
+ mm_max_concurrent_calls: int = 32
547
+ mm_per_request_timeout: float = 10.0
524
548
 
525
549
  def __post_init__(self):
526
550
  """
@@ -550,6 +574,9 @@ class ServerArgs:
550
574
  # Apply model-specific adjustments.
551
575
  self._handle_model_specific_adjustments()
552
576
 
577
+ # Handle Hicache settings.
578
+ self._handle_hicache()
579
+
553
580
  # Set kernel backends.
554
581
  self._handle_sampling_backend()
555
582
  self._handle_attention_backend_compatibility()
@@ -572,9 +599,6 @@ class ServerArgs:
572
599
  # Handle pipeline parallelism.
573
600
  self._handle_pipeline_parallelism()
574
601
 
575
- # Handle Hicache settings.
576
- self._handle_hicache()
577
-
578
602
  # Handle speculative decoding logic.
579
603
  self._handle_speculative_decoding()
580
604
 
@@ -614,22 +638,6 @@ class ServerArgs:
614
638
  )
615
639
  self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
616
640
 
617
- def _handle_ktransformers_configs(self):
618
- from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
619
- CompressedTensorsWNA16AMXEPMoEMethod,
620
- override_config,
621
- )
622
-
623
- override_config(
624
- CompressedTensorsWNA16AMXEPMoEMethod,
625
- self.kt_num_gpu_experts,
626
- self.kt_cpuinfer,
627
- self.kt_threadpool_count,
628
- self.kt_amx_weight_path,
629
- self.kt_amx_method,
630
- self.chunked_prefill_size,
631
- )
632
-
633
641
  def _handle_missing_default_values(self):
634
642
  if self.tokenizer_path is None:
635
643
  self.tokenizer_path = self.model_path
@@ -684,7 +692,7 @@ class ServerArgs:
684
692
  self.cuda_graph_max_bs = 64
685
693
  elif gpu_mem < 35 * 1024:
686
694
  # A10, 4090, 5090
687
- # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
695
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
688
696
  if self.chunked_prefill_size is None:
689
697
  self.chunked_prefill_size = 2048
690
698
  if self.cuda_graph_max_bs is None:
@@ -692,7 +700,7 @@ class ServerArgs:
692
700
  # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
693
701
  # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
694
702
  if self.tp_size < 4:
695
- self.cuda_graph_max_bs = 16
703
+ self.cuda_graph_max_bs = 24
696
704
  else:
697
705
  self.cuda_graph_max_bs = 80
698
706
  elif gpu_mem < 60 * 1024:
@@ -800,11 +808,9 @@ class ServerArgs:
800
808
  else 0.88
801
809
  )
802
810
 
803
- # Lazy init to avoid circular import
804
- # Multimodal models need more memory for the image processor
805
- from sglang.srt.configs.model_config import ModelConfig
806
-
807
- model_config = ModelConfig.from_server_args(self)
811
+ # Multimodal models need more memory for the image processing,
812
+ # so we adjust the mem_fraction_static accordingly.
813
+ model_config = self.get_model_config()
808
814
  if model_config.is_multimodal:
809
815
  self.adjust_mem_fraction_for_vlm(model_config)
810
816
 
@@ -829,7 +835,7 @@ class ServerArgs:
829
835
  capture_bs = (
830
836
  list(range(1, 9, 1))
831
837
  + list(range(10, 33, 2))
832
- + list(range(40, 64, 4))
838
+ + list(range(40, 65, 4))
833
839
  + list(range(72, 257, 8))
834
840
  + list(range(272, self.cuda_graph_max_bs + 1, 16))
835
841
  )
@@ -892,14 +898,19 @@ class ServerArgs:
892
898
  logger.info(
893
899
  "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
894
900
  )
895
- if (
896
- self.quantization == "modelopt_fp4"
897
- and self.moe_runner_backend == "auto"
898
- ):
901
+ if self.moe_a2a_backend == "none" and self.moe_runner_backend == "auto":
899
902
  self.moe_runner_backend = "flashinfer_trtllm"
900
903
  logger.info(
901
- "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
904
+ "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
902
905
  )
906
+ if self.quantization is None:
907
+ # Default DeepSeek V3/R1 native FP8 when not explicitly set,
908
+ # Because we need this condition for an assertion in
909
+ # flashinfer_trtllm MoE runner backend.
910
+ self.quantization = "fp8"
911
+ logger.info(
912
+ "Quantization not specified, default to fp8 for DeepSeek on sm100"
913
+ )
903
914
 
904
915
  elif model_arch in ["GptOssForCausalLM"]:
905
916
  if (
@@ -925,7 +936,7 @@ class ServerArgs:
925
936
  f"- Decode: {decode_attn_backend}\n"
926
937
  )
927
938
 
928
- if is_sm100_supported():
939
+ if is_blackwell_supported():
929
940
  if not self.enable_dp_attention:
930
941
  self.enable_flashinfer_allreduce_fusion = True
931
942
  logger.info(
@@ -937,7 +948,7 @@ class ServerArgs:
937
948
  and quantization_config.get("quant_method") == "mxfp4"
938
949
  )
939
950
 
940
- if is_sm100_supported() and is_mxfp4_quant_format:
951
+ if is_blackwell_supported() and is_mxfp4_quant_format:
941
952
  self.moe_runner_backend = "flashinfer_mxfp4"
942
953
  logger.warning(
943
954
  "Detected SM100 and MXFP4 quantization format for GPT-OSS model, enabling FlashInfer MXFP4 MOE kernel."
@@ -966,7 +977,19 @@ class ServerArgs:
966
977
  "fa3",
967
978
  "aiter",
968
979
  "triton",
969
- }, "fa3, aiter, or triton is required for Llama4 model"
980
+ "trtllm_mha",
981
+ }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
982
+ if is_sm100_supported() and self.attention_backend is None:
983
+ self.attention_backend = "trtllm_mha"
984
+ logger.warning(
985
+ "Use trtllm_mha as attention backend on sm100 for Llama4 model"
986
+ )
987
+ if is_sm100_supported() and self.moe_runner_backend == "auto":
988
+ if self.quantization in {"fp8", "modelopt_fp8"}:
989
+ self.moe_runner_backend = "flashinfer_trtllm"
990
+ logger.info(
991
+ "Use flashinfer_trtllm as MoE runner backend on SM100 for Llama4"
992
+ )
970
993
  elif model_arch in [
971
994
  "Gemma2ForCausalLM",
972
995
  "Gemma3ForCausalLM",
@@ -1005,6 +1028,11 @@ class ServerArgs:
1005
1028
  logger.info(
1006
1029
  f"Using {self.attention_backend} as attention backend for {model_arch}."
1007
1030
  )
1031
+ elif model_arch in ["KimiLinearForCausalLM"]:
1032
+ logger.warning(
1033
+ f"Disabling Radix Cache for {model_arch} as it is not yet supported."
1034
+ )
1035
+ self.disable_radix_cache = True
1008
1036
 
1009
1037
  if is_deepseek_nsa(hf_config):
1010
1038
  if (
@@ -1027,16 +1055,30 @@ class ServerArgs:
1027
1055
  import torch
1028
1056
 
1029
1057
  major, _ = torch.cuda.get_device_capability()
1030
- if major >= 10:
1031
- self.kv_cache_dtype = "fp8_e4m3"
1032
- logger.warning("Setting KV cache dtype to fp8.")
1058
+ if self.kv_cache_dtype == "auto":
1059
+ self.kv_cache_dtype = "fp8_e4m3" if major >= 10 else "bfloat16"
1060
+ logger.warning(
1061
+ f"Setting KV cache dtype to {self.kv_cache_dtype} for DeepSeek NSA."
1062
+ )
1063
+ if self.kv_cache_dtype == "bf16":
1064
+ self.kv_cache_dtype = "bfloat16"
1065
+ assert self.kv_cache_dtype in [
1066
+ "bfloat16",
1067
+ "fp8_e4m3",
1068
+ ], "DeepSeek NSA only supports bf16/bfloat16 or fp8_e4m3 kv_cache_dtype"
1033
1069
 
1034
1070
  if self.kv_cache_dtype == "fp8_e4m3":
1035
- self.nsa_prefill_backend = "flashmla_kv"
1071
+ # flashmla_auto dispatches to flashmla_sparse/flashmla_kv based on hardware and heuristics
1072
+ self.nsa_prefill_backend = "flashmla_auto"
1036
1073
  self.nsa_decode_backend = "flashmla_kv"
1037
1074
  logger.warning(
1038
- "Setting NSA backend to flashmla_kv for FP8 KV Cache."
1075
+ "Setting NSA backend to flashmla_auto for prefill and flashmla_kv for decode for FP8 KV Cache."
1039
1076
  )
1077
+ else:
1078
+ # set prefill/decode backends for Blackwell. The default settings are for Hopper.
1079
+ if major >= 10:
1080
+ self.nsa_prefill_backend = "flashmla_sparse"
1081
+ self.nsa_decode_backend = "flashmla_sparse"
1040
1082
 
1041
1083
  # Logging env vars for NSA
1042
1084
  from sglang.srt.layers.attention.nsa.utils import (
@@ -1052,6 +1094,67 @@ class ServerArgs:
1052
1094
  )
1053
1095
 
1054
1096
  def _handle_attention_backend_compatibility(self):
1097
+ model_config = self.get_model_config()
1098
+ use_mla_backend = self.use_mla_backend()
1099
+
1100
+ if self.prefill_attention_backend is not None and (
1101
+ self.prefill_attention_backend == self.decode_attention_backend
1102
+ ): # override the default attention backend
1103
+ self.attention_backend = self.prefill_attention_backend
1104
+
1105
+ # Pick the default attention backend if not specified
1106
+ if self.attention_backend is None:
1107
+ """
1108
+ Auto select the fastest attention backend.
1109
+
1110
+ 1. Models with MHA Architecture (e.g: Llama, QWen)
1111
+ 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
1112
+ 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
1113
+ 2. Models with MLA Architecture and using FA3
1114
+ 2.1 We will use FA3 backend on hopper.
1115
+ 2.2 We will use Flashinfer backend on blackwell.
1116
+ 2.3 Otherwise, we will use triton backend.
1117
+ """
1118
+
1119
+ if not use_mla_backend:
1120
+ # MHA architecture
1121
+ if (
1122
+ is_hopper_with_cuda_12_3()
1123
+ and is_no_spec_infer_or_topk_one(self)
1124
+ and is_fa3_default_architecture(self.model_config.hf_config)
1125
+ ):
1126
+ self.attention_backend = "fa3"
1127
+ elif is_hip():
1128
+ self.attention_backend = "aiter"
1129
+ elif is_npu():
1130
+ self.attention_backend = "ascend"
1131
+ else:
1132
+ self.attention_backend = (
1133
+ "flashinfer" if is_flashinfer_available() else "triton"
1134
+ )
1135
+ else:
1136
+ # MLA architecture
1137
+ if is_hopper_with_cuda_12_3():
1138
+ self.attention_backend = "fa3"
1139
+ elif is_sm100_supported():
1140
+ self.attention_backend = "flashinfer"
1141
+ elif is_hip():
1142
+ head_num = model_config.get_num_kv_heads(self.tp_size)
1143
+ # TODO current aiter only support head number 16 or 128 head number
1144
+ if head_num == 128 or head_num == 16:
1145
+ self.attention_backend = "aiter"
1146
+ else:
1147
+ self.attention_backend = "triton"
1148
+ elif is_npu():
1149
+ self.attention_backend = "ascend"
1150
+ else:
1151
+ self.attention_backend = "triton"
1152
+
1153
+ logger.warning(
1154
+ f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
1155
+ )
1156
+
1157
+ # Torch native and flex attention backends
1055
1158
  if self.attention_backend == "torch_native":
1056
1159
  logger.warning(
1057
1160
  "Cuda graph is disabled because of using torch native attention backend"
@@ -1067,12 +1170,7 @@ class ServerArgs:
1067
1170
  self.speculative_algorithm is None
1068
1171
  ), "Speculative decoding is currently not supported with Flex Attention backend"
1069
1172
 
1070
- if is_npu() and self.attention_backend in ["ascend"]:
1071
- logger.warning(
1072
- "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
1073
- )
1074
- self.page_size = 128
1075
-
1173
+ # Major NVIDIA platforms backends
1076
1174
  if (
1077
1175
  self.attention_backend == "flashmla"
1078
1176
  or self.decode_attention_backend == "flashmla"
@@ -1095,7 +1193,7 @@ class ServerArgs:
1095
1193
  self.attention_backend == "trtllm_mla"
1096
1194
  or self.decode_attention_backend == "trtllm_mla"
1097
1195
  ):
1098
- if not is_sm100_supported():
1196
+ if not is_blackwell_supported():
1099
1197
  raise ValueError(
1100
1198
  "TRTLLM MLA backend is only supported on Blackwell GPUs (SM100). Please use a different backend."
1101
1199
  )
@@ -1127,19 +1225,13 @@ class ServerArgs:
1127
1225
  )
1128
1226
  self.page_size = 64
1129
1227
 
1130
- if self.attention_backend == "dual_chunk_flash_attn":
1228
+ if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
1131
1229
  logger.warning(
1132
- "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
1230
+ "FlashAttention3 only supports fp8_e4m3 if using FP8; "
1231
+ "Setting attention backend to triton."
1133
1232
  )
1134
- self.enable_mixed_chunk = False
1135
- self.disable_radix_cache = True
1233
+ self.attention_backend = "triton"
1136
1234
 
1137
- if self.attention_backend == "intel_xpu":
1138
- if self.page_size not in [32, 64, 128]:
1139
- logger.warning(
1140
- f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1141
- )
1142
- self.page_size = 128
1143
1235
  if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
1144
1236
  raise ValueError(
1145
1237
  "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
@@ -1150,6 +1242,66 @@ class ServerArgs:
1150
1242
  )
1151
1243
  self.page_size = 128
1152
1244
 
1245
+ # AMD platforms backends
1246
+ if self.attention_backend == "aiter":
1247
+ if model_config.context_len > 8192:
1248
+ self.mem_fraction_static *= 0.85
1249
+
1250
+ # NPU platforms backends
1251
+ if is_npu() and self.attention_backend in ["ascend"]:
1252
+ logger.warning(
1253
+ "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
1254
+ )
1255
+ self.page_size = 128
1256
+
1257
+ # Other platforms backends
1258
+ if (
1259
+ self.attention_backend == "intel_amx"
1260
+ and self.device == "cpu"
1261
+ and not cpu_has_amx_support()
1262
+ ):
1263
+ logger.warning(
1264
+ "The current platform does not support Intel AMX, will fallback to torch_native backend."
1265
+ )
1266
+ self.attention_backend = "torch_native"
1267
+
1268
+ if (
1269
+ self.attention_backend == "intel_xpu"
1270
+ and self.device == "xpu"
1271
+ and not xpu_has_xmx_support()
1272
+ ):
1273
+ logger.warning(
1274
+ "The current platform does not support Intel XMX, will fallback to triton backend."
1275
+ )
1276
+ self.attention_backend = "triton"
1277
+
1278
+ if self.attention_backend == "intel_xpu":
1279
+ if self.page_size not in [32, 64, 128]:
1280
+ logger.warning(
1281
+ f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1282
+ )
1283
+ self.page_size = 128
1284
+
1285
+ # Dual chunk flash attention backend
1286
+ if (
1287
+ getattr(model_config.hf_config, "dual_chunk_attention_config", None)
1288
+ is not None
1289
+ ):
1290
+ if self.attention_backend is None:
1291
+ self.attention_backend = "dual_chunk_flash_attn"
1292
+ logger.info("Dual chunk attention is turned on by default.")
1293
+ elif self.attention_backend != "dual_chunk_flash_attn":
1294
+ raise ValueError(
1295
+ "Dual chunk attention is enabled, but attention backend is set to "
1296
+ f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
1297
+ )
1298
+ if self.attention_backend == "dual_chunk_flash_attn":
1299
+ logger.warning(
1300
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
1301
+ )
1302
+ self.enable_mixed_chunk = False
1303
+ self.disable_radix_cache = True
1304
+
1153
1305
  def _handle_page_size(self):
1154
1306
  if self.page_size is None:
1155
1307
  self.page_size = 1
@@ -1162,6 +1314,22 @@ class ServerArgs:
1162
1314
  if self.grammar_backend is None:
1163
1315
  self.grammar_backend = "xgrammar"
1164
1316
 
1317
+ def _handle_ktransformers_configs(self):
1318
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
1319
+ CompressedTensorsWNA16AMXEPMoEMethod,
1320
+ override_config,
1321
+ )
1322
+
1323
+ override_config(
1324
+ CompressedTensorsWNA16AMXEPMoEMethod,
1325
+ self.kt_num_gpu_experts,
1326
+ self.kt_cpuinfer,
1327
+ self.kt_threadpool_count,
1328
+ self.kt_amx_weight_path,
1329
+ self.kt_amx_method,
1330
+ self.chunked_prefill_size,
1331
+ )
1332
+
1165
1333
  def _handle_data_parallelism(self):
1166
1334
  if self.dp_size == 1:
1167
1335
  self.enable_dp_attention = False
@@ -1192,8 +1360,10 @@ class ServerArgs:
1192
1360
 
1193
1361
  if self.moe_runner_backend == "flashinfer_trtllm":
1194
1362
  assert (
1195
- self.quantization == "modelopt_fp4" or self.quantization == "fp8"
1196
- ), "modelopt_fp4 or fp8 quantization is required for Flashinfer TRTLLM MoE"
1363
+ self.quantization == "modelopt_fp4"
1364
+ or self.quantization == "modelopt_fp8"
1365
+ or self.quantization == "fp8"
1366
+ ), "modelopt_fp4, modelopt_fp8 or fp8 quantization is required for Flashinfer TRTLLM MoE"
1197
1367
  self.disable_shared_experts_fusion = True
1198
1368
  logger.warning(
1199
1369
  "FlashInfer TRTLLM MoE is enabled. --disable-shared-experts-fusion is automatically set."
@@ -1277,6 +1447,24 @@ class ServerArgs:
1277
1447
  "Page first direct layout only support direct io backend"
1278
1448
  )
1279
1449
 
1450
+ if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
1451
+ # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
1452
+ if self.decode_attention_backend is None:
1453
+ if not self.use_mla_backend():
1454
+ self.decode_attention_backend = (
1455
+ "flashinfer" if is_flashinfer_available() else "triton"
1456
+ )
1457
+ else:
1458
+ self.decode_attention_backend = (
1459
+ "flashinfer" if is_sm100_supported() else "triton"
1460
+ )
1461
+ elif self.decode_attention_backend == "fa3":
1462
+ self.hicache_io_backend = "direct"
1463
+ logger.warning(
1464
+ "FlashAttention3 decode backend is not compatible with hierarchical cache. "
1465
+ "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
1466
+ )
1467
+
1280
1468
  def _handle_speculative_decoding(self):
1281
1469
  if self.speculative_algorithm == "NEXTN":
1282
1470
  self.speculative_algorithm = "EAGLE"
@@ -1287,22 +1475,26 @@ class ServerArgs:
1287
1475
  raise ValueError(
1288
1476
  "Currently standalone speculative decoding does not support dp attention."
1289
1477
  )
1478
+
1290
1479
  if self.max_running_requests is None:
1291
1480
  self.max_running_requests = 48
1292
1481
  logger.warning(
1293
- "Max running requests is reset to 48 for speculative decoding."
1482
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1294
1483
  )
1295
1484
 
1296
- if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
1485
+ if (
1486
+ self.speculative_algorithm == "EAGLE"
1487
+ and envs.SGLANG_ENABLE_SPEC_V2.get()
1488
+ ):
1297
1489
  self.disable_overlap_schedule = False
1298
1490
  logger.warning(
1299
1491
  "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
1300
1492
  )
1301
1493
 
1302
- if not self.enable_beta_spec:
1494
+ if not envs.SGLANG_ENABLE_SPEC_V2.get():
1303
1495
  self.disable_overlap_schedule = True
1304
1496
  logger.warning(
1305
- "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
1497
+ "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
1306
1498
  )
1307
1499
 
1308
1500
  if self.enable_mixed_chunk:
@@ -1371,8 +1563,13 @@ class ServerArgs:
1371
1563
  raise ValueError(
1372
1564
  "Ngram speculative decoding only supports CUDA device."
1373
1565
  )
1566
+
1374
1567
  if self.max_running_requests is None:
1375
1568
  self.max_running_requests = 48
1569
+ logger.warning(
1570
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1571
+ )
1572
+
1376
1573
  self.disable_overlap_schedule = True
1377
1574
  self.enable_mixed_chunk = False
1378
1575
  self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
@@ -1515,19 +1712,44 @@ class ServerArgs:
1515
1712
  )
1516
1713
 
1517
1714
  def _handle_deterministic_inference(self):
1715
+ if self.rl_on_policy_target is not None:
1716
+ logger.warning(
1717
+ "Enable deterministic inference because of rl_on_policy_target."
1718
+ )
1719
+ self.enable_deterministic_inference = True
1720
+ # TODO remove this environment variable as a whole
1721
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
1722
+
1518
1723
  if self.enable_deterministic_inference:
1519
1724
  # Check sampling backend
1520
1725
  self.sampling_backend = "pytorch"
1521
1726
  logger.warning(
1522
1727
  "Sampling backend is set to pytorch for deterministic inference."
1523
1728
  )
1729
+ is_deepseek_model = False
1730
+ if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
1731
+ try:
1732
+ hf_config = self.get_hf_config()
1733
+ model_arch = hf_config.architectures[0]
1734
+ is_deepseek_model = model_arch in [
1735
+ "DeepseekV2ForCausalLM",
1736
+ "DeepseekV3ForCausalLM",
1737
+ "DeepseekV32ForCausalLM",
1738
+ ]
1739
+ except Exception:
1740
+ pass
1524
1741
 
1525
1742
  # Check attention backend
1526
1743
  if self.attention_backend is None:
1527
1744
  # User didn't specify attention backend, fallback based on GPU architecture
1528
1745
  if is_sm100_supported() or is_sm120_supported():
1529
1746
  # Blackwell and newer architectures
1530
- self.attention_backend = "flashinfer"
1747
+ if is_deepseek_model:
1748
+ # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
1749
+ self.attention_backend = "triton"
1750
+ else:
1751
+ # fallback to flashinfer on Blackwell for non-DeepSeek models
1752
+ self.attention_backend = "flashinfer"
1531
1753
  else:
1532
1754
  # Hopper (SM90) and older architectures
1533
1755
  self.attention_backend = "fa3"
@@ -1542,8 +1764,17 @@ class ServerArgs:
1542
1764
  f"but you explicitly specified '{self.attention_backend}'."
1543
1765
  )
1544
1766
 
1545
- # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1546
- if self.attention_backend not in ["fa3", "triton"]:
1767
+ if is_deepseek_model:
1768
+ if self.attention_backend not in ["fa3", "triton"]:
1769
+ raise ValueError(
1770
+ f"Currently only {RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND} attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
1771
+ )
1772
+
1773
+ if (
1774
+ self.attention_backend
1775
+ not in RADIX_SUPPORTED_DETERMINISTIC_ATTENTION_BACKEND
1776
+ ):
1777
+ # Currently, only certain backends support radix cache. Support for other backends is in progress
1547
1778
  self.disable_radix_cache = True
1548
1779
  logger.warning(
1549
1780
  f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
@@ -1558,7 +1789,13 @@ class ServerArgs:
1558
1789
  )
1559
1790
 
1560
1791
  def _handle_other_validations(self):
1561
- pass
1792
+ # Handle model inference tensor dump.
1793
+ if self.debug_tensor_dump_output_folder is not None:
1794
+ logger.warning(
1795
+ "Cuda graph and server warmup are disabled because of using tensor dump mode"
1796
+ )
1797
+ self.disable_cuda_graph = True
1798
+ self.skip_server_warmup = True
1562
1799
 
1563
1800
  @staticmethod
1564
1801
  def add_cli_args(parser: argparse.ArgumentParser):
@@ -1743,6 +1980,18 @@ class ServerArgs:
1743
1980
  "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
1744
1981
  "default to 1.0, which may cause accuracy issues. ",
1745
1982
  )
1983
+ parser.add_argument(
1984
+ "--kv-cache-dtype",
1985
+ type=str,
1986
+ default=ServerArgs.kv_cache_dtype,
1987
+ choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1988
+ help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1989
+ )
1990
+ parser.add_argument(
1991
+ "--enable-fp32-lm-head",
1992
+ action="store_true",
1993
+ help="If set, the LM head outputs (logits) are in FP32.",
1994
+ )
1746
1995
  parser.add_argument(
1747
1996
  "--modelopt-quant",
1748
1997
  type=str,
@@ -1782,18 +2031,6 @@ class ServerArgs:
1782
2031
  "This is useful for development and prototyping. For production, it's recommended "
1783
2032
  "to use separate quantization and deployment steps.",
1784
2033
  )
1785
- parser.add_argument(
1786
- "--kv-cache-dtype",
1787
- type=str,
1788
- default=ServerArgs.kv_cache_dtype,
1789
- choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1790
- help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1791
- )
1792
- parser.add_argument(
1793
- "--enable-fp32-lm-head",
1794
- action="store_true",
1795
- help="If set, the LM head outputs (logits) are in FP32.",
1796
- )
1797
2034
 
1798
2035
  # Memory and scheduling
1799
2036
  parser.add_argument(
@@ -1898,7 +2135,14 @@ class ServerArgs:
1898
2135
  parser.add_argument(
1899
2136
  "--disable-hybrid-swa-memory",
1900
2137
  action="store_true",
1901
- help="Disable the hybrid SWA memory.",
2138
+ help="Disable the hybrid SWA memory pool.",
2139
+ )
2140
+ parser.add_argument(
2141
+ "--radix-eviction-policy",
2142
+ type=str,
2143
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2144
+ default=ServerArgs.radix_eviction_policy,
2145
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
1902
2146
  )
1903
2147
 
1904
2148
  # Runtime options
@@ -1908,21 +2152,6 @@ class ServerArgs:
1908
2152
  default=ServerArgs.device,
1909
2153
  help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
1910
2154
  )
1911
- parser.add_argument(
1912
- "--elastic-ep-backend",
1913
- type=str,
1914
- default=ServerArgs.elastic_ep_backend,
1915
- choices=["none", "mooncake"],
1916
- help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
1917
- )
1918
- parser.add_argument(
1919
- "--mooncake-ib-device",
1920
- type=str,
1921
- default=ServerArgs.mooncake_ib_device,
1922
- help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
1923
- "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
1924
- "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
1925
- )
1926
2155
  parser.add_argument(
1927
2156
  "--tensor-parallel-size",
1928
2157
  "--tp-size",
@@ -2147,7 +2376,7 @@ class ServerArgs:
2147
2376
  help="Enable opentelemetry trace",
2148
2377
  )
2149
2378
  parser.add_argument(
2150
- "--oltp-traces-endpoint",
2379
+ "--otlp-traces-endpoint",
2151
2380
  type=str,
2152
2381
  default="localhost:4317",
2153
2382
  help="Config opentelemetry collector endpoint if --enable-trace is set. format: <ip>:<port>",
@@ -2210,6 +2439,12 @@ class ServerArgs:
2210
2439
  default=ServerArgs.tool_call_parser,
2211
2440
  help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
2212
2441
  )
2442
+ parser.add_argument(
2443
+ "--tool-server",
2444
+ type=str,
2445
+ default=None,
2446
+ help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
2447
+ )
2213
2448
  parser.add_argument(
2214
2449
  "--sampling-defaults",
2215
2450
  type=str,
@@ -2220,12 +2455,6 @@ class ServerArgs:
2220
2455
  "'model' uses the model's generation_config.json to get the recommended "
2221
2456
  "sampling parameters if available. Default is 'model'.",
2222
2457
  )
2223
- parser.add_argument(
2224
- "--tool-server",
2225
- type=str,
2226
- default=None,
2227
- help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
2228
- )
2229
2458
 
2230
2459
  # Data parallelism
2231
2460
  parser.add_argument(
@@ -2332,7 +2561,7 @@ class ServerArgs:
2332
2561
  parser.add_argument(
2333
2562
  "--lora-eviction-policy",
2334
2563
  type=str,
2335
- default=DEFAULT_LORA_EVICTION_POLICY,
2564
+ default=ServerArgs.lora_eviction_policy,
2336
2565
  choices=["lru", "fifo"],
2337
2566
  help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
2338
2567
  )
@@ -2408,7 +2637,6 @@ class ServerArgs:
2408
2637
  )
2409
2638
 
2410
2639
  # Speculative decoding
2411
- parser.add_argument("--enable-beta-spec", action="store_true")
2412
2640
  parser.add_argument(
2413
2641
  "--speculative-algorithm",
2414
2642
  type=str,
@@ -2644,6 +2872,21 @@ class ServerArgs:
2644
2872
  default=ServerArgs.moe_dense_tp_size,
2645
2873
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
2646
2874
  )
2875
+ parser.add_argument(
2876
+ "--elastic-ep-backend",
2877
+ type=str,
2878
+ default=ServerArgs.elastic_ep_backend,
2879
+ choices=["none", "mooncake"],
2880
+ help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
2881
+ )
2882
+ parser.add_argument(
2883
+ "--mooncake-ib-device",
2884
+ type=str,
2885
+ default=ServerArgs.mooncake_ib_device,
2886
+ help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
2887
+ "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
2888
+ "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
2889
+ )
2647
2890
 
2648
2891
  # Mamba Cache
2649
2892
  parser.add_argument(
@@ -2691,13 +2934,6 @@ class ServerArgs:
2691
2934
  default=ServerArgs.hicache_write_policy,
2692
2935
  help="The write policy of hierarchical cache.",
2693
2936
  )
2694
- parser.add_argument(
2695
- "--radix-eviction-policy",
2696
- type=str,
2697
- choices=RADIX_EVICTION_POLICY_CHOICES,
2698
- default=ServerArgs.radix_eviction_policy,
2699
- help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2700
- )
2701
2937
  parser.add_argument(
2702
2938
  "--hicache-io-backend",
2703
2939
  type=str,
@@ -2805,7 +3041,7 @@ class ServerArgs:
2805
3041
  "--ds-sparse-decode-threshold",
2806
3042
  type=int,
2807
3043
  default=ServerArgs.ds_sparse_decode_threshold,
2808
- help="The type of heavy channels in double sparsity attention",
3044
+ help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
2809
3045
  )
2810
3046
 
2811
3047
  # Offloading
@@ -3111,26 +3347,20 @@ class ServerArgs:
3111
3347
  nargs="+",
3112
3348
  help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
3113
3349
  )
3114
-
3115
- # Debug tensor dumps
3116
- parser.add_argument(
3117
- "--debug-tensor-dump-output-folder",
3118
- type=str,
3119
- default=ServerArgs.debug_tensor_dump_output_folder,
3120
- help="The output folder for dumping tensors.",
3121
- )
3122
3350
  parser.add_argument(
3123
- "--debug-tensor-dump-input-file",
3124
- type=str,
3125
- default=ServerArgs.debug_tensor_dump_input_file,
3126
- help="The input filename for dumping tensors",
3351
+ "--enable-deterministic-inference",
3352
+ action="store_true",
3353
+ help="Enable deterministic inference mode with batch invariant ops.",
3127
3354
  )
3128
3355
  parser.add_argument(
3129
- "--debug-tensor-dump-inject",
3356
+ "--rl-on-policy-target",
3130
3357
  type=str,
3131
- default=ServerArgs.debug_tensor_dump_inject,
3132
- help="Inject the outputs from jax as the input of every layer.",
3358
+ default=ServerArgs.rl_on_policy_target,
3359
+ choices=["fsdp"],
3360
+ help="The training system that SGLang needs to match for true on-policy.",
3133
3361
  )
3362
+
3363
+ # Dynamic batch tokenizer
3134
3364
  parser.add_argument(
3135
3365
  "--enable-dynamic-batch-tokenizer",
3136
3366
  action="store_true",
@@ -3149,6 +3379,32 @@ class ServerArgs:
3149
3379
  help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
3150
3380
  )
3151
3381
 
3382
+ # Debug tensor dumps
3383
+ parser.add_argument(
3384
+ "--debug-tensor-dump-output-folder",
3385
+ type=str,
3386
+ default=ServerArgs.debug_tensor_dump_output_folder,
3387
+ help="The output folder for dumping tensors.",
3388
+ )
3389
+ parser.add_argument(
3390
+ "--debug-tensor-dump-layers",
3391
+ type=int,
3392
+ default=-1,
3393
+ help="The layer number for dumping tensors.",
3394
+ )
3395
+ parser.add_argument(
3396
+ "--debug-tensor-dump-input-file",
3397
+ type=str,
3398
+ default=ServerArgs.debug_tensor_dump_input_file,
3399
+ help="The input filename for dumping tensors",
3400
+ )
3401
+ parser.add_argument(
3402
+ "--debug-tensor-dump-inject",
3403
+ type=str,
3404
+ default=ServerArgs.debug_tensor_dump_inject,
3405
+ help="Inject the outputs from jax as the input of every layer.",
3406
+ )
3407
+
3152
3408
  # PD disaggregation
3153
3409
  parser.add_argument(
3154
3410
  "--disaggregation-mode",
@@ -3258,7 +3514,6 @@ class ServerArgs:
3258
3514
  default=None,
3259
3515
  help="The path of the PD-Multiplexing config file.",
3260
3516
  )
3261
-
3262
3517
  parser.add_argument(
3263
3518
  "--sm-group-num",
3264
3519
  type=int,
@@ -3266,55 +3521,25 @@ class ServerArgs:
3266
3521
  help="Number of sm partition groups.",
3267
3522
  )
3268
3523
 
3269
- # For deterministic inference
3524
+ # Configuration file support
3270
3525
  parser.add_argument(
3271
- "--enable-deterministic-inference",
3272
- action="store_true",
3273
- help="Enable deterministic inference mode with batch invariant ops.",
3526
+ "--config",
3527
+ type=str,
3528
+ help="Read CLI options from a config file. Must be a YAML file with configuration options.",
3274
3529
  )
3275
3530
 
3276
- # Deprecated arguments
3531
+ # For Multi-Modal
3277
3532
  parser.add_argument(
3278
- "--enable-ep-moe",
3279
- action=DeprecatedAction,
3280
- help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
3281
- )
3282
- parser.add_argument(
3283
- "--enable-deepep-moe",
3284
- action=DeprecatedAction,
3285
- help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
3286
- )
3287
- parser.add_argument(
3288
- "--enable-flashinfer-cutlass-moe",
3289
- action=DeprecatedAction,
3290
- help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
3291
- )
3292
- parser.add_argument(
3293
- "--enable-flashinfer-cutedsl-moe",
3294
- action=DeprecatedAction,
3295
- help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
3296
- )
3297
- parser.add_argument(
3298
- "--enable-flashinfer-trtllm-moe",
3299
- action=DeprecatedAction,
3300
- help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
3301
- )
3302
- parser.add_argument(
3303
- "--enable-triton-kernel-moe",
3304
- action=DeprecatedAction,
3305
- help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
3306
- )
3307
- parser.add_argument(
3308
- "--enable-flashinfer-mxfp4-moe",
3309
- action=DeprecatedAction,
3310
- help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
3533
+ "--mm-max-concurrent-calls",
3534
+ type=int,
3535
+ default=ServerArgs.mm_max_concurrent_calls,
3536
+ help="The max concurrent calls for async mm data processing.",
3311
3537
  )
3312
-
3313
- # Configuration file support
3314
3538
  parser.add_argument(
3315
- "--config",
3316
- type=str,
3317
- help="Read CLI options from a config file. Must be a YAML file with configuration options.",
3539
+ "--mm-per-request-timeout",
3540
+ type=int,
3541
+ default=ServerArgs.mm_per_request_timeout,
3542
+ help="The timeout for each multi-modal request in seconds.",
3318
3543
  )
3319
3544
 
3320
3545
  @classmethod
@@ -3344,6 +3569,34 @@ class ServerArgs:
3344
3569
  )
3345
3570
  return hf_config
3346
3571
 
3572
+ def get_model_config(self):
3573
+ # Lazy init to avoid circular import
3574
+ from sglang.srt.configs.model_config import ModelConfig
3575
+
3576
+ if hasattr(self, "model_config"):
3577
+ return self.model_config
3578
+ self.model_config = ModelConfig.from_server_args(self)
3579
+ return self.model_config
3580
+
3581
+ def get_attention_backends(self):
3582
+ prefill_attention_backend_str = (
3583
+ self.prefill_attention_backend
3584
+ if self.prefill_attention_backend
3585
+ else self.attention_backend
3586
+ )
3587
+ decode_attention_backend_str = (
3588
+ self.decode_attention_backend
3589
+ if self.decode_attention_backend
3590
+ else self.attention_backend
3591
+ )
3592
+ return prefill_attention_backend_str, decode_attention_backend_str
3593
+
3594
+ def use_mla_backend(self):
3595
+ from sglang.srt.configs.model_config import AttentionArch
3596
+
3597
+ model_config = self.get_model_config()
3598
+ return model_config.attention_arch == AttentionArch.MLA
3599
+
3347
3600
  def check_server_args(self):
3348
3601
  # Check parallel size constraints
3349
3602
  assert (
@@ -3721,6 +3974,13 @@ class PortArgs:
3721
3974
  else:
3722
3975
  nccl_port = server_args.nccl_port
3723
3976
 
3977
+ if server_args.tokenizer_worker_num > 1:
3978
+ tokenizer_worker_ipc_name = (
3979
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
3980
+ )
3981
+ else:
3982
+ tokenizer_worker_ipc_name = None
3983
+
3724
3984
  if not server_args.enable_dp_attention:
3725
3985
  # Normal case, use IPC within a single node
3726
3986
  return PortArgs(
@@ -3730,7 +3990,7 @@ class PortArgs:
3730
3990
  nccl_port=nccl_port,
3731
3991
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3732
3992
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3733
- tokenizer_worker_ipc_name=None,
3993
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3734
3994
  )
3735
3995
  else:
3736
3996
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -3764,7 +4024,7 @@ class PortArgs:
3764
4024
  nccl_port=nccl_port,
3765
4025
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
3766
4026
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
3767
- tokenizer_worker_ipc_name=None,
4027
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3768
4028
  )
3769
4029
 
3770
4030