sglang 0.5.4__py3-none-any.whl → 0.5.4.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. sglang/bench_serving.py +56 -12
  2. sglang/launch_server.py +2 -0
  3. sglang/srt/batch_invariant_ops/batch_invariant_ops.py +101 -4
  4. sglang/srt/compilation/backend.py +1 -1
  5. sglang/srt/configs/model_config.py +5 -5
  6. sglang/srt/distributed/parallel_state.py +0 -7
  7. sglang/srt/entrypoints/engine.py +18 -15
  8. sglang/srt/entrypoints/grpc_server.py +0 -1
  9. sglang/srt/entrypoints/http_server.py +75 -94
  10. sglang/srt/environ.py +16 -2
  11. sglang/srt/eplb/expert_distribution.py +30 -0
  12. sglang/srt/function_call/function_call_parser.py +2 -0
  13. sglang/srt/function_call/minimax_m2.py +367 -0
  14. sglang/srt/layers/activation.py +6 -0
  15. sglang/srt/layers/attention/flashattention_backend.py +12 -2
  16. sglang/srt/layers/attention/flashinfer_backend.py +10 -1
  17. sglang/srt/layers/attention/flashinfer_mla_backend.py +18 -10
  18. sglang/srt/layers/attention/trtllm_mla_backend.py +1 -13
  19. sglang/srt/layers/attention/utils.py +78 -0
  20. sglang/srt/layers/communicator.py +1 -0
  21. sglang/srt/layers/deep_gemm_wrapper/compile_utils.py +1 -1
  22. sglang/srt/layers/layernorm.py +19 -4
  23. sglang/srt/layers/logits_processor.py +5 -0
  24. sglang/srt/layers/moe/cutlass_w4a8_moe.py +138 -0
  25. sglang/srt/layers/moe/ep_moe/kernels.py +194 -0
  26. sglang/srt/layers/moe/ep_moe/layer.py +79 -272
  27. sglang/srt/layers/moe/fused_moe_triton/layer.py +3 -3
  28. sglang/srt/layers/moe/fused_moe_triton/triton_kernels_moe.py +7 -4
  29. sglang/srt/layers/moe/moe_runner/deep_gemm.py +287 -22
  30. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  31. sglang/srt/layers/moe/moe_runner/triton_kernels.py +194 -0
  32. sglang/srt/layers/moe/token_dispatcher/__init__.py +4 -4
  33. sglang/srt/layers/moe/token_dispatcher/base.py +11 -5
  34. sglang/srt/layers/moe/token_dispatcher/deepep.py +18 -14
  35. sglang/srt/layers/moe/token_dispatcher/standard.py +1 -1
  36. sglang/srt/layers/moe/topk.py +4 -4
  37. sglang/srt/layers/moe/utils.py +3 -4
  38. sglang/srt/layers/quantization/__init__.py +3 -5
  39. sglang/srt/layers/quantization/awq.py +0 -3
  40. sglang/srt/layers/quantization/base_config.py +7 -0
  41. sglang/srt/layers/quantization/fp8.py +68 -63
  42. sglang/srt/layers/quantization/gguf.py +566 -0
  43. sglang/srt/layers/quantization/mxfp4.py +30 -38
  44. sglang/srt/layers/quantization/unquant.py +23 -45
  45. sglang/srt/layers/quantization/w4afp8.py +38 -2
  46. sglang/srt/layers/radix_attention.py +5 -2
  47. sglang/srt/layers/rotary_embedding.py +13 -1
  48. sglang/srt/layers/sampler.py +12 -1
  49. sglang/srt/managers/io_struct.py +3 -0
  50. sglang/srt/managers/multi_tokenizer_mixin.py +17 -1
  51. sglang/srt/managers/scheduler.py +21 -15
  52. sglang/srt/managers/scheduler_metrics_mixin.py +22 -14
  53. sglang/srt/managers/scheduler_profiler_mixin.py +3 -4
  54. sglang/srt/managers/tokenizer_manager.py +11 -19
  55. sglang/srt/mem_cache/hicache_storage.py +7 -1
  56. sglang/srt/mem_cache/memory_pool.py +82 -0
  57. sglang/srt/mem_cache/storage/hf3fs/mini_3fs_metadata_server.py +3 -2
  58. sglang/srt/model_executor/forward_batch_info.py +44 -3
  59. sglang/srt/model_executor/model_runner.py +1 -149
  60. sglang/srt/model_executor/piecewise_cuda_graph_runner.py +22 -12
  61. sglang/srt/models/deepseek_v2.py +147 -44
  62. sglang/srt/models/glm4_moe.py +322 -354
  63. sglang/srt/models/glm4_moe_nextn.py +4 -14
  64. sglang/srt/models/glm4v_moe.py +29 -196
  65. sglang/srt/models/minimax_m2.py +922 -0
  66. sglang/srt/models/nvila.py +355 -0
  67. sglang/srt/models/nvila_lite.py +184 -0
  68. sglang/srt/models/qwen2.py +22 -1
  69. sglang/srt/models/qwen3.py +34 -4
  70. sglang/srt/models/qwen3_moe.py +2 -4
  71. sglang/srt/multimodal/processors/base_processor.py +1 -0
  72. sglang/srt/multimodal/processors/glm4v.py +1 -1
  73. sglang/srt/multimodal/processors/{vila.py → nvila.py} +32 -24
  74. sglang/srt/multimodal/processors/points_v15_chat.py +2 -2
  75. sglang/srt/parser/reasoning_parser.py +28 -1
  76. sglang/srt/server_args.py +365 -186
  77. sglang/srt/single_batch_overlap.py +2 -7
  78. sglang/srt/utils/common.py +87 -42
  79. sglang/srt/utils/hf_transformers_utils.py +7 -3
  80. sglang/test/test_deterministic.py +235 -12
  81. sglang/test/test_deterministic_utils.py +2 -1
  82. sglang/version.py +1 -1
  83. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/METADATA +7 -6
  84. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/RECORD +87 -82
  85. sglang/srt/models/vila.py +0 -306
  86. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/WHEEL +0 -0
  87. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/licenses/LICENSE +0 -0
  88. {sglang-0.5.4.dist-info → sglang-0.5.4.post1.dist-info}/top_level.txt +0 -0
sglang/srt/server_args.py CHANGED
@@ -27,19 +27,24 @@ from typing import Dict, List, Literal, Optional, Union
27
27
  import orjson
28
28
 
29
29
  from sglang.srt.connector import ConnectorType
30
+ from sglang.srt.environ import envs
30
31
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
31
32
  from sglang.srt.lora.lora_registry import LoRARef
32
33
  from sglang.srt.parser.reasoning_parser import ReasoningParser
33
- from sglang.srt.utils import (
34
+ from sglang.srt.utils.common import (
34
35
  LORA_TARGET_ALL_MODULES,
35
36
  SUPPORTED_LORA_TARGET_MODULES,
36
37
  configure_ipv6,
38
+ cpu_has_amx_support,
37
39
  get_device,
38
40
  get_device_memory_capacity,
39
41
  get_device_sm,
40
42
  is_cuda,
43
+ is_fa3_default_architecture,
41
44
  is_flashinfer_available,
42
45
  is_hip,
46
+ is_hopper_with_cuda_12_3,
47
+ is_no_spec_infer_or_topk_one,
43
48
  is_npu,
44
49
  is_port_available,
45
50
  is_remote_url,
@@ -51,6 +56,7 @@ from sglang.srt.utils import (
51
56
  json_list_type,
52
57
  nullable_str,
53
58
  parse_connector_type,
59
+ xpu_has_xmx_support,
54
60
  )
55
61
  from sglang.srt.utils.hf_transformers_utils import check_gguf_file, get_config
56
62
  from sglang.utils import is_in_ci
@@ -127,8 +133,6 @@ GRAMMAR_BACKEND_CHOICES = ["xgrammar", "outlines", "llguidance", "none"]
127
133
 
128
134
  DETERMINISTIC_ATTENTION_BACKEND_CHOICES = ["flashinfer", "fa3", "triton"]
129
135
 
130
- DEFAULT_LORA_EVICTION_POLICY = "lru"
131
-
132
136
  NSA_CHOICES = ["flashmla_sparse", "flashmla_kv", "fa3", "tilelang", "aiter"]
133
137
 
134
138
  RADIX_EVICTION_POLICY_CHOICES = ["lru", "lfu"]
@@ -181,6 +185,15 @@ def add_radix_eviction_policy_choices(choices):
181
185
 
182
186
  @dataclasses.dataclass
183
187
  class ServerArgs:
188
+ """
189
+ The arguments of the server.
190
+
191
+ NOTE: When you add new arguments, please make sure the order
192
+ in this class definition the same as the order in the the function
193
+ `ServerArgs.add_cli_args`.
194
+ Please follow the existing style to group the new arguments into related groups or create new groups.
195
+ """
196
+
184
197
  # Model and tokenizer
185
198
  model_path: str
186
199
  tokenizer_path: Optional[str] = None
@@ -190,11 +203,6 @@ class ServerArgs:
190
203
  load_format: str = "auto"
191
204
  model_loader_extra_config: str = "{}"
192
205
  trust_remote_code: bool = False
193
- modelopt_quant: Optional[Union[str, Dict]] = None
194
- modelopt_checkpoint_restore_path: Optional[str] = None
195
- modelopt_checkpoint_save_path: Optional[str] = None
196
- modelopt_export_path: Optional[str] = None
197
- quantize_and_serve: bool = False
198
206
  context_length: Optional[int] = None
199
207
  is_embedding: bool = False
200
208
  enable_multimodal: Optional[bool] = None
@@ -216,6 +224,11 @@ class ServerArgs:
216
224
  quantization_param_path: Optional[str] = None
217
225
  kv_cache_dtype: str = "auto"
218
226
  enable_fp32_lm_head: bool = False
227
+ modelopt_quant: Optional[Union[str, Dict]] = None
228
+ modelopt_checkpoint_restore_path: Optional[str] = None
229
+ modelopt_checkpoint_save_path: Optional[str] = None
230
+ modelopt_export_path: Optional[str] = None
231
+ quantize_and_serve: bool = False
219
232
 
220
233
  # Memory and scheduling
221
234
  mem_fraction_static: Optional[float] = None
@@ -238,8 +251,6 @@ class ServerArgs:
238
251
 
239
252
  # Runtime options
240
253
  device: Optional[str] = None
241
- elastic_ep_backend: Literal[None, "mooncake"] = None
242
- mooncake_ib_device: Optional[str] = None
243
254
  tp_size: int = 1
244
255
  pp_size: int = 1
245
256
  pp_max_micro_batch_size: Optional[int] = None
@@ -272,10 +283,10 @@ class ServerArgs:
272
283
  collect_tokens_histogram: bool = False
273
284
  prompt_tokens_buckets: Optional[List[str]] = None
274
285
  generation_tokens_buckets: Optional[List[str]] = None
286
+ gc_warning_threshold_secs: float = 0.0
275
287
  decode_log_interval: int = 40
276
288
  enable_request_time_stats_logging: bool = False
277
289
  kv_events_config: Optional[str] = None
278
- gc_warning_threshold_secs: float = 0.0
279
290
  enable_trace: bool = False
280
291
  oltp_traces_endpoint: str = "localhost:4317"
281
292
 
@@ -317,7 +328,7 @@ class ServerArgs:
317
328
  ] = None
318
329
  max_loaded_loras: Optional[int] = None
319
330
  max_loras_per_batch: int = 8
320
- lora_eviction_policy: str = DEFAULT_LORA_EVICTION_POLICY
331
+ lora_eviction_policy: str = "lru"
321
332
  lora_backend: str = "triton"
322
333
  max_lora_chunk_size: Optional[int] = 16
323
334
 
@@ -332,7 +343,6 @@ class ServerArgs:
332
343
  nsa_decode_backend: str = "fa3"
333
344
 
334
345
  # Speculative decoding
335
- enable_beta_spec: bool = False
336
346
  speculative_algorithm: Optional[str] = None
337
347
  speculative_draft_model_path: Optional[str] = None
338
348
  speculative_draft_model_revision: Optional[str] = None
@@ -375,6 +385,8 @@ class ServerArgs:
375
385
  enable_expert_distribution_metrics: bool = False
376
386
  deepep_config: Optional[str] = None
377
387
  moe_dense_tp_size: Optional[int] = None
388
+ elastic_ep_backend: Literal[None, "mooncake"] = None
389
+ mooncake_ib_device: Optional[str] = None
378
390
 
379
391
  # Mamba cache
380
392
  max_mamba_cache_size: Optional[int] = None
@@ -473,6 +485,7 @@ class ServerArgs:
473
485
  scheduler_recv_interval: int = 1
474
486
  numa_node: Optional[List[int]] = None
475
487
  enable_deterministic_inference: bool = False
488
+ rl_on_policy_target: Optional[str] = None
476
489
 
477
490
  # Dynamic batch tokenizer
478
491
  enable_dynamic_batch_tokenizer: bool = False
@@ -509,19 +522,6 @@ class ServerArgs:
509
522
  pdmux_config_path: Optional[str] = None
510
523
  sm_group_num: int = 8
511
524
 
512
- def get_attention_backends(server_args):
513
- prefill_attention_backend_str = (
514
- server_args.prefill_attention_backend
515
- if server_args.prefill_attention_backend
516
- else server_args.attention_backend
517
- )
518
- decode_attention_backend_str = (
519
- server_args.decode_attention_backend
520
- if server_args.decode_attention_backend
521
- else server_args.attention_backend
522
- )
523
- return prefill_attention_backend_str, decode_attention_backend_str
524
-
525
525
  def __post_init__(self):
526
526
  """
527
527
  Orchestrates the handling of various server arguments, ensuring proper configuration and validation.
@@ -550,6 +550,9 @@ class ServerArgs:
550
550
  # Apply model-specific adjustments.
551
551
  self._handle_model_specific_adjustments()
552
552
 
553
+ # Handle Hicache settings.
554
+ self._handle_hicache()
555
+
553
556
  # Set kernel backends.
554
557
  self._handle_sampling_backend()
555
558
  self._handle_attention_backend_compatibility()
@@ -572,9 +575,6 @@ class ServerArgs:
572
575
  # Handle pipeline parallelism.
573
576
  self._handle_pipeline_parallelism()
574
577
 
575
- # Handle Hicache settings.
576
- self._handle_hicache()
577
-
578
578
  # Handle speculative decoding logic.
579
579
  self._handle_speculative_decoding()
580
580
 
@@ -614,22 +614,6 @@ class ServerArgs:
614
614
  )
615
615
  self.tool_call_parser = deprecated_tool_call_parsers[self.tool_call_parser]
616
616
 
617
- def _handle_ktransformers_configs(self):
618
- from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
619
- CompressedTensorsWNA16AMXEPMoEMethod,
620
- override_config,
621
- )
622
-
623
- override_config(
624
- CompressedTensorsWNA16AMXEPMoEMethod,
625
- self.kt_num_gpu_experts,
626
- self.kt_cpuinfer,
627
- self.kt_threadpool_count,
628
- self.kt_amx_weight_path,
629
- self.kt_amx_method,
630
- self.chunked_prefill_size,
631
- )
632
-
633
617
  def _handle_missing_default_values(self):
634
618
  if self.tokenizer_path is None:
635
619
  self.tokenizer_path = self.model_path
@@ -684,7 +668,7 @@ class ServerArgs:
684
668
  self.cuda_graph_max_bs = 64
685
669
  elif gpu_mem < 35 * 1024:
686
670
  # A10, 4090, 5090
687
- # (chunked_prefill_size 2k, cuda_graph_max_bs 16 if tp < 4 else 80)
671
+ # (chunked_prefill_size 2k, cuda_graph_max_bs 24 if tp < 4 else 80)
688
672
  if self.chunked_prefill_size is None:
689
673
  self.chunked_prefill_size = 2048
690
674
  if self.cuda_graph_max_bs is None:
@@ -692,7 +676,7 @@ class ServerArgs:
692
676
  # However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs
693
677
  # from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
694
678
  if self.tp_size < 4:
695
- self.cuda_graph_max_bs = 16
679
+ self.cuda_graph_max_bs = 24
696
680
  else:
697
681
  self.cuda_graph_max_bs = 80
698
682
  elif gpu_mem < 60 * 1024:
@@ -800,11 +784,9 @@ class ServerArgs:
800
784
  else 0.88
801
785
  )
802
786
 
803
- # Lazy init to avoid circular import
804
- # Multimodal models need more memory for the image processor
805
- from sglang.srt.configs.model_config import ModelConfig
806
-
807
- model_config = ModelConfig.from_server_args(self)
787
+ # Multimodal models need more memory for the image processing,
788
+ # so we adjust the mem_fraction_static accordingly.
789
+ model_config = self.get_model_config()
808
790
  if model_config.is_multimodal:
809
791
  self.adjust_mem_fraction_for_vlm(model_config)
810
792
 
@@ -892,14 +874,19 @@ class ServerArgs:
892
874
  logger.info(
893
875
  "Enable FlashInfer AllReduce Fusion on sm100 for DeepseekV3ForCausalLM"
894
876
  )
895
- if (
896
- self.quantization == "modelopt_fp4"
897
- and self.moe_runner_backend == "auto"
898
- ):
877
+ if self.moe_runner_backend == "auto":
899
878
  self.moe_runner_backend = "flashinfer_trtllm"
900
879
  logger.info(
901
- "Use flashinfer_trtllm as moe runner backend on sm100 for DeepseekV3ForCausalLM"
880
+ "Use flashinfer_trtllm as MoE runner backend on sm100 for DeepseekV3ForCausalLM"
902
881
  )
882
+ if self.quantization is None:
883
+ # Default DeepSeek V3/R1 native FP8 when not explicitly set,
884
+ # Because we need this condition for an assertion in
885
+ # flashinfer_trtllm MoE runner backend.
886
+ self.quantization = "fp8"
887
+ logger.info(
888
+ "Quantization not specified, default to fp8 for DeepSeek on sm100"
889
+ )
903
890
 
904
891
  elif model_arch in ["GptOssForCausalLM"]:
905
892
  if (
@@ -966,7 +953,13 @@ class ServerArgs:
966
953
  "fa3",
967
954
  "aiter",
968
955
  "triton",
969
- }, "fa3, aiter, or triton is required for Llama4 model"
956
+ "trtllm_mha",
957
+ }, "fa3, aiter, triton, or trtllm_mha is required for Llama4 model"
958
+ if is_sm100_supported() and self.attention_backend is None:
959
+ self.attention_backend = "trtllm_mha"
960
+ logger.warning(
961
+ "Use trtllm_mha as attention backend on sm100 for Llama4 model"
962
+ )
970
963
  elif model_arch in [
971
964
  "Gemma2ForCausalLM",
972
965
  "Gemma3ForCausalLM",
@@ -1052,6 +1045,67 @@ class ServerArgs:
1052
1045
  )
1053
1046
 
1054
1047
  def _handle_attention_backend_compatibility(self):
1048
+ model_config = self.get_model_config()
1049
+ use_mla_backend = self.use_mla_backend()
1050
+
1051
+ if self.prefill_attention_backend is not None and (
1052
+ self.prefill_attention_backend == self.decode_attention_backend
1053
+ ): # override the default attention backend
1054
+ self.attention_backend = self.prefill_attention_backend
1055
+
1056
+ # Pick the default attention backend if not specified
1057
+ if self.attention_backend is None:
1058
+ """
1059
+ Auto select the fastest attention backend.
1060
+
1061
+ 1. Models with MHA Architecture (e.g: Llama, QWen)
1062
+ 1.1 We will turn on FA3 on hopper unless user use spec decode with topk > 1 or page_size > 1.
1063
+ 1.2 In other cases, we will use flashinfer if available, otherwise use triton.
1064
+ 2. Models with MLA Architecture and using FA3
1065
+ 2.1 We will use FA3 backend on hopper.
1066
+ 2.2 We will use Flashinfer backend on blackwell.
1067
+ 2.3 Otherwise, we will use triton backend.
1068
+ """
1069
+
1070
+ if not use_mla_backend:
1071
+ # MHA architecture
1072
+ if (
1073
+ is_hopper_with_cuda_12_3()
1074
+ and is_no_spec_infer_or_topk_one(self)
1075
+ and is_fa3_default_architecture(self.model_config.hf_config)
1076
+ ):
1077
+ self.attention_backend = "fa3"
1078
+ elif is_hip():
1079
+ self.attention_backend = "aiter"
1080
+ elif is_npu():
1081
+ self.attention_backend = "ascend"
1082
+ else:
1083
+ self.attention_backend = (
1084
+ "flashinfer" if is_flashinfer_available() else "triton"
1085
+ )
1086
+ else:
1087
+ # MLA architecture
1088
+ if is_hopper_with_cuda_12_3():
1089
+ self.attention_backend = "fa3"
1090
+ elif is_sm100_supported():
1091
+ self.attention_backend = "flashinfer"
1092
+ elif is_hip():
1093
+ head_num = model_config.get_num_kv_heads(self.tp_size)
1094
+ # TODO current aiter only support head number 16 or 128 head number
1095
+ if head_num == 128 or head_num == 16:
1096
+ self.attention_backend = "aiter"
1097
+ else:
1098
+ self.attention_backend = "triton"
1099
+ elif is_npu():
1100
+ self.attention_backend = "ascend"
1101
+ else:
1102
+ self.attention_backend = "triton"
1103
+
1104
+ logger.warning(
1105
+ f"Attention backend not explicitly specified. Use {self.attention_backend} backend by default."
1106
+ )
1107
+
1108
+ # Torch native and flex attention backends
1055
1109
  if self.attention_backend == "torch_native":
1056
1110
  logger.warning(
1057
1111
  "Cuda graph is disabled because of using torch native attention backend"
@@ -1067,12 +1121,7 @@ class ServerArgs:
1067
1121
  self.speculative_algorithm is None
1068
1122
  ), "Speculative decoding is currently not supported with Flex Attention backend"
1069
1123
 
1070
- if is_npu() and self.attention_backend in ["ascend"]:
1071
- logger.warning(
1072
- "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
1073
- )
1074
- self.page_size = 128
1075
-
1124
+ # Major NVIDIA platforms backends
1076
1125
  if (
1077
1126
  self.attention_backend == "flashmla"
1078
1127
  or self.decode_attention_backend == "flashmla"
@@ -1127,19 +1176,13 @@ class ServerArgs:
1127
1176
  )
1128
1177
  self.page_size = 64
1129
1178
 
1130
- if self.attention_backend == "dual_chunk_flash_attn":
1179
+ if self.attention_backend == "fa3" and self.kv_cache_dtype == "fp8_e5m2":
1131
1180
  logger.warning(
1132
- "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
1181
+ "FlashAttention3 only supports fp8_e4m3 if using FP8; "
1182
+ "Setting attention backend to triton."
1133
1183
  )
1134
- self.enable_mixed_chunk = False
1135
- self.disable_radix_cache = True
1184
+ self.attention_backend = "triton"
1136
1185
 
1137
- if self.attention_backend == "intel_xpu":
1138
- if self.page_size not in [32, 64, 128]:
1139
- logger.warning(
1140
- f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1141
- )
1142
- self.page_size = 128
1143
1186
  if self.attention_backend == "fa4" or self.decode_attention_backend == "fa4":
1144
1187
  raise ValueError(
1145
1188
  "FA4 backend is only supported for prefill. Please use `--prefill-attention-backend fa4` instead."
@@ -1150,6 +1193,66 @@ class ServerArgs:
1150
1193
  )
1151
1194
  self.page_size = 128
1152
1195
 
1196
+ # AMD platforms backends
1197
+ if self.attention_backend == "aiter":
1198
+ if model_config.context_len > 8192:
1199
+ self.mem_fraction_static *= 0.90
1200
+
1201
+ # NPU platforms backends
1202
+ if is_npu() and self.attention_backend in ["ascend"]:
1203
+ logger.warning(
1204
+ "At this moment Ascend attention backend only supports a page_size of 128, change page_size to 128."
1205
+ )
1206
+ self.page_size = 128
1207
+
1208
+ # Other platforms backends
1209
+ if (
1210
+ self.attention_backend == "intel_amx"
1211
+ and self.device == "cpu"
1212
+ and not cpu_has_amx_support()
1213
+ ):
1214
+ logger.warning(
1215
+ "The current platform does not support Intel AMX, will fallback to torch_native backend."
1216
+ )
1217
+ self.attention_backend = "torch_native"
1218
+
1219
+ if (
1220
+ self.attention_backend == "intel_xpu"
1221
+ and self.device == "xpu"
1222
+ and not xpu_has_xmx_support()
1223
+ ):
1224
+ logger.warning(
1225
+ "The current platform does not support Intel XMX, will fallback to triton backend."
1226
+ )
1227
+ self.attention_backend = "triton"
1228
+
1229
+ if self.attention_backend == "intel_xpu":
1230
+ if self.page_size not in [32, 64, 128]:
1231
+ logger.warning(
1232
+ f"Intel XPU attention backend only supports page_size of 32, 64 or 128, changing page_size from {self.page_size} to 128."
1233
+ )
1234
+ self.page_size = 128
1235
+
1236
+ # Dual chunk flash attention backend
1237
+ if (
1238
+ getattr(model_config.hf_config, "dual_chunk_attention_config", None)
1239
+ is not None
1240
+ ):
1241
+ if self.attention_backend is None:
1242
+ self.attention_backend = "dual_chunk_flash_attn"
1243
+ logger.info("Dual chunk attention is turned on by default.")
1244
+ elif self.attention_backend != "dual_chunk_flash_attn":
1245
+ raise ValueError(
1246
+ "Dual chunk attention is enabled, but attention backend is set to "
1247
+ f"{self.attention_backend}. Please set it to 'dual_chunk_flash_attn'."
1248
+ )
1249
+ if self.attention_backend == "dual_chunk_flash_attn":
1250
+ logger.warning(
1251
+ "Mixed chunk and radix cache are disabled when using dual-chunk flash attention backend"
1252
+ )
1253
+ self.enable_mixed_chunk = False
1254
+ self.disable_radix_cache = True
1255
+
1153
1256
  def _handle_page_size(self):
1154
1257
  if self.page_size is None:
1155
1258
  self.page_size = 1
@@ -1162,6 +1265,22 @@ class ServerArgs:
1162
1265
  if self.grammar_backend is None:
1163
1266
  self.grammar_backend = "xgrammar"
1164
1267
 
1268
+ def _handle_ktransformers_configs(self):
1269
+ from sglang.srt.layers.quantization.compressed_tensors.compressed_tensors_moe import (
1270
+ CompressedTensorsWNA16AMXEPMoEMethod,
1271
+ override_config,
1272
+ )
1273
+
1274
+ override_config(
1275
+ CompressedTensorsWNA16AMXEPMoEMethod,
1276
+ self.kt_num_gpu_experts,
1277
+ self.kt_cpuinfer,
1278
+ self.kt_threadpool_count,
1279
+ self.kt_amx_weight_path,
1280
+ self.kt_amx_method,
1281
+ self.chunked_prefill_size,
1282
+ )
1283
+
1165
1284
  def _handle_data_parallelism(self):
1166
1285
  if self.dp_size == 1:
1167
1286
  self.enable_dp_attention = False
@@ -1277,6 +1396,24 @@ class ServerArgs:
1277
1396
  "Page first direct layout only support direct io backend"
1278
1397
  )
1279
1398
 
1399
+ if self.enable_hierarchical_cache and self.hicache_io_backend == "kernel":
1400
+ # fix for the compatibility issue with FlashAttention3 decoding and HiCache kernel backend
1401
+ if self.decode_attention_backend is None:
1402
+ if not self.use_mla_backend():
1403
+ self.decode_attention_backend = (
1404
+ "flashinfer" if is_flashinfer_available() else "triton"
1405
+ )
1406
+ else:
1407
+ self.decode_attention_backend = (
1408
+ "flashinfer" if is_sm100_supported() else "triton"
1409
+ )
1410
+ elif self.decode_attention_backend == "fa3":
1411
+ self.hicache_io_backend = "direct"
1412
+ logger.warning(
1413
+ "FlashAttention3 decode backend is not compatible with hierarchical cache. "
1414
+ "Setting hicache_io_backend to vanilla I/O, which may lead to suboptimal performance with small page sizes."
1415
+ )
1416
+
1280
1417
  def _handle_speculative_decoding(self):
1281
1418
  if self.speculative_algorithm == "NEXTN":
1282
1419
  self.speculative_algorithm = "EAGLE"
@@ -1287,22 +1424,26 @@ class ServerArgs:
1287
1424
  raise ValueError(
1288
1425
  "Currently standalone speculative decoding does not support dp attention."
1289
1426
  )
1427
+
1290
1428
  if self.max_running_requests is None:
1291
1429
  self.max_running_requests = 48
1292
1430
  logger.warning(
1293
- "Max running requests is reset to 48 for speculative decoding."
1431
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1294
1432
  )
1295
1433
 
1296
- if self.speculative_algorithm == "EAGLE" and self.enable_beta_spec:
1434
+ if (
1435
+ self.speculative_algorithm == "EAGLE"
1436
+ and envs.SGLANG_ENABLE_SPEC_V2.get()
1437
+ ):
1297
1438
  self.disable_overlap_schedule = False
1298
1439
  logger.warning(
1299
1440
  "Beta spec is enabled for eagle speculative decoding and overlap schedule is turned on."
1300
1441
  )
1301
1442
 
1302
- if not self.enable_beta_spec:
1443
+ if not envs.SGLANG_ENABLE_SPEC_V2.get():
1303
1444
  self.disable_overlap_schedule = True
1304
1445
  logger.warning(
1305
- "Overlap scheduler is disabled because of using eagle3 and standalone speculative decoding."
1446
+ "Overlap scheduler is disabled because of using eagle3 or standalone speculative decoding."
1306
1447
  )
1307
1448
 
1308
1449
  if self.enable_mixed_chunk:
@@ -1371,8 +1512,13 @@ class ServerArgs:
1371
1512
  raise ValueError(
1372
1513
  "Ngram speculative decoding only supports CUDA device."
1373
1514
  )
1515
+
1374
1516
  if self.max_running_requests is None:
1375
1517
  self.max_running_requests = 48
1518
+ logger.warning(
1519
+ "Max running requests is reset to 48 for speculative decoding. You can override this by explicitly setting --max-running-requests."
1520
+ )
1521
+
1376
1522
  self.disable_overlap_schedule = True
1377
1523
  self.enable_mixed_chunk = False
1378
1524
  self.speculative_eagle_topk = self.speculative_ngram_max_bfs_breadth
@@ -1515,19 +1661,44 @@ class ServerArgs:
1515
1661
  )
1516
1662
 
1517
1663
  def _handle_deterministic_inference(self):
1664
+ if self.rl_on_policy_target is not None:
1665
+ logger.warning(
1666
+ "Enable deterministic inference because of rl_on_policy_target."
1667
+ )
1668
+ self.enable_deterministic_inference = True
1669
+ # TODO remove this environment variable as a whole
1670
+ os.environ["SGLANG_ENABLE_DETERMINISTIC_INFERENCE"] = "1"
1671
+
1518
1672
  if self.enable_deterministic_inference:
1519
1673
  # Check sampling backend
1520
1674
  self.sampling_backend = "pytorch"
1521
1675
  logger.warning(
1522
1676
  "Sampling backend is set to pytorch for deterministic inference."
1523
1677
  )
1678
+ is_deepseek_model = False
1679
+ if parse_connector_type(self.model_path) != ConnectorType.INSTANCE:
1680
+ try:
1681
+ hf_config = self.get_hf_config()
1682
+ model_arch = hf_config.architectures[0]
1683
+ is_deepseek_model = model_arch in [
1684
+ "DeepseekV2ForCausalLM",
1685
+ "DeepseekV3ForCausalLM",
1686
+ "DeepseekV32ForCausalLM",
1687
+ ]
1688
+ except Exception:
1689
+ pass
1524
1690
 
1525
1691
  # Check attention backend
1526
1692
  if self.attention_backend is None:
1527
1693
  # User didn't specify attention backend, fallback based on GPU architecture
1528
1694
  if is_sm100_supported() or is_sm120_supported():
1529
1695
  # Blackwell and newer architectures
1530
- self.attention_backend = "flashinfer"
1696
+ if is_deepseek_model:
1697
+ # fallback to triton for DeepSeek models because flashinfer doesn't support deterministic inference for DeepSeek models yet
1698
+ self.attention_backend = "triton"
1699
+ else:
1700
+ # fallback to flashinfer on Blackwell for non-DeepSeek models
1701
+ self.attention_backend = "flashinfer"
1531
1702
  else:
1532
1703
  # Hopper (SM90) and older architectures
1533
1704
  self.attention_backend = "fa3"
@@ -1542,8 +1713,13 @@ class ServerArgs:
1542
1713
  f"but you explicitly specified '{self.attention_backend}'."
1543
1714
  )
1544
1715
 
1545
- # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1546
1716
  if self.attention_backend not in ["fa3", "triton"]:
1717
+ if is_deepseek_model:
1718
+ raise ValueError(
1719
+ f"Currently only fa3 and triton attention backends are supported for deterministic inference with DeepSeek models. But you're using {self.attention_backend}."
1720
+ )
1721
+
1722
+ # Currently, only FA3 and Triton supports radix cache. Support for other backends is in progress
1547
1723
  self.disable_radix_cache = True
1548
1724
  logger.warning(
1549
1725
  f"Currently radix cache is not compatible with {self.attention_backend} attention backend for deterministic inference. It will be supported in the future."
@@ -1743,6 +1919,18 @@ class ServerArgs:
1743
1919
  "KV cache dtype is FP8. Otherwise, KV cache scaling factors "
1744
1920
  "default to 1.0, which may cause accuracy issues. ",
1745
1921
  )
1922
+ parser.add_argument(
1923
+ "--kv-cache-dtype",
1924
+ type=str,
1925
+ default=ServerArgs.kv_cache_dtype,
1926
+ choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1927
+ help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1928
+ )
1929
+ parser.add_argument(
1930
+ "--enable-fp32-lm-head",
1931
+ action="store_true",
1932
+ help="If set, the LM head outputs (logits) are in FP32.",
1933
+ )
1746
1934
  parser.add_argument(
1747
1935
  "--modelopt-quant",
1748
1936
  type=str,
@@ -1782,18 +1970,6 @@ class ServerArgs:
1782
1970
  "This is useful for development and prototyping. For production, it's recommended "
1783
1971
  "to use separate quantization and deployment steps.",
1784
1972
  )
1785
- parser.add_argument(
1786
- "--kv-cache-dtype",
1787
- type=str,
1788
- default=ServerArgs.kv_cache_dtype,
1789
- choices=["auto", "fp8_e5m2", "fp8_e4m3", "bf16", "bfloat16"],
1790
- help='Data type for kv cache storage. "auto" will use model data type. "bf16" or "bfloat16" for BF16 KV cache. "fp8_e5m2" and "fp8_e4m3" are supported for CUDA 11.8+.',
1791
- )
1792
- parser.add_argument(
1793
- "--enable-fp32-lm-head",
1794
- action="store_true",
1795
- help="If set, the LM head outputs (logits) are in FP32.",
1796
- )
1797
1973
 
1798
1974
  # Memory and scheduling
1799
1975
  parser.add_argument(
@@ -1898,7 +2074,14 @@ class ServerArgs:
1898
2074
  parser.add_argument(
1899
2075
  "--disable-hybrid-swa-memory",
1900
2076
  action="store_true",
1901
- help="Disable the hybrid SWA memory.",
2077
+ help="Disable the hybrid SWA memory pool.",
2078
+ )
2079
+ parser.add_argument(
2080
+ "--radix-eviction-policy",
2081
+ type=str,
2082
+ choices=RADIX_EVICTION_POLICY_CHOICES,
2083
+ default=ServerArgs.radix_eviction_policy,
2084
+ help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
1902
2085
  )
1903
2086
 
1904
2087
  # Runtime options
@@ -1908,21 +2091,6 @@ class ServerArgs:
1908
2091
  default=ServerArgs.device,
1909
2092
  help="The device to use ('cuda', 'xpu', 'hpu', 'npu', 'cpu'). Defaults to auto-detection if not specified.",
1910
2093
  )
1911
- parser.add_argument(
1912
- "--elastic-ep-backend",
1913
- type=str,
1914
- default=ServerArgs.elastic_ep_backend,
1915
- choices=["none", "mooncake"],
1916
- help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
1917
- )
1918
- parser.add_argument(
1919
- "--mooncake-ib-device",
1920
- type=str,
1921
- default=ServerArgs.mooncake_ib_device,
1922
- help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
1923
- "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
1924
- "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
1925
- )
1926
2094
  parser.add_argument(
1927
2095
  "--tensor-parallel-size",
1928
2096
  "--tp-size",
@@ -2210,6 +2378,12 @@ class ServerArgs:
2210
2378
  default=ServerArgs.tool_call_parser,
2211
2379
  help=f"Specify the parser for handling tool-call interactions. Options include: {tool_call_parser_choices}.",
2212
2380
  )
2381
+ parser.add_argument(
2382
+ "--tool-server",
2383
+ type=str,
2384
+ default=None,
2385
+ help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
2386
+ )
2213
2387
  parser.add_argument(
2214
2388
  "--sampling-defaults",
2215
2389
  type=str,
@@ -2220,12 +2394,6 @@ class ServerArgs:
2220
2394
  "'model' uses the model's generation_config.json to get the recommended "
2221
2395
  "sampling parameters if available. Default is 'model'.",
2222
2396
  )
2223
- parser.add_argument(
2224
- "--tool-server",
2225
- type=str,
2226
- default=None,
2227
- help="Either 'demo' or a comma-separated list of tool server urls to use for the model. If not specified, no tool server will be used.",
2228
- )
2229
2397
 
2230
2398
  # Data parallelism
2231
2399
  parser.add_argument(
@@ -2332,7 +2500,7 @@ class ServerArgs:
2332
2500
  parser.add_argument(
2333
2501
  "--lora-eviction-policy",
2334
2502
  type=str,
2335
- default=DEFAULT_LORA_EVICTION_POLICY,
2503
+ default=ServerArgs.lora_eviction_policy,
2336
2504
  choices=["lru", "fifo"],
2337
2505
  help="LoRA adapter eviction policy when memory pool is full. 'lru': Least Recently Used (default, better cache efficiency). 'fifo': First-In-First-Out.",
2338
2506
  )
@@ -2408,7 +2576,6 @@ class ServerArgs:
2408
2576
  )
2409
2577
 
2410
2578
  # Speculative decoding
2411
- parser.add_argument("--enable-beta-spec", action="store_true")
2412
2579
  parser.add_argument(
2413
2580
  "--speculative-algorithm",
2414
2581
  type=str,
@@ -2644,6 +2811,21 @@ class ServerArgs:
2644
2811
  default=ServerArgs.moe_dense_tp_size,
2645
2812
  help="TP size for MoE dense MLP layers. This flag is useful when, with large TP size, there are errors caused by weights in MLP layers having dimension smaller than the min dimension GEMM supports.",
2646
2813
  )
2814
+ parser.add_argument(
2815
+ "--elastic-ep-backend",
2816
+ type=str,
2817
+ default=ServerArgs.elastic_ep_backend,
2818
+ choices=["none", "mooncake"],
2819
+ help="Specify the collective communication backend for elastic EP. Currently supports 'mooncake'.",
2820
+ )
2821
+ parser.add_argument(
2822
+ "--mooncake-ib-device",
2823
+ type=str,
2824
+ default=ServerArgs.mooncake_ib_device,
2825
+ help="The InfiniBand devices for Mooncake Backend transfer, accepts multiple comma-separated devices "
2826
+ "(e.g., --mooncake-ib-device mlx5_0,mlx5_1). "
2827
+ "Default is None, which triggers automatic device detection when Mooncake Backend is enabled.",
2828
+ )
2647
2829
 
2648
2830
  # Mamba Cache
2649
2831
  parser.add_argument(
@@ -2691,13 +2873,6 @@ class ServerArgs:
2691
2873
  default=ServerArgs.hicache_write_policy,
2692
2874
  help="The write policy of hierarchical cache.",
2693
2875
  )
2694
- parser.add_argument(
2695
- "--radix-eviction-policy",
2696
- type=str,
2697
- choices=RADIX_EVICTION_POLICY_CHOICES,
2698
- default=ServerArgs.radix_eviction_policy,
2699
- help="The eviction policy of radix trees. 'lru' stands for Least Recently Used, 'lfu' stands for Least Frequently Used.",
2700
- )
2701
2876
  parser.add_argument(
2702
2877
  "--hicache-io-backend",
2703
2878
  type=str,
@@ -2805,7 +2980,7 @@ class ServerArgs:
2805
2980
  "--ds-sparse-decode-threshold",
2806
2981
  type=int,
2807
2982
  default=ServerArgs.ds_sparse_decode_threshold,
2808
- help="The type of heavy channels in double sparsity attention",
2983
+ help="The minimum decode sequence length required before the double-sparsity backend switches from the dense fallback to the sparse decode kernel.",
2809
2984
  )
2810
2985
 
2811
2986
  # Offloading
@@ -3111,26 +3286,20 @@ class ServerArgs:
3111
3286
  nargs="+",
3112
3287
  help="Sets the numa node for the subprocesses. i-th element corresponds to i-th subprocess.",
3113
3288
  )
3114
-
3115
- # Debug tensor dumps
3116
- parser.add_argument(
3117
- "--debug-tensor-dump-output-folder",
3118
- type=str,
3119
- default=ServerArgs.debug_tensor_dump_output_folder,
3120
- help="The output folder for dumping tensors.",
3121
- )
3122
3289
  parser.add_argument(
3123
- "--debug-tensor-dump-input-file",
3124
- type=str,
3125
- default=ServerArgs.debug_tensor_dump_input_file,
3126
- help="The input filename for dumping tensors",
3290
+ "--enable-deterministic-inference",
3291
+ action="store_true",
3292
+ help="Enable deterministic inference mode with batch invariant ops.",
3127
3293
  )
3128
3294
  parser.add_argument(
3129
- "--debug-tensor-dump-inject",
3295
+ "--rl-on-policy-target",
3130
3296
  type=str,
3131
- default=ServerArgs.debug_tensor_dump_inject,
3132
- help="Inject the outputs from jax as the input of every layer.",
3297
+ default=ServerArgs.rl_on_policy_target,
3298
+ choices=["fsdp"],
3299
+ help="The training system that SGLang needs to match for true on-policy.",
3133
3300
  )
3301
+
3302
+ # Dynamic batch tokenizer
3134
3303
  parser.add_argument(
3135
3304
  "--enable-dynamic-batch-tokenizer",
3136
3305
  action="store_true",
@@ -3149,6 +3318,26 @@ class ServerArgs:
3149
3318
  help="[Only used if --enable-dynamic-batch-tokenizer is set] Timeout in seconds for batching tokenization requests.",
3150
3319
  )
3151
3320
 
3321
+ # Debug tensor dumps
3322
+ parser.add_argument(
3323
+ "--debug-tensor-dump-output-folder",
3324
+ type=str,
3325
+ default=ServerArgs.debug_tensor_dump_output_folder,
3326
+ help="The output folder for dumping tensors.",
3327
+ )
3328
+ parser.add_argument(
3329
+ "--debug-tensor-dump-input-file",
3330
+ type=str,
3331
+ default=ServerArgs.debug_tensor_dump_input_file,
3332
+ help="The input filename for dumping tensors",
3333
+ )
3334
+ parser.add_argument(
3335
+ "--debug-tensor-dump-inject",
3336
+ type=str,
3337
+ default=ServerArgs.debug_tensor_dump_inject,
3338
+ help="Inject the outputs from jax as the input of every layer.",
3339
+ )
3340
+
3152
3341
  # PD disaggregation
3153
3342
  parser.add_argument(
3154
3343
  "--disaggregation-mode",
@@ -3258,7 +3447,6 @@ class ServerArgs:
3258
3447
  default=None,
3259
3448
  help="The path of the PD-Multiplexing config file.",
3260
3449
  )
3261
-
3262
3450
  parser.add_argument(
3263
3451
  "--sm-group-num",
3264
3452
  type=int,
@@ -3266,50 +3454,6 @@ class ServerArgs:
3266
3454
  help="Number of sm partition groups.",
3267
3455
  )
3268
3456
 
3269
- # For deterministic inference
3270
- parser.add_argument(
3271
- "--enable-deterministic-inference",
3272
- action="store_true",
3273
- help="Enable deterministic inference mode with batch invariant ops.",
3274
- )
3275
-
3276
- # Deprecated arguments
3277
- parser.add_argument(
3278
- "--enable-ep-moe",
3279
- action=DeprecatedAction,
3280
- help="NOTE: --enable-ep-moe is deprecated. Please set `--ep-size` to the same value as `--tp-size` instead.",
3281
- )
3282
- parser.add_argument(
3283
- "--enable-deepep-moe",
3284
- action=DeprecatedAction,
3285
- help="NOTE: --enable-deepep-moe is deprecated. Please set `--moe-a2a-backend` to 'deepep' instead.",
3286
- )
3287
- parser.add_argument(
3288
- "--enable-flashinfer-cutlass-moe",
3289
- action=DeprecatedAction,
3290
- help="NOTE: --enable-flashinfer-cutlass-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutlass' instead.",
3291
- )
3292
- parser.add_argument(
3293
- "--enable-flashinfer-cutedsl-moe",
3294
- action=DeprecatedAction,
3295
- help="NOTE: --enable-flashinfer-cutedsl-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_cutedsl' instead.",
3296
- )
3297
- parser.add_argument(
3298
- "--enable-flashinfer-trtllm-moe",
3299
- action=DeprecatedAction,
3300
- help="NOTE: --enable-flashinfer-trtllm-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_trtllm' instead.",
3301
- )
3302
- parser.add_argument(
3303
- "--enable-triton-kernel-moe",
3304
- action=DeprecatedAction,
3305
- help="NOTE: --enable-triton-kernel-moe is deprecated. Please set `--moe-runner-backend` to 'triton_kernel' instead.",
3306
- )
3307
- parser.add_argument(
3308
- "--enable-flashinfer-mxfp4-moe",
3309
- action=DeprecatedAction,
3310
- help="NOTE: --enable-flashinfer-mxfp4-moe is deprecated. Please set `--moe-runner-backend` to 'flashinfer_mxfp4' instead.",
3311
- )
3312
-
3313
3457
  # Configuration file support
3314
3458
  parser.add_argument(
3315
3459
  "--config",
@@ -3344,6 +3488,34 @@ class ServerArgs:
3344
3488
  )
3345
3489
  return hf_config
3346
3490
 
3491
+ def get_model_config(self):
3492
+ # Lazy init to avoid circular import
3493
+ from sglang.srt.configs.model_config import ModelConfig
3494
+
3495
+ if hasattr(self, "model_config"):
3496
+ return self.model_config
3497
+ self.model_config = ModelConfig.from_server_args(self)
3498
+ return self.model_config
3499
+
3500
+ def get_attention_backends(self):
3501
+ prefill_attention_backend_str = (
3502
+ self.prefill_attention_backend
3503
+ if self.prefill_attention_backend
3504
+ else self.attention_backend
3505
+ )
3506
+ decode_attention_backend_str = (
3507
+ self.decode_attention_backend
3508
+ if self.decode_attention_backend
3509
+ else self.attention_backend
3510
+ )
3511
+ return prefill_attention_backend_str, decode_attention_backend_str
3512
+
3513
+ def use_mla_backend(self):
3514
+ from sglang.srt.configs.model_config import AttentionArch
3515
+
3516
+ model_config = self.get_model_config()
3517
+ return model_config.attention_arch == AttentionArch.MLA
3518
+
3347
3519
  def check_server_args(self):
3348
3520
  # Check parallel size constraints
3349
3521
  assert (
@@ -3721,6 +3893,13 @@ class PortArgs:
3721
3893
  else:
3722
3894
  nccl_port = server_args.nccl_port
3723
3895
 
3896
+ if server_args.tokenizer_worker_num > 1:
3897
+ tokenizer_worker_ipc_name = (
3898
+ f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}"
3899
+ )
3900
+ else:
3901
+ tokenizer_worker_ipc_name = None
3902
+
3724
3903
  if not server_args.enable_dp_attention:
3725
3904
  # Normal case, use IPC within a single node
3726
3905
  return PortArgs(
@@ -3730,7 +3909,7 @@ class PortArgs:
3730
3909
  nccl_port=nccl_port,
3731
3910
  rpc_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3732
3911
  metrics_ipc_name=f"ipc://{tempfile.NamedTemporaryFile(delete=False).name}",
3733
- tokenizer_worker_ipc_name=None,
3912
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3734
3913
  )
3735
3914
  else:
3736
3915
  # DP attention. Use TCP + port to handle both single-node and multi-node.
@@ -3764,7 +3943,7 @@ class PortArgs:
3764
3943
  nccl_port=nccl_port,
3765
3944
  rpc_ipc_name=f"tcp://{dist_init_host}:{rpc_port}",
3766
3945
  metrics_ipc_name=f"tcp://{dist_init_host}:{metrics_ipc_name}",
3767
- tokenizer_worker_ipc_name=None,
3946
+ tokenizer_worker_ipc_name=tokenizer_worker_ipc_name,
3768
3947
  )
3769
3948
 
3770
3949