sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. sglang/srt/_custom_ops.py +29 -1
  2. sglang/srt/configs/model_config.py +1 -1
  3. sglang/srt/conversation.py +1 -1
  4. sglang/srt/disaggregation/common/conn.py +34 -6
  5. sglang/srt/disaggregation/mini_lb.py +3 -2
  6. sglang/srt/disaggregation/mooncake/conn.py +49 -20
  7. sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
  8. sglang/srt/disaggregation/nixl/conn.py +17 -13
  9. sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
  10. sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
  11. sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
  12. sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
  13. sglang/srt/distributed/parallel_state.py +70 -15
  14. sglang/srt/entrypoints/engine.py +2 -8
  15. sglang/srt/entrypoints/http_server.py +20 -32
  16. sglang/srt/entrypoints/openai/protocol.py +3 -3
  17. sglang/srt/entrypoints/openai/serving_chat.py +27 -4
  18. sglang/srt/function_call/base_format_detector.py +74 -12
  19. sglang/srt/function_call/deepseekv3_detector.py +26 -11
  20. sglang/srt/function_call/ebnf_composer.py +95 -63
  21. sglang/srt/function_call/function_call_parser.py +4 -4
  22. sglang/srt/function_call/kimik2_detector.py +41 -16
  23. sglang/srt/function_call/llama32_detector.py +6 -3
  24. sglang/srt/function_call/mistral_detector.py +11 -3
  25. sglang/srt/function_call/pythonic_detector.py +16 -14
  26. sglang/srt/function_call/qwen25_detector.py +12 -3
  27. sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +10 -9
  28. sglang/srt/layers/activation.py +11 -3
  29. sglang/srt/layers/attention/base_attn_backend.py +3 -1
  30. sglang/srt/layers/communicator.py +12 -12
  31. sglang/srt/layers/dp_attention.py +72 -24
  32. sglang/srt/layers/logits_processor.py +34 -24
  33. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
  34. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
  35. sglang/srt/layers/moe/topk.py +5 -13
  36. sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
  37. sglang/srt/layers/quantization/modelopt_quant.py +8 -4
  38. sglang/srt/layers/quantization/utils.py +0 -9
  39. sglang/srt/layers/radix_attention.py +5 -3
  40. sglang/srt/lora/lora_manager.py +133 -169
  41. sglang/srt/lora/lora_registry.py +124 -0
  42. sglang/srt/lora/mem_pool.py +2 -2
  43. sglang/srt/managers/cache_controller.py +53 -6
  44. sglang/srt/managers/io_struct.py +19 -1
  45. sglang/srt/managers/schedule_batch.py +13 -3
  46. sglang/srt/managers/scheduler.py +13 -25
  47. sglang/srt/managers/tokenizer_manager.py +28 -25
  48. sglang/srt/managers/tp_worker.py +2 -4
  49. sglang/srt/mem_cache/allocator.py +67 -7
  50. sglang/srt/mem_cache/hicache_storage.py +17 -1
  51. sglang/srt/mem_cache/hiradix_cache.py +30 -16
  52. sglang/srt/mem_cache/memory_pool_host.py +3 -0
  53. sglang/srt/model_executor/cuda_graph_runner.py +61 -25
  54. sglang/srt/model_executor/forward_batch_info.py +201 -29
  55. sglang/srt/model_executor/model_runner.py +41 -23
  56. sglang/srt/models/deepseek_v2.py +1 -2
  57. sglang/srt/models/mllama4.py +10 -3
  58. sglang/srt/models/qwen2_moe.py +0 -4
  59. sglang/srt/models/qwen3_moe.py +1 -6
  60. sglang/srt/reasoning_parser.py +46 -4
  61. sglang/srt/sampling/sampling_batch_info.py +6 -5
  62. sglang/srt/server_args.py +76 -55
  63. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
  64. sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
  65. sglang/srt/speculative/eagle_utils.py +51 -23
  66. sglang/srt/speculative/eagle_worker.py +59 -44
  67. sglang/srt/two_batch_overlap.py +9 -5
  68. sglang/srt/utils.py +17 -68
  69. sglang/test/test_activation.py +50 -1
  70. sglang/version.py +1 -1
  71. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +5 -5
  72. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +75 -72
  73. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
  74. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
  75. {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,7 @@ from sglang.srt.layers.sampler import Sampler
68
68
  from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
69
69
  from sglang.srt.layers.utils import is_sm100_supported
70
70
  from sglang.srt.lora.lora_manager import LoRAManager
71
+ from sglang.srt.lora.lora_registry import LoRARef
71
72
  from sglang.srt.managers.schedule_batch import (
72
73
  GLOBAL_SERVER_ARGS_KEYS,
73
74
  global_server_args_dict,
@@ -108,7 +109,6 @@ from sglang.srt.utils import (
108
109
  get_bool_env_var,
109
110
  get_cpu_ids_by_node,
110
111
  init_custom_process_group,
111
- is_cuda,
112
112
  is_fa3_default_architecture,
113
113
  is_flashinfer_available,
114
114
  is_hip,
@@ -275,6 +275,7 @@ class ModelRunner:
275
275
  self.sampler = Sampler()
276
276
  self.load_model()
277
277
 
278
+ # Check if the model is using hybrid SWA
278
279
  if (
279
280
  not self.server_args.disable_hybrid_swa_memory
280
281
  and self.sliding_window_size is not None
@@ -377,6 +378,7 @@ class ModelRunner:
377
378
  is_hopper_with_cuda_12_3()
378
379
  and is_no_spec_infer_or_topk_one(server_args)
379
380
  and is_fa3_default_architecture(self.model_config.hf_config)
381
+ and (not server_args.enable_hierarchical_cache)
380
382
  ):
381
383
  server_args.attention_backend = "fa3"
382
384
  elif _is_hip:
@@ -389,7 +391,9 @@ class ModelRunner:
389
391
  )
390
392
  else:
391
393
  # MLA architecture
392
- if is_hopper_with_cuda_12_3():
394
+ if is_hopper_with_cuda_12_3() and (
395
+ not server_args.enable_hierarchical_cache
396
+ ):
393
397
  server_args.attention_backend = "fa3"
394
398
  elif is_sm100_supported():
395
399
  server_args.attention_backend = "flashinfer"
@@ -890,44 +894,38 @@ class ModelRunner:
890
894
  tp_rank=self.tp_rank,
891
895
  max_lora_rank=self.server_args.max_lora_rank,
892
896
  target_modules=self.server_args.lora_target_modules,
897
+ lora_paths=self.server_args.lora_paths,
893
898
  )
894
- result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
895
- if result.success:
896
- logger.info(
897
- f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
898
- )
899
- else:
900
- raise RuntimeError(f"Failed to load LoRA adapters: {result.error_message}")
901
899
 
902
- def load_lora_adapter(self, lora_name: str, lora_path: str):
900
+ def load_lora_adapter(self, lora_ref: LoRARef):
903
901
  """Load a new lora adapter from disk or huggingface."""
904
902
 
905
903
  logger.info(
906
- f"LoRA adapter loading starts: name={lora_name}, path={lora_path}. "
904
+ f"LoRA adapter loading starts: {lora_ref}. "
907
905
  f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
908
906
  )
909
907
 
910
- result = self.lora_manager.load_lora_adapter(lora_name, lora_path)
908
+ result = self.lora_manager.load_lora_adapter(lora_ref)
911
909
 
912
910
  logger.info(
913
- f"LoRA adapter loading completes: name={lora_name}, path={lora_path}. "
911
+ f"LoRA adapter loading completes: {lora_ref}. "
914
912
  f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
915
913
  )
916
914
 
917
915
  return result
918
916
 
919
- def unload_lora_adapter(self, lora_name: str):
917
+ def unload_lora_adapter(self, lora_ref: LoRARef):
920
918
  """Unload a lora adapter that was previously loaded during initialization or dynamic loading."""
921
919
 
922
920
  logger.info(
923
- f"LoRA adapter unloading starts: name={lora_name}. "
921
+ f"LoRA adapter unloading starts: {lora_ref}. "
924
922
  f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
925
923
  )
926
924
 
927
- result = self.lora_manager.unload_lora_adapter(lora_name)
925
+ result = self.lora_manager.unload_lora_adapter(lora_ref)
928
926
 
929
927
  logger.info(
930
- f"LoRA adapter unloading completes: name={lora_name}. "
928
+ f"LoRA adapter unloading completes: {lora_ref}. "
931
929
  f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
932
930
  )
933
931
 
@@ -1010,8 +1008,11 @@ class ModelRunner:
1010
1008
  try:
1011
1009
  layers = self.model.language_model.model.layers
1012
1010
  except:
1013
- self.is_hybrid = False
1014
- return
1011
+ try:
1012
+ layers = self.model.language_model.layers
1013
+ except:
1014
+ self.is_hybrid = False
1015
+ return
1015
1016
 
1016
1017
  for layer in layers:
1017
1018
  if (
@@ -1462,9 +1463,13 @@ class ModelRunner:
1462
1463
  tensor_parallel(self.model, device_mesh)
1463
1464
 
1464
1465
  def forward_decode(
1465
- self, forward_batch: ForwardBatch, pp_proxy_tensors=None
1466
+ self,
1467
+ forward_batch: ForwardBatch,
1468
+ skip_attn_backend_init: bool = False,
1469
+ pp_proxy_tensors=None,
1466
1470
  ) -> LogitsProcessorOutput:
1467
- self.attn_backend.init_forward_metadata(forward_batch)
1471
+ if not skip_attn_backend_init:
1472
+ self.attn_backend.init_forward_metadata(forward_batch)
1468
1473
  # FIXME: add pp_proxy_tensors arg to all models
1469
1474
  kwargs = {}
1470
1475
  if self.support_pp:
@@ -1576,8 +1581,18 @@ class ModelRunner:
1576
1581
  skip_attn_backend_init=skip_attn_backend_init,
1577
1582
  pp_proxy_tensors=pp_proxy_tensors,
1578
1583
  )
1579
- elif forward_batch.forward_mode.is_decode():
1580
- ret = self.forward_decode(forward_batch, pp_proxy_tensors=pp_proxy_tensors)
1584
+ return ret, can_run_cuda_graph
1585
+
1586
+ # For MLP sync
1587
+ if forward_batch.global_num_tokens_cpu is not None:
1588
+ forward_batch.prepare_mlp_sync_batch(self)
1589
+
1590
+ if forward_batch.forward_mode.is_decode():
1591
+ ret = self.forward_decode(
1592
+ forward_batch,
1593
+ skip_attn_backend_init=skip_attn_backend_init,
1594
+ pp_proxy_tensors=pp_proxy_tensors,
1595
+ )
1581
1596
  elif forward_batch.forward_mode.is_extend():
1582
1597
  ret = self.forward_extend(
1583
1598
  forward_batch,
@@ -1595,6 +1610,9 @@ class ModelRunner:
1595
1610
  else:
1596
1611
  raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
1597
1612
 
1613
+ if forward_batch.global_num_tokens_cpu is not None:
1614
+ forward_batch.post_forward_mlp_sync_batch(ret)
1615
+
1598
1616
  return ret, can_run_cuda_graph
1599
1617
 
1600
1618
  def _preprocess_logits(
@@ -550,9 +550,8 @@ class DeepseekV2MoE(nn.Module):
550
550
  def forward_deepep(
551
551
  self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
552
552
  ) -> torch.Tensor:
553
- forward_mode = forward_batch.forward_mode
554
553
  shared_output = None
555
- if is_non_idle_and_non_empty(forward_mode, hidden_states):
554
+ if hidden_states.shape[0] > 0:
556
555
  # router_logits: (num_tokens, n_experts)
557
556
  router_logits = self.gate(hidden_states)
558
557
  shared_output = self._forward_shared_experts(hidden_states)
@@ -23,6 +23,7 @@ from sglang.srt.managers.schedule_batch import (
23
23
  Modality,
24
24
  MultimodalDataItem,
25
25
  MultimodalInputs,
26
+ global_server_args_dict,
26
27
  )
27
28
  from sglang.srt.model_executor.forward_batch_info import ForwardBatch
28
29
  from sglang.srt.model_loader.weight_utils import default_weight_loader
@@ -55,13 +56,17 @@ class Llama4ForConditionalGeneration(nn.Module):
55
56
  self.quant_config = quant_config
56
57
 
57
58
  # Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
58
- self.has_vision = self._has_vision_weights(config)
59
- if not self.has_vision:
59
+ self.has_vision_weights = self._has_vision_weights(config)
60
+ if not self.has_vision_weights:
60
61
  logger.warning(
61
62
  "No vision weights found in checkpoint. Model will run in text-only mode. "
62
63
  "Multimodal capabilities (image processing) will be unavailable."
63
64
  )
64
65
 
66
+ self.has_vision = (
67
+ self.has_vision_weights and global_server_args_dict["enable_multimodal"]
68
+ )
69
+
65
70
  if self.has_vision:
66
71
  self.vision_model = Llama4VisionModel(config.vision_config)
67
72
  self.multi_modal_projector = Llama4MultiModalProjector(config)
@@ -269,7 +274,9 @@ class Llama4ForConditionalGeneration(nn.Module):
269
274
 
270
275
  def _should_skip_weight(self, name: str) -> bool:
271
276
  """Check if we should skip loading this weight."""
272
- return "vision" in name and not self.has_vision
277
+ return not self.has_vision and (
278
+ "vision" in name or "multi_modal_projector" in name
279
+ )
273
280
 
274
281
  def _transform_weight_name(self, name: str) -> str:
275
282
  """Transform weight name by adding language_model prefix if needed."""
@@ -43,10 +43,6 @@ from sglang.srt.layers.communicator import (
43
43
  ScatterMode,
44
44
  )
45
45
  from sglang.srt.layers.dp_attention import (
46
- attn_tp_all_gather,
47
- attn_tp_reduce_scatter,
48
- dp_gather_partial,
49
- dp_scatter,
50
46
  get_attention_tp_rank,
51
47
  get_attention_tp_size,
52
48
  get_local_attention_dp_size,
@@ -38,10 +38,6 @@ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
38
38
  from sglang.srt.layers.activation import SiluAndMul
39
39
  from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
40
40
  from sglang.srt.layers.dp_attention import (
41
- attn_tp_all_gather,
42
- attn_tp_reduce_scatter,
43
- dp_gather_partial,
44
- dp_scatter,
45
41
  get_attention_tp_rank,
46
42
  get_attention_tp_size,
47
43
  get_local_attention_dp_size,
@@ -193,8 +189,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
193
189
  def forward_deepep(
194
190
  self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
195
191
  ) -> torch.Tensor:
196
- forward_mode = forward_batch.forward_mode
197
- if is_non_idle_and_non_empty(forward_mode, hidden_states):
192
+ if hidden_states.shape[0] > 0:
198
193
  # router_logits: (num_tokens, n_experts)
199
194
  router_logits, _ = self.gate(hidden_states)
200
195
  topk_weights, topk_idx, _ = self.topk(
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
118
118
  Returns all the text before the </think> tag as `reasoning_text`
119
119
  and the rest of the text as `normal_text`.
120
120
 
121
+ Supported models:
122
+ - DeepSeek-R1: Always generates thinking content without <think> start tag
123
+ - DeepSeek-R1-0528: Generates thinking content with <think> start tag
124
+
125
+ Format patterns:
126
+ - DeepSeek-R1: "I need to think about this...</think>The answer is 42."
127
+ - DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
128
+
121
129
  Args:
122
130
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
123
131
  If True, streams reasoning content as it arrives.
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
136
144
 
137
145
  class Qwen3Detector(BaseReasoningFormatDetector):
138
146
  """
139
- Detector for Qwen3 model.
147
+ Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
140
148
  Assumes reasoning format:
141
149
  (<think>)*(.*)</think>
142
- Returns all the text before the </think> tag as `reasoning_text`
143
- and the rest of the text as `normal_text`.
150
+
151
+ Qwen3 models released before 07/2025 supports switching between thinking mode and normal
152
+ mode using `enable_thinking` parameter in the request parameter.
153
+ - enable_thinking=True: "<think>reasoning content</think>The answer is 42."
154
+ - enable_thinking=False: "The answer is 42." (no thinking tokens)
155
+
156
+ This detector handles both cases.
157
+
158
+ NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
159
+ Those models always generate thinking content without <think> start tags.
160
+ Use "qwen3-thinking" parser type for those models instead.
144
161
 
145
162
  Args:
146
163
  stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
148
165
  """
149
166
 
150
167
  def __init__(self, stream_reasoning: bool = True):
151
- # Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
152
168
  super().__init__(
153
169
  "<think>",
154
170
  "</think>",
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
157
173
  )
158
174
 
159
175
 
176
+ class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
177
+ """
178
+ Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
179
+ Assumes reasoning format:
180
+ *(.*)</think>
181
+
182
+ These models always generate thinking content without <think> start tag.
183
+ They do not support the enable_thinking parameter and always think.
184
+
185
+ Format: "I need to think about this...</think>The answer is 42."
186
+
187
+ Args:
188
+ stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
189
+ If True, streams reasoning content as it arrives.
190
+ """
191
+
192
+ def __init__(self, stream_reasoning: bool = True):
193
+ super().__init__(
194
+ "<think>",
195
+ "</think>",
196
+ force_reasoning=True,
197
+ stream_reasoning=stream_reasoning,
198
+ )
199
+
200
+
160
201
  class KimiDetector(BaseReasoningFormatDetector):
161
202
  """
162
203
  Detector for Kimi Thinking model.
@@ -189,6 +230,7 @@ class ReasoningParser:
189
230
  DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
190
231
  "deepseek-r1": DeepSeekR1Detector,
191
232
  "qwen3": Qwen3Detector,
233
+ "qwen3-thinking": Qwen3ThinkingDetector,
192
234
  "kimi": KimiDetector,
193
235
  }
194
236
 
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
322
322
  # Set the flag to True if any of the two has custom logit processor
323
323
  self.has_custom_logit_processor = True
324
324
 
325
+ # Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
326
+ # See note below on len(self) and len(other).
327
+ self.logit_bias = merge_bias_tensor(
328
+ self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
329
+ )
330
+
325
331
  # Note: because the __len()__ operator is defined on the temperatures tensor,
326
332
  # please make sure any merge operation with len(self) or len(other) is done before
327
333
  # the merge operation of the temperatures tensor below.
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
340
346
  self.need_top_k_sampling |= other.need_top_k_sampling
341
347
  self.need_min_p_sampling |= other.need_min_p_sampling
342
348
 
343
- # Merge logit bias
344
- self.logit_bias = merge_bias_tensor(
345
- self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
346
- )
347
-
348
349
 
349
350
  def merge_bias_tensor(
350
351
  lhs: Optional[torch.Tensor],
sglang/srt/server_args.py CHANGED
@@ -20,10 +20,10 @@ import logging
20
20
  import os
21
21
  import random
22
22
  import tempfile
23
- from token import OP
24
23
  from typing import List, Literal, Optional, Union
25
24
 
26
25
  from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
26
+ from sglang.srt.lora.lora_registry import LoRARef
27
27
  from sglang.srt.reasoning_parser import ReasoningParser
28
28
  from sglang.srt.utils import (
29
29
  LORA_TARGET_ALL_MODULES,
@@ -80,7 +80,7 @@ class ServerArgs:
80
80
  schedule_policy: str = "fcfs"
81
81
  schedule_conservativeness: float = 1.0
82
82
  cpu_offload_gb: int = 0
83
- page_size: int = 1
83
+ page_size: Optional[int] = None
84
84
  hybrid_kvcache_ratio: Optional[float] = None
85
85
  swa_full_tokens_ratio: float = 0.8
86
86
  disable_hybrid_swa_memory: bool = False
@@ -145,7 +145,7 @@ class ServerArgs:
145
145
  enable_lora: Optional[bool] = None
146
146
  max_lora_rank: Optional[int] = None
147
147
  lora_target_modules: Optional[Union[set[str], List[str]]] = None
148
- lora_paths: Optional[Union[dict[str, str], List[str]]] = None
148
+ lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
149
149
  max_loras_per_batch: int = 8
150
150
  lora_backend: str = "triton"
151
151
 
@@ -266,31 +266,20 @@ class ServerArgs:
266
266
 
267
267
  def __post_init__(self):
268
268
  # Expert parallelism
269
+ # We put it here first due to some internal ckpt conversation issues.
269
270
  if self.enable_ep_moe:
270
271
  self.ep_size = self.tp_size
271
272
  logger.warning(
272
273
  f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
273
274
  )
274
- if self.enable_flashinfer_moe:
275
- assert (
276
- self.quantization == "modelopt_fp4"
277
- ), "modelopt_fp4 quantization is required for Flashinfer MOE"
278
- os.environ["TRTLLM_ENABLE_PDL"] = "1"
279
- self.disable_shared_experts_fusion = True
280
- logger.warning(
281
- f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
282
- )
283
275
 
284
276
  # Set missing default values
285
277
  if self.tokenizer_path is None:
286
278
  self.tokenizer_path = self.model_path
287
-
288
- if self.device is None:
289
- self.device = get_device()
290
-
291
279
  if self.served_model_name is None:
292
280
  self.served_model_name = self.model_path
293
-
281
+ if self.device is None:
282
+ self.device = get_device()
294
283
  if self.random_seed is None:
295
284
  self.random_seed = random.randint(0, 1 << 30)
296
285
 
@@ -359,7 +348,6 @@ class ServerArgs:
359
348
  self.chunked_prefill_size = 16384
360
349
  else:
361
350
  self.chunked_prefill_size = 4096
362
- assert self.chunked_prefill_size % self.page_size == 0
363
351
 
364
352
  # Set cuda graph max batch size
365
353
  if self.cuda_graph_max_bs is None:
@@ -410,6 +398,14 @@ class ServerArgs:
410
398
  )
411
399
  self.page_size = 128
412
400
 
401
+ # Set page size
402
+ if self.page_size is None:
403
+ self.page_size = 1
404
+
405
+ # AMD-specific Triton attention KV splits default number
406
+ if is_hip():
407
+ self.triton_attention_num_kv_splits = 16
408
+
413
409
  # Choose grammar backend
414
410
  if self.grammar_backend is None:
415
411
  self.grammar_backend = "xgrammar"
@@ -431,6 +427,13 @@ class ServerArgs:
431
427
  self.enable_dp_attention
432
428
  ), "Please enable dp attention when setting enable_dp_lm_head. "
433
429
 
430
+ # MoE kernel
431
+ if self.enable_flashinfer_moe:
432
+ assert (
433
+ self.quantization == "modelopt_fp4"
434
+ ), "modelopt_fp4 quantization is required for Flashinfer MOE"
435
+ os.environ["TRTLLM_ENABLE_PDL"] = "1"
436
+
434
437
  # DeepEP MoE
435
438
  if self.enable_deepep_moe:
436
439
  if self.deepep_mode == "normal":
@@ -502,14 +505,6 @@ class ServerArgs:
502
505
  logger.warning(
503
506
  "DeepSeek MTP does not require setting speculative_draft_model_path."
504
507
  )
505
- elif "Llama4" in model_arch:
506
- # TODO: remove this after Llama4 supports in other backends
507
- if self.attention_backend != "fa3":
508
- self.attention_backend = "fa3"
509
- logger.warning(
510
- "Llama4 requires using fa3 attention backend. "
511
- "Attention backend is automatically set to fa3."
512
- )
513
508
 
514
509
  # Auto choose parameters
515
510
  if self.speculative_num_steps is None:
@@ -542,12 +537,11 @@ class ServerArgs:
542
537
  ) and check_gguf_file(self.model_path):
543
538
  self.quantization = self.load_format = "gguf"
544
539
 
540
+ # Model loading
545
541
  if is_remote_url(self.model_path):
546
542
  self.load_format = "remote"
547
-
548
- # AMD-specific Triton attention KV splits default number
549
- if is_hip():
550
- self.triton_attention_num_kv_splits = 16
543
+ if self.custom_weight_loader is None:
544
+ self.custom_weight_loader = []
551
545
 
552
546
  # PD disaggregation
553
547
  if self.disaggregation_mode == "decode":
@@ -572,6 +566,7 @@ class ServerArgs:
572
566
  self.disable_cuda_graph = True
573
567
  logger.warning("Cuda graph is disabled for prefill server")
574
568
 
569
+ # Propagate env vars
575
570
  os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
576
571
  "1" if self.enable_torch_compile else "0"
577
572
  )
@@ -580,9 +575,6 @@ class ServerArgs:
580
575
  "1" if self.disable_outlines_disk_cache else "0"
581
576
  )
582
577
 
583
- if self.custom_weight_loader is None:
584
- self.custom_weight_loader = []
585
-
586
578
  @staticmethod
587
579
  def add_cli_args(parser: argparse.ArgumentParser):
588
580
  # Model and tokenizer
@@ -1099,10 +1091,10 @@ class ServerArgs:
1099
1091
  "deepseekv3",
1100
1092
  "pythonic",
1101
1093
  "kimi_k2",
1102
- "qwen3",
1094
+ "qwen3_coder",
1103
1095
  ],
1104
1096
  default=ServerArgs.tool_call_parser,
1105
- help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and 'kimi_k2'.",
1097
+ help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
1106
1098
  )
1107
1099
 
1108
1100
  # Data parallelism
@@ -1227,6 +1219,13 @@ class ServerArgs:
1227
1219
  default=ServerArgs.grammar_backend,
1228
1220
  help="Choose the backend for grammar-guided decoding.",
1229
1221
  )
1222
+ parser.add_argument(
1223
+ "--mm-attention-backend",
1224
+ type=str,
1225
+ choices=["sdpa", "fa3", "triton_attn"],
1226
+ default=ServerArgs.mm_attention_backend,
1227
+ help="Set multimodal attention backend.",
1228
+ )
1230
1229
 
1231
1230
  # Speculative decoding
1232
1231
  parser.add_argument(
@@ -1276,13 +1275,6 @@ class ServerArgs:
1276
1275
  help="The path of the draft model's small vocab table.",
1277
1276
  default=ServerArgs.speculative_token_map,
1278
1277
  )
1279
- parser.add_argument(
1280
- "--mm-attention-backend",
1281
- type=str,
1282
- choices=["sdpa", "fa3", "triton_attn"],
1283
- default=ServerArgs.mm_attention_backend,
1284
- help="Set multimodal attention backend.",
1285
- )
1286
1278
 
1287
1279
  # Expert parallelism
1288
1280
  parser.add_argument(
@@ -1530,11 +1522,6 @@ class ServerArgs:
1530
1522
  action="store_true",
1531
1523
  help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
1532
1524
  )
1533
- parser.add_argument(
1534
- "--disable-overlap-cg-plan",
1535
- action="store_true",
1536
- help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
1537
- )
1538
1525
  parser.add_argument(
1539
1526
  "--enable-mixed-chunk",
1540
1527
  action="store_true",
@@ -1792,11 +1779,11 @@ class ServerArgs:
1792
1779
  return hf_config
1793
1780
 
1794
1781
  def check_server_args(self):
1782
+ # Check parallel size constraints
1795
1783
  assert (
1796
1784
  self.tp_size * self.pp_size
1797
1785
  ) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
1798
1786
 
1799
- # FIXME pp constraints
1800
1787
  if self.pp_size > 1:
1801
1788
  assert (
1802
1789
  self.disable_overlap_schedule
@@ -1807,11 +1794,7 @@ class ServerArgs:
1807
1794
  assert not (
1808
1795
  self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
1809
1796
  ), "multi-node data parallel is not supported unless dp attention!"
1810
- assert (
1811
- self.max_loras_per_batch > 0
1812
- # FIXME
1813
- and (self.lora_paths is None or self.disable_radix_cache)
1814
- ), "compatibility of lora and radix attention is in progress"
1797
+
1815
1798
  assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
1816
1799
  assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
1817
1800
 
@@ -1820,9 +1803,32 @@ class ServerArgs:
1820
1803
  None,
1821
1804
  }, "moe_dense_tp_size only support 1 and None currently"
1822
1805
 
1806
+ # Check model architecture
1807
+ model_arch = self.get_hf_config().architectures[0]
1808
+ if "Llama4" in model_arch:
1809
+ assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
1810
+
1811
+ # Check LoRA
1823
1812
  self.check_lora_server_args()
1824
1813
 
1814
+ # Check speculative decoding
1815
+ if self.speculative_algorithm is not None:
1816
+ assert (
1817
+ not self.enable_mixed_chunk
1818
+ ), "enable_mixed_chunk is required for speculative decoding"
1819
+
1820
+ # Check chunked prefill
1821
+ assert (
1822
+ self.chunked_prefill_size % self.page_size == 0
1823
+ ), "chunked_prefill_size must be divisible by page_size"
1824
+
1825
1825
  def check_lora_server_args(self):
1826
+ assert (
1827
+ self.max_loras_per_batch > 0
1828
+ # FIXME
1829
+ and (self.lora_paths is None or self.disable_radix_cache)
1830
+ ), "compatibility of lora and radix attention is in progress"
1831
+
1826
1832
  # Enable LoRA if any LoRA paths are provided for backward compatibility.
1827
1833
  if self.lora_paths:
1828
1834
  if self.enable_lora is None:
@@ -1843,9 +1849,24 @@ class ServerArgs:
1843
1849
  for lora_path in lora_paths:
1844
1850
  if "=" in lora_path:
1845
1851
  name, path = lora_path.split("=", 1)
1846
- self.lora_paths[name] = path
1852
+ self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
1847
1853
  else:
1848
- self.lora_paths[lora_path] = lora_path
1854
+ self.lora_paths[lora_path] = LoRARef(
1855
+ lora_name=lora_path,
1856
+ lora_path=lora_path,
1857
+ )
1858
+ elif isinstance(self.lora_paths, dict):
1859
+ self.lora_paths = {
1860
+ k: LoRARef(lora_name=k, lora_path=v)
1861
+ for k, v in self.lora_paths.items()
1862
+ }
1863
+ elif self.lora_paths is None:
1864
+ self.lora_paths = {}
1865
+ else:
1866
+ raise ValueError(
1867
+ f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
1868
+ "Expected a list or a dictionary."
1869
+ )
1849
1870
 
1850
1871
  # Expand target modules
1851
1872
  if self.lora_target_modules: