sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (180) hide show
  1. sglang/bench_offline_throughput.py +10 -8
  2. sglang/bench_one_batch.py +7 -6
  3. sglang/bench_one_batch_server.py +157 -21
  4. sglang/bench_serving.py +137 -59
  5. sglang/compile_deep_gemm.py +5 -5
  6. sglang/eval/loogle_eval.py +157 -0
  7. sglang/lang/chat_template.py +78 -78
  8. sglang/lang/tracer.py +1 -1
  9. sglang/srt/code_completion_parser.py +1 -1
  10. sglang/srt/configs/deepseekvl2.py +2 -2
  11. sglang/srt/configs/model_config.py +40 -28
  12. sglang/srt/constrained/base_grammar_backend.py +55 -72
  13. sglang/srt/constrained/llguidance_backend.py +25 -21
  14. sglang/srt/constrained/outlines_backend.py +27 -26
  15. sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
  16. sglang/srt/constrained/xgrammar_backend.py +69 -43
  17. sglang/srt/conversation.py +49 -44
  18. sglang/srt/disaggregation/base/conn.py +1 -0
  19. sglang/srt/disaggregation/decode.py +129 -135
  20. sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
  21. sglang/srt/disaggregation/fake/conn.py +3 -13
  22. sglang/srt/disaggregation/kv_events.py +357 -0
  23. sglang/srt/disaggregation/mini_lb.py +57 -24
  24. sglang/srt/disaggregation/mooncake/conn.py +238 -122
  25. sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
  26. sglang/srt/disaggregation/nixl/conn.py +10 -19
  27. sglang/srt/disaggregation/prefill.py +132 -47
  28. sglang/srt/disaggregation/utils.py +123 -6
  29. sglang/srt/distributed/utils.py +3 -3
  30. sglang/srt/entrypoints/EngineBase.py +5 -0
  31. sglang/srt/entrypoints/engine.py +44 -9
  32. sglang/srt/entrypoints/http_server.py +23 -6
  33. sglang/srt/entrypoints/http_server_engine.py +5 -2
  34. sglang/srt/function_call/base_format_detector.py +250 -0
  35. sglang/srt/function_call/core_types.py +34 -0
  36. sglang/srt/function_call/deepseekv3_detector.py +157 -0
  37. sglang/srt/function_call/ebnf_composer.py +234 -0
  38. sglang/srt/function_call/function_call_parser.py +175 -0
  39. sglang/srt/function_call/llama32_detector.py +74 -0
  40. sglang/srt/function_call/mistral_detector.py +84 -0
  41. sglang/srt/function_call/pythonic_detector.py +163 -0
  42. sglang/srt/function_call/qwen25_detector.py +67 -0
  43. sglang/srt/function_call/utils.py +35 -0
  44. sglang/srt/hf_transformers_utils.py +46 -7
  45. sglang/srt/layers/attention/aiter_backend.py +513 -0
  46. sglang/srt/layers/attention/flashattention_backend.py +64 -18
  47. sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
  48. sglang/srt/layers/attention/flashmla_backend.py +340 -78
  49. sglang/srt/layers/attention/triton_backend.py +3 -0
  50. sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
  51. sglang/srt/layers/attention/utils.py +6 -4
  52. sglang/srt/layers/attention/vision.py +1 -1
  53. sglang/srt/layers/communicator.py +451 -0
  54. sglang/srt/layers/dp_attention.py +61 -21
  55. sglang/srt/layers/layernorm.py +1 -1
  56. sglang/srt/layers/logits_processor.py +46 -11
  57. sglang/srt/layers/moe/cutlass_moe.py +207 -0
  58. sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
  59. sglang/srt/layers/moe/ep_moe/layer.py +105 -51
  60. sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
  61. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
  62. sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
  63. sglang/srt/layers/moe/topk.py +67 -10
  64. sglang/srt/layers/multimodal.py +70 -0
  65. sglang/srt/layers/quantization/__init__.py +8 -3
  66. sglang/srt/layers/quantization/blockwise_int8.py +2 -2
  67. sglang/srt/layers/quantization/deep_gemm.py +77 -74
  68. sglang/srt/layers/quantization/fp8.py +92 -2
  69. sglang/srt/layers/quantization/fp8_kernel.py +3 -3
  70. sglang/srt/layers/quantization/fp8_utils.py +6 -0
  71. sglang/srt/layers/quantization/gptq.py +298 -6
  72. sglang/srt/layers/quantization/int8_kernel.py +20 -7
  73. sglang/srt/layers/quantization/qoq.py +244 -0
  74. sglang/srt/layers/sampler.py +0 -4
  75. sglang/srt/layers/vocab_parallel_embedding.py +18 -7
  76. sglang/srt/lora/lora_manager.py +2 -4
  77. sglang/srt/lora/mem_pool.py +4 -4
  78. sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
  79. sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
  80. sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
  81. sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
  82. sglang/srt/lora/utils.py +1 -1
  83. sglang/srt/managers/data_parallel_controller.py +3 -3
  84. sglang/srt/managers/deepseek_eplb.py +278 -0
  85. sglang/srt/managers/detokenizer_manager.py +21 -8
  86. sglang/srt/managers/eplb_manager.py +55 -0
  87. sglang/srt/managers/expert_distribution.py +704 -56
  88. sglang/srt/managers/expert_location.py +394 -0
  89. sglang/srt/managers/expert_location_dispatch.py +91 -0
  90. sglang/srt/managers/io_struct.py +19 -4
  91. sglang/srt/managers/mm_utils.py +294 -140
  92. sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
  93. sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
  94. sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
  95. sglang/srt/managers/multimodal_processors/internvl.py +14 -5
  96. sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
  97. sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
  98. sglang/srt/managers/multimodal_processors/llava.py +46 -0
  99. sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
  100. sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
  101. sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
  102. sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
  103. sglang/srt/managers/schedule_batch.py +122 -42
  104. sglang/srt/managers/schedule_policy.py +1 -5
  105. sglang/srt/managers/scheduler.py +205 -138
  106. sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
  107. sglang/srt/managers/session_controller.py +1 -1
  108. sglang/srt/managers/tokenizer_manager.py +232 -58
  109. sglang/srt/managers/tp_worker.py +12 -9
  110. sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
  111. sglang/srt/mem_cache/base_prefix_cache.py +3 -0
  112. sglang/srt/mem_cache/chunk_cache.py +3 -1
  113. sglang/srt/mem_cache/hiradix_cache.py +4 -4
  114. sglang/srt/mem_cache/memory_pool.py +76 -52
  115. sglang/srt/mem_cache/multimodal_cache.py +45 -0
  116. sglang/srt/mem_cache/radix_cache.py +58 -5
  117. sglang/srt/metrics/collector.py +314 -39
  118. sglang/srt/mm_utils.py +10 -0
  119. sglang/srt/model_executor/cuda_graph_runner.py +29 -19
  120. sglang/srt/model_executor/expert_location_updater.py +422 -0
  121. sglang/srt/model_executor/forward_batch_info.py +5 -1
  122. sglang/srt/model_executor/model_runner.py +163 -68
  123. sglang/srt/model_loader/loader.py +10 -6
  124. sglang/srt/models/clip.py +5 -1
  125. sglang/srt/models/deepseek_janus_pro.py +2 -2
  126. sglang/srt/models/deepseek_v2.py +308 -351
  127. sglang/srt/models/exaone.py +8 -3
  128. sglang/srt/models/gemma3_mm.py +70 -33
  129. sglang/srt/models/llama.py +2 -0
  130. sglang/srt/models/llama4.py +15 -8
  131. sglang/srt/models/llava.py +258 -7
  132. sglang/srt/models/mimo_mtp.py +220 -0
  133. sglang/srt/models/minicpmo.py +5 -12
  134. sglang/srt/models/mistral.py +71 -1
  135. sglang/srt/models/mixtral.py +98 -34
  136. sglang/srt/models/mllama.py +3 -3
  137. sglang/srt/models/pixtral.py +467 -0
  138. sglang/srt/models/qwen2.py +95 -26
  139. sglang/srt/models/qwen2_5_vl.py +8 -0
  140. sglang/srt/models/qwen2_moe.py +330 -60
  141. sglang/srt/models/qwen2_vl.py +6 -0
  142. sglang/srt/models/qwen3.py +52 -10
  143. sglang/srt/models/qwen3_moe.py +411 -48
  144. sglang/srt/models/roberta.py +1 -1
  145. sglang/srt/models/siglip.py +294 -0
  146. sglang/srt/models/torch_native_llama.py +1 -1
  147. sglang/srt/openai_api/adapter.py +58 -20
  148. sglang/srt/openai_api/protocol.py +6 -8
  149. sglang/srt/operations.py +154 -0
  150. sglang/srt/operations_strategy.py +31 -0
  151. sglang/srt/reasoning_parser.py +3 -3
  152. sglang/srt/sampling/custom_logit_processor.py +18 -3
  153. sglang/srt/sampling/sampling_batch_info.py +4 -56
  154. sglang/srt/sampling/sampling_params.py +2 -2
  155. sglang/srt/server_args.py +162 -22
  156. sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
  157. sglang/srt/speculative/eagle_utils.py +138 -7
  158. sglang/srt/speculative/eagle_worker.py +69 -21
  159. sglang/srt/utils.py +74 -17
  160. sglang/test/few_shot_gsm8k.py +2 -2
  161. sglang/test/few_shot_gsm8k_engine.py +2 -2
  162. sglang/test/run_eval.py +2 -2
  163. sglang/test/runners.py +8 -1
  164. sglang/test/send_one.py +13 -3
  165. sglang/test/simple_eval_common.py +1 -1
  166. sglang/test/simple_eval_humaneval.py +1 -1
  167. sglang/test/test_cutlass_moe.py +278 -0
  168. sglang/test/test_programs.py +5 -5
  169. sglang/test/test_utils.py +55 -14
  170. sglang/utils.py +3 -3
  171. sglang/version.py +1 -1
  172. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
  173. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
  174. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
  175. sglang/srt/function_call_parser.py +0 -858
  176. sglang/srt/platforms/interface.py +0 -371
  177. /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
  178. /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
  179. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
  180. {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,31 @@
1
+ import torch
2
+
3
+
4
+ def compute_layer_operations(
5
+ layer: torch.nn.Module,
6
+ ):
7
+ if not layer.is_layer_sparse:
8
+ return [
9
+ layer.op_comm_prepare_attn,
10
+ layer.op_attn,
11
+ layer.op_comm_prepare_mlp,
12
+ layer.op_mlp,
13
+ layer.op_comm_postprocess_layer,
14
+ ]
15
+
16
+ # Will add TBO operation orders here
17
+ return [
18
+ layer.op_comm_prepare_attn,
19
+ layer.op_attn,
20
+ layer.op_comm_prepare_mlp,
21
+ layer.mlp.op_gate,
22
+ layer.mlp.op_shared_experts,
23
+ layer.mlp.op_select_experts,
24
+ layer.mlp.op_dispatch_a,
25
+ layer.mlp.op_dispatch_b,
26
+ layer.mlp.op_experts,
27
+ layer.mlp.op_combine_a,
28
+ layer.mlp.op_combine_b,
29
+ layer.mlp.op_output,
30
+ layer.op_comm_postprocess_layer,
31
+ ]
@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
32
32
  One-time parsing: Detects and parses reasoning sections in the provided text.
33
33
  Returns both reasoning content and normal text separately.
34
34
  """
35
- text = text.replace(self.think_start_token, "")
35
+ text = text.replace(self.think_start_token, "").strip()
36
36
  if self.think_end_token not in text:
37
37
  # Assume reasoning was truncated before `</think>` token
38
38
  return StreamingParseResult(reasoning_text=text)
@@ -73,7 +73,7 @@ class BaseReasoningFormatDetector:
73
73
  normal_text = current_text[end_idx + len(self.think_end_token) :]
74
74
 
75
75
  return StreamingParseResult(
76
- normal_text=normal_text, reasoning_text=reasoning_text
76
+ normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
77
77
  )
78
78
 
79
79
  # Continue with reasoning content
@@ -147,7 +147,7 @@ class ReasoningParser:
147
147
 
148
148
  Args:
149
149
  model_type (str): Type of model to parse reasoning from
150
- stream_reasoning (bool): If Flase, accumulates reasoning content until complete.
150
+ stream_reasoning (bool): If False, accumulates reasoning content until complete.
151
151
  If True, streams reasoning content as it arrives.
152
152
  """
153
153
 
@@ -28,11 +28,26 @@ class CustomLogitProcessor(ABC):
28
28
  """Define the callable behavior."""
29
29
  raise NotImplementedError
30
30
 
31
- def to_str(self) -> str:
31
+ @classmethod
32
+ def to_str(cls) -> str:
32
33
  """Serialize the callable function to a JSON-compatible string."""
33
- return json.dumps({"callable": dill.dumps(self).hex()})
34
+ return json.dumps({"callable": dill.dumps(cls).hex()})
34
35
 
35
36
  @classmethod
36
37
  def from_str(cls, json_str: str):
37
38
  """Deserialize a callable function from a JSON string."""
38
- return _cache_from_str(json_str)
39
+ return _cache_from_str(json_str)()
40
+
41
+
42
+ class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
43
+ def __call__(
44
+ self,
45
+ logits: torch.Tensor,
46
+ custom_param_list: Optional[List[Dict[str, Any]]] = None,
47
+ ) -> torch.Tensor:
48
+ disallowed_token_ids = custom_param_list[0]["token_ids"]
49
+ assert all(
50
+ disallowed_token_ids == c["token_ids"] for c in custom_param_list
51
+ ), f"{custom_param_list=}"
52
+ logits[..., disallowed_token_ids] = -float("inf")
53
+ return logits
@@ -30,13 +30,8 @@ class SamplingBatchInfo:
30
30
  # Whether any request needs min_p sampling
31
31
  need_min_p_sampling: bool
32
32
 
33
- # Use thinking_budget to truncate thinking
34
- num_thinking_tokens: Optional[torch.Tensor] = None
35
- think_end_ids: Optional[torch.Tensor] = None
36
- thinking_budgets: Optional[torch.Tensor] = None
37
-
38
33
  # Masking tensors for grammar-guided structured outputs
39
- vocab_size: int = 0
34
+ vocab_size: int
40
35
  grammars: Optional[List] = None
41
36
  vocab_mask: Optional[torch.Tensor] = None
42
37
  apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
@@ -81,22 +76,7 @@ class SamplingBatchInfo:
81
76
  min_ps = torch.tensor(
82
77
  [r.sampling_params.min_p for r in reqs], dtype=torch.float
83
78
  ).to(device, non_blocking=True)
84
- if any(hasattr(r.tokenizer, "think_end_id") for r in reqs):
85
- think_end_ids = torch.tensor(
86
- [getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
87
- dtype=torch.int64,
88
- ).to(device, non_blocking=True)
89
- num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
90
- device, non_blocking=True
91
- )
92
- thinking_budgets = torch.tensor(
93
- [r.sampling_params.thinking_budget or -1 for r in reqs],
94
- dtype=torch.int64,
95
- ).to(device, non_blocking=True)
96
- else:
97
- think_end_ids = None
98
- num_thinking_tokens = None
99
- thinking_budgets = None
79
+
100
80
  # Check if any request has custom logit processor
101
81
  has_custom_logit_processor = (
102
82
  batch.enable_custom_logit_processor # check the flag first.
@@ -152,9 +132,6 @@ class SamplingBatchInfo:
152
132
  top_ps=top_ps,
153
133
  top_ks=top_ks,
154
134
  min_ps=min_ps,
155
- think_end_ids=think_end_ids,
156
- num_thinking_tokens=num_thinking_tokens,
157
- thinking_budgets=thinking_budgets,
158
135
  is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
159
136
  need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
160
137
  vocab_size=vocab_size,
@@ -169,35 +146,6 @@ class SamplingBatchInfo:
169
146
  def __len__(self):
170
147
  return len(self.temperatures)
171
148
 
172
- def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
173
- has_budget = self.thinking_budgets > 0
174
- if not has_budget.any():
175
- return
176
- torch.where(
177
- has_budget,
178
- self.num_thinking_tokens + 1,
179
- self.num_thinking_tokens,
180
- out=self.num_thinking_tokens,
181
- )
182
- should_stop = has_budget & (
183
- self.num_thinking_tokens - 1 > self.thinking_budgets
184
- )
185
- next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
186
- batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
187
- if len(batch_indices) > 0:
188
- end_token_indices = self.think_end_ids[batch_indices]
189
- next_token_logits[batch_indices, end_token_indices] = 0.0
190
-
191
- def update_thinking_budgets(self, next_token_ids: torch.Tensor):
192
- if not torch.any(self.thinking_budgets > 0):
193
- return
194
- torch.where(
195
- next_token_ids == self.think_end_ids,
196
- torch.tensor(-1, device=self.thinking_budgets.device),
197
- self.thinking_budgets,
198
- out=self.thinking_budgets,
199
- )
200
-
201
149
  def update_regex_vocab_mask(self):
202
150
  if not self.grammars:
203
151
  self.vocab_mask = None
@@ -346,7 +294,7 @@ class SamplingBatchInfo:
346
294
  # Set the flag to True if any of the two has custom logit processor
347
295
  self.has_custom_logit_processor = True
348
296
 
349
- # Note: becasue the __len()__ operator is defined on the temperatures tensor,
297
+ # Note: because the __len()__ operator is defined on the temperatures tensor,
350
298
  # please make sure any merge operation with len(self) or len(other) is done before
351
299
  # the merge operation of the temperatures tensor below.
352
300
  for item in [
@@ -359,5 +307,5 @@ class SamplingBatchInfo:
359
307
  other_val = getattr(other, item, None)
360
308
  setattr(self, item, torch.cat([self_val, other_val]))
361
309
 
362
- self.is_all_greedy |= other.is_all_greedy
310
+ self.is_all_greedy &= other.is_all_greedy
363
311
  self.need_min_p_sampling |= other.need_min_p_sampling
@@ -30,7 +30,6 @@ class SamplingParams:
30
30
  def __init__(
31
31
  self,
32
32
  max_new_tokens: int = 128,
33
- thinking_budget: Optional[int] = None,
34
33
  stop: Optional[Union[str, List[str]]] = None,
35
34
  stop_token_ids: Optional[List[int]] = None,
36
35
  temperature: float = 1.0,
@@ -51,6 +50,7 @@ class SamplingParams:
51
50
  spaces_between_special_tokens: bool = True,
52
51
  no_stop_trim: bool = False,
53
52
  custom_params: Optional[Dict[str, Any]] = None,
53
+ stream_interval: Optional[int] = None,
54
54
  ) -> None:
55
55
  self.max_new_tokens = max_new_tokens
56
56
  self.stop_strs = stop
@@ -58,7 +58,6 @@ class SamplingParams:
58
58
  self.stop_token_ids = set(stop_token_ids)
59
59
  else:
60
60
  self.stop_token_ids = None
61
- self.thinking_budget = thinking_budget
62
61
  self.temperature = temperature
63
62
  self.top_p = top_p
64
63
  self.top_k = top_k
@@ -77,6 +76,7 @@ class SamplingParams:
77
76
  self.spaces_between_special_tokens = spaces_between_special_tokens
78
77
  self.no_stop_trim = no_stop_trim
79
78
  self.custom_params = custom_params
79
+ self.stream_interval = stream_interval
80
80
 
81
81
  # Process some special cases
82
82
  if 0 <= self.temperature < _SAMPLING_EPS:
sglang/srt/server_args.py CHANGED
@@ -46,7 +46,6 @@ class ServerArgs:
46
46
  tokenizer_path: Optional[str] = None
47
47
  tokenizer_mode: str = "auto"
48
48
  skip_tokenizer_init: bool = False
49
- enable_tokenizer_batch_encode: bool = False
50
49
  load_format: str = "auto"
51
50
  trust_remote_code: bool = False
52
51
  dtype: str = "auto"
@@ -59,6 +58,7 @@ class ServerArgs:
59
58
  chat_template: Optional[str] = None
60
59
  completion_template: Optional[str] = None
61
60
  is_embedding: bool = False
61
+ enable_multimodal: Optional[bool] = None
62
62
  revision: Optional[str] = None
63
63
 
64
64
  # Port for the HTTP server
@@ -97,7 +97,13 @@ class ServerArgs:
97
97
  log_requests_level: int = 0
98
98
  show_time_cost: bool = False
99
99
  enable_metrics: bool = False
100
+ bucket_time_to_first_token: Optional[List[float]] = None
101
+ bucket_e2e_request_latency: Optional[List[float]] = None
102
+ bucket_inter_token_latency: Optional[List[float]] = None
103
+ collect_tokens_histogram: bool = False
100
104
  decode_log_interval: int = 40
105
+ enable_request_time_stats_logging: bool = False
106
+ kv_events_config: Optional[str] = None
101
107
 
102
108
  # API related
103
109
  api_key: Optional[str] = None
@@ -119,6 +125,7 @@ class ServerArgs:
119
125
 
120
126
  # Model override args in JSON
121
127
  json_model_override_args: str = "{}"
128
+ preferred_sampling_params: Optional[str] = None
122
129
 
123
130
  # LoRA
124
131
  lora_paths: Optional[List[str]] = None
@@ -153,15 +160,27 @@ class ServerArgs:
153
160
  disable_cuda_graph: bool = False
154
161
  disable_cuda_graph_padding: bool = False
155
162
  enable_nccl_nvls: bool = False
163
+ enable_tokenizer_batch_encode: bool = False
156
164
  disable_outlines_disk_cache: bool = False
157
165
  disable_custom_all_reduce: bool = False
158
- enable_multimodal: Optional[bool] = None
159
166
  disable_overlap_schedule: bool = False
160
167
  enable_mixed_chunk: bool = False
161
168
  enable_dp_attention: bool = False
169
+ enable_dp_lm_head: bool = False
162
170
  enable_ep_moe: bool = False
163
171
  enable_deepep_moe: bool = False
164
172
  deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
173
+ ep_num_redundant_experts: int = 0
174
+ ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
175
+ init_expert_location: str = "trivial"
176
+ enable_eplb: bool = False
177
+ eplb_rebalance_num_iterations: int = 1000
178
+ expert_distribution_recorder_mode: Optional[
179
+ Literal["stat", "per_pass", "per_token"]
180
+ ] = None
181
+ expert_distribution_recorder_buffer_size: Optional[int] = None
182
+ enable_expert_distribution_metrics: bool = False
183
+ deepep_config: Optional[str] = None
165
184
  enable_torch_compile: bool = False
166
185
  torch_compile_max_bs: int = 32
167
186
  cuda_graph_max_bs: Optional[int] = None
@@ -227,7 +246,7 @@ class ServerArgs:
227
246
  # Set mem fraction static, which depends on the tensor parallelism size
228
247
  if self.mem_fraction_static is None:
229
248
  parallel_size = self.tp_size * self.pp_size
230
- if gpu_mem <= 81920:
249
+ if gpu_mem is not None and gpu_mem <= 81920:
231
250
  if parallel_size >= 16:
232
251
  self.mem_fraction_static = 0.79
233
252
  elif parallel_size >= 8:
@@ -240,7 +259,7 @@ class ServerArgs:
240
259
  self.mem_fraction_static = 0.88
241
260
  else:
242
261
  self.mem_fraction_static = 0.88
243
- if gpu_mem > 96 * 1024:
262
+ if gpu_mem is not None and gpu_mem > 96 * 1024:
244
263
  mem_fraction = self.mem_fraction_static
245
264
  self.mem_fraction_static = min(
246
265
  mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
@@ -317,6 +336,11 @@ class ServerArgs:
317
336
  f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
318
337
  )
319
338
 
339
+ if self.enable_dp_lm_head:
340
+ assert (
341
+ self.enable_dp_attention
342
+ ), "Please enable dp attention when setting enable_dp_attention. "
343
+
320
344
  # DeepEP MoE
321
345
  self.enable_sp_layernorm = False
322
346
  if self.enable_deepep_moe:
@@ -335,6 +359,21 @@ class ServerArgs:
335
359
  f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
336
360
  )
337
361
 
362
+ if self.pp_size > 1:
363
+ self.disable_overlap_schedule = True
364
+ logger.warning(
365
+ "Pipeline parallelism is incompatible with overlap schedule."
366
+ )
367
+
368
+ if self.expert_distribution_recorder_buffer_size is None:
369
+ # TODO pr-chain: enable this later
370
+ # if (x := self.eplb_rebalance_num_iterations) is not None:
371
+ # self.expert_distribution_recorder_buffer_size = x
372
+ if False:
373
+ pass
374
+ elif self.expert_distribution_recorder_mode is not None:
375
+ self.expert_distribution_recorder_buffer_size = 1000
376
+
338
377
  # Speculative Decoding
339
378
  if self.speculative_algorithm == "NEXTN":
340
379
  # NEXTN shares the same implementation of EAGLE
@@ -455,11 +494,6 @@ class ServerArgs:
455
494
  action="store_true",
456
495
  help="If set, skip init tokenizer and pass input_ids in generate request.",
457
496
  )
458
- parser.add_argument(
459
- "--enable-tokenizer-batch-encode",
460
- action="store_true",
461
- help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
462
- )
463
497
  parser.add_argument(
464
498
  "--load-format",
465
499
  type=str,
@@ -537,6 +571,7 @@ class ServerArgs:
537
571
  "w8a8_int8",
538
572
  "w8a8_fp8",
539
573
  "moe_wna16",
574
+ "qoq",
540
575
  ],
541
576
  help="The quantization method.",
542
577
  )
@@ -584,6 +619,12 @@ class ServerArgs:
584
619
  action="store_true",
585
620
  help="Whether to use a CausalLM as an embedding model.",
586
621
  )
622
+ parser.add_argument(
623
+ "--enable-multimodal",
624
+ default=ServerArgs.enable_multimodal,
625
+ action="store_true",
626
+ help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
627
+ )
587
628
  parser.add_argument(
588
629
  "--revision",
589
630
  type=str,
@@ -761,12 +802,51 @@ class ServerArgs:
761
802
  action="store_true",
762
803
  help="Enable log prometheus metrics.",
763
804
  )
805
+ parser.add_argument(
806
+ "--bucket-time-to-first-token",
807
+ type=float,
808
+ nargs="+",
809
+ default=ServerArgs.bucket_time_to_first_token,
810
+ help="The buckets of time to first token, specified as a list of floats.",
811
+ )
812
+ parser.add_argument(
813
+ "--bucket-inter-token-latency",
814
+ type=float,
815
+ nargs="+",
816
+ default=ServerArgs.bucket_inter_token_latency,
817
+ help="The buckets of inter-token latency, specified as a list of floats.",
818
+ )
819
+ parser.add_argument(
820
+ "--bucket-e2e-request-latency",
821
+ type=float,
822
+ nargs="+",
823
+ default=ServerArgs.bucket_e2e_request_latency,
824
+ help="The buckets of end-to-end request latency, specified as a list of floats.",
825
+ )
826
+ parser.add_argument(
827
+ "--collect-tokens-histogram",
828
+ action="store_true",
829
+ default=ServerArgs.collect_tokens_histogram,
830
+ help="Collect prompt/generation tokens histogram.",
831
+ )
832
+ parser.add_argument(
833
+ "--kv-events-config",
834
+ type=str,
835
+ default=None,
836
+ help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
837
+ )
764
838
  parser.add_argument(
765
839
  "--decode-log-interval",
766
840
  type=int,
767
841
  default=ServerArgs.decode_log_interval,
768
842
  help="The log interval of decode batch.",
769
843
  )
844
+ parser.add_argument(
845
+ "--enable-request-time-stats-logging",
846
+ action="store_true",
847
+ default=ServerArgs.enable_request_time_stats_logging,
848
+ help="Enable per request time stats logging",
849
+ )
770
850
 
771
851
  # API related
772
852
  parser.add_argument(
@@ -825,7 +905,7 @@ class ServerArgs:
825
905
  # Multi-node distributed serving
826
906
  parser.add_argument(
827
907
  "--dist-init-addr",
828
- "--nccl-init-addr", # For backward compatbility. This will be removed in the future.
908
+ "--nccl-init-addr", # For backward compatibility. This will be removed in the future.
829
909
  type=str,
830
910
  help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
831
911
  )
@@ -843,6 +923,11 @@ class ServerArgs:
843
923
  help="A dictionary in JSON string format used to override default model configurations.",
844
924
  default=ServerArgs.json_model_override_args,
845
925
  )
926
+ parser.add_argument(
927
+ "--preferred-sampling-params",
928
+ type=str,
929
+ help="json-formatted sampling settings that will be returned in /get_model_info",
930
+ )
846
931
 
847
932
  # LoRA
848
933
  parser.add_argument(
@@ -871,6 +956,7 @@ class ServerArgs:
871
956
  "--attention-backend",
872
957
  type=str,
873
958
  choices=[
959
+ "aiter",
874
960
  "flashinfer",
875
961
  "triton",
876
962
  "torch_native",
@@ -1018,6 +1104,11 @@ class ServerArgs:
1018
1104
  action="store_true",
1019
1105
  help="Enable NCCL NVLS for prefill heavy requests when available.",
1020
1106
  )
1107
+ parser.add_argument(
1108
+ "--enable-tokenizer-batch-encode",
1109
+ action="store_true",
1110
+ help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
1111
+ )
1021
1112
  parser.add_argument(
1022
1113
  "--disable-outlines-disk-cache",
1023
1114
  action="store_true",
@@ -1028,12 +1119,6 @@ class ServerArgs:
1028
1119
  action="store_true",
1029
1120
  help="Disable the custom all-reduce kernel and fall back to NCCL.",
1030
1121
  )
1031
- parser.add_argument(
1032
- "--enable-multimodal",
1033
- default=ServerArgs.enable_multimodal,
1034
- action="store_true",
1035
- help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
1036
- )
1037
1122
  parser.add_argument(
1038
1123
  "--disable-overlap-schedule",
1039
1124
  action="store_true",
@@ -1047,7 +1132,12 @@ class ServerArgs:
1047
1132
  parser.add_argument(
1048
1133
  "--enable-dp-attention",
1049
1134
  action="store_true",
1050
- help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently only DeepSeek-V2 is supported.",
1135
+ help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.",
1136
+ )
1137
+ parser.add_argument(
1138
+ "--enable-dp-lm-head",
1139
+ action="store_true",
1140
+ help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
1051
1141
  )
1052
1142
  parser.add_argument(
1053
1143
  "--enable-ep-moe",
@@ -1069,7 +1159,7 @@ class ServerArgs:
1069
1159
  "--cuda-graph-max-bs",
1070
1160
  type=int,
1071
1161
  default=ServerArgs.cuda_graph_max_bs,
1072
- help="Set the maximum batch size for cuda graph.",
1162
+ help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
1073
1163
  )
1074
1164
  parser.add_argument(
1075
1165
  "--cuda-graph-bs",
@@ -1096,7 +1186,7 @@ class ServerArgs:
1096
1186
  parser.add_argument(
1097
1187
  "--triton-attention-reduce-in-fp32",
1098
1188
  action="store_true",
1099
- help="Cast the intermidiate attention results to fp32 to avoid possible crashes related to fp16."
1189
+ help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
1100
1190
  "This only affects Triton attention kernels.",
1101
1191
  )
1102
1192
  parser.add_argument(
@@ -1182,13 +1272,65 @@ class ServerArgs:
1182
1272
  default="auto",
1183
1273
  help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
1184
1274
  )
1275
+ parser.add_argument(
1276
+ "--ep-num-redundant-experts",
1277
+ type=int,
1278
+ default=ServerArgs.ep_num_redundant_experts,
1279
+ help="Allocate this number of redundant experts in expert parallel.",
1280
+ )
1281
+ parser.add_argument(
1282
+ "--ep-dispatch-algorithm",
1283
+ type=str,
1284
+ default=ServerArgs.ep_dispatch_algorithm,
1285
+ help="The algorithm to choose ranks for redundant experts in expert parallel.",
1286
+ )
1287
+ parser.add_argument(
1288
+ "--init-expert-location",
1289
+ type=str,
1290
+ default=ServerArgs.init_expert_location,
1291
+ help="Initial location of EP experts.",
1292
+ )
1293
+ parser.add_argument(
1294
+ "--enable-eplb",
1295
+ action="store_true",
1296
+ help="Enable EPLB algorithm",
1297
+ )
1298
+ parser.add_argument(
1299
+ "--eplb-rebalance-num-iterations",
1300
+ type=int,
1301
+ default=ServerArgs.eplb_rebalance_num_iterations,
1302
+ help="Number of iterations to automatically trigger a EPLB re-balance.",
1303
+ )
1304
+ parser.add_argument(
1305
+ "--expert-distribution-recorder-mode",
1306
+ type=str,
1307
+ default=ServerArgs.expert_distribution_recorder_mode,
1308
+ help="Mode of expert distribution recorder.",
1309
+ )
1310
+ parser.add_argument(
1311
+ "--expert-distribution-recorder-buffer-size",
1312
+ type=int,
1313
+ default=ServerArgs.expert_distribution_recorder_buffer_size,
1314
+ help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
1315
+ )
1316
+ parser.add_argument(
1317
+ "--enable-expert-distribution-metrics",
1318
+ action="store_true",
1319
+ help="Enable logging metrics for expert balancedness",
1320
+ )
1321
+ parser.add_argument(
1322
+ "--deepep-config",
1323
+ type=str,
1324
+ default=ServerArgs.deepep_config,
1325
+ help="Tuned DeepEP config suitable for your own cluster.",
1326
+ )
1185
1327
 
1186
1328
  parser.add_argument(
1187
1329
  "--n-share-experts-fusion",
1188
1330
  type=int,
1189
1331
  default=0,
1190
1332
  help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
1191
- "set it to tp_size can get best optimized performace.",
1333
+ "set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
1192
1334
  )
1193
1335
  parser.add_argument(
1194
1336
  "--disable-chunked-prefix-cache",
@@ -1296,8 +1438,6 @@ class ServerArgs:
1296
1438
 
1297
1439
  # FIXME pp constraints
1298
1440
  if self.pp_size > 1:
1299
- logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
1300
- self.disable_overlap_schedule = True
1301
1441
  assert (
1302
1442
  self.disable_overlap_schedule
1303
1443
  and self.speculative_algorithm is None
@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
82
82
  self.capture()
83
83
  except RuntimeError as e:
84
84
  raise Exception(
85
- f"Capture cuda graph failed: {e}\n"
85
+ f"Capture CUDA graph failed: {e}\n"
86
86
  "Possible solutions:\n"
87
87
  "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
88
88
  "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
89
89
  "3. disable torch compile by not using --enable-torch-compile\n"
90
- "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
90
+ "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
91
91
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
92
92
  )
93
93
 
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
149
149
 
150
150
  # Run and capture
151
151
  def run_once():
152
- # Backup two fileds, which will be modified in-place in `draft_forward`.
152
+ # Backup two fields, which will be modified in-place in `draft_forward`.
153
153
  output_cache_loc_backup = forward_batch.out_cache_loc
154
154
  hidden_states_backup = forward_batch.spec_info.hidden_states
155
155