sglang 0.4.6.post3__py3-none-any.whl → 0.4.6.post5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +10 -8
- sglang/bench_one_batch.py +7 -6
- sglang/bench_one_batch_server.py +157 -21
- sglang/bench_serving.py +137 -59
- sglang/compile_deep_gemm.py +5 -5
- sglang/eval/loogle_eval.py +157 -0
- sglang/lang/chat_template.py +78 -78
- sglang/lang/tracer.py +1 -1
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +2 -2
- sglang/srt/configs/model_config.py +40 -28
- sglang/srt/constrained/base_grammar_backend.py +55 -72
- sglang/srt/constrained/llguidance_backend.py +25 -21
- sglang/srt/constrained/outlines_backend.py +27 -26
- sglang/srt/constrained/reasoner_grammar_backend.py +22 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -43
- sglang/srt/conversation.py +49 -44
- sglang/srt/disaggregation/base/conn.py +1 -0
- sglang/srt/disaggregation/decode.py +129 -135
- sglang/srt/disaggregation/decode_schedule_batch_mixin.py +142 -0
- sglang/srt/disaggregation/fake/conn.py +3 -13
- sglang/srt/disaggregation/kv_events.py +357 -0
- sglang/srt/disaggregation/mini_lb.py +57 -24
- sglang/srt/disaggregation/mooncake/conn.py +238 -122
- sglang/srt/disaggregation/mooncake/transfer_engine.py +2 -1
- sglang/srt/disaggregation/nixl/conn.py +10 -19
- sglang/srt/disaggregation/prefill.py +132 -47
- sglang/srt/disaggregation/utils.py +123 -6
- sglang/srt/distributed/utils.py +3 -3
- sglang/srt/entrypoints/EngineBase.py +5 -0
- sglang/srt/entrypoints/engine.py +44 -9
- sglang/srt/entrypoints/http_server.py +23 -6
- sglang/srt/entrypoints/http_server_engine.py +5 -2
- sglang/srt/function_call/base_format_detector.py +250 -0
- sglang/srt/function_call/core_types.py +34 -0
- sglang/srt/function_call/deepseekv3_detector.py +157 -0
- sglang/srt/function_call/ebnf_composer.py +234 -0
- sglang/srt/function_call/function_call_parser.py +175 -0
- sglang/srt/function_call/llama32_detector.py +74 -0
- sglang/srt/function_call/mistral_detector.py +84 -0
- sglang/srt/function_call/pythonic_detector.py +163 -0
- sglang/srt/function_call/qwen25_detector.py +67 -0
- sglang/srt/function_call/utils.py +35 -0
- sglang/srt/hf_transformers_utils.py +46 -7
- sglang/srt/layers/attention/aiter_backend.py +513 -0
- sglang/srt/layers/attention/flashattention_backend.py +64 -18
- sglang/srt/layers/attention/flashinfer_mla_backend.py +8 -4
- sglang/srt/layers/attention/flashmla_backend.py +340 -78
- sglang/srt/layers/attention/triton_backend.py +3 -0
- sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +1 -1
- sglang/srt/layers/attention/utils.py +6 -4
- sglang/srt/layers/attention/vision.py +1 -1
- sglang/srt/layers/communicator.py +451 -0
- sglang/srt/layers/dp_attention.py +61 -21
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/logits_processor.py +46 -11
- sglang/srt/layers/moe/cutlass_moe.py +207 -0
- sglang/srt/layers/moe/ep_moe/kernels.py +34 -12
- sglang/srt/layers/moe/ep_moe/layer.py +105 -51
- sglang/srt/layers/moe/ep_moe/token_dispatcher.py +82 -7
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +1 -1
- sglang/srt/layers/moe/fused_moe_triton/layer.py +14 -0
- sglang/srt/layers/moe/topk.py +67 -10
- sglang/srt/layers/multimodal.py +70 -0
- sglang/srt/layers/quantization/__init__.py +8 -3
- sglang/srt/layers/quantization/blockwise_int8.py +2 -2
- sglang/srt/layers/quantization/deep_gemm.py +77 -74
- sglang/srt/layers/quantization/fp8.py +92 -2
- sglang/srt/layers/quantization/fp8_kernel.py +3 -3
- sglang/srt/layers/quantization/fp8_utils.py +6 -0
- sglang/srt/layers/quantization/gptq.py +298 -6
- sglang/srt/layers/quantization/int8_kernel.py +20 -7
- sglang/srt/layers/quantization/qoq.py +244 -0
- sglang/srt/layers/sampler.py +0 -4
- sglang/srt/layers/vocab_parallel_embedding.py +18 -7
- sglang/srt/lora/lora_manager.py +2 -4
- sglang/srt/lora/mem_pool.py +4 -4
- sglang/srt/lora/triton_ops/gate_up_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/qkv_lora_b.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_a.py +1 -1
- sglang/srt/lora/triton_ops/sgemm_lora_b.py +1 -1
- sglang/srt/lora/utils.py +1 -1
- sglang/srt/managers/data_parallel_controller.py +3 -3
- sglang/srt/managers/deepseek_eplb.py +278 -0
- sglang/srt/managers/detokenizer_manager.py +21 -8
- sglang/srt/managers/eplb_manager.py +55 -0
- sglang/srt/managers/expert_distribution.py +704 -56
- sglang/srt/managers/expert_location.py +394 -0
- sglang/srt/managers/expert_location_dispatch.py +91 -0
- sglang/srt/managers/io_struct.py +19 -4
- sglang/srt/managers/mm_utils.py +294 -140
- sglang/srt/managers/multimodal_processors/base_processor.py +127 -42
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +6 -1
- sglang/srt/managers/multimodal_processors/gemma3.py +31 -6
- sglang/srt/managers/multimodal_processors/internvl.py +14 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +7 -1
- sglang/srt/managers/multimodal_processors/kimi_vl.py +7 -6
- sglang/srt/managers/multimodal_processors/llava.py +46 -0
- sglang/srt/managers/multimodal_processors/minicpm.py +25 -31
- sglang/srt/managers/multimodal_processors/mllama4.py +6 -0
- sglang/srt/managers/multimodal_processors/pixtral.py +127 -0
- sglang/srt/managers/multimodal_processors/qwen_vl.py +58 -16
- sglang/srt/managers/schedule_batch.py +122 -42
- sglang/srt/managers/schedule_policy.py +1 -5
- sglang/srt/managers/scheduler.py +205 -138
- sglang/srt/managers/scheduler_output_processor_mixin.py +124 -55
- sglang/srt/managers/session_controller.py +1 -1
- sglang/srt/managers/tokenizer_manager.py +232 -58
- sglang/srt/managers/tp_worker.py +12 -9
- sglang/srt/managers/tp_worker_overlap_thread.py +22 -11
- sglang/srt/mem_cache/base_prefix_cache.py +3 -0
- sglang/srt/mem_cache/chunk_cache.py +3 -1
- sglang/srt/mem_cache/hiradix_cache.py +4 -4
- sglang/srt/mem_cache/memory_pool.py +76 -52
- sglang/srt/mem_cache/multimodal_cache.py +45 -0
- sglang/srt/mem_cache/radix_cache.py +58 -5
- sglang/srt/metrics/collector.py +314 -39
- sglang/srt/mm_utils.py +10 -0
- sglang/srt/model_executor/cuda_graph_runner.py +29 -19
- sglang/srt/model_executor/expert_location_updater.py +422 -0
- sglang/srt/model_executor/forward_batch_info.py +5 -1
- sglang/srt/model_executor/model_runner.py +163 -68
- sglang/srt/model_loader/loader.py +10 -6
- sglang/srt/models/clip.py +5 -1
- sglang/srt/models/deepseek_janus_pro.py +2 -2
- sglang/srt/models/deepseek_v2.py +308 -351
- sglang/srt/models/exaone.py +8 -3
- sglang/srt/models/gemma3_mm.py +70 -33
- sglang/srt/models/llama.py +2 -0
- sglang/srt/models/llama4.py +15 -8
- sglang/srt/models/llava.py +258 -7
- sglang/srt/models/mimo_mtp.py +220 -0
- sglang/srt/models/minicpmo.py +5 -12
- sglang/srt/models/mistral.py +71 -1
- sglang/srt/models/mixtral.py +98 -34
- sglang/srt/models/mllama.py +3 -3
- sglang/srt/models/pixtral.py +467 -0
- sglang/srt/models/qwen2.py +95 -26
- sglang/srt/models/qwen2_5_vl.py +8 -0
- sglang/srt/models/qwen2_moe.py +330 -60
- sglang/srt/models/qwen2_vl.py +6 -0
- sglang/srt/models/qwen3.py +52 -10
- sglang/srt/models/qwen3_moe.py +411 -48
- sglang/srt/models/roberta.py +1 -1
- sglang/srt/models/siglip.py +294 -0
- sglang/srt/models/torch_native_llama.py +1 -1
- sglang/srt/openai_api/adapter.py +58 -20
- sglang/srt/openai_api/protocol.py +6 -8
- sglang/srt/operations.py +154 -0
- sglang/srt/operations_strategy.py +31 -0
- sglang/srt/reasoning_parser.py +3 -3
- sglang/srt/sampling/custom_logit_processor.py +18 -3
- sglang/srt/sampling/sampling_batch_info.py +4 -56
- sglang/srt/sampling/sampling_params.py +2 -2
- sglang/srt/server_args.py +162 -22
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/speculative/eagle_utils.py +138 -7
- sglang/srt/speculative/eagle_worker.py +69 -21
- sglang/srt/utils.py +74 -17
- sglang/test/few_shot_gsm8k.py +2 -2
- sglang/test/few_shot_gsm8k_engine.py +2 -2
- sglang/test/run_eval.py +2 -2
- sglang/test/runners.py +8 -1
- sglang/test/send_one.py +13 -3
- sglang/test/simple_eval_common.py +1 -1
- sglang/test/simple_eval_humaneval.py +1 -1
- sglang/test/test_cutlass_moe.py +278 -0
- sglang/test/test_programs.py +5 -5
- sglang/test/test_utils.py +55 -14
- sglang/utils.py +3 -3
- sglang/version.py +1 -1
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/METADATA +23 -13
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/RECORD +178 -149
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/WHEEL +1 -1
- sglang/srt/function_call_parser.py +0 -858
- sglang/srt/platforms/interface.py +0 -371
- /sglang/{llama3_eval.py → eval/llama3_eval.py} +0 -0
- /sglang/srt/models/{xiaomi_mimo.py → mimo.py} +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.6.post3.dist-info → sglang-0.4.6.post5.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
import torch
|
2
|
+
|
3
|
+
|
4
|
+
def compute_layer_operations(
|
5
|
+
layer: torch.nn.Module,
|
6
|
+
):
|
7
|
+
if not layer.is_layer_sparse:
|
8
|
+
return [
|
9
|
+
layer.op_comm_prepare_attn,
|
10
|
+
layer.op_attn,
|
11
|
+
layer.op_comm_prepare_mlp,
|
12
|
+
layer.op_mlp,
|
13
|
+
layer.op_comm_postprocess_layer,
|
14
|
+
]
|
15
|
+
|
16
|
+
# Will add TBO operation orders here
|
17
|
+
return [
|
18
|
+
layer.op_comm_prepare_attn,
|
19
|
+
layer.op_attn,
|
20
|
+
layer.op_comm_prepare_mlp,
|
21
|
+
layer.mlp.op_gate,
|
22
|
+
layer.mlp.op_shared_experts,
|
23
|
+
layer.mlp.op_select_experts,
|
24
|
+
layer.mlp.op_dispatch_a,
|
25
|
+
layer.mlp.op_dispatch_b,
|
26
|
+
layer.mlp.op_experts,
|
27
|
+
layer.mlp.op_combine_a,
|
28
|
+
layer.mlp.op_combine_b,
|
29
|
+
layer.mlp.op_output,
|
30
|
+
layer.op_comm_postprocess_layer,
|
31
|
+
]
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -32,7 +32,7 @@ class BaseReasoningFormatDetector:
|
|
32
32
|
One-time parsing: Detects and parses reasoning sections in the provided text.
|
33
33
|
Returns both reasoning content and normal text separately.
|
34
34
|
"""
|
35
|
-
text = text.replace(self.think_start_token, "")
|
35
|
+
text = text.replace(self.think_start_token, "").strip()
|
36
36
|
if self.think_end_token not in text:
|
37
37
|
# Assume reasoning was truncated before `</think>` token
|
38
38
|
return StreamingParseResult(reasoning_text=text)
|
@@ -73,7 +73,7 @@ class BaseReasoningFormatDetector:
|
|
73
73
|
normal_text = current_text[end_idx + len(self.think_end_token) :]
|
74
74
|
|
75
75
|
return StreamingParseResult(
|
76
|
-
normal_text=normal_text, reasoning_text=reasoning_text
|
76
|
+
normal_text=normal_text, reasoning_text=reasoning_text.rstrip()
|
77
77
|
)
|
78
78
|
|
79
79
|
# Continue with reasoning content
|
@@ -147,7 +147,7 @@ class ReasoningParser:
|
|
147
147
|
|
148
148
|
Args:
|
149
149
|
model_type (str): Type of model to parse reasoning from
|
150
|
-
stream_reasoning (bool): If
|
150
|
+
stream_reasoning (bool): If False, accumulates reasoning content until complete.
|
151
151
|
If True, streams reasoning content as it arrives.
|
152
152
|
"""
|
153
153
|
|
@@ -28,11 +28,26 @@ class CustomLogitProcessor(ABC):
|
|
28
28
|
"""Define the callable behavior."""
|
29
29
|
raise NotImplementedError
|
30
30
|
|
31
|
-
|
31
|
+
@classmethod
|
32
|
+
def to_str(cls) -> str:
|
32
33
|
"""Serialize the callable function to a JSON-compatible string."""
|
33
|
-
return json.dumps({"callable": dill.dumps(
|
34
|
+
return json.dumps({"callable": dill.dumps(cls).hex()})
|
34
35
|
|
35
36
|
@classmethod
|
36
37
|
def from_str(cls, json_str: str):
|
37
38
|
"""Deserialize a callable function from a JSON string."""
|
38
|
-
return _cache_from_str(json_str)
|
39
|
+
return _cache_from_str(json_str)()
|
40
|
+
|
41
|
+
|
42
|
+
class DisallowedTokensLogitsProcessor(CustomLogitProcessor):
|
43
|
+
def __call__(
|
44
|
+
self,
|
45
|
+
logits: torch.Tensor,
|
46
|
+
custom_param_list: Optional[List[Dict[str, Any]]] = None,
|
47
|
+
) -> torch.Tensor:
|
48
|
+
disallowed_token_ids = custom_param_list[0]["token_ids"]
|
49
|
+
assert all(
|
50
|
+
disallowed_token_ids == c["token_ids"] for c in custom_param_list
|
51
|
+
), f"{custom_param_list=}"
|
52
|
+
logits[..., disallowed_token_ids] = -float("inf")
|
53
|
+
return logits
|
@@ -30,13 +30,8 @@ class SamplingBatchInfo:
|
|
30
30
|
# Whether any request needs min_p sampling
|
31
31
|
need_min_p_sampling: bool
|
32
32
|
|
33
|
-
# Use thinking_budget to truncate thinking
|
34
|
-
num_thinking_tokens: Optional[torch.Tensor] = None
|
35
|
-
think_end_ids: Optional[torch.Tensor] = None
|
36
|
-
thinking_budgets: Optional[torch.Tensor] = None
|
37
|
-
|
38
33
|
# Masking tensors for grammar-guided structured outputs
|
39
|
-
vocab_size: int
|
34
|
+
vocab_size: int
|
40
35
|
grammars: Optional[List] = None
|
41
36
|
vocab_mask: Optional[torch.Tensor] = None
|
42
37
|
apply_mask_func: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None
|
@@ -81,22 +76,7 @@ class SamplingBatchInfo:
|
|
81
76
|
min_ps = torch.tensor(
|
82
77
|
[r.sampling_params.min_p for r in reqs], dtype=torch.float
|
83
78
|
).to(device, non_blocking=True)
|
84
|
-
|
85
|
-
think_end_ids = torch.tensor(
|
86
|
-
[getattr(r.tokenizer, "think_end_id", -1) for r in reqs],
|
87
|
-
dtype=torch.int64,
|
88
|
-
).to(device, non_blocking=True)
|
89
|
-
num_thinking_tokens = torch.tensor([0 for _ in reqs], dtype=torch.int64).to(
|
90
|
-
device, non_blocking=True
|
91
|
-
)
|
92
|
-
thinking_budgets = torch.tensor(
|
93
|
-
[r.sampling_params.thinking_budget or -1 for r in reqs],
|
94
|
-
dtype=torch.int64,
|
95
|
-
).to(device, non_blocking=True)
|
96
|
-
else:
|
97
|
-
think_end_ids = None
|
98
|
-
num_thinking_tokens = None
|
99
|
-
thinking_budgets = None
|
79
|
+
|
100
80
|
# Check if any request has custom logit processor
|
101
81
|
has_custom_logit_processor = (
|
102
82
|
batch.enable_custom_logit_processor # check the flag first.
|
@@ -152,9 +132,6 @@ class SamplingBatchInfo:
|
|
152
132
|
top_ps=top_ps,
|
153
133
|
top_ks=top_ks,
|
154
134
|
min_ps=min_ps,
|
155
|
-
think_end_ids=think_end_ids,
|
156
|
-
num_thinking_tokens=num_thinking_tokens,
|
157
|
-
thinking_budgets=thinking_budgets,
|
158
135
|
is_all_greedy=all(r.sampling_params.top_k <= 1 for r in reqs),
|
159
136
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
160
137
|
vocab_size=vocab_size,
|
@@ -169,35 +146,6 @@ class SamplingBatchInfo:
|
|
169
146
|
def __len__(self):
|
170
147
|
return len(self.temperatures)
|
171
148
|
|
172
|
-
def apply_thinking_budgets(self, next_token_logits: torch.Tensor):
|
173
|
-
has_budget = self.thinking_budgets > 0
|
174
|
-
if not has_budget.any():
|
175
|
-
return
|
176
|
-
torch.where(
|
177
|
-
has_budget,
|
178
|
-
self.num_thinking_tokens + 1,
|
179
|
-
self.num_thinking_tokens,
|
180
|
-
out=self.num_thinking_tokens,
|
181
|
-
)
|
182
|
-
should_stop = has_budget & (
|
183
|
-
self.num_thinking_tokens - 1 > self.thinking_budgets
|
184
|
-
)
|
185
|
-
next_token_logits.masked_fill_(should_stop.unsqueeze(0), float("-inf"))
|
186
|
-
batch_indices = torch.nonzero(should_stop, as_tuple=True)[0]
|
187
|
-
if len(batch_indices) > 0:
|
188
|
-
end_token_indices = self.think_end_ids[batch_indices]
|
189
|
-
next_token_logits[batch_indices, end_token_indices] = 0.0
|
190
|
-
|
191
|
-
def update_thinking_budgets(self, next_token_ids: torch.Tensor):
|
192
|
-
if not torch.any(self.thinking_budgets > 0):
|
193
|
-
return
|
194
|
-
torch.where(
|
195
|
-
next_token_ids == self.think_end_ids,
|
196
|
-
torch.tensor(-1, device=self.thinking_budgets.device),
|
197
|
-
self.thinking_budgets,
|
198
|
-
out=self.thinking_budgets,
|
199
|
-
)
|
200
|
-
|
201
149
|
def update_regex_vocab_mask(self):
|
202
150
|
if not self.grammars:
|
203
151
|
self.vocab_mask = None
|
@@ -346,7 +294,7 @@ class SamplingBatchInfo:
|
|
346
294
|
# Set the flag to True if any of the two has custom logit processor
|
347
295
|
self.has_custom_logit_processor = True
|
348
296
|
|
349
|
-
# Note:
|
297
|
+
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
350
298
|
# please make sure any merge operation with len(self) or len(other) is done before
|
351
299
|
# the merge operation of the temperatures tensor below.
|
352
300
|
for item in [
|
@@ -359,5 +307,5 @@ class SamplingBatchInfo:
|
|
359
307
|
other_val = getattr(other, item, None)
|
360
308
|
setattr(self, item, torch.cat([self_val, other_val]))
|
361
309
|
|
362
|
-
self.is_all_greedy
|
310
|
+
self.is_all_greedy &= other.is_all_greedy
|
363
311
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
@@ -30,7 +30,6 @@ class SamplingParams:
|
|
30
30
|
def __init__(
|
31
31
|
self,
|
32
32
|
max_new_tokens: int = 128,
|
33
|
-
thinking_budget: Optional[int] = None,
|
34
33
|
stop: Optional[Union[str, List[str]]] = None,
|
35
34
|
stop_token_ids: Optional[List[int]] = None,
|
36
35
|
temperature: float = 1.0,
|
@@ -51,6 +50,7 @@ class SamplingParams:
|
|
51
50
|
spaces_between_special_tokens: bool = True,
|
52
51
|
no_stop_trim: bool = False,
|
53
52
|
custom_params: Optional[Dict[str, Any]] = None,
|
53
|
+
stream_interval: Optional[int] = None,
|
54
54
|
) -> None:
|
55
55
|
self.max_new_tokens = max_new_tokens
|
56
56
|
self.stop_strs = stop
|
@@ -58,7 +58,6 @@ class SamplingParams:
|
|
58
58
|
self.stop_token_ids = set(stop_token_ids)
|
59
59
|
else:
|
60
60
|
self.stop_token_ids = None
|
61
|
-
self.thinking_budget = thinking_budget
|
62
61
|
self.temperature = temperature
|
63
62
|
self.top_p = top_p
|
64
63
|
self.top_k = top_k
|
@@ -77,6 +76,7 @@ class SamplingParams:
|
|
77
76
|
self.spaces_between_special_tokens = spaces_between_special_tokens
|
78
77
|
self.no_stop_trim = no_stop_trim
|
79
78
|
self.custom_params = custom_params
|
79
|
+
self.stream_interval = stream_interval
|
80
80
|
|
81
81
|
# Process some special cases
|
82
82
|
if 0 <= self.temperature < _SAMPLING_EPS:
|
sglang/srt/server_args.py
CHANGED
@@ -46,7 +46,6 @@ class ServerArgs:
|
|
46
46
|
tokenizer_path: Optional[str] = None
|
47
47
|
tokenizer_mode: str = "auto"
|
48
48
|
skip_tokenizer_init: bool = False
|
49
|
-
enable_tokenizer_batch_encode: bool = False
|
50
49
|
load_format: str = "auto"
|
51
50
|
trust_remote_code: bool = False
|
52
51
|
dtype: str = "auto"
|
@@ -59,6 +58,7 @@ class ServerArgs:
|
|
59
58
|
chat_template: Optional[str] = None
|
60
59
|
completion_template: Optional[str] = None
|
61
60
|
is_embedding: bool = False
|
61
|
+
enable_multimodal: Optional[bool] = None
|
62
62
|
revision: Optional[str] = None
|
63
63
|
|
64
64
|
# Port for the HTTP server
|
@@ -97,7 +97,13 @@ class ServerArgs:
|
|
97
97
|
log_requests_level: int = 0
|
98
98
|
show_time_cost: bool = False
|
99
99
|
enable_metrics: bool = False
|
100
|
+
bucket_time_to_first_token: Optional[List[float]] = None
|
101
|
+
bucket_e2e_request_latency: Optional[List[float]] = None
|
102
|
+
bucket_inter_token_latency: Optional[List[float]] = None
|
103
|
+
collect_tokens_histogram: bool = False
|
100
104
|
decode_log_interval: int = 40
|
105
|
+
enable_request_time_stats_logging: bool = False
|
106
|
+
kv_events_config: Optional[str] = None
|
101
107
|
|
102
108
|
# API related
|
103
109
|
api_key: Optional[str] = None
|
@@ -119,6 +125,7 @@ class ServerArgs:
|
|
119
125
|
|
120
126
|
# Model override args in JSON
|
121
127
|
json_model_override_args: str = "{}"
|
128
|
+
preferred_sampling_params: Optional[str] = None
|
122
129
|
|
123
130
|
# LoRA
|
124
131
|
lora_paths: Optional[List[str]] = None
|
@@ -153,15 +160,27 @@ class ServerArgs:
|
|
153
160
|
disable_cuda_graph: bool = False
|
154
161
|
disable_cuda_graph_padding: bool = False
|
155
162
|
enable_nccl_nvls: bool = False
|
163
|
+
enable_tokenizer_batch_encode: bool = False
|
156
164
|
disable_outlines_disk_cache: bool = False
|
157
165
|
disable_custom_all_reduce: bool = False
|
158
|
-
enable_multimodal: Optional[bool] = None
|
159
166
|
disable_overlap_schedule: bool = False
|
160
167
|
enable_mixed_chunk: bool = False
|
161
168
|
enable_dp_attention: bool = False
|
169
|
+
enable_dp_lm_head: bool = False
|
162
170
|
enable_ep_moe: bool = False
|
163
171
|
enable_deepep_moe: bool = False
|
164
172
|
deepep_mode: Optional[Literal["auto", "normal", "low_latency"]] = "auto"
|
173
|
+
ep_num_redundant_experts: int = 0
|
174
|
+
ep_dispatch_algorithm: Optional[Literal["static", "dynamic"]] = None
|
175
|
+
init_expert_location: str = "trivial"
|
176
|
+
enable_eplb: bool = False
|
177
|
+
eplb_rebalance_num_iterations: int = 1000
|
178
|
+
expert_distribution_recorder_mode: Optional[
|
179
|
+
Literal["stat", "per_pass", "per_token"]
|
180
|
+
] = None
|
181
|
+
expert_distribution_recorder_buffer_size: Optional[int] = None
|
182
|
+
enable_expert_distribution_metrics: bool = False
|
183
|
+
deepep_config: Optional[str] = None
|
165
184
|
enable_torch_compile: bool = False
|
166
185
|
torch_compile_max_bs: int = 32
|
167
186
|
cuda_graph_max_bs: Optional[int] = None
|
@@ -227,7 +246,7 @@ class ServerArgs:
|
|
227
246
|
# Set mem fraction static, which depends on the tensor parallelism size
|
228
247
|
if self.mem_fraction_static is None:
|
229
248
|
parallel_size = self.tp_size * self.pp_size
|
230
|
-
if gpu_mem <= 81920:
|
249
|
+
if gpu_mem is not None and gpu_mem <= 81920:
|
231
250
|
if parallel_size >= 16:
|
232
251
|
self.mem_fraction_static = 0.79
|
233
252
|
elif parallel_size >= 8:
|
@@ -240,7 +259,7 @@ class ServerArgs:
|
|
240
259
|
self.mem_fraction_static = 0.88
|
241
260
|
else:
|
242
261
|
self.mem_fraction_static = 0.88
|
243
|
-
if gpu_mem > 96 * 1024:
|
262
|
+
if gpu_mem is not None and gpu_mem > 96 * 1024:
|
244
263
|
mem_fraction = self.mem_fraction_static
|
245
264
|
self.mem_fraction_static = min(
|
246
265
|
mem_fraction + 48 * 1024 * (1 - mem_fraction) / gpu_mem,
|
@@ -317,6 +336,11 @@ class ServerArgs:
|
|
317
336
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
318
337
|
)
|
319
338
|
|
339
|
+
if self.enable_dp_lm_head:
|
340
|
+
assert (
|
341
|
+
self.enable_dp_attention
|
342
|
+
), "Please enable dp attention when setting enable_dp_attention. "
|
343
|
+
|
320
344
|
# DeepEP MoE
|
321
345
|
self.enable_sp_layernorm = False
|
322
346
|
if self.enable_deepep_moe:
|
@@ -335,6 +359,21 @@ class ServerArgs:
|
|
335
359
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
336
360
|
)
|
337
361
|
|
362
|
+
if self.pp_size > 1:
|
363
|
+
self.disable_overlap_schedule = True
|
364
|
+
logger.warning(
|
365
|
+
"Pipeline parallelism is incompatible with overlap schedule."
|
366
|
+
)
|
367
|
+
|
368
|
+
if self.expert_distribution_recorder_buffer_size is None:
|
369
|
+
# TODO pr-chain: enable this later
|
370
|
+
# if (x := self.eplb_rebalance_num_iterations) is not None:
|
371
|
+
# self.expert_distribution_recorder_buffer_size = x
|
372
|
+
if False:
|
373
|
+
pass
|
374
|
+
elif self.expert_distribution_recorder_mode is not None:
|
375
|
+
self.expert_distribution_recorder_buffer_size = 1000
|
376
|
+
|
338
377
|
# Speculative Decoding
|
339
378
|
if self.speculative_algorithm == "NEXTN":
|
340
379
|
# NEXTN shares the same implementation of EAGLE
|
@@ -455,11 +494,6 @@ class ServerArgs:
|
|
455
494
|
action="store_true",
|
456
495
|
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
457
496
|
)
|
458
|
-
parser.add_argument(
|
459
|
-
"--enable-tokenizer-batch-encode",
|
460
|
-
action="store_true",
|
461
|
-
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
462
|
-
)
|
463
497
|
parser.add_argument(
|
464
498
|
"--load-format",
|
465
499
|
type=str,
|
@@ -537,6 +571,7 @@ class ServerArgs:
|
|
537
571
|
"w8a8_int8",
|
538
572
|
"w8a8_fp8",
|
539
573
|
"moe_wna16",
|
574
|
+
"qoq",
|
540
575
|
],
|
541
576
|
help="The quantization method.",
|
542
577
|
)
|
@@ -584,6 +619,12 @@ class ServerArgs:
|
|
584
619
|
action="store_true",
|
585
620
|
help="Whether to use a CausalLM as an embedding model.",
|
586
621
|
)
|
622
|
+
parser.add_argument(
|
623
|
+
"--enable-multimodal",
|
624
|
+
default=ServerArgs.enable_multimodal,
|
625
|
+
action="store_true",
|
626
|
+
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
627
|
+
)
|
587
628
|
parser.add_argument(
|
588
629
|
"--revision",
|
589
630
|
type=str,
|
@@ -761,12 +802,51 @@ class ServerArgs:
|
|
761
802
|
action="store_true",
|
762
803
|
help="Enable log prometheus metrics.",
|
763
804
|
)
|
805
|
+
parser.add_argument(
|
806
|
+
"--bucket-time-to-first-token",
|
807
|
+
type=float,
|
808
|
+
nargs="+",
|
809
|
+
default=ServerArgs.bucket_time_to_first_token,
|
810
|
+
help="The buckets of time to first token, specified as a list of floats.",
|
811
|
+
)
|
812
|
+
parser.add_argument(
|
813
|
+
"--bucket-inter-token-latency",
|
814
|
+
type=float,
|
815
|
+
nargs="+",
|
816
|
+
default=ServerArgs.bucket_inter_token_latency,
|
817
|
+
help="The buckets of inter-token latency, specified as a list of floats.",
|
818
|
+
)
|
819
|
+
parser.add_argument(
|
820
|
+
"--bucket-e2e-request-latency",
|
821
|
+
type=float,
|
822
|
+
nargs="+",
|
823
|
+
default=ServerArgs.bucket_e2e_request_latency,
|
824
|
+
help="The buckets of end-to-end request latency, specified as a list of floats.",
|
825
|
+
)
|
826
|
+
parser.add_argument(
|
827
|
+
"--collect-tokens-histogram",
|
828
|
+
action="store_true",
|
829
|
+
default=ServerArgs.collect_tokens_histogram,
|
830
|
+
help="Collect prompt/generation tokens histogram.",
|
831
|
+
)
|
832
|
+
parser.add_argument(
|
833
|
+
"--kv-events-config",
|
834
|
+
type=str,
|
835
|
+
default=None,
|
836
|
+
help="Config in json format for NVIDIA dynamo KV event publishing. Publishing will be enabled if this flag is used.",
|
837
|
+
)
|
764
838
|
parser.add_argument(
|
765
839
|
"--decode-log-interval",
|
766
840
|
type=int,
|
767
841
|
default=ServerArgs.decode_log_interval,
|
768
842
|
help="The log interval of decode batch.",
|
769
843
|
)
|
844
|
+
parser.add_argument(
|
845
|
+
"--enable-request-time-stats-logging",
|
846
|
+
action="store_true",
|
847
|
+
default=ServerArgs.enable_request_time_stats_logging,
|
848
|
+
help="Enable per request time stats logging",
|
849
|
+
)
|
770
850
|
|
771
851
|
# API related
|
772
852
|
parser.add_argument(
|
@@ -825,7 +905,7 @@ class ServerArgs:
|
|
825
905
|
# Multi-node distributed serving
|
826
906
|
parser.add_argument(
|
827
907
|
"--dist-init-addr",
|
828
|
-
"--nccl-init-addr", # For backward
|
908
|
+
"--nccl-init-addr", # For backward compatibility. This will be removed in the future.
|
829
909
|
type=str,
|
830
910
|
help="The host address for initializing distributed backend (e.g., `192.168.0.2:25000`).",
|
831
911
|
)
|
@@ -843,6 +923,11 @@ class ServerArgs:
|
|
843
923
|
help="A dictionary in JSON string format used to override default model configurations.",
|
844
924
|
default=ServerArgs.json_model_override_args,
|
845
925
|
)
|
926
|
+
parser.add_argument(
|
927
|
+
"--preferred-sampling-params",
|
928
|
+
type=str,
|
929
|
+
help="json-formatted sampling settings that will be returned in /get_model_info",
|
930
|
+
)
|
846
931
|
|
847
932
|
# LoRA
|
848
933
|
parser.add_argument(
|
@@ -871,6 +956,7 @@ class ServerArgs:
|
|
871
956
|
"--attention-backend",
|
872
957
|
type=str,
|
873
958
|
choices=[
|
959
|
+
"aiter",
|
874
960
|
"flashinfer",
|
875
961
|
"triton",
|
876
962
|
"torch_native",
|
@@ -1018,6 +1104,11 @@ class ServerArgs:
|
|
1018
1104
|
action="store_true",
|
1019
1105
|
help="Enable NCCL NVLS for prefill heavy requests when available.",
|
1020
1106
|
)
|
1107
|
+
parser.add_argument(
|
1108
|
+
"--enable-tokenizer-batch-encode",
|
1109
|
+
action="store_true",
|
1110
|
+
help="Enable batch tokenization for improved performance when processing multiple text inputs. Do not use with image inputs, pre-tokenized input_ids, or input_embeds.",
|
1111
|
+
)
|
1021
1112
|
parser.add_argument(
|
1022
1113
|
"--disable-outlines-disk-cache",
|
1023
1114
|
action="store_true",
|
@@ -1028,12 +1119,6 @@ class ServerArgs:
|
|
1028
1119
|
action="store_true",
|
1029
1120
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
1030
1121
|
)
|
1031
|
-
parser.add_argument(
|
1032
|
-
"--enable-multimodal",
|
1033
|
-
default=ServerArgs.enable_multimodal,
|
1034
|
-
action="store_true",
|
1035
|
-
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
1036
|
-
)
|
1037
1122
|
parser.add_argument(
|
1038
1123
|
"--disable-overlap-schedule",
|
1039
1124
|
action="store_true",
|
@@ -1047,7 +1132,12 @@ class ServerArgs:
|
|
1047
1132
|
parser.add_argument(
|
1048
1133
|
"--enable-dp-attention",
|
1049
1134
|
action="store_true",
|
1050
|
-
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently
|
1135
|
+
help="Enabling data parallelism for attention and tensor parallelism for FFN. The dp size should be equal to the tp size. Currently DeepSeek-V2 and Qwen 2/3 MoE models are supported.",
|
1136
|
+
)
|
1137
|
+
parser.add_argument(
|
1138
|
+
"--enable-dp-lm-head",
|
1139
|
+
action="store_true",
|
1140
|
+
help="Enable vocabulary parallel across the attention TP group to avoid all-gather across DP groups, optimizing performance under DP attention.",
|
1051
1141
|
)
|
1052
1142
|
parser.add_argument(
|
1053
1143
|
"--enable-ep-moe",
|
@@ -1069,7 +1159,7 @@ class ServerArgs:
|
|
1069
1159
|
"--cuda-graph-max-bs",
|
1070
1160
|
type=int,
|
1071
1161
|
default=ServerArgs.cuda_graph_max_bs,
|
1072
|
-
help="Set the maximum batch size for cuda graph.",
|
1162
|
+
help="Set the maximum batch size for cuda graph. It will extend the cuda graph capture batch size to this value.",
|
1073
1163
|
)
|
1074
1164
|
parser.add_argument(
|
1075
1165
|
"--cuda-graph-bs",
|
@@ -1096,7 +1186,7 @@ class ServerArgs:
|
|
1096
1186
|
parser.add_argument(
|
1097
1187
|
"--triton-attention-reduce-in-fp32",
|
1098
1188
|
action="store_true",
|
1099
|
-
help="Cast the
|
1189
|
+
help="Cast the intermediate attention results to fp32 to avoid possible crashes related to fp16."
|
1100
1190
|
"This only affects Triton attention kernels.",
|
1101
1191
|
)
|
1102
1192
|
parser.add_argument(
|
@@ -1182,13 +1272,65 @@ class ServerArgs:
|
|
1182
1272
|
default="auto",
|
1183
1273
|
help="Select the mode when enable DeepEP MoE, could be `normal`, `low_latency` or `auto`. Default is `auto`, which means `low_latency` for decode batch and `normal` for prefill batch.",
|
1184
1274
|
)
|
1275
|
+
parser.add_argument(
|
1276
|
+
"--ep-num-redundant-experts",
|
1277
|
+
type=int,
|
1278
|
+
default=ServerArgs.ep_num_redundant_experts,
|
1279
|
+
help="Allocate this number of redundant experts in expert parallel.",
|
1280
|
+
)
|
1281
|
+
parser.add_argument(
|
1282
|
+
"--ep-dispatch-algorithm",
|
1283
|
+
type=str,
|
1284
|
+
default=ServerArgs.ep_dispatch_algorithm,
|
1285
|
+
help="The algorithm to choose ranks for redundant experts in expert parallel.",
|
1286
|
+
)
|
1287
|
+
parser.add_argument(
|
1288
|
+
"--init-expert-location",
|
1289
|
+
type=str,
|
1290
|
+
default=ServerArgs.init_expert_location,
|
1291
|
+
help="Initial location of EP experts.",
|
1292
|
+
)
|
1293
|
+
parser.add_argument(
|
1294
|
+
"--enable-eplb",
|
1295
|
+
action="store_true",
|
1296
|
+
help="Enable EPLB algorithm",
|
1297
|
+
)
|
1298
|
+
parser.add_argument(
|
1299
|
+
"--eplb-rebalance-num-iterations",
|
1300
|
+
type=int,
|
1301
|
+
default=ServerArgs.eplb_rebalance_num_iterations,
|
1302
|
+
help="Number of iterations to automatically trigger a EPLB re-balance.",
|
1303
|
+
)
|
1304
|
+
parser.add_argument(
|
1305
|
+
"--expert-distribution-recorder-mode",
|
1306
|
+
type=str,
|
1307
|
+
default=ServerArgs.expert_distribution_recorder_mode,
|
1308
|
+
help="Mode of expert distribution recorder.",
|
1309
|
+
)
|
1310
|
+
parser.add_argument(
|
1311
|
+
"--expert-distribution-recorder-buffer-size",
|
1312
|
+
type=int,
|
1313
|
+
default=ServerArgs.expert_distribution_recorder_buffer_size,
|
1314
|
+
help="Circular buffer size of expert distribution recorder. Set to -1 to denote infinite buffer.",
|
1315
|
+
)
|
1316
|
+
parser.add_argument(
|
1317
|
+
"--enable-expert-distribution-metrics",
|
1318
|
+
action="store_true",
|
1319
|
+
help="Enable logging metrics for expert balancedness",
|
1320
|
+
)
|
1321
|
+
parser.add_argument(
|
1322
|
+
"--deepep-config",
|
1323
|
+
type=str,
|
1324
|
+
default=ServerArgs.deepep_config,
|
1325
|
+
help="Tuned DeepEP config suitable for your own cluster.",
|
1326
|
+
)
|
1185
1327
|
|
1186
1328
|
parser.add_argument(
|
1187
1329
|
"--n-share-experts-fusion",
|
1188
1330
|
type=int,
|
1189
1331
|
default=0,
|
1190
1332
|
help="The number of shared_experts need to be replicated to fuse with normal experts in deepseek v3/r1, "
|
1191
|
-
"set it to tp_size can get best optimized
|
1333
|
+
"set it to tp_size can get best optimized performance. Note that for architectures with SM==90, we have enabled the shared experts fusion optimization by default for DeepSeek V3/R1, with n_share_experts_fusion automatically set to the TP size.",
|
1192
1334
|
)
|
1193
1335
|
parser.add_argument(
|
1194
1336
|
"--disable-chunked-prefix-cache",
|
@@ -1296,8 +1438,6 @@ class ServerArgs:
|
|
1296
1438
|
|
1297
1439
|
# FIXME pp constraints
|
1298
1440
|
if self.pp_size > 1:
|
1299
|
-
logger.warning(f"Turn off overlap scheule for pipeline parallelism.")
|
1300
|
-
self.disable_overlap_schedule = True
|
1301
1441
|
assert (
|
1302
1442
|
self.disable_overlap_schedule
|
1303
1443
|
and self.speculative_algorithm is None
|
@@ -82,12 +82,12 @@ class EAGLEDraftCudaGraphRunner:
|
|
82
82
|
self.capture()
|
83
83
|
except RuntimeError as e:
|
84
84
|
raise Exception(
|
85
|
-
f"Capture
|
85
|
+
f"Capture CUDA graph failed: {e}\n"
|
86
86
|
"Possible solutions:\n"
|
87
87
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
88
88
|
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
89
89
|
"3. disable torch compile by not using --enable-torch-compile\n"
|
90
|
-
"4. disable
|
90
|
+
"4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
|
91
91
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
92
92
|
)
|
93
93
|
|
@@ -149,7 +149,7 @@ class EAGLEDraftCudaGraphRunner:
|
|
149
149
|
|
150
150
|
# Run and capture
|
151
151
|
def run_once():
|
152
|
-
# Backup two
|
152
|
+
# Backup two fields, which will be modified in-place in `draft_forward`.
|
153
153
|
output_cache_loc_backup = forward_batch.out_cache_loc
|
154
154
|
hidden_states_backup = forward_batch.spec_info.hidden_states
|
155
155
|
|