sglang 0.4.9.post3__py3-none-any.whl → 0.4.9.post4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/_custom_ops.py +29 -1
- sglang/srt/configs/model_config.py +1 -1
- sglang/srt/conversation.py +1 -1
- sglang/srt/disaggregation/common/conn.py +34 -6
- sglang/srt/disaggregation/mini_lb.py +3 -2
- sglang/srt/disaggregation/mooncake/conn.py +49 -20
- sglang/srt/disaggregation/mooncake/transfer_engine.py +4 -2
- sglang/srt/disaggregation/nixl/conn.py +17 -13
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +3 -91
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +96 -1
- sglang/srt/distributed/device_communicators/quick_all_reduce.py +273 -0
- sglang/srt/distributed/device_communicators/shm_broadcast.py +12 -5
- sglang/srt/distributed/parallel_state.py +70 -15
- sglang/srt/entrypoints/engine.py +2 -8
- sglang/srt/entrypoints/http_server.py +20 -32
- sglang/srt/entrypoints/openai/protocol.py +3 -3
- sglang/srt/entrypoints/openai/serving_chat.py +27 -4
- sglang/srt/function_call/base_format_detector.py +74 -12
- sglang/srt/function_call/deepseekv3_detector.py +26 -11
- sglang/srt/function_call/ebnf_composer.py +95 -63
- sglang/srt/function_call/function_call_parser.py +4 -4
- sglang/srt/function_call/kimik2_detector.py +41 -16
- sglang/srt/function_call/llama32_detector.py +6 -3
- sglang/srt/function_call/mistral_detector.py +11 -3
- sglang/srt/function_call/pythonic_detector.py +16 -14
- sglang/srt/function_call/qwen25_detector.py +12 -3
- sglang/srt/function_call/{qwen3_detector.py → qwen3_coder_detector.py} +10 -9
- sglang/srt/layers/activation.py +11 -3
- sglang/srt/layers/attention/base_attn_backend.py +3 -1
- sglang/srt/layers/communicator.py +12 -12
- sglang/srt/layers/dp_attention.py +72 -24
- sglang/srt/layers/logits_processor.py +34 -24
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_3_1/E=160,N=320,device_name=NVIDIA_H20-3e.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +25 -224
- sglang/srt/layers/moe/topk.py +5 -13
- sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py +2 -9
- sglang/srt/layers/quantization/modelopt_quant.py +8 -4
- sglang/srt/layers/quantization/utils.py +0 -9
- sglang/srt/layers/radix_attention.py +5 -3
- sglang/srt/lora/lora_manager.py +133 -169
- sglang/srt/lora/lora_registry.py +124 -0
- sglang/srt/lora/mem_pool.py +2 -2
- sglang/srt/managers/cache_controller.py +53 -6
- sglang/srt/managers/io_struct.py +19 -1
- sglang/srt/managers/schedule_batch.py +13 -3
- sglang/srt/managers/scheduler.py +13 -25
- sglang/srt/managers/tokenizer_manager.py +28 -25
- sglang/srt/managers/tp_worker.py +2 -4
- sglang/srt/mem_cache/allocator.py +67 -7
- sglang/srt/mem_cache/hicache_storage.py +17 -1
- sglang/srt/mem_cache/hiradix_cache.py +30 -16
- sglang/srt/mem_cache/memory_pool_host.py +3 -0
- sglang/srt/model_executor/cuda_graph_runner.py +61 -25
- sglang/srt/model_executor/forward_batch_info.py +201 -29
- sglang/srt/model_executor/model_runner.py +41 -23
- sglang/srt/models/deepseek_v2.py +1 -2
- sglang/srt/models/mllama4.py +10 -3
- sglang/srt/models/qwen2_moe.py +0 -4
- sglang/srt/models/qwen3_moe.py +1 -6
- sglang/srt/reasoning_parser.py +46 -4
- sglang/srt/sampling/sampling_batch_info.py +6 -5
- sglang/srt/server_args.py +76 -55
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +33 -28
- sglang/srt/speculative/eagle_draft_extend_cuda_graph_runner.py +37 -36
- sglang/srt/speculative/eagle_utils.py +51 -23
- sglang/srt/speculative/eagle_worker.py +59 -44
- sglang/srt/two_batch_overlap.py +9 -5
- sglang/srt/utils.py +17 -68
- sglang/test/test_activation.py +50 -1
- sglang/version.py +1 -1
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/METADATA +5 -5
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/RECORD +75 -72
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/WHEEL +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.9.post3.dist-info → sglang-0.4.9.post4.dist-info}/top_level.txt +0 -0
@@ -68,6 +68,7 @@ from sglang.srt.layers.sampler import Sampler
|
|
68
68
|
from sglang.srt.layers.torchao_utils import apply_torchao_config_to_model
|
69
69
|
from sglang.srt.layers.utils import is_sm100_supported
|
70
70
|
from sglang.srt.lora.lora_manager import LoRAManager
|
71
|
+
from sglang.srt.lora.lora_registry import LoRARef
|
71
72
|
from sglang.srt.managers.schedule_batch import (
|
72
73
|
GLOBAL_SERVER_ARGS_KEYS,
|
73
74
|
global_server_args_dict,
|
@@ -108,7 +109,6 @@ from sglang.srt.utils import (
|
|
108
109
|
get_bool_env_var,
|
109
110
|
get_cpu_ids_by_node,
|
110
111
|
init_custom_process_group,
|
111
|
-
is_cuda,
|
112
112
|
is_fa3_default_architecture,
|
113
113
|
is_flashinfer_available,
|
114
114
|
is_hip,
|
@@ -275,6 +275,7 @@ class ModelRunner:
|
|
275
275
|
self.sampler = Sampler()
|
276
276
|
self.load_model()
|
277
277
|
|
278
|
+
# Check if the model is using hybrid SWA
|
278
279
|
if (
|
279
280
|
not self.server_args.disable_hybrid_swa_memory
|
280
281
|
and self.sliding_window_size is not None
|
@@ -377,6 +378,7 @@ class ModelRunner:
|
|
377
378
|
is_hopper_with_cuda_12_3()
|
378
379
|
and is_no_spec_infer_or_topk_one(server_args)
|
379
380
|
and is_fa3_default_architecture(self.model_config.hf_config)
|
381
|
+
and (not server_args.enable_hierarchical_cache)
|
380
382
|
):
|
381
383
|
server_args.attention_backend = "fa3"
|
382
384
|
elif _is_hip:
|
@@ -389,7 +391,9 @@ class ModelRunner:
|
|
389
391
|
)
|
390
392
|
else:
|
391
393
|
# MLA architecture
|
392
|
-
if is_hopper_with_cuda_12_3()
|
394
|
+
if is_hopper_with_cuda_12_3() and (
|
395
|
+
not server_args.enable_hierarchical_cache
|
396
|
+
):
|
393
397
|
server_args.attention_backend = "fa3"
|
394
398
|
elif is_sm100_supported():
|
395
399
|
server_args.attention_backend = "flashinfer"
|
@@ -890,44 +894,38 @@ class ModelRunner:
|
|
890
894
|
tp_rank=self.tp_rank,
|
891
895
|
max_lora_rank=self.server_args.max_lora_rank,
|
892
896
|
target_modules=self.server_args.lora_target_modules,
|
897
|
+
lora_paths=self.server_args.lora_paths,
|
893
898
|
)
|
894
|
-
result = self.lora_manager.load_lora_adapters(self.server_args.lora_paths or {})
|
895
|
-
if result.success:
|
896
|
-
logger.info(
|
897
|
-
f"LoRA manager ready. Loaded LoRA adapters: {', '.join(result.loaded_adapters)}"
|
898
|
-
)
|
899
|
-
else:
|
900
|
-
raise RuntimeError(f"Failed to load LoRA adapters: {result.error_message}")
|
901
899
|
|
902
|
-
def load_lora_adapter(self,
|
900
|
+
def load_lora_adapter(self, lora_ref: LoRARef):
|
903
901
|
"""Load a new lora adapter from disk or huggingface."""
|
904
902
|
|
905
903
|
logger.info(
|
906
|
-
f"LoRA adapter loading starts:
|
904
|
+
f"LoRA adapter loading starts: {lora_ref}. "
|
907
905
|
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
908
906
|
)
|
909
907
|
|
910
|
-
result = self.lora_manager.load_lora_adapter(
|
908
|
+
result = self.lora_manager.load_lora_adapter(lora_ref)
|
911
909
|
|
912
910
|
logger.info(
|
913
|
-
f"LoRA adapter loading completes:
|
911
|
+
f"LoRA adapter loading completes: {lora_ref}. "
|
914
912
|
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
915
913
|
)
|
916
914
|
|
917
915
|
return result
|
918
916
|
|
919
|
-
def unload_lora_adapter(self,
|
917
|
+
def unload_lora_adapter(self, lora_ref: LoRARef):
|
920
918
|
"""Unload a lora adapter that was previously loaded during initialization or dynamic loading."""
|
921
919
|
|
922
920
|
logger.info(
|
923
|
-
f"LoRA adapter unloading starts:
|
921
|
+
f"LoRA adapter unloading starts: {lora_ref}. "
|
924
922
|
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
925
923
|
)
|
926
924
|
|
927
|
-
result = self.lora_manager.unload_lora_adapter(
|
925
|
+
result = self.lora_manager.unload_lora_adapter(lora_ref)
|
928
926
|
|
929
927
|
logger.info(
|
930
|
-
f"LoRA adapter unloading completes:
|
928
|
+
f"LoRA adapter unloading completes: {lora_ref}. "
|
931
929
|
f"avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB"
|
932
930
|
)
|
933
931
|
|
@@ -1010,8 +1008,11 @@ class ModelRunner:
|
|
1010
1008
|
try:
|
1011
1009
|
layers = self.model.language_model.model.layers
|
1012
1010
|
except:
|
1013
|
-
|
1014
|
-
|
1011
|
+
try:
|
1012
|
+
layers = self.model.language_model.layers
|
1013
|
+
except:
|
1014
|
+
self.is_hybrid = False
|
1015
|
+
return
|
1015
1016
|
|
1016
1017
|
for layer in layers:
|
1017
1018
|
if (
|
@@ -1462,9 +1463,13 @@ class ModelRunner:
|
|
1462
1463
|
tensor_parallel(self.model, device_mesh)
|
1463
1464
|
|
1464
1465
|
def forward_decode(
|
1465
|
-
self,
|
1466
|
+
self,
|
1467
|
+
forward_batch: ForwardBatch,
|
1468
|
+
skip_attn_backend_init: bool = False,
|
1469
|
+
pp_proxy_tensors=None,
|
1466
1470
|
) -> LogitsProcessorOutput:
|
1467
|
-
|
1471
|
+
if not skip_attn_backend_init:
|
1472
|
+
self.attn_backend.init_forward_metadata(forward_batch)
|
1468
1473
|
# FIXME: add pp_proxy_tensors arg to all models
|
1469
1474
|
kwargs = {}
|
1470
1475
|
if self.support_pp:
|
@@ -1576,8 +1581,18 @@ class ModelRunner:
|
|
1576
1581
|
skip_attn_backend_init=skip_attn_backend_init,
|
1577
1582
|
pp_proxy_tensors=pp_proxy_tensors,
|
1578
1583
|
)
|
1579
|
-
|
1580
|
-
|
1584
|
+
return ret, can_run_cuda_graph
|
1585
|
+
|
1586
|
+
# For MLP sync
|
1587
|
+
if forward_batch.global_num_tokens_cpu is not None:
|
1588
|
+
forward_batch.prepare_mlp_sync_batch(self)
|
1589
|
+
|
1590
|
+
if forward_batch.forward_mode.is_decode():
|
1591
|
+
ret = self.forward_decode(
|
1592
|
+
forward_batch,
|
1593
|
+
skip_attn_backend_init=skip_attn_backend_init,
|
1594
|
+
pp_proxy_tensors=pp_proxy_tensors,
|
1595
|
+
)
|
1581
1596
|
elif forward_batch.forward_mode.is_extend():
|
1582
1597
|
ret = self.forward_extend(
|
1583
1598
|
forward_batch,
|
@@ -1595,6 +1610,9 @@ class ModelRunner:
|
|
1595
1610
|
else:
|
1596
1611
|
raise ValueError(f"Invalid forward mode: {forward_batch.forward_mode}")
|
1597
1612
|
|
1613
|
+
if forward_batch.global_num_tokens_cpu is not None:
|
1614
|
+
forward_batch.post_forward_mlp_sync_batch(ret)
|
1615
|
+
|
1598
1616
|
return ret, can_run_cuda_graph
|
1599
1617
|
|
1600
1618
|
def _preprocess_logits(
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -550,9 +550,8 @@ class DeepseekV2MoE(nn.Module):
|
|
550
550
|
def forward_deepep(
|
551
551
|
self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
|
552
552
|
) -> torch.Tensor:
|
553
|
-
forward_mode = forward_batch.forward_mode
|
554
553
|
shared_output = None
|
555
|
-
if
|
554
|
+
if hidden_states.shape[0] > 0:
|
556
555
|
# router_logits: (num_tokens, n_experts)
|
557
556
|
router_logits = self.gate(hidden_states)
|
558
557
|
shared_output = self._forward_shared_experts(hidden_states)
|
sglang/srt/models/mllama4.py
CHANGED
@@ -23,6 +23,7 @@ from sglang.srt.managers.schedule_batch import (
|
|
23
23
|
Modality,
|
24
24
|
MultimodalDataItem,
|
25
25
|
MultimodalInputs,
|
26
|
+
global_server_args_dict,
|
26
27
|
)
|
27
28
|
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
28
29
|
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
@@ -55,13 +56,17 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
55
56
|
self.quant_config = quant_config
|
56
57
|
|
57
58
|
# Check if this is a text-only model (modelopt fp8 llama4 has no vision components)
|
58
|
-
self.
|
59
|
-
if not self.
|
59
|
+
self.has_vision_weights = self._has_vision_weights(config)
|
60
|
+
if not self.has_vision_weights:
|
60
61
|
logger.warning(
|
61
62
|
"No vision weights found in checkpoint. Model will run in text-only mode. "
|
62
63
|
"Multimodal capabilities (image processing) will be unavailable."
|
63
64
|
)
|
64
65
|
|
66
|
+
self.has_vision = (
|
67
|
+
self.has_vision_weights and global_server_args_dict["enable_multimodal"]
|
68
|
+
)
|
69
|
+
|
65
70
|
if self.has_vision:
|
66
71
|
self.vision_model = Llama4VisionModel(config.vision_config)
|
67
72
|
self.multi_modal_projector = Llama4MultiModalProjector(config)
|
@@ -269,7 +274,9 @@ class Llama4ForConditionalGeneration(nn.Module):
|
|
269
274
|
|
270
275
|
def _should_skip_weight(self, name: str) -> bool:
|
271
276
|
"""Check if we should skip loading this weight."""
|
272
|
-
return
|
277
|
+
return not self.has_vision and (
|
278
|
+
"vision" in name or "multi_modal_projector" in name
|
279
|
+
)
|
273
280
|
|
274
281
|
def _transform_weight_name(self, name: str) -> str:
|
275
282
|
"""Transform weight name by adding language_model prefix if needed."""
|
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -43,10 +43,6 @@ from sglang.srt.layers.communicator import (
|
|
43
43
|
ScatterMode,
|
44
44
|
)
|
45
45
|
from sglang.srt.layers.dp_attention import (
|
46
|
-
attn_tp_all_gather,
|
47
|
-
attn_tp_reduce_scatter,
|
48
|
-
dp_gather_partial,
|
49
|
-
dp_scatter,
|
50
46
|
get_attention_tp_rank,
|
51
47
|
get_attention_tp_size,
|
52
48
|
get_local_attention_dp_size,
|
sglang/srt/models/qwen3_moe.py
CHANGED
@@ -38,10 +38,6 @@ from sglang.srt.eplb.expert_location_dispatch import ExpertLocationDispatchInfo
|
|
38
38
|
from sglang.srt.layers.activation import SiluAndMul
|
39
39
|
from sglang.srt.layers.communicator import LayerCommunicator, LayerScatterModes
|
40
40
|
from sglang.srt.layers.dp_attention import (
|
41
|
-
attn_tp_all_gather,
|
42
|
-
attn_tp_reduce_scatter,
|
43
|
-
dp_gather_partial,
|
44
|
-
dp_scatter,
|
45
41
|
get_attention_tp_rank,
|
46
42
|
get_attention_tp_size,
|
47
43
|
get_local_attention_dp_size,
|
@@ -193,8 +189,7 @@ class Qwen3MoeSparseMoeBlock(nn.Module):
|
|
193
189
|
def forward_deepep(
|
194
190
|
self, hidden_states: torch.Tensor, forward_batch: ForwardBatch
|
195
191
|
) -> torch.Tensor:
|
196
|
-
|
197
|
-
if is_non_idle_and_non_empty(forward_mode, hidden_states):
|
192
|
+
if hidden_states.shape[0] > 0:
|
198
193
|
# router_logits: (num_tokens, n_experts)
|
199
194
|
router_logits, _ = self.gate(hidden_states)
|
200
195
|
topk_weights, topk_idx, _ = self.topk(
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -118,6 +118,14 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
118
118
|
Returns all the text before the </think> tag as `reasoning_text`
|
119
119
|
and the rest of the text as `normal_text`.
|
120
120
|
|
121
|
+
Supported models:
|
122
|
+
- DeepSeek-R1: Always generates thinking content without <think> start tag
|
123
|
+
- DeepSeek-R1-0528: Generates thinking content with <think> start tag
|
124
|
+
|
125
|
+
Format patterns:
|
126
|
+
- DeepSeek-R1: "I need to think about this...</think>The answer is 42."
|
127
|
+
- DeepSeek-R1-0528: "<think>I need to think about this...</think>The answer is 42."
|
128
|
+
|
121
129
|
Args:
|
122
130
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
123
131
|
If True, streams reasoning content as it arrives.
|
@@ -136,11 +144,20 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
136
144
|
|
137
145
|
class Qwen3Detector(BaseReasoningFormatDetector):
|
138
146
|
"""
|
139
|
-
Detector for Qwen3
|
147
|
+
Detector for standard Qwen3 models (e.g., Qwen/Qwen3-235B-A22B).
|
140
148
|
Assumes reasoning format:
|
141
149
|
(<think>)*(.*)</think>
|
142
|
-
|
143
|
-
|
150
|
+
|
151
|
+
Qwen3 models released before 07/2025 supports switching between thinking mode and normal
|
152
|
+
mode using `enable_thinking` parameter in the request parameter.
|
153
|
+
- enable_thinking=True: "<think>reasoning content</think>The answer is 42."
|
154
|
+
- enable_thinking=False: "The answer is 42." (no thinking tokens)
|
155
|
+
|
156
|
+
This detector handles both cases.
|
157
|
+
|
158
|
+
NOTE: Do NOT use this detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
159
|
+
Those models always generate thinking content without <think> start tags.
|
160
|
+
Use "qwen3-thinking" parser type for those models instead.
|
144
161
|
|
145
162
|
Args:
|
146
163
|
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
@@ -148,7 +165,6 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
148
165
|
"""
|
149
166
|
|
150
167
|
def __init__(self, stream_reasoning: bool = True):
|
151
|
-
# Qwen3 won't be in reasoning mode when user passes `enable_thinking=False`
|
152
168
|
super().__init__(
|
153
169
|
"<think>",
|
154
170
|
"</think>",
|
@@ -157,6 +173,31 @@ class Qwen3Detector(BaseReasoningFormatDetector):
|
|
157
173
|
)
|
158
174
|
|
159
175
|
|
176
|
+
class Qwen3ThinkingDetector(BaseReasoningFormatDetector):
|
177
|
+
"""
|
178
|
+
Detector for Qwen3-Thinking models (e.g., Qwen3-Thinking-2507).
|
179
|
+
Assumes reasoning format:
|
180
|
+
*(.*)</think>
|
181
|
+
|
182
|
+
These models always generate thinking content without <think> start tag.
|
183
|
+
They do not support the enable_thinking parameter and always think.
|
184
|
+
|
185
|
+
Format: "I need to think about this...</think>The answer is 42."
|
186
|
+
|
187
|
+
Args:
|
188
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
189
|
+
If True, streams reasoning content as it arrives.
|
190
|
+
"""
|
191
|
+
|
192
|
+
def __init__(self, stream_reasoning: bool = True):
|
193
|
+
super().__init__(
|
194
|
+
"<think>",
|
195
|
+
"</think>",
|
196
|
+
force_reasoning=True,
|
197
|
+
stream_reasoning=stream_reasoning,
|
198
|
+
)
|
199
|
+
|
200
|
+
|
160
201
|
class KimiDetector(BaseReasoningFormatDetector):
|
161
202
|
"""
|
162
203
|
Detector for Kimi Thinking model.
|
@@ -189,6 +230,7 @@ class ReasoningParser:
|
|
189
230
|
DetectorMap: Dict[str, Type[BaseReasoningFormatDetector]] = {
|
190
231
|
"deepseek-r1": DeepSeekR1Detector,
|
191
232
|
"qwen3": Qwen3Detector,
|
233
|
+
"qwen3-thinking": Qwen3ThinkingDetector,
|
192
234
|
"kimi": KimiDetector,
|
193
235
|
}
|
194
236
|
|
@@ -322,6 +322,12 @@ class SamplingBatchInfo:
|
|
322
322
|
# Set the flag to True if any of the two has custom logit processor
|
323
323
|
self.has_custom_logit_processor = True
|
324
324
|
|
325
|
+
# Merge logit bias - note this has to come before the temperatures tensor update! Otherwise will cause crashes.
|
326
|
+
# See note below on len(self) and len(other).
|
327
|
+
self.logit_bias = merge_bias_tensor(
|
328
|
+
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
329
|
+
)
|
330
|
+
|
325
331
|
# Note: because the __len()__ operator is defined on the temperatures tensor,
|
326
332
|
# please make sure any merge operation with len(self) or len(other) is done before
|
327
333
|
# the merge operation of the temperatures tensor below.
|
@@ -340,11 +346,6 @@ class SamplingBatchInfo:
|
|
340
346
|
self.need_top_k_sampling |= other.need_top_k_sampling
|
341
347
|
self.need_min_p_sampling |= other.need_min_p_sampling
|
342
348
|
|
343
|
-
# Merge logit bias
|
344
|
-
self.logit_bias = merge_bias_tensor(
|
345
|
-
self.logit_bias, other.logit_bias, len(self), len(other), self.device, 0.0
|
346
|
-
)
|
347
|
-
|
348
349
|
|
349
350
|
def merge_bias_tensor(
|
350
351
|
lhs: Optional[torch.Tensor],
|
sglang/srt/server_args.py
CHANGED
@@ -20,10 +20,10 @@ import logging
|
|
20
20
|
import os
|
21
21
|
import random
|
22
22
|
import tempfile
|
23
|
-
from token import OP
|
24
23
|
from typing import List, Literal, Optional, Union
|
25
24
|
|
26
25
|
from sglang.srt.hf_transformers_utils import check_gguf_file, get_config
|
26
|
+
from sglang.srt.lora.lora_registry import LoRARef
|
27
27
|
from sglang.srt.reasoning_parser import ReasoningParser
|
28
28
|
from sglang.srt.utils import (
|
29
29
|
LORA_TARGET_ALL_MODULES,
|
@@ -80,7 +80,7 @@ class ServerArgs:
|
|
80
80
|
schedule_policy: str = "fcfs"
|
81
81
|
schedule_conservativeness: float = 1.0
|
82
82
|
cpu_offload_gb: int = 0
|
83
|
-
page_size: int =
|
83
|
+
page_size: Optional[int] = None
|
84
84
|
hybrid_kvcache_ratio: Optional[float] = None
|
85
85
|
swa_full_tokens_ratio: float = 0.8
|
86
86
|
disable_hybrid_swa_memory: bool = False
|
@@ -145,7 +145,7 @@ class ServerArgs:
|
|
145
145
|
enable_lora: Optional[bool] = None
|
146
146
|
max_lora_rank: Optional[int] = None
|
147
147
|
lora_target_modules: Optional[Union[set[str], List[str]]] = None
|
148
|
-
lora_paths: Optional[Union[dict[str, str], List[str]]] = None
|
148
|
+
lora_paths: Optional[Union[dict[str, str], dict[str, LoRARef], List[str]]] = None
|
149
149
|
max_loras_per_batch: int = 8
|
150
150
|
lora_backend: str = "triton"
|
151
151
|
|
@@ -266,31 +266,20 @@ class ServerArgs:
|
|
266
266
|
|
267
267
|
def __post_init__(self):
|
268
268
|
# Expert parallelism
|
269
|
+
# We put it here first due to some internal ckpt conversation issues.
|
269
270
|
if self.enable_ep_moe:
|
270
271
|
self.ep_size = self.tp_size
|
271
272
|
logger.warning(
|
272
273
|
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
273
274
|
)
|
274
|
-
if self.enable_flashinfer_moe:
|
275
|
-
assert (
|
276
|
-
self.quantization == "modelopt_fp4"
|
277
|
-
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
278
|
-
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
279
|
-
self.disable_shared_experts_fusion = True
|
280
|
-
logger.warning(
|
281
|
-
f"Flashinfer MoE is enabled. Shared expert fusion is disabled."
|
282
|
-
)
|
283
275
|
|
284
276
|
# Set missing default values
|
285
277
|
if self.tokenizer_path is None:
|
286
278
|
self.tokenizer_path = self.model_path
|
287
|
-
|
288
|
-
if self.device is None:
|
289
|
-
self.device = get_device()
|
290
|
-
|
291
279
|
if self.served_model_name is None:
|
292
280
|
self.served_model_name = self.model_path
|
293
|
-
|
281
|
+
if self.device is None:
|
282
|
+
self.device = get_device()
|
294
283
|
if self.random_seed is None:
|
295
284
|
self.random_seed = random.randint(0, 1 << 30)
|
296
285
|
|
@@ -359,7 +348,6 @@ class ServerArgs:
|
|
359
348
|
self.chunked_prefill_size = 16384
|
360
349
|
else:
|
361
350
|
self.chunked_prefill_size = 4096
|
362
|
-
assert self.chunked_prefill_size % self.page_size == 0
|
363
351
|
|
364
352
|
# Set cuda graph max batch size
|
365
353
|
if self.cuda_graph_max_bs is None:
|
@@ -410,6 +398,14 @@ class ServerArgs:
|
|
410
398
|
)
|
411
399
|
self.page_size = 128
|
412
400
|
|
401
|
+
# Set page size
|
402
|
+
if self.page_size is None:
|
403
|
+
self.page_size = 1
|
404
|
+
|
405
|
+
# AMD-specific Triton attention KV splits default number
|
406
|
+
if is_hip():
|
407
|
+
self.triton_attention_num_kv_splits = 16
|
408
|
+
|
413
409
|
# Choose grammar backend
|
414
410
|
if self.grammar_backend is None:
|
415
411
|
self.grammar_backend = "xgrammar"
|
@@ -431,6 +427,13 @@ class ServerArgs:
|
|
431
427
|
self.enable_dp_attention
|
432
428
|
), "Please enable dp attention when setting enable_dp_lm_head. "
|
433
429
|
|
430
|
+
# MoE kernel
|
431
|
+
if self.enable_flashinfer_moe:
|
432
|
+
assert (
|
433
|
+
self.quantization == "modelopt_fp4"
|
434
|
+
), "modelopt_fp4 quantization is required for Flashinfer MOE"
|
435
|
+
os.environ["TRTLLM_ENABLE_PDL"] = "1"
|
436
|
+
|
434
437
|
# DeepEP MoE
|
435
438
|
if self.enable_deepep_moe:
|
436
439
|
if self.deepep_mode == "normal":
|
@@ -502,14 +505,6 @@ class ServerArgs:
|
|
502
505
|
logger.warning(
|
503
506
|
"DeepSeek MTP does not require setting speculative_draft_model_path."
|
504
507
|
)
|
505
|
-
elif "Llama4" in model_arch:
|
506
|
-
# TODO: remove this after Llama4 supports in other backends
|
507
|
-
if self.attention_backend != "fa3":
|
508
|
-
self.attention_backend = "fa3"
|
509
|
-
logger.warning(
|
510
|
-
"Llama4 requires using fa3 attention backend. "
|
511
|
-
"Attention backend is automatically set to fa3."
|
512
|
-
)
|
513
508
|
|
514
509
|
# Auto choose parameters
|
515
510
|
if self.speculative_num_steps is None:
|
@@ -542,12 +537,11 @@ class ServerArgs:
|
|
542
537
|
) and check_gguf_file(self.model_path):
|
543
538
|
self.quantization = self.load_format = "gguf"
|
544
539
|
|
540
|
+
# Model loading
|
545
541
|
if is_remote_url(self.model_path):
|
546
542
|
self.load_format = "remote"
|
547
|
-
|
548
|
-
|
549
|
-
if is_hip():
|
550
|
-
self.triton_attention_num_kv_splits = 16
|
543
|
+
if self.custom_weight_loader is None:
|
544
|
+
self.custom_weight_loader = []
|
551
545
|
|
552
546
|
# PD disaggregation
|
553
547
|
if self.disaggregation_mode == "decode":
|
@@ -572,6 +566,7 @@ class ServerArgs:
|
|
572
566
|
self.disable_cuda_graph = True
|
573
567
|
logger.warning("Cuda graph is disabled for prefill server")
|
574
568
|
|
569
|
+
# Propagate env vars
|
575
570
|
os.environ["SGLANG_ENABLE_TORCH_COMPILE"] = (
|
576
571
|
"1" if self.enable_torch_compile else "0"
|
577
572
|
)
|
@@ -580,9 +575,6 @@ class ServerArgs:
|
|
580
575
|
"1" if self.disable_outlines_disk_cache else "0"
|
581
576
|
)
|
582
577
|
|
583
|
-
if self.custom_weight_loader is None:
|
584
|
-
self.custom_weight_loader = []
|
585
|
-
|
586
578
|
@staticmethod
|
587
579
|
def add_cli_args(parser: argparse.ArgumentParser):
|
588
580
|
# Model and tokenizer
|
@@ -1099,10 +1091,10 @@ class ServerArgs:
|
|
1099
1091
|
"deepseekv3",
|
1100
1092
|
"pythonic",
|
1101
1093
|
"kimi_k2",
|
1102
|
-
"
|
1094
|
+
"qwen3_coder",
|
1103
1095
|
],
|
1104
1096
|
default=ServerArgs.tool_call_parser,
|
1105
|
-
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', and '
|
1097
|
+
help="Specify the parser for handling tool-call interactions. Options include: 'qwen25', 'mistral', 'llama3', 'deepseekv3', 'pythonic', 'kimi_k2', and 'qwen3_coder'.",
|
1106
1098
|
)
|
1107
1099
|
|
1108
1100
|
# Data parallelism
|
@@ -1227,6 +1219,13 @@ class ServerArgs:
|
|
1227
1219
|
default=ServerArgs.grammar_backend,
|
1228
1220
|
help="Choose the backend for grammar-guided decoding.",
|
1229
1221
|
)
|
1222
|
+
parser.add_argument(
|
1223
|
+
"--mm-attention-backend",
|
1224
|
+
type=str,
|
1225
|
+
choices=["sdpa", "fa3", "triton_attn"],
|
1226
|
+
default=ServerArgs.mm_attention_backend,
|
1227
|
+
help="Set multimodal attention backend.",
|
1228
|
+
)
|
1230
1229
|
|
1231
1230
|
# Speculative decoding
|
1232
1231
|
parser.add_argument(
|
@@ -1276,13 +1275,6 @@ class ServerArgs:
|
|
1276
1275
|
help="The path of the draft model's small vocab table.",
|
1277
1276
|
default=ServerArgs.speculative_token_map,
|
1278
1277
|
)
|
1279
|
-
parser.add_argument(
|
1280
|
-
"--mm-attention-backend",
|
1281
|
-
type=str,
|
1282
|
-
choices=["sdpa", "fa3", "triton_attn"],
|
1283
|
-
default=ServerArgs.mm_attention_backend,
|
1284
|
-
help="Set multimodal attention backend.",
|
1285
|
-
)
|
1286
1278
|
|
1287
1279
|
# Expert parallelism
|
1288
1280
|
parser.add_argument(
|
@@ -1530,11 +1522,6 @@ class ServerArgs:
|
|
1530
1522
|
action="store_true",
|
1531
1523
|
help="Disable the overlap scheduler, which overlaps the CPU scheduler with GPU model worker.",
|
1532
1524
|
)
|
1533
|
-
parser.add_argument(
|
1534
|
-
"--disable-overlap-cg-plan",
|
1535
|
-
action="store_true",
|
1536
|
-
help="Disable the overlap optimization for cudagraph preparation in eagle verify.",
|
1537
|
-
)
|
1538
1525
|
parser.add_argument(
|
1539
1526
|
"--enable-mixed-chunk",
|
1540
1527
|
action="store_true",
|
@@ -1792,11 +1779,11 @@ class ServerArgs:
|
|
1792
1779
|
return hf_config
|
1793
1780
|
|
1794
1781
|
def check_server_args(self):
|
1782
|
+
# Check parallel size constraints
|
1795
1783
|
assert (
|
1796
1784
|
self.tp_size * self.pp_size
|
1797
1785
|
) % self.nnodes == 0, "tp_size must be divisible by number of nodes"
|
1798
1786
|
|
1799
|
-
# FIXME pp constraints
|
1800
1787
|
if self.pp_size > 1:
|
1801
1788
|
assert (
|
1802
1789
|
self.disable_overlap_schedule
|
@@ -1807,11 +1794,7 @@ class ServerArgs:
|
|
1807
1794
|
assert not (
|
1808
1795
|
self.dp_size > 1 and self.nnodes != 1 and not self.enable_dp_attention
|
1809
1796
|
), "multi-node data parallel is not supported unless dp attention!"
|
1810
|
-
|
1811
|
-
self.max_loras_per_batch > 0
|
1812
|
-
# FIXME
|
1813
|
-
and (self.lora_paths is None or self.disable_radix_cache)
|
1814
|
-
), "compatibility of lora and radix attention is in progress"
|
1797
|
+
|
1815
1798
|
assert self.base_gpu_id >= 0, "base_gpu_id must be non-negative"
|
1816
1799
|
assert self.gpu_id_step >= 1, "gpu_id_step must be positive"
|
1817
1800
|
|
@@ -1820,9 +1803,32 @@ class ServerArgs:
|
|
1820
1803
|
None,
|
1821
1804
|
}, "moe_dense_tp_size only support 1 and None currently"
|
1822
1805
|
|
1806
|
+
# Check model architecture
|
1807
|
+
model_arch = self.get_hf_config().architectures[0]
|
1808
|
+
if "Llama4" in model_arch:
|
1809
|
+
assert self.attention_backend == "fa3", "fa3 is required for Llama4 model"
|
1810
|
+
|
1811
|
+
# Check LoRA
|
1823
1812
|
self.check_lora_server_args()
|
1824
1813
|
|
1814
|
+
# Check speculative decoding
|
1815
|
+
if self.speculative_algorithm is not None:
|
1816
|
+
assert (
|
1817
|
+
not self.enable_mixed_chunk
|
1818
|
+
), "enable_mixed_chunk is required for speculative decoding"
|
1819
|
+
|
1820
|
+
# Check chunked prefill
|
1821
|
+
assert (
|
1822
|
+
self.chunked_prefill_size % self.page_size == 0
|
1823
|
+
), "chunked_prefill_size must be divisible by page_size"
|
1824
|
+
|
1825
1825
|
def check_lora_server_args(self):
|
1826
|
+
assert (
|
1827
|
+
self.max_loras_per_batch > 0
|
1828
|
+
# FIXME
|
1829
|
+
and (self.lora_paths is None or self.disable_radix_cache)
|
1830
|
+
), "compatibility of lora and radix attention is in progress"
|
1831
|
+
|
1826
1832
|
# Enable LoRA if any LoRA paths are provided for backward compatibility.
|
1827
1833
|
if self.lora_paths:
|
1828
1834
|
if self.enable_lora is None:
|
@@ -1843,9 +1849,24 @@ class ServerArgs:
|
|
1843
1849
|
for lora_path in lora_paths:
|
1844
1850
|
if "=" in lora_path:
|
1845
1851
|
name, path = lora_path.split("=", 1)
|
1846
|
-
self.lora_paths[name] = path
|
1852
|
+
self.lora_paths[name] = LoRARef(lora_name=name, lora_path=path)
|
1847
1853
|
else:
|
1848
|
-
self.lora_paths[lora_path] =
|
1854
|
+
self.lora_paths[lora_path] = LoRARef(
|
1855
|
+
lora_name=lora_path,
|
1856
|
+
lora_path=lora_path,
|
1857
|
+
)
|
1858
|
+
elif isinstance(self.lora_paths, dict):
|
1859
|
+
self.lora_paths = {
|
1860
|
+
k: LoRARef(lora_name=k, lora_path=v)
|
1861
|
+
for k, v in self.lora_paths.items()
|
1862
|
+
}
|
1863
|
+
elif self.lora_paths is None:
|
1864
|
+
self.lora_paths = {}
|
1865
|
+
else:
|
1866
|
+
raise ValueError(
|
1867
|
+
f"Invalid type for --lora-paths: {type(self.lora_paths)}. "
|
1868
|
+
"Expected a list or a dictionary."
|
1869
|
+
)
|
1849
1870
|
|
1850
1871
|
# Expand target modules
|
1851
1872
|
if self.lora_target_modules:
|