sglang 0.4.5.post3__py3-none-any.whl → 0.4.6.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +19 -3
- sglang/bench_serving.py +8 -9
- sglang/compile_deep_gemm.py +45 -4
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +1 -1
- sglang/srt/configs/model_config.py +9 -3
- sglang/srt/constrained/llguidance_backend.py +78 -61
- sglang/srt/conversation.py +34 -1
- sglang/srt/disaggregation/decode.py +67 -13
- sglang/srt/disaggregation/fake/__init__.py +1 -0
- sglang/srt/disaggregation/fake/conn.py +88 -0
- sglang/srt/disaggregation/mini_lb.py +45 -8
- sglang/srt/disaggregation/mooncake/conn.py +198 -31
- sglang/srt/disaggregation/prefill.py +36 -12
- sglang/srt/disaggregation/utils.py +16 -2
- sglang/srt/entrypoints/engine.py +9 -0
- sglang/srt/entrypoints/http_server.py +35 -4
- sglang/srt/function_call_parser.py +77 -5
- sglang/srt/layers/attention/base_attn_backend.py +3 -0
- sglang/srt/layers/attention/cutlass_mla_backend.py +278 -0
- sglang/srt/layers/attention/flashattention_backend.py +28 -10
- sglang/srt/layers/attention/flashmla_backend.py +8 -11
- sglang/srt/layers/attention/utils.py +1 -1
- sglang/srt/layers/attention/vision.py +2 -0
- sglang/srt/layers/layernorm.py +38 -16
- sglang/srt/layers/logits_processor.py +2 -2
- sglang/srt/layers/moe/fused_moe_native.py +2 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=192,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=384,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=768,device_name=NVIDIA_H200.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=128,N=96,device_name=NVIDIA_H20.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +20 -17
- sglang/srt/layers/moe/fused_moe_triton/layer.py +15 -17
- sglang/srt/layers/pooler.py +6 -0
- sglang/srt/layers/quantization/awq.py +5 -1
- sglang/srt/layers/quantization/deep_gemm.py +17 -10
- sglang/srt/layers/quantization/fp8.py +20 -22
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/int8_kernel.py +32 -1
- sglang/srt/layers/radix_attention.py +13 -3
- sglang/srt/layers/rotary_embedding.py +170 -126
- sglang/srt/managers/data_parallel_controller.py +10 -3
- sglang/srt/managers/io_struct.py +7 -0
- sglang/srt/managers/mm_utils.py +85 -28
- sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
- sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
- sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
- sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
- sglang/srt/managers/schedule_batch.py +38 -12
- sglang/srt/managers/scheduler.py +41 -28
- sglang/srt/managers/scheduler_output_processor_mixin.py +25 -9
- sglang/srt/managers/tokenizer_manager.py +5 -1
- sglang/srt/managers/tp_worker.py +3 -3
- sglang/srt/managers/tp_worker_overlap_thread.py +9 -4
- sglang/srt/mem_cache/memory_pool.py +87 -0
- sglang/srt/model_executor/cuda_graph_runner.py +4 -3
- sglang/srt/model_executor/forward_batch_info.py +51 -95
- sglang/srt/model_executor/model_runner.py +19 -25
- sglang/srt/models/deepseek.py +12 -2
- sglang/srt/models/deepseek_nextn.py +101 -6
- sglang/srt/models/deepseek_v2.py +144 -70
- sglang/srt/models/deepseek_vl2.py +9 -4
- sglang/srt/models/gemma3_causal.py +1 -1
- sglang/srt/models/llama4.py +0 -1
- sglang/srt/models/minicpmo.py +5 -1
- sglang/srt/models/mllama4.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +3 -6
- sglang/srt/models/qwen2_vl.py +3 -7
- sglang/srt/models/roberta.py +178 -0
- sglang/srt/openai_api/adapter.py +50 -11
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/reasoning_parser.py +25 -1
- sglang/srt/server_args.py +31 -24
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/torch_memory_saver_adapter.py +10 -1
- sglang/srt/utils.py +5 -1
- sglang/test/runners.py +6 -13
- sglang/test/send_one.py +84 -28
- sglang/test/test_utils.py +74 -18
- sglang/version.py +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/METADATA +5 -6
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/RECORD +97 -80
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/WHEEL +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.post1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
import itertools
|
4
|
+
from typing import Iterable, Optional, Tuple
|
5
|
+
|
6
|
+
import torch
|
7
|
+
from torch import nn
|
8
|
+
|
9
|
+
from sglang.srt.layers.pooler import Pooler, PoolingType
|
10
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
11
|
+
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
12
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
13
|
+
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
14
|
+
from sglang.srt.models.bert import BertEncoder
|
15
|
+
|
16
|
+
RobertaConfig = None
|
17
|
+
|
18
|
+
|
19
|
+
class RobertaEmbedding(nn.Module):
|
20
|
+
|
21
|
+
def __init__(self, config: RobertaConfig):
|
22
|
+
super().__init__()
|
23
|
+
self.size = config.hidden_size
|
24
|
+
self.word_embeddings = VocabParallelEmbedding(
|
25
|
+
config.vocab_size, config.hidden_size
|
26
|
+
)
|
27
|
+
self.padding_idx = config.pad_token_id
|
28
|
+
self.position_embeddings = nn.Embedding(
|
29
|
+
config.max_position_embeddings,
|
30
|
+
config.hidden_size,
|
31
|
+
padding_idx=self.padding_idx,
|
32
|
+
)
|
33
|
+
|
34
|
+
self.token_type_embeddings = nn.Embedding(
|
35
|
+
config.type_vocab_size, config.hidden_size
|
36
|
+
)
|
37
|
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
38
|
+
|
39
|
+
self.position_ids = nn.Parameter(
|
40
|
+
torch.empty((1, config.max_position_embeddings)),
|
41
|
+
)
|
42
|
+
|
43
|
+
self.position_embedding_type = config.position_embedding_type
|
44
|
+
if self.position_embedding_type != "absolute":
|
45
|
+
raise ValueError(
|
46
|
+
"Only 'absolute' position_embedding_type" + " is supported"
|
47
|
+
)
|
48
|
+
|
49
|
+
def forward(
|
50
|
+
self,
|
51
|
+
input_ids: torch.Tensor,
|
52
|
+
seq_lens: torch.Tensor,
|
53
|
+
position_ids: torch.Tensor,
|
54
|
+
inputs_embeds=None,
|
55
|
+
token_type_ids: Optional[torch.Tensor] = None,
|
56
|
+
) -> torch.Tensor:
|
57
|
+
input_shape = input_ids.size()
|
58
|
+
inputs_embeds = self.word_embeddings(input_ids)
|
59
|
+
|
60
|
+
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
61
|
+
|
62
|
+
pos_list = []
|
63
|
+
token_list = []
|
64
|
+
offset = 0
|
65
|
+
for seq_len in seq_lens:
|
66
|
+
pos_list.append(position_ids[offset : offset + seq_len])
|
67
|
+
token_list.append(input_ids[offset : offset + seq_len])
|
68
|
+
offset += seq_len
|
69
|
+
|
70
|
+
new_pos_list = []
|
71
|
+
for positions, tokens in zip(pos_list, token_list):
|
72
|
+
# Verify assumption that incoming position are
|
73
|
+
# always a sequence from 0 to N.
|
74
|
+
expected_pos = torch.arange(
|
75
|
+
positions.size()[0], dtype=torch.long, device=inputs_embeds.device
|
76
|
+
)
|
77
|
+
assert torch.equal(positions, expected_pos)
|
78
|
+
new_pos_list.append(
|
79
|
+
create_position_ids_from_input_ids(tokens, self.padding_idx)
|
80
|
+
)
|
81
|
+
position_ids = torch.cat(new_pos_list)
|
82
|
+
|
83
|
+
# Position embeddings.
|
84
|
+
position_embeddings = self.position_embeddings(position_ids)
|
85
|
+
if token_type_ids is None:
|
86
|
+
token_type_ids = torch.zeros(
|
87
|
+
input_shape, dtype=torch.long, device=inputs_embeds.device
|
88
|
+
)
|
89
|
+
|
90
|
+
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
91
|
+
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
92
|
+
embeddings = self.LayerNorm(embeddings)
|
93
|
+
return embeddings
|
94
|
+
|
95
|
+
|
96
|
+
class XLMRobertaModel(nn.Module):
|
97
|
+
def __init__(
|
98
|
+
self,
|
99
|
+
*,
|
100
|
+
config: RobertaConfig,
|
101
|
+
quant_config: Optional[QuantizationConfig] = None,
|
102
|
+
prefix: str = "",
|
103
|
+
):
|
104
|
+
super().__init__()
|
105
|
+
|
106
|
+
self.config = config
|
107
|
+
self.embeddings = RobertaEmbedding(config)
|
108
|
+
self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
|
109
|
+
self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
|
110
|
+
|
111
|
+
@torch.no_grad()
|
112
|
+
def forward(
|
113
|
+
self,
|
114
|
+
input_ids: torch.Tensor,
|
115
|
+
positions: torch.Tensor,
|
116
|
+
forward_batch: ForwardBatch,
|
117
|
+
input_embeds: torch.Tensor = None,
|
118
|
+
get_embedding: bool = False,
|
119
|
+
) -> torch.Tensor:
|
120
|
+
assert get_embedding == True
|
121
|
+
# Your tokenized IDs
|
122
|
+
|
123
|
+
hidden_states = self.embeddings(
|
124
|
+
input_ids=input_ids,
|
125
|
+
position_ids=positions,
|
126
|
+
seq_lens=forward_batch.seq_lens,
|
127
|
+
)
|
128
|
+
|
129
|
+
hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
|
130
|
+
pooler_out = self.pooler(hidden_states, forward_batch)
|
131
|
+
return pooler_out
|
132
|
+
|
133
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
134
|
+
stacked_params_mapping = [
|
135
|
+
# (param_name, shard_name, shard_id)
|
136
|
+
("qkv_proj", "query", "q"),
|
137
|
+
("qkv_proj", "key", "k"),
|
138
|
+
("qkv_proj", "value", "v"),
|
139
|
+
]
|
140
|
+
|
141
|
+
params_dict = dict(self.named_parameters())
|
142
|
+
for name, loaded_weight in weights:
|
143
|
+
name = name.replace("self", "self_attn")
|
144
|
+
if "pooler" in name:
|
145
|
+
continue
|
146
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
147
|
+
|
148
|
+
if weight_name not in name:
|
149
|
+
continue
|
150
|
+
name = name.replace(weight_name, param_name)
|
151
|
+
# Skip loading extra bias for GPTQ models.
|
152
|
+
if name.endswith(".bias") and name not in params_dict:
|
153
|
+
continue
|
154
|
+
param = params_dict[name]
|
155
|
+
weight_loader = param.weight_loader
|
156
|
+
weight_loader(param, loaded_weight, shard_id)
|
157
|
+
break
|
158
|
+
else:
|
159
|
+
# Skip loading extra bias for GPTQ models.
|
160
|
+
if name.endswith(".bias") and name not in params_dict:
|
161
|
+
continue
|
162
|
+
param = params_dict[name]
|
163
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
164
|
+
weight_loader(param, loaded_weight)
|
165
|
+
|
166
|
+
|
167
|
+
# Adapted from transformers
|
168
|
+
def create_position_ids_from_input_ids(
|
169
|
+
input_ids, padding_idx, past_key_values_length=0
|
170
|
+
):
|
171
|
+
mask = input_ids.ne(padding_idx).int()
|
172
|
+
incremental_indices = (
|
173
|
+
torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
|
174
|
+
) * mask
|
175
|
+
return incremental_indices.long() + padding_idx
|
176
|
+
|
177
|
+
|
178
|
+
EntryClass = [XLMRobertaModel]
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -715,7 +715,10 @@ def v1_generate_response(
|
|
715
715
|
|
716
716
|
|
717
717
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
718
|
-
|
718
|
+
try:
|
719
|
+
request_json = await raw_request.json()
|
720
|
+
except Exception as e:
|
721
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
719
722
|
all_requests = [CompletionRequest(**request_json)]
|
720
723
|
created = int(time.time())
|
721
724
|
adapted_request, request = v1_generate_request(all_requests)
|
@@ -909,6 +912,7 @@ def v1_chat_generate_request(
|
|
909
912
|
|
910
913
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
911
914
|
|
915
|
+
is_multimodal = tokenizer_manager.model_config.is_multimodal
|
912
916
|
for request in all_requests:
|
913
917
|
# Prep the data needed for the underlying GenerateReqInput:
|
914
918
|
# - prompt: The full prompt string.
|
@@ -918,6 +922,7 @@ def v1_chat_generate_request(
|
|
918
922
|
# None skips any image processing in GenerateReqInput.
|
919
923
|
strict_tag = None
|
920
924
|
prompt = ""
|
925
|
+
prompt_ids = []
|
921
926
|
if not isinstance(request.messages, str):
|
922
927
|
# Apply chat template and its stop strings.
|
923
928
|
tools = None
|
@@ -964,10 +969,10 @@ def v1_chat_generate_request(
|
|
964
969
|
),
|
965
970
|
}
|
966
971
|
)
|
967
|
-
# TODO fix the compatible issues with xgrammar
|
968
|
-
strict_tag = None
|
969
972
|
|
970
973
|
for message in request.messages:
|
974
|
+
if message.content is None:
|
975
|
+
message.content = ""
|
971
976
|
if isinstance(message.content, str):
|
972
977
|
openai_compatible_messages.append(
|
973
978
|
{"role": message.role, "content": message.content}
|
@@ -998,6 +1003,11 @@ def v1_chat_generate_request(
|
|
998
1003
|
tokenize=True,
|
999
1004
|
add_generation_prompt=True,
|
1000
1005
|
tools=tools,
|
1006
|
+
**(
|
1007
|
+
request.chat_template_kwargs
|
1008
|
+
if request.chat_template_kwargs
|
1009
|
+
else {}
|
1010
|
+
),
|
1001
1011
|
)
|
1002
1012
|
except:
|
1003
1013
|
# This except branch will be triggered when the chosen model
|
@@ -1009,6 +1019,11 @@ def v1_chat_generate_request(
|
|
1009
1019
|
tokenize=True,
|
1010
1020
|
add_generation_prompt=True,
|
1011
1021
|
tools=tools,
|
1022
|
+
**(
|
1023
|
+
request.chat_template_kwargs
|
1024
|
+
if request.chat_template_kwargs
|
1025
|
+
else {}
|
1026
|
+
),
|
1012
1027
|
)
|
1013
1028
|
|
1014
1029
|
if assistant_prefix:
|
@@ -1019,7 +1034,7 @@ def v1_chat_generate_request(
|
|
1019
1034
|
):
|
1020
1035
|
encoded = encoded[1:]
|
1021
1036
|
prompt_ids += encoded
|
1022
|
-
if
|
1037
|
+
if is_multimodal:
|
1023
1038
|
prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
|
1024
1039
|
stop = request.stop
|
1025
1040
|
image_data = None
|
@@ -1064,8 +1079,9 @@ def v1_chat_generate_request(
|
|
1064
1079
|
stop.append(request.stop)
|
1065
1080
|
else:
|
1066
1081
|
stop.extend(request.stop)
|
1067
|
-
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
|
1068
1082
|
|
1083
|
+
if not is_multimodal:
|
1084
|
+
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
|
1069
1085
|
else:
|
1070
1086
|
# Use the raw prompt and stop strings if the messages is already a string.
|
1071
1087
|
prompt_ids = request.messages
|
@@ -1135,7 +1151,7 @@ def v1_chat_generate_request(
|
|
1135
1151
|
audio_data_list.append(audio_data)
|
1136
1152
|
modalities_list.append(modalities)
|
1137
1153
|
if len(all_requests) == 1:
|
1138
|
-
if
|
1154
|
+
if is_multimodal:
|
1139
1155
|
# processor will need text input
|
1140
1156
|
prompt_kwargs = {"text": prompts[0]}
|
1141
1157
|
else:
|
@@ -1175,6 +1191,7 @@ def v1_chat_generate_request(
|
|
1175
1191
|
modalities=modalities_list,
|
1176
1192
|
lora_path=lora_paths,
|
1177
1193
|
bootstrap_host=all_requests[0].bootstrap_host,
|
1194
|
+
bootstrap_port=all_requests[0].bootstrap_port,
|
1178
1195
|
bootstrap_room=all_requests[0].bootstrap_room,
|
1179
1196
|
)
|
1180
1197
|
|
@@ -1241,16 +1258,34 @@ def v1_chat_generate_response(
|
|
1241
1258
|
tool_calls = None
|
1242
1259
|
text = ret_item["text"]
|
1243
1260
|
|
1261
|
+
enable_thinking = True
|
1244
1262
|
if isinstance(request, list):
|
1245
1263
|
tool_choice = request[idx].tool_choice
|
1246
1264
|
tools = request[idx].tools
|
1247
1265
|
separate_reasoning = request[idx].separate_reasoning
|
1266
|
+
|
1267
|
+
if (
|
1268
|
+
request[idx].chat_template_kwargs
|
1269
|
+
and request[idx].chat_template_kwargs.get("enable_thinking") is not None
|
1270
|
+
):
|
1271
|
+
enable_thinking = request[idx].chat_template_kwargs.get(
|
1272
|
+
"enable_thinking", True
|
1273
|
+
)
|
1248
1274
|
else:
|
1249
1275
|
tool_choice = request.tool_choice
|
1250
1276
|
tools = request.tools
|
1251
1277
|
separate_reasoning = request.separate_reasoning
|
1252
1278
|
|
1253
|
-
|
1279
|
+
if (
|
1280
|
+
request.chat_template_kwargs
|
1281
|
+
and request.chat_template_kwargs.get("enable_thinking") is not None
|
1282
|
+
):
|
1283
|
+
enable_thinking = request.chat_template_kwargs.get(
|
1284
|
+
"enable_thinking", True
|
1285
|
+
)
|
1286
|
+
|
1287
|
+
reasoning_text = None
|
1288
|
+
if reasoning_parser and separate_reasoning and enable_thinking:
|
1254
1289
|
try:
|
1255
1290
|
parser = ReasoningParser(
|
1256
1291
|
model_type=reasoning_parser, stream_reasoning=False
|
@@ -1262,8 +1297,6 @@ def v1_chat_generate_response(
|
|
1262
1297
|
HTTPStatus.BAD_REQUEST,
|
1263
1298
|
"Failed to parse reasoning related info to json format!",
|
1264
1299
|
)
|
1265
|
-
else:
|
1266
|
-
reasoning_text = None
|
1267
1300
|
|
1268
1301
|
if tool_choice != "none" and tools:
|
1269
1302
|
parser = FunctionCallParser(tools, tool_call_parser)
|
@@ -1378,7 +1411,10 @@ def v1_chat_generate_response(
|
|
1378
1411
|
async def v1_chat_completions(
|
1379
1412
|
tokenizer_manager, raw_request: Request, cache_report=False
|
1380
1413
|
):
|
1381
|
-
|
1414
|
+
try:
|
1415
|
+
request_json = await raw_request.json()
|
1416
|
+
except Exception as e:
|
1417
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
1382
1418
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1383
1419
|
created = int(time.time())
|
1384
1420
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
@@ -1799,7 +1835,10 @@ def v1_embedding_response(ret, model_path, to_file=False):
|
|
1799
1835
|
|
1800
1836
|
|
1801
1837
|
async def v1_embeddings(tokenizer_manager, raw_request: Request):
|
1802
|
-
|
1838
|
+
try:
|
1839
|
+
request_json = await raw_request.json()
|
1840
|
+
except Exception as e:
|
1841
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
1803
1842
|
all_requests = [EmbeddingRequest(**request_json)]
|
1804
1843
|
adapted_request, request = v1_embedding_request(all_requests, tokenizer_manager)
|
1805
1844
|
|
@@ -361,9 +361,11 @@ class ChatCompletionRequest(BaseModel):
|
|
361
361
|
session_params: Optional[Dict] = None
|
362
362
|
separate_reasoning: bool = True
|
363
363
|
stream_reasoning: bool = True
|
364
|
+
chat_template_kwargs: Optional[Dict] = None
|
364
365
|
|
365
366
|
# For PD disaggregation
|
366
367
|
bootstrap_host: Optional[str] = None
|
368
|
+
bootstrap_port: Optional[int] = None
|
367
369
|
bootstrap_room: Optional[int] = None
|
368
370
|
|
369
371
|
|
sglang/srt/reasoning_parser.py
CHANGED
@@ -117,6 +117,29 @@ class DeepSeekR1Detector(BaseReasoningFormatDetector):
|
|
117
117
|
# https://github.com/sgl-project/sglang/pull/3202#discussion_r1950153599
|
118
118
|
|
119
119
|
|
120
|
+
class Qwen3Detector(BaseReasoningFormatDetector):
|
121
|
+
"""
|
122
|
+
Detector for Qwen3 model.
|
123
|
+
Assumes reasoning format:
|
124
|
+
(<think>)*(.*)</think>
|
125
|
+
Returns all the text before the </think> tag as `reasoning_text`
|
126
|
+
and the rest of the text as `normal_text`.
|
127
|
+
|
128
|
+
Args:
|
129
|
+
stream_reasoning (bool): If False, accumulates reasoning content until the end tag.
|
130
|
+
If True, streams reasoning content as it arrives.
|
131
|
+
"""
|
132
|
+
|
133
|
+
def __init__(self, stream_reasoning: bool = True):
|
134
|
+
# Qwen3 is assumed to be reasoning until `</think>` token
|
135
|
+
super().__init__(
|
136
|
+
"<think>",
|
137
|
+
"</think>",
|
138
|
+
force_reasoning=True,
|
139
|
+
stream_reasoning=stream_reasoning,
|
140
|
+
)
|
141
|
+
|
142
|
+
|
120
143
|
class ReasoningParser:
|
121
144
|
"""
|
122
145
|
Parser that handles both streaming and non-streaming scenarios for extracting
|
@@ -129,7 +152,8 @@ class ReasoningParser:
|
|
129
152
|
"""
|
130
153
|
|
131
154
|
DetectorMap: Dict[str, BaseReasoningFormatDetector] = {
|
132
|
-
"deepseek-r1": DeepSeekR1Detector
|
155
|
+
"deepseek-r1": DeepSeekR1Detector,
|
156
|
+
"qwen3": Qwen3Detector,
|
133
157
|
}
|
134
158
|
|
135
159
|
def __init__(self, model_type: str = None, stream_reasoning: bool = True):
|
sglang/srt/server_args.py
CHANGED
@@ -153,7 +153,7 @@ class ServerArgs:
|
|
153
153
|
enable_nccl_nvls: bool = False
|
154
154
|
disable_outlines_disk_cache: bool = False
|
155
155
|
disable_custom_all_reduce: bool = False
|
156
|
-
|
156
|
+
enable_multimodal: Optional[bool] = None
|
157
157
|
disable_overlap_schedule: bool = False
|
158
158
|
enable_mixed_chunk: bool = False
|
159
159
|
enable_dp_attention: bool = False
|
@@ -201,7 +201,7 @@ class ServerArgs:
|
|
201
201
|
# Expert parallelism
|
202
202
|
if self.enable_ep_moe:
|
203
203
|
self.ep_size = self.tp_size
|
204
|
-
logger.
|
204
|
+
logger.warning(
|
205
205
|
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
206
206
|
)
|
207
207
|
|
@@ -243,19 +243,25 @@ class ServerArgs:
|
|
243
243
|
self.chunked_prefill_size = 2048
|
244
244
|
else:
|
245
245
|
self.chunked_prefill_size = 8192
|
246
|
-
|
247
246
|
assert self.chunked_prefill_size % self.page_size == 0
|
248
247
|
|
249
248
|
assert self.moe_dense_tp_size in {
|
250
249
|
1,
|
251
250
|
None,
|
252
|
-
},
|
251
|
+
}, "moe_dense_tp_size only support 1 and None currently"
|
253
252
|
|
254
253
|
if self.attention_backend == "flashmla":
|
255
254
|
logger.warning(
|
256
255
|
"FlashMLA only supports a page_size of 64, change page_size to 64."
|
257
256
|
)
|
258
257
|
self.page_size = 64
|
258
|
+
|
259
|
+
if self.attention_backend == "cutlass_mla":
|
260
|
+
logger.warning(
|
261
|
+
"Cutlass MLA only supports a page_size of 128, change page_size to 128."
|
262
|
+
)
|
263
|
+
self.page_size = 128
|
264
|
+
|
259
265
|
# Set cuda graph max batch size
|
260
266
|
if self.cuda_graph_max_bs is None:
|
261
267
|
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
@@ -270,6 +276,7 @@ class ServerArgs:
|
|
270
276
|
self.attention_backend = "torch_native"
|
271
277
|
self.sampling_backend = "pytorch"
|
272
278
|
|
279
|
+
# Set kernel backends
|
273
280
|
if self.sampling_backend is None:
|
274
281
|
self.sampling_backend = (
|
275
282
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
@@ -285,8 +292,6 @@ class ServerArgs:
|
|
285
292
|
if self.grammar_backend is None:
|
286
293
|
self.grammar_backend = "xgrammar"
|
287
294
|
|
288
|
-
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
|
289
|
-
|
290
295
|
# Data parallelism attention
|
291
296
|
if self.enable_dp_attention:
|
292
297
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
@@ -299,8 +304,8 @@ class ServerArgs:
|
|
299
304
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
300
305
|
)
|
301
306
|
|
302
|
-
self.enable_sp_layernorm = False
|
303
307
|
# DeepEP MoE
|
308
|
+
self.enable_sp_layernorm = False
|
304
309
|
if self.enable_deepep_moe:
|
305
310
|
if self.deepep_mode == "auto":
|
306
311
|
assert (
|
@@ -310,7 +315,7 @@ class ServerArgs:
|
|
310
315
|
self.enable_sp_layernorm = (
|
311
316
|
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
312
317
|
)
|
313
|
-
logger.
|
318
|
+
logger.warning(
|
314
319
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
315
320
|
)
|
316
321
|
|
@@ -319,14 +324,11 @@ class ServerArgs:
|
|
319
324
|
# NEXTN shares the same implementation of EAGLE
|
320
325
|
self.speculative_algorithm = "EAGLE"
|
321
326
|
|
322
|
-
if (
|
323
|
-
self.speculative_algorithm == "EAGLE"
|
324
|
-
or self.speculative_algorithm == "EAGLE3"
|
325
|
-
):
|
327
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
326
328
|
if self.max_running_requests is None:
|
327
329
|
self.max_running_requests = 48
|
328
330
|
self.disable_overlap_schedule = True
|
329
|
-
logger.
|
331
|
+
logger.warning(
|
330
332
|
"Overlap scheduler is disabled because of using "
|
331
333
|
"eagle speculative decoding."
|
332
334
|
)
|
@@ -345,7 +347,7 @@ class ServerArgs:
|
|
345
347
|
|
346
348
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
347
349
|
self.speculative_eagle_topk = 1
|
348
|
-
logger.
|
350
|
+
logger.warning(
|
349
351
|
"speculative_eagle_topk is adjusted to 1 when page_size > 1"
|
350
352
|
)
|
351
353
|
|
@@ -353,7 +355,7 @@ class ServerArgs:
|
|
353
355
|
self.speculative_eagle_topk == 1
|
354
356
|
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
355
357
|
):
|
356
|
-
logger.
|
358
|
+
logger.warning(
|
357
359
|
"speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
|
358
360
|
)
|
359
361
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
@@ -424,7 +426,7 @@ class ServerArgs:
|
|
424
426
|
parser.add_argument(
|
425
427
|
"--skip-tokenizer-init",
|
426
428
|
action="store_true",
|
427
|
-
help="If set, skip init tokenizer and pass input_ids in generate request",
|
429
|
+
help="If set, skip init tokenizer and pass input_ids in generate request.",
|
428
430
|
)
|
429
431
|
parser.add_argument(
|
430
432
|
"--enable-tokenizer-batch-encode",
|
@@ -563,6 +565,7 @@ class ServerArgs:
|
|
563
565
|
"name, a tag name, or a commit id. If unspecified, will use "
|
564
566
|
"the default version.",
|
565
567
|
)
|
568
|
+
|
566
569
|
# Memory and scheduling
|
567
570
|
parser.add_argument(
|
568
571
|
"--mem-fraction-static",
|
@@ -827,7 +830,14 @@ class ServerArgs:
|
|
827
830
|
parser.add_argument(
|
828
831
|
"--attention-backend",
|
829
832
|
type=str,
|
830
|
-
choices=[
|
833
|
+
choices=[
|
834
|
+
"flashinfer",
|
835
|
+
"triton",
|
836
|
+
"torch_native",
|
837
|
+
"fa3",
|
838
|
+
"flashmla",
|
839
|
+
"cutlass_mla",
|
840
|
+
],
|
831
841
|
default=ServerArgs.attention_backend,
|
832
842
|
help="Choose the kernels for attention layers.",
|
833
843
|
)
|
@@ -979,10 +989,10 @@ class ServerArgs:
|
|
979
989
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
980
990
|
)
|
981
991
|
parser.add_argument(
|
982
|
-
"--enable-
|
983
|
-
default=ServerArgs.
|
992
|
+
"--enable-multimodal",
|
993
|
+
default=ServerArgs.enable_multimodal,
|
984
994
|
action="store_true",
|
985
|
-
help="Enable the multimodal functionality for
|
995
|
+
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
986
996
|
)
|
987
997
|
parser.add_argument(
|
988
998
|
"--disable-overlap-schedule",
|
@@ -1364,10 +1374,7 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
1364
1374
|
|
1365
1375
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1366
1376
|
"""
|
1367
|
-
|
1368
|
-
config_path = self.decrypted_config_file
|
1369
|
-
else:
|
1370
|
-
config_path = os.path.join(self.model_path, "config.json")
|
1377
|
+
config_path = os.path.join(self.model_path, "config.json")
|
1371
1378
|
if not os.path.exists(config_path):
|
1372
1379
|
raise ValueError(f"{config_path} is not found.")
|
1373
1380
|
|
@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
|
|
85
85
|
f"Capture cuda graph failed: {e}\n"
|
86
86
|
"Possible solutions:\n"
|
87
87
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
88
|
-
"2.
|
89
|
-
"3.
|
90
|
-
"4. disable cuda graph by --disable-cuda-graph\n"
|
88
|
+
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
89
|
+
"3. disable torch compile by not using --enable-torch-compile\n"
|
90
|
+
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
91
91
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
92
92
|
)
|
93
93
|
|
@@ -6,7 +6,9 @@ try:
|
|
6
6
|
import torch_memory_saver
|
7
7
|
|
8
8
|
_primary_memory_saver = torch_memory_saver.TorchMemorySaver()
|
9
|
-
|
9
|
+
import_error = None
|
10
|
+
except ImportError as e:
|
11
|
+
import_error = e
|
10
12
|
pass
|
11
13
|
|
12
14
|
logger = logging.getLogger(__name__)
|
@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__)
|
|
15
17
|
class TorchMemorySaverAdapter(ABC):
|
16
18
|
@staticmethod
|
17
19
|
def create(enable: bool):
|
20
|
+
if enable and import_error is not None:
|
21
|
+
logger.warning(
|
22
|
+
"enable_memory_saver is enabled, but "
|
23
|
+
"torch-memory-saver is not installed. Please install it "
|
24
|
+
"via `pip3 install torch-memory-saver`. "
|
25
|
+
)
|
26
|
+
raise import_error
|
18
27
|
return (
|
19
28
|
_TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
|
20
29
|
)
|
sglang/srt/utils.py
CHANGED
@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str:
|
|
1944
1944
|
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
|
1945
1945
|
return s.getsockname()[0]
|
1946
1946
|
except Exception:
|
1947
|
-
raise ValueError(
|
1947
|
+
raise ValueError("Can not get local ip")
|
1948
1948
|
|
1949
1949
|
|
1950
1950
|
def is_page_size_one(server_args):
|
@@ -1970,7 +1970,11 @@ def is_fa3_default_architecture(hf_config):
|
|
1970
1970
|
"Llama4ForConditionalGeneration",
|
1971
1971
|
"LlamaForCausalLM",
|
1972
1972
|
"MistralForCausalLM",
|
1973
|
+
"MixtralForCausalLM",
|
1973
1974
|
"Gemma2ForCausalLM",
|
1975
|
+
"Gemma3ForConditionalGeneration",
|
1976
|
+
"Qwen3ForCausalLM",
|
1977
|
+
"Qwen3MoeForCausalLM",
|
1974
1978
|
}
|
1975
1979
|
return architectures[0] in default_archs
|
1976
1980
|
|
sglang/test/runners.py
CHANGED
@@ -190,25 +190,18 @@ class HFRunner:
|
|
190
190
|
if attention_mask is not None:
|
191
191
|
attention_mask = attention_mask.to(inputs_embeds.device)
|
192
192
|
|
193
|
-
outputs = self.model
|
194
|
-
input_ids=
|
193
|
+
outputs = self.model(
|
194
|
+
input_ids=input_ids,
|
195
195
|
position_ids=position_ids,
|
196
196
|
attention_mask=attention_mask,
|
197
197
|
past_key_values=past_key_values,
|
198
|
+
output_hidden_states=True,
|
199
|
+
return_dict=True,
|
198
200
|
inputs_embeds=inputs_embeds,
|
201
|
+
image_grid_thw=image_grid_thw,
|
199
202
|
)
|
200
203
|
|
201
|
-
|
202
|
-
left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0] # TODO
|
203
|
-
if left_padding:
|
204
|
-
embeddings = outputs.last_hidden_state[:, -1]
|
205
|
-
else:
|
206
|
-
sequence_lengths = pooling_mask.sum(dim=1) - 1
|
207
|
-
batch_size = outputs.last_hidden_state.shape[0]
|
208
|
-
embeddings = outputs.last_hidden_state[
|
209
|
-
torch.arange(batch_size, device=outputs.last_hidden_state.device),
|
210
|
-
sequence_lengths,
|
211
|
-
]
|
204
|
+
embeddings = outputs.hidden_states[-1][:, -1]
|
212
205
|
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
213
206
|
return embeddings.contiguous()
|
214
207
|
|