sglang 0.4.5.post3__py3-none-any.whl → 0.4.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +19 -3
- sglang/bench_serving.py +8 -9
- sglang/compile_deep_gemm.py +45 -4
- sglang/srt/code_completion_parser.py +1 -1
- sglang/srt/configs/deepseekvl2.py +1 -1
- sglang/srt/configs/model_config.py +9 -3
- sglang/srt/constrained/llguidance_backend.py +78 -61
- sglang/srt/conversation.py +34 -1
- sglang/srt/disaggregation/decode.py +59 -11
- sglang/srt/disaggregation/mini_lb.py +45 -8
- sglang/srt/disaggregation/mooncake/conn.py +198 -31
- sglang/srt/disaggregation/prefill.py +24 -9
- sglang/srt/entrypoints/http_server.py +8 -2
- sglang/srt/function_call_parser.py +77 -5
- sglang/srt/layers/attention/base_attn_backend.py +3 -0
- sglang/srt/layers/attention/flashattention_backend.py +28 -10
- sglang/srt/layers/attention/flashmla_backend.py +8 -11
- sglang/srt/layers/attention/vision.py +2 -0
- sglang/srt/layers/layernorm.py +38 -16
- sglang/srt/layers/logits_processor.py +2 -2
- sglang/srt/layers/moe/fused_moe_native.py +2 -4
- sglang/srt/layers/moe/fused_moe_triton/configs/E=264,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +41 -41
- sglang/srt/layers/moe/fused_moe_triton/configs/E=272,N=128,device_name=NVIDIA_H20,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -15
- sglang/srt/layers/pooler.py +6 -0
- sglang/srt/layers/quantization/awq.py +5 -1
- sglang/srt/layers/quantization/deep_gemm.py +17 -10
- sglang/srt/layers/quantization/int8_kernel.py +32 -1
- sglang/srt/layers/radix_attention.py +13 -3
- sglang/srt/layers/rotary_embedding.py +170 -126
- sglang/srt/managers/data_parallel_controller.py +10 -3
- sglang/srt/managers/io_struct.py +7 -0
- sglang/srt/managers/mm_utils.py +85 -28
- sglang/srt/managers/multimodal_processors/base_processor.py +14 -1
- sglang/srt/managers/multimodal_processors/deepseek_vl_v2.py +9 -2
- sglang/srt/managers/multimodal_processors/gemma3.py +2 -5
- sglang/srt/managers/multimodal_processors/janus_pro.py +2 -2
- sglang/srt/managers/multimodal_processors/minicpm.py +4 -3
- sglang/srt/managers/multimodal_processors/qwen_vl.py +38 -13
- sglang/srt/managers/schedule_batch.py +29 -12
- sglang/srt/managers/scheduler.py +31 -20
- sglang/srt/managers/tokenizer_manager.py +5 -1
- sglang/srt/mem_cache/memory_pool.py +87 -0
- sglang/srt/model_executor/cuda_graph_runner.py +4 -3
- sglang/srt/model_executor/forward_batch_info.py +51 -95
- sglang/srt/model_executor/model_runner.py +11 -24
- sglang/srt/models/deepseek.py +12 -2
- sglang/srt/models/deepseek_nextn.py +101 -6
- sglang/srt/models/deepseek_v2.py +144 -70
- sglang/srt/models/deepseek_vl2.py +9 -4
- sglang/srt/models/gemma3_causal.py +1 -1
- sglang/srt/models/llama4.py +0 -1
- sglang/srt/models/minicpmo.py +5 -1
- sglang/srt/models/mllama4.py +2 -2
- sglang/srt/models/qwen2_5_vl.py +3 -6
- sglang/srt/models/qwen2_vl.py +3 -7
- sglang/srt/models/roberta.py +178 -0
- sglang/srt/openai_api/adapter.py +18 -8
- sglang/srt/server_args.py +15 -22
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +3 -3
- sglang/srt/torch_memory_saver_adapter.py +10 -1
- sglang/srt/utils.py +2 -1
- sglang/test/runners.py +6 -13
- sglang/test/test_utils.py +36 -18
- sglang/version.py +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/METADATA +4 -5
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/RECORD +70 -68
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/WHEEL +1 -1
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.4.5.post3.dist-info → sglang-0.4.6.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,178 @@
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
2
|
+
|
3
|
+
import itertools
|
4
|
+
from typing import Iterable, Optional, Tuple
|
5
|
+
|
6
|
+
import torch
|
7
|
+
from torch import nn
|
8
|
+
|
9
|
+
from sglang.srt.layers.pooler import Pooler, PoolingType
|
10
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
11
|
+
from sglang.srt.layers.vocab_parallel_embedding import VocabParallelEmbedding
|
12
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
13
|
+
from sglang.srt.model_loader.weight_utils import default_weight_loader
|
14
|
+
from sglang.srt.models.bert import BertEncoder
|
15
|
+
|
16
|
+
RobertaConfig = None
|
17
|
+
|
18
|
+
|
19
|
+
class RobertaEmbedding(nn.Module):
|
20
|
+
|
21
|
+
def __init__(self, config: RobertaConfig):
|
22
|
+
super().__init__()
|
23
|
+
self.size = config.hidden_size
|
24
|
+
self.word_embeddings = VocabParallelEmbedding(
|
25
|
+
config.vocab_size, config.hidden_size
|
26
|
+
)
|
27
|
+
self.padding_idx = config.pad_token_id
|
28
|
+
self.position_embeddings = nn.Embedding(
|
29
|
+
config.max_position_embeddings,
|
30
|
+
config.hidden_size,
|
31
|
+
padding_idx=self.padding_idx,
|
32
|
+
)
|
33
|
+
|
34
|
+
self.token_type_embeddings = nn.Embedding(
|
35
|
+
config.type_vocab_size, config.hidden_size
|
36
|
+
)
|
37
|
+
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
|
38
|
+
|
39
|
+
self.position_ids = nn.Parameter(
|
40
|
+
torch.empty((1, config.max_position_embeddings)),
|
41
|
+
)
|
42
|
+
|
43
|
+
self.position_embedding_type = config.position_embedding_type
|
44
|
+
if self.position_embedding_type != "absolute":
|
45
|
+
raise ValueError(
|
46
|
+
"Only 'absolute' position_embedding_type" + " is supported"
|
47
|
+
)
|
48
|
+
|
49
|
+
def forward(
|
50
|
+
self,
|
51
|
+
input_ids: torch.Tensor,
|
52
|
+
seq_lens: torch.Tensor,
|
53
|
+
position_ids: torch.Tensor,
|
54
|
+
inputs_embeds=None,
|
55
|
+
token_type_ids: Optional[torch.Tensor] = None,
|
56
|
+
) -> torch.Tensor:
|
57
|
+
input_shape = input_ids.size()
|
58
|
+
inputs_embeds = self.word_embeddings(input_ids)
|
59
|
+
|
60
|
+
# adpated from vllm: https://github.com/vllm-project/vllm/commit/4a18fd14ba4a349291c798a16bf62fa8a9af0b6b/vllm/model_executor/models/roberta.py
|
61
|
+
|
62
|
+
pos_list = []
|
63
|
+
token_list = []
|
64
|
+
offset = 0
|
65
|
+
for seq_len in seq_lens:
|
66
|
+
pos_list.append(position_ids[offset : offset + seq_len])
|
67
|
+
token_list.append(input_ids[offset : offset + seq_len])
|
68
|
+
offset += seq_len
|
69
|
+
|
70
|
+
new_pos_list = []
|
71
|
+
for positions, tokens in zip(pos_list, token_list):
|
72
|
+
# Verify assumption that incoming position are
|
73
|
+
# always a sequence from 0 to N.
|
74
|
+
expected_pos = torch.arange(
|
75
|
+
positions.size()[0], dtype=torch.long, device=inputs_embeds.device
|
76
|
+
)
|
77
|
+
assert torch.equal(positions, expected_pos)
|
78
|
+
new_pos_list.append(
|
79
|
+
create_position_ids_from_input_ids(tokens, self.padding_idx)
|
80
|
+
)
|
81
|
+
position_ids = torch.cat(new_pos_list)
|
82
|
+
|
83
|
+
# Position embeddings.
|
84
|
+
position_embeddings = self.position_embeddings(position_ids)
|
85
|
+
if token_type_ids is None:
|
86
|
+
token_type_ids = torch.zeros(
|
87
|
+
input_shape, dtype=torch.long, device=inputs_embeds.device
|
88
|
+
)
|
89
|
+
|
90
|
+
token_type_embeddings = self.token_type_embeddings(token_type_ids)
|
91
|
+
embeddings = inputs_embeds + token_type_embeddings + position_embeddings
|
92
|
+
embeddings = self.LayerNorm(embeddings)
|
93
|
+
return embeddings
|
94
|
+
|
95
|
+
|
96
|
+
class XLMRobertaModel(nn.Module):
|
97
|
+
def __init__(
|
98
|
+
self,
|
99
|
+
*,
|
100
|
+
config: RobertaConfig,
|
101
|
+
quant_config: Optional[QuantizationConfig] = None,
|
102
|
+
prefix: str = "",
|
103
|
+
):
|
104
|
+
super().__init__()
|
105
|
+
|
106
|
+
self.config = config
|
107
|
+
self.embeddings = RobertaEmbedding(config)
|
108
|
+
self.encoder = BertEncoder(config=config, quant_config=quant_config, prefix="")
|
109
|
+
self.pooler = Pooler(pooling_type=PoolingType.CLS, normalize=True)
|
110
|
+
|
111
|
+
@torch.no_grad()
|
112
|
+
def forward(
|
113
|
+
self,
|
114
|
+
input_ids: torch.Tensor,
|
115
|
+
positions: torch.Tensor,
|
116
|
+
forward_batch: ForwardBatch,
|
117
|
+
input_embeds: torch.Tensor = None,
|
118
|
+
get_embedding: bool = False,
|
119
|
+
) -> torch.Tensor:
|
120
|
+
assert get_embedding == True
|
121
|
+
# Your tokenized IDs
|
122
|
+
|
123
|
+
hidden_states = self.embeddings(
|
124
|
+
input_ids=input_ids,
|
125
|
+
position_ids=positions,
|
126
|
+
seq_lens=forward_batch.seq_lens,
|
127
|
+
)
|
128
|
+
|
129
|
+
hidden_states = self.encoder(hidden_states, forward_batch=forward_batch)
|
130
|
+
pooler_out = self.pooler(hidden_states, forward_batch)
|
131
|
+
return pooler_out
|
132
|
+
|
133
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
134
|
+
stacked_params_mapping = [
|
135
|
+
# (param_name, shard_name, shard_id)
|
136
|
+
("qkv_proj", "query", "q"),
|
137
|
+
("qkv_proj", "key", "k"),
|
138
|
+
("qkv_proj", "value", "v"),
|
139
|
+
]
|
140
|
+
|
141
|
+
params_dict = dict(self.named_parameters())
|
142
|
+
for name, loaded_weight in weights:
|
143
|
+
name = name.replace("self", "self_attn")
|
144
|
+
if "pooler" in name:
|
145
|
+
continue
|
146
|
+
for param_name, weight_name, shard_id in stacked_params_mapping:
|
147
|
+
|
148
|
+
if weight_name not in name:
|
149
|
+
continue
|
150
|
+
name = name.replace(weight_name, param_name)
|
151
|
+
# Skip loading extra bias for GPTQ models.
|
152
|
+
if name.endswith(".bias") and name not in params_dict:
|
153
|
+
continue
|
154
|
+
param = params_dict[name]
|
155
|
+
weight_loader = param.weight_loader
|
156
|
+
weight_loader(param, loaded_weight, shard_id)
|
157
|
+
break
|
158
|
+
else:
|
159
|
+
# Skip loading extra bias for GPTQ models.
|
160
|
+
if name.endswith(".bias") and name not in params_dict:
|
161
|
+
continue
|
162
|
+
param = params_dict[name]
|
163
|
+
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
164
|
+
weight_loader(param, loaded_weight)
|
165
|
+
|
166
|
+
|
167
|
+
# Adapted from transformers
|
168
|
+
def create_position_ids_from_input_ids(
|
169
|
+
input_ids, padding_idx, past_key_values_length=0
|
170
|
+
):
|
171
|
+
mask = input_ids.ne(padding_idx).int()
|
172
|
+
incremental_indices = (
|
173
|
+
torch.cumsum(mask, dim=0).type_as(mask) + past_key_values_length
|
174
|
+
) * mask
|
175
|
+
return incremental_indices.long() + padding_idx
|
176
|
+
|
177
|
+
|
178
|
+
EntryClass = [XLMRobertaModel]
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -715,7 +715,10 @@ def v1_generate_response(
|
|
715
715
|
|
716
716
|
|
717
717
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
718
|
-
|
718
|
+
try:
|
719
|
+
request_json = await raw_request.json()
|
720
|
+
except Exception as e:
|
721
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
719
722
|
all_requests = [CompletionRequest(**request_json)]
|
720
723
|
created = int(time.time())
|
721
724
|
adapted_request, request = v1_generate_request(all_requests)
|
@@ -909,6 +912,7 @@ def v1_chat_generate_request(
|
|
909
912
|
|
910
913
|
# NOTE: with openai API, the prompt's logprobs are always not computed
|
911
914
|
|
915
|
+
is_multimodal = tokenizer_manager.model_config.is_multimodal
|
912
916
|
for request in all_requests:
|
913
917
|
# Prep the data needed for the underlying GenerateReqInput:
|
914
918
|
# - prompt: The full prompt string.
|
@@ -918,6 +922,7 @@ def v1_chat_generate_request(
|
|
918
922
|
# None skips any image processing in GenerateReqInput.
|
919
923
|
strict_tag = None
|
920
924
|
prompt = ""
|
925
|
+
prompt_ids = []
|
921
926
|
if not isinstance(request.messages, str):
|
922
927
|
# Apply chat template and its stop strings.
|
923
928
|
tools = None
|
@@ -964,8 +969,6 @@ def v1_chat_generate_request(
|
|
964
969
|
),
|
965
970
|
}
|
966
971
|
)
|
967
|
-
# TODO fix the compatible issues with xgrammar
|
968
|
-
strict_tag = None
|
969
972
|
|
970
973
|
for message in request.messages:
|
971
974
|
if isinstance(message.content, str):
|
@@ -1019,7 +1022,7 @@ def v1_chat_generate_request(
|
|
1019
1022
|
):
|
1020
1023
|
encoded = encoded[1:]
|
1021
1024
|
prompt_ids += encoded
|
1022
|
-
if
|
1025
|
+
if is_multimodal:
|
1023
1026
|
prompt = tokenizer_manager.tokenizer.decode(prompt_ids)
|
1024
1027
|
stop = request.stop
|
1025
1028
|
image_data = None
|
@@ -1064,8 +1067,9 @@ def v1_chat_generate_request(
|
|
1064
1067
|
stop.append(request.stop)
|
1065
1068
|
else:
|
1066
1069
|
stop.extend(request.stop)
|
1067
|
-
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
|
1068
1070
|
|
1071
|
+
if not is_multimodal:
|
1072
|
+
prompt_ids = tokenizer_manager.tokenizer.encode(prompt)
|
1069
1073
|
else:
|
1070
1074
|
# Use the raw prompt and stop strings if the messages is already a string.
|
1071
1075
|
prompt_ids = request.messages
|
@@ -1135,7 +1139,7 @@ def v1_chat_generate_request(
|
|
1135
1139
|
audio_data_list.append(audio_data)
|
1136
1140
|
modalities_list.append(modalities)
|
1137
1141
|
if len(all_requests) == 1:
|
1138
|
-
if
|
1142
|
+
if is_multimodal:
|
1139
1143
|
# processor will need text input
|
1140
1144
|
prompt_kwargs = {"text": prompts[0]}
|
1141
1145
|
else:
|
@@ -1378,7 +1382,10 @@ def v1_chat_generate_response(
|
|
1378
1382
|
async def v1_chat_completions(
|
1379
1383
|
tokenizer_manager, raw_request: Request, cache_report=False
|
1380
1384
|
):
|
1381
|
-
|
1385
|
+
try:
|
1386
|
+
request_json = await raw_request.json()
|
1387
|
+
except Exception as e:
|
1388
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
1382
1389
|
all_requests = [ChatCompletionRequest(**request_json)]
|
1383
1390
|
created = int(time.time())
|
1384
1391
|
adapted_request, request = v1_chat_generate_request(all_requests, tokenizer_manager)
|
@@ -1799,7 +1806,10 @@ def v1_embedding_response(ret, model_path, to_file=False):
|
|
1799
1806
|
|
1800
1807
|
|
1801
1808
|
async def v1_embeddings(tokenizer_manager, raw_request: Request):
|
1802
|
-
|
1809
|
+
try:
|
1810
|
+
request_json = await raw_request.json()
|
1811
|
+
except Exception as e:
|
1812
|
+
return create_error_response("Invalid request body, error: ", str(e))
|
1803
1813
|
all_requests = [EmbeddingRequest(**request_json)]
|
1804
1814
|
adapted_request, request = v1_embedding_request(all_requests, tokenizer_manager)
|
1805
1815
|
|
sglang/srt/server_args.py
CHANGED
@@ -153,7 +153,7 @@ class ServerArgs:
|
|
153
153
|
enable_nccl_nvls: bool = False
|
154
154
|
disable_outlines_disk_cache: bool = False
|
155
155
|
disable_custom_all_reduce: bool = False
|
156
|
-
|
156
|
+
enable_multimodal: Optional[bool] = None
|
157
157
|
disable_overlap_schedule: bool = False
|
158
158
|
enable_mixed_chunk: bool = False
|
159
159
|
enable_dp_attention: bool = False
|
@@ -201,7 +201,7 @@ class ServerArgs:
|
|
201
201
|
# Expert parallelism
|
202
202
|
if self.enable_ep_moe:
|
203
203
|
self.ep_size = self.tp_size
|
204
|
-
logger.
|
204
|
+
logger.warning(
|
205
205
|
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
206
206
|
)
|
207
207
|
|
@@ -243,19 +243,19 @@ class ServerArgs:
|
|
243
243
|
self.chunked_prefill_size = 2048
|
244
244
|
else:
|
245
245
|
self.chunked_prefill_size = 8192
|
246
|
-
|
247
246
|
assert self.chunked_prefill_size % self.page_size == 0
|
248
247
|
|
249
248
|
assert self.moe_dense_tp_size in {
|
250
249
|
1,
|
251
250
|
None,
|
252
|
-
},
|
251
|
+
}, "moe_dense_tp_size only support 1 and None currently"
|
253
252
|
|
254
253
|
if self.attention_backend == "flashmla":
|
255
254
|
logger.warning(
|
256
255
|
"FlashMLA only supports a page_size of 64, change page_size to 64."
|
257
256
|
)
|
258
257
|
self.page_size = 64
|
258
|
+
|
259
259
|
# Set cuda graph max batch size
|
260
260
|
if self.cuda_graph_max_bs is None:
|
261
261
|
# Based on detailed statistics, when serving TP1/TP2 models on lower-end GPUs with HBM<25G, you can either disable cuda graph or set `cuda_graph_max_bs` to a very small value to reduce the memory overhead of creating cuda graphs, with almost no impact on performance. However, when serving models with TP4 or TP8, we need to enable cuda graph to maintain high performance. In this case, we can set `cuda_graph_max_bs` to 80 (half of the default value 160) to reduce the memory overhead of creating cuda graphs. Looking at the logs from TP4 serving of qwen2-72b, a value of 80 is sufficient and can reduce the memory overhead of creating cuda graphs on lower-end GPUs compared to the original 160, avoiding OOM issues.
|
@@ -270,6 +270,7 @@ class ServerArgs:
|
|
270
270
|
self.attention_backend = "torch_native"
|
271
271
|
self.sampling_backend = "pytorch"
|
272
272
|
|
273
|
+
# Set kernel backends
|
273
274
|
if self.sampling_backend is None:
|
274
275
|
self.sampling_backend = (
|
275
276
|
"flashinfer" if is_flashinfer_available() else "pytorch"
|
@@ -285,8 +286,6 @@ class ServerArgs:
|
|
285
286
|
if self.grammar_backend is None:
|
286
287
|
self.grammar_backend = "xgrammar"
|
287
288
|
|
288
|
-
self.enable_multimodal: Optional[bool] = self.enable_llama4_multimodal
|
289
|
-
|
290
289
|
# Data parallelism attention
|
291
290
|
if self.enable_dp_attention:
|
292
291
|
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
|
@@ -299,8 +298,8 @@ class ServerArgs:
|
|
299
298
|
f"DP attention is enabled. The chunked prefill size is adjusted to {self.chunked_prefill_size} to avoid MoE kernel issues. "
|
300
299
|
)
|
301
300
|
|
302
|
-
self.enable_sp_layernorm = False
|
303
301
|
# DeepEP MoE
|
302
|
+
self.enable_sp_layernorm = False
|
304
303
|
if self.enable_deepep_moe:
|
305
304
|
if self.deepep_mode == "auto":
|
306
305
|
assert (
|
@@ -310,7 +309,7 @@ class ServerArgs:
|
|
310
309
|
self.enable_sp_layernorm = (
|
311
310
|
self.dp_size < self.tp_size if self.enable_dp_attention else True
|
312
311
|
)
|
313
|
-
logger.
|
312
|
+
logger.warning(
|
314
313
|
f"DeepEP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
315
314
|
)
|
316
315
|
|
@@ -319,14 +318,11 @@ class ServerArgs:
|
|
319
318
|
# NEXTN shares the same implementation of EAGLE
|
320
319
|
self.speculative_algorithm = "EAGLE"
|
321
320
|
|
322
|
-
if (
|
323
|
-
self.speculative_algorithm == "EAGLE"
|
324
|
-
or self.speculative_algorithm == "EAGLE3"
|
325
|
-
):
|
321
|
+
if self.speculative_algorithm in ("EAGLE", "EAGLE3"):
|
326
322
|
if self.max_running_requests is None:
|
327
323
|
self.max_running_requests = 48
|
328
324
|
self.disable_overlap_schedule = True
|
329
|
-
logger.
|
325
|
+
logger.warning(
|
330
326
|
"Overlap scheduler is disabled because of using "
|
331
327
|
"eagle speculative decoding."
|
332
328
|
)
|
@@ -345,7 +341,7 @@ class ServerArgs:
|
|
345
341
|
|
346
342
|
if self.page_size > 1 and self.speculative_eagle_topk > 1:
|
347
343
|
self.speculative_eagle_topk = 1
|
348
|
-
logger.
|
344
|
+
logger.warning(
|
349
345
|
"speculative_eagle_topk is adjusted to 1 when page_size > 1"
|
350
346
|
)
|
351
347
|
|
@@ -353,7 +349,7 @@ class ServerArgs:
|
|
353
349
|
self.speculative_eagle_topk == 1
|
354
350
|
and self.speculative_num_draft_tokens != self.speculative_num_steps + 1
|
355
351
|
):
|
356
|
-
logger.
|
352
|
+
logger.warning(
|
357
353
|
"speculative_num_draft_tokens is adjusted to speculative_num_steps + 1 when speculative_eagle_topk == 1"
|
358
354
|
)
|
359
355
|
self.speculative_num_draft_tokens = self.speculative_num_steps + 1
|
@@ -979,10 +975,10 @@ class ServerArgs:
|
|
979
975
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
980
976
|
)
|
981
977
|
parser.add_argument(
|
982
|
-
"--enable-
|
983
|
-
default=ServerArgs.
|
978
|
+
"--enable-multimodal",
|
979
|
+
default=ServerArgs.enable_multimodal,
|
984
980
|
action="store_true",
|
985
|
-
help="Enable the multimodal functionality for
|
981
|
+
help="Enable the multimodal functionality for the served model. If the model being served is not multimodal, nothing will happen",
|
986
982
|
)
|
987
983
|
parser.add_argument(
|
988
984
|
"--disable-overlap-schedule",
|
@@ -1364,10 +1360,7 @@ def auto_choose_speculative_params(self: ServerArgs):
|
|
1364
1360
|
|
1365
1361
|
You can tune them on your own models and prompts with scripts/playground/bench_speculative.py
|
1366
1362
|
"""
|
1367
|
-
|
1368
|
-
config_path = self.decrypted_config_file
|
1369
|
-
else:
|
1370
|
-
config_path = os.path.join(self.model_path, "config.json")
|
1363
|
+
config_path = os.path.join(self.model_path, "config.json")
|
1371
1364
|
if not os.path.exists(config_path):
|
1372
1365
|
raise ValueError(f"{config_path} is not found.")
|
1373
1366
|
|
@@ -85,9 +85,9 @@ class EAGLEDraftCudaGraphRunner:
|
|
85
85
|
f"Capture cuda graph failed: {e}\n"
|
86
86
|
"Possible solutions:\n"
|
87
87
|
"1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
|
88
|
-
"2.
|
89
|
-
"3.
|
90
|
-
"4. disable cuda graph by --disable-cuda-graph\n"
|
88
|
+
"2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
|
89
|
+
"3. disable torch compile by not using --enable-torch-compile\n"
|
90
|
+
"4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
|
91
91
|
"Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
|
92
92
|
)
|
93
93
|
|
@@ -6,7 +6,9 @@ try:
|
|
6
6
|
import torch_memory_saver
|
7
7
|
|
8
8
|
_primary_memory_saver = torch_memory_saver.TorchMemorySaver()
|
9
|
-
|
9
|
+
import_error = None
|
10
|
+
except ImportError as e:
|
11
|
+
import_error = e
|
10
12
|
pass
|
11
13
|
|
12
14
|
logger = logging.getLogger(__name__)
|
@@ -15,6 +17,13 @@ logger = logging.getLogger(__name__)
|
|
15
17
|
class TorchMemorySaverAdapter(ABC):
|
16
18
|
@staticmethod
|
17
19
|
def create(enable: bool):
|
20
|
+
if enable and import_error is not None:
|
21
|
+
logger.warning(
|
22
|
+
"enable_memory_saver is enabled, but "
|
23
|
+
"torch-memory-saver is not installed. Please install it "
|
24
|
+
"via `pip3 install torch-memory-saver`. "
|
25
|
+
)
|
26
|
+
raise import_error
|
18
27
|
return (
|
19
28
|
_TorchMemorySaverAdapterReal() if enable else _TorchMemorySaverAdapterNoop()
|
20
29
|
)
|
sglang/srt/utils.py
CHANGED
@@ -1944,7 +1944,7 @@ def get_local_ip_by_remote() -> str:
|
|
1944
1944
|
s.connect(("2001:4860:4860::8888", 80)) # Doesn't need to be reachable
|
1945
1945
|
return s.getsockname()[0]
|
1946
1946
|
except Exception:
|
1947
|
-
raise ValueError(
|
1947
|
+
raise ValueError("Can not get local ip")
|
1948
1948
|
|
1949
1949
|
|
1950
1950
|
def is_page_size_one(server_args):
|
@@ -1971,6 +1971,7 @@ def is_fa3_default_architecture(hf_config):
|
|
1971
1971
|
"LlamaForCausalLM",
|
1972
1972
|
"MistralForCausalLM",
|
1973
1973
|
"Gemma2ForCausalLM",
|
1974
|
+
"Gemma3ForConditionalGeneration",
|
1974
1975
|
}
|
1975
1976
|
return architectures[0] in default_archs
|
1976
1977
|
|
sglang/test/runners.py
CHANGED
@@ -190,25 +190,18 @@ class HFRunner:
|
|
190
190
|
if attention_mask is not None:
|
191
191
|
attention_mask = attention_mask.to(inputs_embeds.device)
|
192
192
|
|
193
|
-
outputs = self.model
|
194
|
-
input_ids=
|
193
|
+
outputs = self.model(
|
194
|
+
input_ids=input_ids,
|
195
195
|
position_ids=position_ids,
|
196
196
|
attention_mask=attention_mask,
|
197
197
|
past_key_values=past_key_values,
|
198
|
+
output_hidden_states=True,
|
199
|
+
return_dict=True,
|
198
200
|
inputs_embeds=inputs_embeds,
|
201
|
+
image_grid_thw=image_grid_thw,
|
199
202
|
)
|
200
203
|
|
201
|
-
|
202
|
-
left_padding = pooling_mask[:, -1].sum() == pooling_mask.shape[0] # TODO
|
203
|
-
if left_padding:
|
204
|
-
embeddings = outputs.last_hidden_state[:, -1]
|
205
|
-
else:
|
206
|
-
sequence_lengths = pooling_mask.sum(dim=1) - 1
|
207
|
-
batch_size = outputs.last_hidden_state.shape[0]
|
208
|
-
embeddings = outputs.last_hidden_state[
|
209
|
-
torch.arange(batch_size, device=outputs.last_hidden_state.device),
|
210
|
-
sequence_lengths,
|
211
|
-
]
|
204
|
+
embeddings = outputs.hidden_states[-1][:, -1]
|
212
205
|
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
|
213
206
|
return embeddings.contiguous()
|
214
207
|
|
sglang/test/test_utils.py
CHANGED
@@ -8,7 +8,6 @@ import random
|
|
8
8
|
import subprocess
|
9
9
|
import threading
|
10
10
|
import time
|
11
|
-
import traceback
|
12
11
|
import unittest
|
13
12
|
from concurrent.futures import ThreadPoolExecutor
|
14
13
|
from dataclasses import dataclass
|
@@ -34,27 +33,44 @@ from sglang.srt.utils import (
|
|
34
33
|
from sglang.test.run_eval import run_eval
|
35
34
|
from sglang.utils import get_exception_traceback
|
36
35
|
|
37
|
-
|
38
|
-
DEFAULT_FP8_MODEL_NAME_FOR_ACCURACY_TEST = "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
|
39
|
-
DEFAULT_FP8_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST = (
|
40
|
-
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
41
|
-
)
|
42
|
-
DEFAULT_FP8_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST = (
|
43
|
-
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
44
|
-
)
|
45
|
-
|
36
|
+
# General test models
|
46
37
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
|
47
38
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.2-1B-Instruct"
|
48
39
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
49
40
|
DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
|
50
|
-
|
41
|
+
|
42
|
+
# MLA test models
|
51
43
|
DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
|
52
44
|
DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
|
45
|
+
DEFAULT_MODEL_NAME_FOR_TEST_MLA = "lmsys/sglang-ci-dsv3-test"
|
46
|
+
DEFAULT_MODEL_NAME_FOR_TEST_MLA_NEXTN = "lmsys/sglang-ci-dsv3-test-NextN"
|
47
|
+
|
48
|
+
# FP8 models
|
49
|
+
DEFAULT_MODEL_NAME_FOR_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
50
|
+
DEFAULT_MODEL_NAME_FOR_ACCURACY_TEST_FP8 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8"
|
51
|
+
DEFAULT_MODEL_NAME_FOR_DYNAMIC_QUANT_ACCURACY_TEST_FP8 = (
|
52
|
+
"neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"
|
53
|
+
)
|
54
|
+
DEFAULT_MODEL_NAME_FOR_MODELOPT_QUANT_ACCURACY_TEST_FP8 = (
|
55
|
+
"nvidia/Llama-3.1-8B-Instruct-FP8"
|
56
|
+
)
|
57
|
+
|
58
|
+
# EAGLE
|
59
|
+
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
60
|
+
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
61
|
+
DEFAULT_MODEL_NAME_FOR_TEST_EAGLE3 = "jamesliu1/sglang-EAGLE3-Llama-3.1-Instruct-8B"
|
62
|
+
|
63
|
+
# Other use cases
|
64
|
+
DEFAULT_MODEL_NAME_FOR_TEST_LOCAL_ATTENTION = (
|
65
|
+
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
66
|
+
)
|
67
|
+
DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
|
53
68
|
DEFAULT_REASONING_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
|
54
69
|
DEFAULT_AWQ_MOE_MODEL_NAME_FOR_TEST = (
|
55
70
|
"hugging-quants/Mixtral-8x7B-Instruct-v0.1-AWQ-INT4"
|
56
71
|
)
|
57
|
-
|
72
|
+
|
73
|
+
# Nightly tests
|
58
74
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
|
59
75
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
|
60
76
|
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
|
@@ -63,12 +79,11 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
|
|
63
79
|
DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
|
64
80
|
DEFAULT_SMALL_VLM_MODEL_NAME = "Qwen/Qwen2-VL-2B"
|
65
81
|
|
66
|
-
DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
|
67
|
-
DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmsys/sglang-EAGLE-llama2-chat-7B"
|
68
|
-
|
69
82
|
DEFAULT_IMAGE_URL = "https://github.com/sgl-project/sglang/blob/main/test/lang/example_image.png?raw=true"
|
70
83
|
DEFAULT_VIDEO_URL = "https://raw.githubusercontent.com/EvolvingLMMs-Lab/sglang/dev/onevision_local/assets/jobs.mp4"
|
71
84
|
|
85
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
|
86
|
+
|
72
87
|
|
73
88
|
def is_in_ci():
|
74
89
|
"""Return whether it is in CI runner."""
|
@@ -494,7 +509,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
494
509
|
tic = time.time()
|
495
510
|
success = True
|
496
511
|
|
497
|
-
for file in files:
|
512
|
+
for i, file in enumerate(files):
|
498
513
|
filename, estimated_time = file.name, file.estimated_time
|
499
514
|
process = None
|
500
515
|
|
@@ -502,7 +517,10 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
502
517
|
nonlocal process
|
503
518
|
|
504
519
|
filename = os.path.join(os.getcwd(), filename)
|
505
|
-
print(
|
520
|
+
print(
|
521
|
+
f".\n.\nBegin ({i}/{len(files) - 1}):\npython3 {filename}\n.\n.\n",
|
522
|
+
flush=True,
|
523
|
+
)
|
506
524
|
tic = time.time()
|
507
525
|
|
508
526
|
process = subprocess.Popen(
|
@@ -512,7 +530,7 @@ def run_unittest_files(files: List[TestFile], timeout_per_file: float):
|
|
512
530
|
elapsed = time.time() - tic
|
513
531
|
|
514
532
|
print(
|
515
|
-
f".\n.\nEnd:\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
533
|
+
f".\n.\nEnd ({i}/{len(files) - 1}):\n{filename=}, {elapsed=:.0f}, {estimated_time=}\n.\n.\n",
|
516
534
|
flush=True,
|
517
535
|
)
|
518
536
|
return process.returncode
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.4.
|
1
|
+
__version__ = "0.4.6"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.4.
|
3
|
+
Version: 0.4.6
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,7 +225,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
226
226
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
227
227
|
Requires-Dist: interegular; extra == "runtime-common"
|
228
|
-
Requires-Dist: llguidance
|
228
|
+
Requires-Dist: llguidance<0.8.0,>=0.7.11; extra == "runtime-common"
|
229
229
|
Requires-Dist: modelscope; extra == "runtime-common"
|
230
230
|
Requires-Dist: ninja; extra == "runtime-common"
|
231
231
|
Requires-Dist: orjson; extra == "runtime-common"
|
@@ -242,7 +242,6 @@ Requires-Dist: torchao>=0.7.0; extra == "runtime-common"
|
|
242
242
|
Requires-Dist: transformers==4.51.1; extra == "runtime-common"
|
243
243
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
244
244
|
Requires-Dist: uvloop; extra == "runtime-common"
|
245
|
-
Requires-Dist: compressed-tensors; extra == "runtime-common"
|
246
245
|
Requires-Dist: xgrammar==0.1.17; extra == "runtime-common"
|
247
246
|
Provides-Extra: srt
|
248
247
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
@@ -409,5 +408,5 @@ It is supported by the following institutions: AMD, Atlas Cloud, Baseten, Cursor
|
|
409
408
|
|
410
409
|
For enterprises interested in adopting or deploying SGLang at scale, including technical consulting, sponsorship opportunities, or partnership inquiries, please contact us at contact@sglang.ai.
|
411
410
|
|
412
|
-
## Acknowledgment
|
413
|
-
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
411
|
+
## Acknowledgment
|
412
|
+
We learned the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|