sglang 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/backend/runtime_endpoint.py +14 -4
- sglang/bench_latency.py +6 -3
- sglang/global_config.py +22 -16
- sglang/lang/chat_template.py +2 -2
- sglang/lang/ir.py +3 -3
- sglang/srt/layers/radix_attention.py +14 -37
- sglang/srt/layers/token_attention.py +2 -9
- sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- sglang/srt/managers/controller/infer_batch.py +256 -42
- sglang/srt/managers/controller/manager_multi.py +6 -2
- sglang/srt/managers/controller/manager_single.py +125 -50
- sglang/srt/managers/controller/model_runner.py +69 -284
- sglang/srt/managers/controller/radix_cache.py +4 -3
- sglang/srt/managers/controller/schedule_heuristic.py +4 -0
- sglang/srt/managers/controller/tp_worker.py +44 -44
- sglang/srt/memory_pool.py +52 -50
- sglang/srt/models/minicpm.py +1 -8
- sglang/srt/models/qwen2_moe.py +126 -107
- sglang/srt/server.py +11 -15
- sglang/srt/server_args.py +12 -4
- sglang/srt/utils.py +1 -1
- {sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/METADATA +9 -1
- {sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/RECORD +27 -26
- {sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
- {sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
- {sglang-0.1.19.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
sglang/srt/models/qwen2_moe.py
CHANGED
@@ -8,24 +8,28 @@ import torch
|
|
8
8
|
import torch.nn.functional as F
|
9
9
|
from torch import nn
|
10
10
|
from transformers import PretrainedConfig
|
11
|
-
|
12
11
|
from vllm.config import CacheConfig
|
13
|
-
from vllm.distributed import (
|
14
|
-
|
12
|
+
from vllm.distributed import (
|
13
|
+
get_tensor_model_parallel_world_size,
|
14
|
+
tensor_model_parallel_all_reduce,
|
15
|
+
)
|
15
16
|
from vllm.model_executor.layers.activation import SiluAndMul
|
16
17
|
from vllm.model_executor.layers.fused_moe import FusedMoE
|
17
18
|
from vllm.model_executor.layers.layernorm import RMSNorm
|
18
|
-
from vllm.model_executor.layers.linear import (
|
19
|
-
|
20
|
-
|
21
|
-
|
19
|
+
from vllm.model_executor.layers.linear import (
|
20
|
+
MergedColumnParallelLinear,
|
21
|
+
QKVParallelLinear,
|
22
|
+
ReplicatedLinear,
|
23
|
+
RowParallelLinear,
|
24
|
+
)
|
22
25
|
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
23
|
-
from vllm.model_executor.layers.quantization.base_config import
|
24
|
-
QuantizationConfig)
|
26
|
+
from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
|
25
27
|
from vllm.model_executor.layers.rotary_embedding import get_rope
|
26
28
|
from vllm.model_executor.layers.sampler import Sampler
|
27
29
|
from vllm.model_executor.layers.vocab_parallel_embedding import (
|
28
|
-
ParallelLMHead,
|
30
|
+
ParallelLMHead,
|
31
|
+
VocabParallelEmbedding,
|
32
|
+
)
|
29
33
|
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
30
34
|
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
31
35
|
from vllm.sequence import IntermediateTensors, SamplerOutput
|
@@ -34,8 +38,8 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
|
|
34
38
|
from sglang.srt.layers.radix_attention import RadixAttention
|
35
39
|
from sglang.srt.managers.controller.model_runner import InputMetadata
|
36
40
|
|
37
|
-
class Qwen2MoeMLP(nn.Module):
|
38
41
|
|
42
|
+
class Qwen2MoeMLP(nn.Module):
|
39
43
|
def __init__(
|
40
44
|
self,
|
41
45
|
hidden_size: int,
|
@@ -46,17 +50,20 @@ class Qwen2MoeMLP(nn.Module):
|
|
46
50
|
) -> None:
|
47
51
|
super().__init__()
|
48
52
|
self.gate_up_proj = MergedColumnParallelLinear(
|
49
|
-
hidden_size, [intermediate_size] * 2,
|
53
|
+
hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
|
54
|
+
)
|
55
|
+
self.down_proj = RowParallelLinear(
|
56
|
+
intermediate_size,
|
57
|
+
hidden_size,
|
50
58
|
bias=False,
|
51
|
-
quant_config=quant_config
|
52
|
-
|
53
|
-
|
54
|
-
bias=False,
|
55
|
-
quant_config=quant_config,
|
56
|
-
reduce_results=reduce_results)
|
59
|
+
quant_config=quant_config,
|
60
|
+
reduce_results=reduce_results,
|
61
|
+
)
|
57
62
|
if hidden_act != "silu":
|
58
|
-
raise ValueError(
|
59
|
-
|
63
|
+
raise ValueError(
|
64
|
+
f"Unsupported activation: {hidden_act}. "
|
65
|
+
"Only silu is supported for now."
|
66
|
+
)
|
60
67
|
self.act_fn = SiluAndMul()
|
61
68
|
|
62
69
|
def forward(self, x):
|
@@ -67,7 +74,6 @@ class Qwen2MoeMLP(nn.Module):
|
|
67
74
|
|
68
75
|
|
69
76
|
class Qwen2MoeSparseMoeBlock(nn.Module):
|
70
|
-
|
71
77
|
def __init__(
|
72
78
|
self,
|
73
79
|
config: PretrainedConfig,
|
@@ -79,20 +85,22 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
79
85
|
if self.tp_size > config.num_experts:
|
80
86
|
raise ValueError(
|
81
87
|
f"Tensor parallel size {self.tp_size} is greater than "
|
82
|
-
f"the number of experts {config.num_experts}."
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
88
|
+
f"the number of experts {config.num_experts}."
|
89
|
+
)
|
90
|
+
|
91
|
+
self.experts = FusedMoE(
|
92
|
+
num_experts=config.num_experts,
|
93
|
+
top_k=config.num_experts_per_tok,
|
94
|
+
hidden_size=config.hidden_size,
|
95
|
+
intermediate_size=config.moe_intermediate_size,
|
96
|
+
reduce_results=False,
|
97
|
+
renormalize=config.norm_topk_prob,
|
98
|
+
quant_config=quant_config,
|
99
|
+
)
|
100
|
+
|
101
|
+
self.gate = ReplicatedLinear(
|
102
|
+
config.hidden_size, config.num_experts, bias=False, quant_config=None
|
103
|
+
)
|
96
104
|
if config.shared_expert_intermediate_size > 0:
|
97
105
|
self.shared_expert = Qwen2MoeMLP(
|
98
106
|
hidden_size=config.hidden_size,
|
@@ -103,9 +111,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
103
111
|
)
|
104
112
|
else:
|
105
113
|
self.shared_expert = None
|
106
|
-
self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
|
107
|
-
1,
|
108
|
-
bias=False)
|
114
|
+
self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
|
109
115
|
|
110
116
|
def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
111
117
|
num_tokens, hidden_dim = hidden_states.shape
|
@@ -114,24 +120,24 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
|
|
114
120
|
if self.shared_expert is not None:
|
115
121
|
shared_output = self.shared_expert(hidden_states)
|
116
122
|
if self.shared_expert_gate is not None:
|
117
|
-
shared_output =
|
118
|
-
self.shared_expert_gate(hidden_states)) * shared_output
|
123
|
+
shared_output = (
|
124
|
+
F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_output
|
125
|
+
)
|
119
126
|
|
120
127
|
# router_logits: (num_tokens, n_experts)
|
121
128
|
router_logits, _ = self.gate(hidden_states)
|
122
|
-
final_hidden_states = self.experts(
|
123
|
-
|
129
|
+
final_hidden_states = self.experts(
|
130
|
+
hidden_states=hidden_states, router_logits=router_logits
|
131
|
+
)
|
124
132
|
if shared_output is not None:
|
125
133
|
final_hidden_states = final_hidden_states + shared_output
|
126
134
|
if self.tp_size > 1:
|
127
|
-
final_hidden_states = tensor_model_parallel_all_reduce(
|
128
|
-
final_hidden_states)
|
135
|
+
final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
|
129
136
|
|
130
137
|
return final_hidden_states.view(num_tokens, hidden_dim)
|
131
138
|
|
132
139
|
|
133
140
|
class Qwen2MoeAttention(nn.Module):
|
134
|
-
|
135
141
|
def __init__(
|
136
142
|
self,
|
137
143
|
hidden_size: int,
|
@@ -190,17 +196,19 @@ class Qwen2MoeAttention(nn.Module):
|
|
190
196
|
base=rope_theta,
|
191
197
|
rope_scaling=rope_scaling,
|
192
198
|
)
|
193
|
-
self.attn = RadixAttention(
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
199
|
+
self.attn = RadixAttention(
|
200
|
+
self.num_heads,
|
201
|
+
self.head_dim,
|
202
|
+
self.scaling,
|
203
|
+
num_kv_heads=self.num_kv_heads,
|
204
|
+
layer_id=layer_id,
|
205
|
+
)
|
198
206
|
|
199
207
|
def forward(
|
200
208
|
self,
|
201
209
|
positions: torch.Tensor,
|
202
210
|
hidden_states: torch.Tensor,
|
203
|
-
input_metadata: InputMetadata
|
211
|
+
input_metadata: InputMetadata,
|
204
212
|
) -> torch.Tensor:
|
205
213
|
qkv, _ = self.qkv_proj(hidden_states)
|
206
214
|
q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
|
@@ -211,7 +219,6 @@ class Qwen2MoeAttention(nn.Module):
|
|
211
219
|
|
212
220
|
|
213
221
|
class Qwen2MoeDecoderLayer(nn.Module):
|
214
|
-
|
215
222
|
def __init__(
|
216
223
|
self,
|
217
224
|
config: PretrainedConfig,
|
@@ -223,8 +230,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
223
230
|
self.hidden_size = config.hidden_size
|
224
231
|
rope_theta = getattr(config, "rope_theta", 10000)
|
225
232
|
rope_scaling = getattr(config, "rope_scaling", None)
|
226
|
-
max_position_embeddings = getattr(config, "max_position_embeddings",
|
227
|
-
8192)
|
233
|
+
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
228
234
|
self.self_attn = Qwen2MoeAttention(
|
229
235
|
hidden_size=self.hidden_size,
|
230
236
|
num_heads=config.num_attention_heads,
|
@@ -239,13 +245,13 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
239
245
|
|
240
246
|
# Note: Qwen/Qwen2-57B-A14B-Instruct does not have
|
241
247
|
# `mlp_only_layers` in the config.
|
242
|
-
mlp_only_layers = (
|
243
|
-
|
248
|
+
mlp_only_layers = (
|
249
|
+
[] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
|
250
|
+
)
|
244
251
|
if (layer_id not in mlp_only_layers) and (
|
245
|
-
|
246
|
-
|
247
|
-
self.mlp = Qwen2MoeSparseMoeBlock(config=config,
|
248
|
-
quant_config=quant_config)
|
252
|
+
config.num_experts > 0 and (layer_id + 1) % config.decoder_sparse_step == 0
|
253
|
+
):
|
254
|
+
self.mlp = Qwen2MoeSparseMoeBlock(config=config, quant_config=quant_config)
|
249
255
|
else:
|
250
256
|
self.mlp = Qwen2MoeMLP(
|
251
257
|
hidden_size=config.hidden_size,
|
@@ -253,10 +259,10 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
253
259
|
hidden_act=config.hidden_act,
|
254
260
|
quant_config=quant_config,
|
255
261
|
)
|
256
|
-
self.input_layernorm = RMSNorm(config.hidden_size,
|
257
|
-
|
258
|
-
|
259
|
-
|
262
|
+
self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
263
|
+
self.post_attention_layernorm = RMSNorm(
|
264
|
+
config.hidden_size, eps=config.rms_norm_eps
|
265
|
+
)
|
260
266
|
|
261
267
|
def forward(
|
262
268
|
self,
|
@@ -270,23 +276,20 @@ class Qwen2MoeDecoderLayer(nn.Module):
|
|
270
276
|
residual = hidden_states
|
271
277
|
hidden_states = self.input_layernorm(hidden_states)
|
272
278
|
else:
|
273
|
-
hidden_states, residual = self.input_layernorm(
|
274
|
-
hidden_states, residual)
|
279
|
+
hidden_states, residual = self.input_layernorm(hidden_states, residual)
|
275
280
|
hidden_states = self.self_attn(
|
276
281
|
positions=positions,
|
277
282
|
hidden_states=hidden_states,
|
278
|
-
input_metadata=input_metadata
|
283
|
+
input_metadata=input_metadata,
|
279
284
|
)
|
280
285
|
|
281
286
|
# Fully Connected
|
282
|
-
hidden_states, residual = self.post_attention_layernorm(
|
283
|
-
hidden_states, residual)
|
287
|
+
hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
|
284
288
|
hidden_states = self.mlp(hidden_states)
|
285
289
|
return hidden_states, residual
|
286
290
|
|
287
291
|
|
288
292
|
class Qwen2MoeModel(nn.Module):
|
289
|
-
|
290
293
|
def __init__(
|
291
294
|
self,
|
292
295
|
config: PretrainedConfig,
|
@@ -301,13 +304,14 @@ class Qwen2MoeModel(nn.Module):
|
|
301
304
|
config.vocab_size,
|
302
305
|
config.hidden_size,
|
303
306
|
)
|
304
|
-
self.layers = nn.ModuleList(
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
307
|
+
self.layers = nn.ModuleList(
|
308
|
+
[
|
309
|
+
Qwen2MoeDecoderLayer(
|
310
|
+
config, layer_id, cache_config, quant_config=quant_config
|
311
|
+
)
|
312
|
+
for layer_id in range(config.num_hidden_layers)
|
313
|
+
]
|
314
|
+
)
|
311
315
|
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
312
316
|
|
313
317
|
def forward(
|
@@ -315,7 +319,7 @@ class Qwen2MoeModel(nn.Module):
|
|
315
319
|
input_ids: torch.Tensor,
|
316
320
|
positions: torch.Tensor,
|
317
321
|
input_metadata: InputMetadata,
|
318
|
-
input_embeds: torch.Tensor = None
|
322
|
+
input_embeds: torch.Tensor = None,
|
319
323
|
) -> torch.Tensor:
|
320
324
|
if input_embeds is None:
|
321
325
|
hidden_states = self.embed_tokens(input_ids)
|
@@ -324,10 +328,9 @@ class Qwen2MoeModel(nn.Module):
|
|
324
328
|
residual = None
|
325
329
|
for i in range(len(self.layers)):
|
326
330
|
layer = self.layers[i]
|
327
|
-
hidden_states, residual = layer(
|
328
|
-
|
329
|
-
|
330
|
-
residual)
|
331
|
+
hidden_states, residual = layer(
|
332
|
+
positions, hidden_states, input_metadata, residual
|
333
|
+
)
|
331
334
|
hidden_states, _ = self.norm(hidden_states, residual)
|
332
335
|
return hidden_states
|
333
336
|
|
@@ -346,9 +349,9 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
346
349
|
self.config = config
|
347
350
|
self.quant_config = quant_config
|
348
351
|
self.model = Qwen2MoeModel(config, cache_config, quant_config)
|
349
|
-
self.lm_head = ParallelLMHead(
|
350
|
-
|
351
|
-
|
352
|
+
self.lm_head = ParallelLMHead(
|
353
|
+
config.vocab_size, config.hidden_size, quant_config=quant_config
|
354
|
+
)
|
352
355
|
self.logits_processor = LogitsProcessor(config)
|
353
356
|
self.sampler = Sampler()
|
354
357
|
|
@@ -357,17 +360,22 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
357
360
|
input_ids: torch.Tensor,
|
358
361
|
positions: torch.Tensor,
|
359
362
|
input_metadata: InputMetadata,
|
360
|
-
input_embeds: torch.Tensor = None
|
363
|
+
input_embeds: torch.Tensor = None,
|
364
|
+
) -> torch.Tensor:
|
365
|
+
hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
|
366
|
+
return self.logits_processor(
|
367
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
368
|
+
)
|
369
|
+
|
370
|
+
def compute_logits(
|
371
|
+
self,
|
372
|
+
input_ids: torch.Tensor,
|
373
|
+
hidden_states: torch.Tensor,
|
374
|
+
input_metadata: InputMetadata,
|
361
375
|
) -> torch.Tensor:
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
input_metadata)
|
366
|
-
|
367
|
-
def compute_logits(self, input_ids: torch.Tensor, hidden_states: torch.Tensor,
|
368
|
-
input_metadata: InputMetadata) -> torch.Tensor:
|
369
|
-
logits = self.logits_processor(input_ids, hidden_states, self.lm_head.weight,
|
370
|
-
input_metadata)
|
376
|
+
logits = self.logits_processor(
|
377
|
+
input_ids, hidden_states, self.lm_head.weight, input_metadata
|
378
|
+
)
|
371
379
|
return logits
|
372
380
|
|
373
381
|
def sample(
|
@@ -391,11 +399,18 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
391
399
|
expert_params_mapping = [
|
392
400
|
# These are the weights for the experts
|
393
401
|
# (param_name, weight_name, expert_id, shard_id)
|
394
|
-
(
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
402
|
+
(
|
403
|
+
"experts.w13_weight"
|
404
|
+
if weight_name in ["gate_proj", "up_proj"]
|
405
|
+
else "experts.w2_weight",
|
406
|
+
f"experts.{expert_id}.{weight_name}.weight",
|
407
|
+
expert_id,
|
408
|
+
shard_id,
|
409
|
+
)
|
410
|
+
for expert_id in range(self.config.num_experts)
|
411
|
+
for shard_id, weight_name in enumerate(
|
412
|
+
["gate_proj", "down_proj", "up_proj"]
|
413
|
+
)
|
399
414
|
]
|
400
415
|
|
401
416
|
params_dict = dict(self.named_parameters())
|
@@ -433,11 +448,13 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
433
448
|
name = name.replace(weight_name, param_name)
|
434
449
|
param = params_dict[name]
|
435
450
|
weight_loader = param.weight_loader
|
436
|
-
weight_loader(
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
451
|
+
weight_loader(
|
452
|
+
param,
|
453
|
+
loaded_weight,
|
454
|
+
weight_name,
|
455
|
+
shard_id=shard_id,
|
456
|
+
expert_id=expert_id,
|
457
|
+
)
|
441
458
|
break
|
442
459
|
else:
|
443
460
|
# Skip loading extra bias for GPTQ models.
|
@@ -447,8 +464,10 @@ class Qwen2MoeForCausalLM(nn.Module):
|
|
447
464
|
continue
|
448
465
|
|
449
466
|
param = params_dict[name]
|
450
|
-
weight_loader = getattr(
|
451
|
-
|
467
|
+
weight_loader = getattr(
|
468
|
+
param, "weight_loader", default_weight_loader
|
469
|
+
)
|
452
470
|
weight_loader(param, loaded_weight)
|
453
471
|
|
472
|
+
|
454
473
|
EntryClass = Qwen2MoeForCausalLM
|
sglang/srt/server.py
CHANGED
@@ -33,9 +33,9 @@ from sglang.srt.managers.controller.manager_multi import (
|
|
33
33
|
start_controller_process as start_controller_process_multi,
|
34
34
|
)
|
35
35
|
from sglang.srt.managers.controller.manager_single import (
|
36
|
+
launch_tp_servers,
|
36
37
|
start_controller_process as start_controller_process_single,
|
37
38
|
)
|
38
|
-
from sglang.srt.managers.controller.tp_worker import ModelTpService
|
39
39
|
from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
|
40
40
|
from sglang.srt.managers.io_struct import GenerateReqInput
|
41
41
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -53,7 +53,6 @@ from sglang.srt.utils import (
|
|
53
53
|
enable_show_time_cost,
|
54
54
|
receive_addrs,
|
55
55
|
send_addrs_to_rank_0,
|
56
|
-
start_rpyc_service_process,
|
57
56
|
)
|
58
57
|
from sglang.utils import get_exception_traceback
|
59
58
|
|
@@ -146,6 +145,7 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
|
146
145
|
|
147
146
|
# Set global environments
|
148
147
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
148
|
+
os.environ["NCCL_CUMEM_ENABLE"] = "0"
|
149
149
|
if server_args.show_time_cost:
|
150
150
|
enable_show_time_cost()
|
151
151
|
if server_args.disable_disk_cache:
|
@@ -191,21 +191,17 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
|
|
191
191
|
model_port_args=model_port_args,
|
192
192
|
)
|
193
193
|
|
194
|
-
#
|
195
|
-
assert not (server_args.dp_size > 1 and server_args.node_rank is not None)
|
194
|
+
# Handle multi-node tp
|
196
195
|
if server_args.nnodes > 1:
|
196
|
+
assert server_args.dp_size == 1, "Multi-node dp is not supported."
|
197
|
+
|
197
198
|
if server_args.node_rank != 0:
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
)
|
205
|
-
if server_args.node_rank != 0:
|
206
|
-
logger.info(
|
207
|
-
f"[node_rank={server_args.node_rank}]: Listen for connections..."
|
208
|
-
)
|
199
|
+
tp_size_local = server_args.tp_size // server_args.nnodes
|
200
|
+
gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
|
201
|
+
tp_rank_range = list(range(server_args.node_rank * tp_size_local,
|
202
|
+
(server_args.node_rank + 1) * tp_size_local))
|
203
|
+
procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args,
|
204
|
+
port_args.model_port_args[0], model_overide_args)
|
209
205
|
while True:
|
210
206
|
pass
|
211
207
|
|
sglang/srt/server_args.py
CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
|
|
29
29
|
max_prefill_tokens: Optional[int] = None
|
30
30
|
max_running_requests: Optional[int] = None
|
31
31
|
schedule_heuristic: str = "lpm"
|
32
|
-
schedule_conservativeness: float =
|
32
|
+
schedule_conservativeness: float = 0.8
|
33
33
|
|
34
34
|
# Other runtime options
|
35
35
|
tp_size: int = 1
|
@@ -53,6 +53,7 @@ class ServerArgs:
|
|
53
53
|
disable_flashinfer: bool = False
|
54
54
|
disable_radix_cache: bool = False
|
55
55
|
disable_regex_jump_forward: bool = False
|
56
|
+
disable_cuda_graph: bool = False
|
56
57
|
disable_disk_cache: bool = False
|
57
58
|
attention_reduce_in_fp32: bool = False
|
58
59
|
enable_p2p_check: bool = False
|
@@ -66,14 +67,16 @@ class ServerArgs:
|
|
66
67
|
if self.tokenizer_path is None:
|
67
68
|
self.tokenizer_path = self.model_path
|
68
69
|
if self.mem_fraction_static is None:
|
69
|
-
if self.tp_size >=
|
70
|
-
self.mem_fraction_static = 0.
|
70
|
+
if self.tp_size >= 16:
|
71
|
+
self.mem_fraction_static = 0.74
|
72
|
+
elif self.tp_size >= 8:
|
73
|
+
self.mem_fraction_static = 0.78
|
71
74
|
elif self.tp_size >= 4:
|
72
75
|
self.mem_fraction_static = 0.82
|
73
76
|
elif self.tp_size >= 2:
|
74
77
|
self.mem_fraction_static = 0.85
|
75
78
|
else:
|
76
|
-
self.mem_fraction_static = 0.
|
79
|
+
self.mem_fraction_static = 0.88
|
77
80
|
if isinstance(self.additional_ports, int):
|
78
81
|
self.additional_ports = [self.additional_ports]
|
79
82
|
elif self.additional_ports is None:
|
@@ -294,6 +297,11 @@ class ServerArgs:
|
|
294
297
|
action="store_true",
|
295
298
|
help="Disable regex jump-forward",
|
296
299
|
)
|
300
|
+
parser.add_argument(
|
301
|
+
"--disable-cuda-graph",
|
302
|
+
action="store_true",
|
303
|
+
help="Disable cuda graph.",
|
304
|
+
)
|
297
305
|
parser.add_argument(
|
298
306
|
"--disable-disk-cache",
|
299
307
|
action="store_true",
|
sglang/srt/utils.py
CHANGED
@@ -474,9 +474,9 @@ def monkey_patch_vllm_dummy_weight_loader():
|
|
474
474
|
DummyModelLoader,
|
475
475
|
LoRAConfig,
|
476
476
|
ModelConfig,
|
477
|
+
MultiModalConfig,
|
477
478
|
ParallelConfig,
|
478
479
|
SchedulerConfig,
|
479
|
-
MultiModalConfig,
|
480
480
|
_initialize_model,
|
481
481
|
initialize_dummy_weights,
|
482
482
|
nn,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.21
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -623,6 +623,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
623
623
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
624
624
|
```
|
625
625
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
626
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
|
627
|
+
```
|
628
|
+
# Node 0
|
629
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
|
630
|
+
|
631
|
+
# Node 1
|
632
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
633
|
+
```
|
626
634
|
|
627
635
|
### Supported Models
|
628
636
|
- Llama
|