sglang 0.1.19__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,24 +8,28 @@ import torch
8
8
  import torch.nn.functional as F
9
9
  from torch import nn
10
10
  from transformers import PretrainedConfig
11
-
12
11
  from vllm.config import CacheConfig
13
- from vllm.distributed import (get_tensor_model_parallel_world_size,
14
- tensor_model_parallel_all_reduce)
12
+ from vllm.distributed import (
13
+ get_tensor_model_parallel_world_size,
14
+ tensor_model_parallel_all_reduce,
15
+ )
15
16
  from vllm.model_executor.layers.activation import SiluAndMul
16
17
  from vllm.model_executor.layers.fused_moe import FusedMoE
17
18
  from vllm.model_executor.layers.layernorm import RMSNorm
18
- from vllm.model_executor.layers.linear import (MergedColumnParallelLinear,
19
- QKVParallelLinear,
20
- ReplicatedLinear,
21
- RowParallelLinear)
19
+ from vllm.model_executor.layers.linear import (
20
+ MergedColumnParallelLinear,
21
+ QKVParallelLinear,
22
+ ReplicatedLinear,
23
+ RowParallelLinear,
24
+ )
22
25
  from vllm.model_executor.layers.logits_processor import LogitsProcessor
23
- from vllm.model_executor.layers.quantization.base_config import (
24
- QuantizationConfig)
26
+ from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
25
27
  from vllm.model_executor.layers.rotary_embedding import get_rope
26
28
  from vllm.model_executor.layers.sampler import Sampler
27
29
  from vllm.model_executor.layers.vocab_parallel_embedding import (
28
- ParallelLMHead, VocabParallelEmbedding)
30
+ ParallelLMHead,
31
+ VocabParallelEmbedding,
32
+ )
29
33
  from vllm.model_executor.model_loader.weight_utils import default_weight_loader
30
34
  from vllm.model_executor.sampling_metadata import SamplingMetadata
31
35
  from vllm.sequence import IntermediateTensors, SamplerOutput
@@ -34,8 +38,8 @@ from sglang.srt.layers.logits_processor import LogitsProcessor
34
38
  from sglang.srt.layers.radix_attention import RadixAttention
35
39
  from sglang.srt.managers.controller.model_runner import InputMetadata
36
40
 
37
- class Qwen2MoeMLP(nn.Module):
38
41
 
42
+ class Qwen2MoeMLP(nn.Module):
39
43
  def __init__(
40
44
  self,
41
45
  hidden_size: int,
@@ -46,17 +50,20 @@ class Qwen2MoeMLP(nn.Module):
46
50
  ) -> None:
47
51
  super().__init__()
48
52
  self.gate_up_proj = MergedColumnParallelLinear(
49
- hidden_size, [intermediate_size] * 2,
53
+ hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config
54
+ )
55
+ self.down_proj = RowParallelLinear(
56
+ intermediate_size,
57
+ hidden_size,
50
58
  bias=False,
51
- quant_config=quant_config)
52
- self.down_proj = RowParallelLinear(intermediate_size,
53
- hidden_size,
54
- bias=False,
55
- quant_config=quant_config,
56
- reduce_results=reduce_results)
59
+ quant_config=quant_config,
60
+ reduce_results=reduce_results,
61
+ )
57
62
  if hidden_act != "silu":
58
- raise ValueError(f"Unsupported activation: {hidden_act}. "
59
- "Only silu is supported for now.")
63
+ raise ValueError(
64
+ f"Unsupported activation: {hidden_act}. "
65
+ "Only silu is supported for now."
66
+ )
60
67
  self.act_fn = SiluAndMul()
61
68
 
62
69
  def forward(self, x):
@@ -67,7 +74,6 @@ class Qwen2MoeMLP(nn.Module):
67
74
 
68
75
 
69
76
  class Qwen2MoeSparseMoeBlock(nn.Module):
70
-
71
77
  def __init__(
72
78
  self,
73
79
  config: PretrainedConfig,
@@ -79,20 +85,22 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
79
85
  if self.tp_size > config.num_experts:
80
86
  raise ValueError(
81
87
  f"Tensor parallel size {self.tp_size} is greater than "
82
- f"the number of experts {config.num_experts}.")
83
-
84
- self.experts = FusedMoE(num_experts=config.num_experts,
85
- top_k=config.num_experts_per_tok,
86
- hidden_size=config.hidden_size,
87
- intermediate_size=config.moe_intermediate_size,
88
- reduce_results=False,
89
- renormalize=config.norm_topk_prob,
90
- quant_config=quant_config)
91
-
92
- self.gate = ReplicatedLinear(config.hidden_size,
93
- config.num_experts,
94
- bias=False,
95
- quant_config=None)
88
+ f"the number of experts {config.num_experts}."
89
+ )
90
+
91
+ self.experts = FusedMoE(
92
+ num_experts=config.num_experts,
93
+ top_k=config.num_experts_per_tok,
94
+ hidden_size=config.hidden_size,
95
+ intermediate_size=config.moe_intermediate_size,
96
+ reduce_results=False,
97
+ renormalize=config.norm_topk_prob,
98
+ quant_config=quant_config,
99
+ )
100
+
101
+ self.gate = ReplicatedLinear(
102
+ config.hidden_size, config.num_experts, bias=False, quant_config=None
103
+ )
96
104
  if config.shared_expert_intermediate_size > 0:
97
105
  self.shared_expert = Qwen2MoeMLP(
98
106
  hidden_size=config.hidden_size,
@@ -103,9 +111,7 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
103
111
  )
104
112
  else:
105
113
  self.shared_expert = None
106
- self.shared_expert_gate = torch.nn.Linear(config.hidden_size,
107
- 1,
108
- bias=False)
114
+ self.shared_expert_gate = torch.nn.Linear(config.hidden_size, 1, bias=False)
109
115
 
110
116
  def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
111
117
  num_tokens, hidden_dim = hidden_states.shape
@@ -114,24 +120,24 @@ class Qwen2MoeSparseMoeBlock(nn.Module):
114
120
  if self.shared_expert is not None:
115
121
  shared_output = self.shared_expert(hidden_states)
116
122
  if self.shared_expert_gate is not None:
117
- shared_output = F.sigmoid(
118
- self.shared_expert_gate(hidden_states)) * shared_output
123
+ shared_output = (
124
+ F.sigmoid(self.shared_expert_gate(hidden_states)) * shared_output
125
+ )
119
126
 
120
127
  # router_logits: (num_tokens, n_experts)
121
128
  router_logits, _ = self.gate(hidden_states)
122
- final_hidden_states = self.experts(hidden_states=hidden_states,
123
- router_logits=router_logits)
129
+ final_hidden_states = self.experts(
130
+ hidden_states=hidden_states, router_logits=router_logits
131
+ )
124
132
  if shared_output is not None:
125
133
  final_hidden_states = final_hidden_states + shared_output
126
134
  if self.tp_size > 1:
127
- final_hidden_states = tensor_model_parallel_all_reduce(
128
- final_hidden_states)
135
+ final_hidden_states = tensor_model_parallel_all_reduce(final_hidden_states)
129
136
 
130
137
  return final_hidden_states.view(num_tokens, hidden_dim)
131
138
 
132
139
 
133
140
  class Qwen2MoeAttention(nn.Module):
134
-
135
141
  def __init__(
136
142
  self,
137
143
  hidden_size: int,
@@ -190,17 +196,19 @@ class Qwen2MoeAttention(nn.Module):
190
196
  base=rope_theta,
191
197
  rope_scaling=rope_scaling,
192
198
  )
193
- self.attn = RadixAttention(self.num_heads,
194
- self.head_dim,
195
- self.scaling,
196
- num_kv_heads=self.num_kv_heads,
197
- layer_id=layer_id)
199
+ self.attn = RadixAttention(
200
+ self.num_heads,
201
+ self.head_dim,
202
+ self.scaling,
203
+ num_kv_heads=self.num_kv_heads,
204
+ layer_id=layer_id,
205
+ )
198
206
 
199
207
  def forward(
200
208
  self,
201
209
  positions: torch.Tensor,
202
210
  hidden_states: torch.Tensor,
203
- input_metadata: InputMetadata
211
+ input_metadata: InputMetadata,
204
212
  ) -> torch.Tensor:
205
213
  qkv, _ = self.qkv_proj(hidden_states)
206
214
  q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -211,7 +219,6 @@ class Qwen2MoeAttention(nn.Module):
211
219
 
212
220
 
213
221
  class Qwen2MoeDecoderLayer(nn.Module):
214
-
215
222
  def __init__(
216
223
  self,
217
224
  config: PretrainedConfig,
@@ -223,8 +230,7 @@ class Qwen2MoeDecoderLayer(nn.Module):
223
230
  self.hidden_size = config.hidden_size
224
231
  rope_theta = getattr(config, "rope_theta", 10000)
225
232
  rope_scaling = getattr(config, "rope_scaling", None)
226
- max_position_embeddings = getattr(config, "max_position_embeddings",
227
- 8192)
233
+ max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
228
234
  self.self_attn = Qwen2MoeAttention(
229
235
  hidden_size=self.hidden_size,
230
236
  num_heads=config.num_attention_heads,
@@ -239,13 +245,13 @@ class Qwen2MoeDecoderLayer(nn.Module):
239
245
 
240
246
  # Note: Qwen/Qwen2-57B-A14B-Instruct does not have
241
247
  # `mlp_only_layers` in the config.
242
- mlp_only_layers = ([] if not hasattr(config, "mlp_only_layers") else
243
- config.mlp_only_layers)
248
+ mlp_only_layers = (
249
+ [] if not hasattr(config, "mlp_only_layers") else config.mlp_only_layers
250
+ )
244
251
  if (layer_id not in mlp_only_layers) and (
245
- config.num_experts > 0 and
246
- (layer_id + 1) % config.decoder_sparse_step == 0):
247
- self.mlp = Qwen2MoeSparseMoeBlock(config=config,
248
- quant_config=quant_config)
252
+ config.num_experts > 0 and (layer_id + 1) % config.decoder_sparse_step == 0
253
+ ):
254
+ self.mlp = Qwen2MoeSparseMoeBlock(config=config, quant_config=quant_config)
249
255
  else:
250
256
  self.mlp = Qwen2MoeMLP(
251
257
  hidden_size=config.hidden_size,
@@ -253,10 +259,10 @@ class Qwen2MoeDecoderLayer(nn.Module):
253
259
  hidden_act=config.hidden_act,
254
260
  quant_config=quant_config,
255
261
  )
256
- self.input_layernorm = RMSNorm(config.hidden_size,
257
- eps=config.rms_norm_eps)
258
- self.post_attention_layernorm = RMSNorm(config.hidden_size,
259
- eps=config.rms_norm_eps)
262
+ self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
263
+ self.post_attention_layernorm = RMSNorm(
264
+ config.hidden_size, eps=config.rms_norm_eps
265
+ )
260
266
 
261
267
  def forward(
262
268
  self,
@@ -270,23 +276,20 @@ class Qwen2MoeDecoderLayer(nn.Module):
270
276
  residual = hidden_states
271
277
  hidden_states = self.input_layernorm(hidden_states)
272
278
  else:
273
- hidden_states, residual = self.input_layernorm(
274
- hidden_states, residual)
279
+ hidden_states, residual = self.input_layernorm(hidden_states, residual)
275
280
  hidden_states = self.self_attn(
276
281
  positions=positions,
277
282
  hidden_states=hidden_states,
278
- input_metadata=input_metadata
283
+ input_metadata=input_metadata,
279
284
  )
280
285
 
281
286
  # Fully Connected
282
- hidden_states, residual = self.post_attention_layernorm(
283
- hidden_states, residual)
287
+ hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
284
288
  hidden_states = self.mlp(hidden_states)
285
289
  return hidden_states, residual
286
290
 
287
291
 
288
292
  class Qwen2MoeModel(nn.Module):
289
-
290
293
  def __init__(
291
294
  self,
292
295
  config: PretrainedConfig,
@@ -301,13 +304,14 @@ class Qwen2MoeModel(nn.Module):
301
304
  config.vocab_size,
302
305
  config.hidden_size,
303
306
  )
304
- self.layers = nn.ModuleList([
305
- Qwen2MoeDecoderLayer(config,
306
- layer_id,
307
- cache_config,
308
- quant_config=quant_config)
309
- for layer_id in range(config.num_hidden_layers)
310
- ])
307
+ self.layers = nn.ModuleList(
308
+ [
309
+ Qwen2MoeDecoderLayer(
310
+ config, layer_id, cache_config, quant_config=quant_config
311
+ )
312
+ for layer_id in range(config.num_hidden_layers)
313
+ ]
314
+ )
311
315
  self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
312
316
 
313
317
  def forward(
@@ -315,7 +319,7 @@ class Qwen2MoeModel(nn.Module):
315
319
  input_ids: torch.Tensor,
316
320
  positions: torch.Tensor,
317
321
  input_metadata: InputMetadata,
318
- input_embeds: torch.Tensor = None
322
+ input_embeds: torch.Tensor = None,
319
323
  ) -> torch.Tensor:
320
324
  if input_embeds is None:
321
325
  hidden_states = self.embed_tokens(input_ids)
@@ -324,10 +328,9 @@ class Qwen2MoeModel(nn.Module):
324
328
  residual = None
325
329
  for i in range(len(self.layers)):
326
330
  layer = self.layers[i]
327
- hidden_states, residual = layer(positions,
328
- hidden_states,
329
- input_metadata,
330
- residual)
331
+ hidden_states, residual = layer(
332
+ positions, hidden_states, input_metadata, residual
333
+ )
331
334
  hidden_states, _ = self.norm(hidden_states, residual)
332
335
  return hidden_states
333
336
 
@@ -346,9 +349,9 @@ class Qwen2MoeForCausalLM(nn.Module):
346
349
  self.config = config
347
350
  self.quant_config = quant_config
348
351
  self.model = Qwen2MoeModel(config, cache_config, quant_config)
349
- self.lm_head = ParallelLMHead(config.vocab_size,
350
- config.hidden_size,
351
- quant_config=quant_config)
352
+ self.lm_head = ParallelLMHead(
353
+ config.vocab_size, config.hidden_size, quant_config=quant_config
354
+ )
352
355
  self.logits_processor = LogitsProcessor(config)
353
356
  self.sampler = Sampler()
354
357
 
@@ -357,17 +360,22 @@ class Qwen2MoeForCausalLM(nn.Module):
357
360
  input_ids: torch.Tensor,
358
361
  positions: torch.Tensor,
359
362
  input_metadata: InputMetadata,
360
- input_embeds: torch.Tensor = None
363
+ input_embeds: torch.Tensor = None,
364
+ ) -> torch.Tensor:
365
+ hidden_states = self.model(input_ids, positions, input_metadata, input_embeds)
366
+ return self.logits_processor(
367
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
368
+ )
369
+
370
+ def compute_logits(
371
+ self,
372
+ input_ids: torch.Tensor,
373
+ hidden_states: torch.Tensor,
374
+ input_metadata: InputMetadata,
361
375
  ) -> torch.Tensor:
362
- hidden_states = self.model(input_ids, positions, input_metadata,
363
- input_embeds)
364
- return self.logits_processor(input_ids, hidden_states, self.lm_head.weight,
365
- input_metadata)
366
-
367
- def compute_logits(self, input_ids: torch.Tensor, hidden_states: torch.Tensor,
368
- input_metadata: InputMetadata) -> torch.Tensor:
369
- logits = self.logits_processor(input_ids, hidden_states, self.lm_head.weight,
370
- input_metadata)
376
+ logits = self.logits_processor(
377
+ input_ids, hidden_states, self.lm_head.weight, input_metadata
378
+ )
371
379
  return logits
372
380
 
373
381
  def sample(
@@ -391,11 +399,18 @@ class Qwen2MoeForCausalLM(nn.Module):
391
399
  expert_params_mapping = [
392
400
  # These are the weights for the experts
393
401
  # (param_name, weight_name, expert_id, shard_id)
394
- ("experts.w13_weight" if weight_name in ["gate_proj", "up_proj"]
395
- else "experts.w2_weight",
396
- f"experts.{expert_id}.{weight_name}.weight", expert_id, shard_id)
397
- for expert_id in range(self.config.num_experts) for shard_id,
398
- weight_name in enumerate(["gate_proj", "down_proj", "up_proj"])
402
+ (
403
+ "experts.w13_weight"
404
+ if weight_name in ["gate_proj", "up_proj"]
405
+ else "experts.w2_weight",
406
+ f"experts.{expert_id}.{weight_name}.weight",
407
+ expert_id,
408
+ shard_id,
409
+ )
410
+ for expert_id in range(self.config.num_experts)
411
+ for shard_id, weight_name in enumerate(
412
+ ["gate_proj", "down_proj", "up_proj"]
413
+ )
399
414
  ]
400
415
 
401
416
  params_dict = dict(self.named_parameters())
@@ -433,11 +448,13 @@ class Qwen2MoeForCausalLM(nn.Module):
433
448
  name = name.replace(weight_name, param_name)
434
449
  param = params_dict[name]
435
450
  weight_loader = param.weight_loader
436
- weight_loader(param,
437
- loaded_weight,
438
- weight_name,
439
- shard_id=shard_id,
440
- expert_id=expert_id)
451
+ weight_loader(
452
+ param,
453
+ loaded_weight,
454
+ weight_name,
455
+ shard_id=shard_id,
456
+ expert_id=expert_id,
457
+ )
441
458
  break
442
459
  else:
443
460
  # Skip loading extra bias for GPTQ models.
@@ -447,8 +464,10 @@ class Qwen2MoeForCausalLM(nn.Module):
447
464
  continue
448
465
 
449
466
  param = params_dict[name]
450
- weight_loader = getattr(param, "weight_loader",
451
- default_weight_loader)
467
+ weight_loader = getattr(
468
+ param, "weight_loader", default_weight_loader
469
+ )
452
470
  weight_loader(param, loaded_weight)
453
471
 
472
+
454
473
  EntryClass = Qwen2MoeForCausalLM
sglang/srt/server.py CHANGED
@@ -33,9 +33,9 @@ from sglang.srt.managers.controller.manager_multi import (
33
33
  start_controller_process as start_controller_process_multi,
34
34
  )
35
35
  from sglang.srt.managers.controller.manager_single import (
36
+ launch_tp_servers,
36
37
  start_controller_process as start_controller_process_single,
37
38
  )
38
- from sglang.srt.managers.controller.tp_worker import ModelTpService
39
39
  from sglang.srt.managers.detokenizer_manager import start_detokenizer_process
40
40
  from sglang.srt.managers.io_struct import GenerateReqInput
41
41
  from sglang.srt.managers.tokenizer_manager import TokenizerManager
@@ -53,7 +53,6 @@ from sglang.srt.utils import (
53
53
  enable_show_time_cost,
54
54
  receive_addrs,
55
55
  send_addrs_to_rank_0,
56
- start_rpyc_service_process,
57
56
  )
58
57
  from sglang.utils import get_exception_traceback
59
58
 
@@ -146,6 +145,7 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
146
145
 
147
146
  # Set global environments
148
147
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
148
+ os.environ["NCCL_CUMEM_ENABLE"] = "0"
149
149
  if server_args.show_time_cost:
150
150
  enable_show_time_cost()
151
151
  if server_args.disable_disk_cache:
@@ -191,21 +191,17 @@ def launch_server(server_args: ServerArgs, pipe_finish_writer, model_overide_arg
191
191
  model_port_args=model_port_args,
192
192
  )
193
193
 
194
- # TODO multi-node dp is not supported
195
- assert not (server_args.dp_size > 1 and server_args.node_rank is not None)
194
+ # Handle multi-node tp
196
195
  if server_args.nnodes > 1:
196
+ assert server_args.dp_size == 1, "Multi-node dp is not supported."
197
+
197
198
  if server_args.node_rank != 0:
198
- send_addrs_to_rank_0(model_port_args[0], server_args)
199
- else:
200
- receive_addrs(model_port_args[0], server_args)
201
- for i in range(tp_size_local):
202
- start_rpyc_service_process(
203
- ModelTpService, model_port_args[0].model_tp_ports[i]
204
- )
205
- if server_args.node_rank != 0:
206
- logger.info(
207
- f"[node_rank={server_args.node_rank}]: Listen for connections..."
208
- )
199
+ tp_size_local = server_args.tp_size // server_args.nnodes
200
+ gpu_ids = [i for _ in range(server_args.nnodes) for i in range(tp_size_local)]
201
+ tp_rank_range = list(range(server_args.node_rank * tp_size_local,
202
+ (server_args.node_rank + 1) * tp_size_local))
203
+ procs = launch_tp_servers(gpu_ids, tp_rank_range, server_args,
204
+ port_args.model_port_args[0], model_overide_args)
209
205
  while True:
210
206
  pass
211
207
 
sglang/srt/server_args.py CHANGED
@@ -29,7 +29,7 @@ class ServerArgs:
29
29
  max_prefill_tokens: Optional[int] = None
30
30
  max_running_requests: Optional[int] = None
31
31
  schedule_heuristic: str = "lpm"
32
- schedule_conservativeness: float = 1.0
32
+ schedule_conservativeness: float = 0.8
33
33
 
34
34
  # Other runtime options
35
35
  tp_size: int = 1
@@ -53,6 +53,7 @@ class ServerArgs:
53
53
  disable_flashinfer: bool = False
54
54
  disable_radix_cache: bool = False
55
55
  disable_regex_jump_forward: bool = False
56
+ disable_cuda_graph: bool = False
56
57
  disable_disk_cache: bool = False
57
58
  attention_reduce_in_fp32: bool = False
58
59
  enable_p2p_check: bool = False
@@ -66,14 +67,16 @@ class ServerArgs:
66
67
  if self.tokenizer_path is None:
67
68
  self.tokenizer_path = self.model_path
68
69
  if self.mem_fraction_static is None:
69
- if self.tp_size >= 8:
70
- self.mem_fraction_static = 0.80
70
+ if self.tp_size >= 16:
71
+ self.mem_fraction_static = 0.74
72
+ elif self.tp_size >= 8:
73
+ self.mem_fraction_static = 0.78
71
74
  elif self.tp_size >= 4:
72
75
  self.mem_fraction_static = 0.82
73
76
  elif self.tp_size >= 2:
74
77
  self.mem_fraction_static = 0.85
75
78
  else:
76
- self.mem_fraction_static = 0.90
79
+ self.mem_fraction_static = 0.88
77
80
  if isinstance(self.additional_ports, int):
78
81
  self.additional_ports = [self.additional_ports]
79
82
  elif self.additional_ports is None:
@@ -294,6 +297,11 @@ class ServerArgs:
294
297
  action="store_true",
295
298
  help="Disable regex jump-forward",
296
299
  )
300
+ parser.add_argument(
301
+ "--disable-cuda-graph",
302
+ action="store_true",
303
+ help="Disable cuda graph.",
304
+ )
297
305
  parser.add_argument(
298
306
  "--disable-disk-cache",
299
307
  action="store_true",
sglang/srt/utils.py CHANGED
@@ -474,9 +474,9 @@ def monkey_patch_vllm_dummy_weight_loader():
474
474
  DummyModelLoader,
475
475
  LoRAConfig,
476
476
  ModelConfig,
477
+ MultiModalConfig,
477
478
  ParallelConfig,
478
479
  SchedulerConfig,
479
- MultiModalConfig,
480
480
  _initialize_model,
481
481
  initialize_dummy_weights,
482
482
  nn,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.19
3
+ Version: 0.1.21
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -623,6 +623,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
623
623
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
624
624
  ```
625
625
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
627
+ ```
628
+ # Node 0
629
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
630
+
631
+ # Node 1
632
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
633
+ ```
626
634
 
627
635
  ### Supported Models
628
636
  - Llama