sglang 0.3.1.post3__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/bench_latency.py CHANGED
@@ -260,7 +260,7 @@ def correctness_test(
260
260
 
261
261
  # Decode
262
262
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
263
- for _ in range(bench_args.output_len[0]):
263
+ for _ in range(bench_args.output_len[0] - 1):
264
264
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
265
265
  for i in range(len(reqs)):
266
266
  output_ids[i].append(next_token_ids[i])
@@ -311,7 +311,7 @@ def latency_test_run_once(
311
311
 
312
312
  # Decode
313
313
  decode_latencies = []
314
- for i in range(output_len):
314
+ for i in range(output_len - 1):
315
315
  torch.cuda.synchronize()
316
316
  tic = time.time()
317
317
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
@@ -491,18 +491,10 @@ def main(server_args, bench_args):
491
491
 
492
492
 
493
493
  if __name__ == "__main__":
494
- multiprocessing.set_start_method("spawn", force=True)
495
-
496
494
  parser = argparse.ArgumentParser()
497
495
  ServerArgs.add_cli_args(parser)
498
496
  BenchArgs.add_cli_args(parser)
499
- # For this script, model-path is not required
500
- assert (
501
- parser._actions[1].option_strings[0] == "--model-path"
502
- ), "options changed, this code need to be updated"
503
- parser._actions[1].required = False
504
497
  args = parser.parse_args()
505
-
506
498
  server_args = ServerArgs.from_cli_args(args)
507
499
  bench_args = BenchArgs.from_cli_args(args)
508
500
 
@@ -511,6 +503,8 @@ if __name__ == "__main__":
511
503
  format="%(message)s",
512
504
  )
513
505
 
506
+ multiprocessing.set_start_method("spawn", force=True)
507
+
514
508
  try:
515
509
  main(server_args, bench_args)
516
510
  except Exception as e:
@@ -174,13 +174,7 @@ if __name__ == "__main__":
174
174
  parser = argparse.ArgumentParser()
175
175
  ServerArgs.add_cli_args(parser)
176
176
  BenchArgs.add_cli_args(parser)
177
- # For this script, model-path is not required
178
- assert (
179
- parser._actions[1].option_strings[0] == "--model-path"
180
- ), "options changed, this code need to be updated"
181
- parser._actions[1].required = False
182
177
  args = parser.parse_args()
183
-
184
178
  server_args = ServerArgs.from_cli_args(args)
185
179
  bench_args = BenchArgs.from_cli_args(args)
186
180
 
@@ -129,6 +129,7 @@ def get_tokenizer(
129
129
  *args,
130
130
  trust_remote_code=trust_remote_code,
131
131
  tokenizer_revision=tokenizer_revision,
132
+ clean_up_tokenization_spaces=False,
132
133
  **kwargs,
133
134
  )
134
135
  except TypeError as e:
@@ -86,17 +86,9 @@ class FlashInferAttnBackend(AttentionBackend):
86
86
  super().__init__()
87
87
  self.model_runner = model_runner
88
88
 
89
- local_num_qo_heads = (
90
- model_runner.model_config.num_attention_heads // model_runner.tp_size
91
- )
92
- local_num_kv_heads = model_runner.model_config.get_num_kv_heads(
93
- model_runner.tp_size
94
- )
95
- if (
96
- not _grouped_size_compiled_for_decode_kernels(
97
- local_num_qo_heads, local_num_kv_heads
98
- )
99
- or local_num_qo_heads // local_num_kv_heads > 4
89
+ if not _grouped_size_compiled_for_decode_kernels(
90
+ model_runner.model_config.num_attention_heads // model_runner.tp_size,
91
+ model_runner.model_config.get_num_kv_heads(model_runner.tp_size),
100
92
  ):
101
93
  self.decode_use_tensor_cores = True
102
94
  else:
@@ -0,0 +1,117 @@
1
+ from typing import Optional
2
+
3
+ import torch
4
+ from torch.nn import functional as F
5
+
6
+
7
+ def fused_topk_native(
8
+ hidden_states: torch.Tensor,
9
+ gating_output: torch.Tensor,
10
+ topk: int,
11
+ renormalize: bool,
12
+ ):
13
+ assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
14
+ M, _ = hidden_states.shape
15
+ topk_weights = torch.empty(
16
+ M, topk, dtype=torch.float32, device=hidden_states.device
17
+ )
18
+ topk_ids = torch.empty(M, topk, dtype=torch.int32, device=hidden_states.device)
19
+ topk_weights = F.softmax(gating_output.float(), dim=-1)
20
+ topk_weights, topk_ids = torch.topk(topk_weights, topk, dim=-1)
21
+ if renormalize:
22
+ topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
23
+ return topk_weights, topk_ids
24
+
25
+
26
+ # This is used by the Deepseek-V2 model
27
+ def grouped_topk(
28
+ hidden_states: torch.Tensor,
29
+ gating_output: torch.Tensor,
30
+ topk: int,
31
+ renormalize: bool,
32
+ num_expert_group: int = 0,
33
+ topk_group: int = 0,
34
+ ):
35
+
36
+ assert hidden_states.shape[0] == gating_output.shape[0], "Number of tokens mismatch"
37
+
38
+ scores = torch.softmax(gating_output, dim=-1)
39
+ num_token = scores.shape[0]
40
+ group_scores = (
41
+ scores.view(num_token, num_expert_group, -1).max(dim=-1).values
42
+ ) # [n, n_group]
43
+ group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[
44
+ 1
45
+ ] # [n, top_k_group]
46
+ group_mask = torch.zeros_like(group_scores) # [n, n_group]
47
+ group_mask.scatter_(1, group_idx, 1) # [n, n_group]
48
+ score_mask = (
49
+ group_mask.unsqueeze(-1)
50
+ .expand(num_token, num_expert_group, scores.shape[-1] // num_expert_group)
51
+ .reshape(num_token, -1)
52
+ ) # [n, e]
53
+ tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
54
+ topk_weights, topk_ids = torch.topk(tmp_scores, k=topk, dim=-1, sorted=False)
55
+
56
+ if renormalize:
57
+ topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
58
+ return topk_weights, topk_ids
59
+
60
+
61
+ def select_experts_native(
62
+ hidden_states: torch.Tensor,
63
+ router_logits: torch.Tensor,
64
+ top_k: int,
65
+ use_grouped_topk: bool,
66
+ renormalize: bool,
67
+ topk_group: Optional[int] = None,
68
+ num_expert_group: Optional[int] = None,
69
+ ):
70
+ # DeekSeekv2 uses grouped_top_k
71
+ if use_grouped_topk:
72
+ assert topk_group is not None
73
+ assert num_expert_group is not None
74
+ topk_weights, topk_ids = grouped_topk(
75
+ hidden_states=hidden_states,
76
+ gating_output=router_logits,
77
+ topk=top_k,
78
+ renormalize=renormalize,
79
+ num_expert_group=num_expert_group,
80
+ topk_group=topk_group,
81
+ )
82
+ else:
83
+ topk_weights, topk_ids = fused_topk_native(
84
+ hidden_states=hidden_states,
85
+ gating_output=router_logits,
86
+ topk=top_k,
87
+ renormalize=renormalize,
88
+ )
89
+ return topk_weights, topk_ids
90
+
91
+
92
+ def fused_moe_forward_native(
93
+ layer: torch.nn.Module,
94
+ x: torch.Tensor,
95
+ use_grouped_topk: bool,
96
+ top_k: int,
97
+ router_logits: torch.Tensor,
98
+ renormalize: bool,
99
+ topk_group: Optional[int] = None,
100
+ num_expert_group: Optional[int] = None,
101
+ ) -> torch.Tensor:
102
+ topk_weights, topk_ids = select_experts_native(
103
+ hidden_states=x,
104
+ router_logits=router_logits,
105
+ use_grouped_topk=use_grouped_topk,
106
+ top_k=top_k,
107
+ renormalize=renormalize,
108
+ topk_group=topk_group,
109
+ num_expert_group=num_expert_group,
110
+ )
111
+ w13_weights = layer.w13_weight[topk_ids]
112
+ w1_weights, w3_weights = torch.chunk(w13_weights, 2, dim=2)
113
+ w2_weights = layer.w2_weight[topk_ids]
114
+ x1 = F.silu(torch.einsum("ti,taoi -> tao", x, w1_weights))
115
+ x3 = torch.einsum("ti, taoi -> tao", x, w3_weights)
116
+ expert_outs = torch.einsum("tao, taio -> tai", (x1 * x3), w2_weights)
117
+ return torch.einsum("tai,ta -> ti", expert_outs, topk_weights)
@@ -429,7 +429,7 @@ class ScheduleBatch:
429
429
  def prepare_for_extend(self, vocab_size: int):
430
430
  self.forward_mode = ForwardMode.EXTEND
431
431
 
432
- bs = self.batch_size()
432
+ bs = len(self.reqs)
433
433
  reqs = self.reqs
434
434
  input_ids = [r.fill_ids[len(r.prefix_indices) :] for r in reqs]
435
435
  extend_num_tokens = sum(len(ids) for ids in input_ids)
@@ -509,7 +509,7 @@ class ScheduleBatch:
509
509
  self.extend_logprob_start_lens_cpu.extend([0] * running_bs)
510
510
 
511
511
  def check_decode_mem(self):
512
- bs = self.batch_size()
512
+ bs = len(self.reqs)
513
513
  if self.token_to_kv_pool.available_size() >= bs:
514
514
  return True
515
515
 
@@ -680,14 +680,12 @@ class ScheduleBatch:
680
680
  r.output_ids[-1] if r.output_ids else r.origin_input_ids[-1]
681
681
  for r in self.reqs
682
682
  ]
683
- else:
684
- self.sampling_info.penalizer_orchestrator.cumulate_input_tokens(input_ids)
685
683
 
686
684
  self.input_ids = torch.tensor(input_ids, dtype=torch.int32, device="cuda")
687
685
  self.seq_lens.add_(1)
688
686
 
689
687
  # Alloc mem
690
- bs = self.batch_size()
688
+ bs = len(self.reqs)
691
689
  self.out_cache_loc = self.alloc_token_slots(bs)
692
690
 
693
691
  self.req_to_token_pool.req_to_token[
@@ -123,6 +123,7 @@ class TokenizerManager:
123
123
  initializer=init_global_processor,
124
124
  mp_context=mp.get_context("fork"),
125
125
  initargs=(server_args,),
126
+ max_workers=os.environ.get("SGLANG_CPU_COUNT", os.cpu_count()),
126
127
  )
127
128
  else:
128
129
  self.tokenizer = get_tokenizer(
@@ -215,6 +215,7 @@ class ModelTpServer:
215
215
  self.new_token_ratio_decay = global_config.new_token_ratio_decay
216
216
  self.do_not_get_new_batch = False
217
217
 
218
+ @torch.inference_mode()
218
219
  def exposed_step(self, recv_reqs: List):
219
220
  try:
220
221
  # Recv requests
@@ -246,7 +247,6 @@ class ModelTpServer:
246
247
  self.out_pyobjs = []
247
248
  return ret
248
249
 
249
- @torch.inference_mode()
250
250
  def forward_step(self):
251
251
  if self.do_not_get_new_batch and self.current_inflight_req is None:
252
252
  new_batch = None
@@ -291,15 +291,15 @@ class RadixCache(BasePrefixCache):
291
291
 
292
292
  def _collect_leaves(self):
293
293
  ret_list = []
294
+ stack = [self.root_node]
294
295
 
295
- def dfs_(cur_node):
296
+ while stack:
297
+ cur_node = stack.pop()
296
298
  if len(cur_node.children) == 0:
297
299
  ret_list.append(cur_node)
300
+ else:
301
+ stack.extend(cur_node.children.values())
298
302
 
299
- for x in cur_node.children.values():
300
- dfs_(x)
301
-
302
- dfs_(self.root_node)
303
303
  return ret_list
304
304
 
305
305
 
@@ -25,6 +25,7 @@ import torch
25
25
  from vllm.distributed.parallel_state import graph_capture
26
26
  from vllm.model_executor.custom_op import CustomOp
27
27
 
28
+ from sglang.srt.layers.fused_moe.patch import fused_moe_forward_native
28
29
  from sglang.srt.layers.logits_processor import (
29
30
  LogitsMetadata,
30
31
  LogitsProcessor,
@@ -41,14 +42,15 @@ if TYPE_CHECKING:
41
42
  def _to_torch(model: torch.nn.Module, reverse: bool = False):
42
43
  for sub in model._modules.values():
43
44
  if isinstance(sub, CustomOp):
44
- # NOTE: FusedMoE torch native implementaiton is not efficient
45
- if "FusedMoE" in sub.__class__.__name__:
46
- continue
47
45
  if reverse:
48
46
  sub._forward_method = sub.forward_cuda
49
47
  setattr(sub, "is_torch_compile", False)
50
48
  else:
51
- sub._forward_method = sub.forward_native
49
+ # NOTE: Temporarily workaround MoE
50
+ if "FusedMoE" in sub.__class__.__name__:
51
+ sub._forward_method = fused_moe_forward_native
52
+ else:
53
+ sub._forward_method = sub.forward_native
52
54
  setattr(sub, "is_torch_compile", True)
53
55
  if isinstance(sub, torch.nn.Module):
54
56
  _to_torch(sub, reverse)
@@ -67,7 +69,9 @@ def patch_model(
67
69
  monkey_patch_vllm_all_gather()
68
70
  backup_ca_comm = tp_group.ca_comm
69
71
  tp_group.ca_comm = None
70
- yield torch.compile(model.forward, mode="max-autotune-no-cudagraphs")
72
+ yield torch.compile(
73
+ torch.no_grad()(model.forward), mode="max-autotune-no-cudagraphs"
74
+ )
71
75
  else:
72
76
  yield model.forward
73
77
  finally:
@@ -150,7 +154,7 @@ class CudaGraphRunner:
150
154
  f"Capture cuda graph failed: {e}\n"
151
155
  "Possible solutions:\n"
152
156
  "1. disable cuda graph by --disable-cuda-graph\n"
153
- "2. set --mem-fraction-static to a smaller value\n"
157
+ "2. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
154
158
  "3. disable torch compile by not using --enable-torch-compile\n"
155
159
  "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
156
160
  )
@@ -97,14 +97,12 @@ class InputMetadata:
97
97
  self.modalities = [r.modalities for r in reqs]
98
98
 
99
99
  def compute_positions(self, batch: ScheduleBatch):
100
- position_ids_offsets = batch.position_ids_offsets
101
-
102
100
  if self.forward_mode.is_decode():
103
101
  if True:
104
102
  self.positions = self.seq_lens - 1
105
103
  else:
106
104
  # Deprecated
107
- self.positions = (self.seq_lens - 1) + position_ids_offsets
105
+ self.positions = (self.seq_lens - 1) + batch.position_ids_offsets
108
106
  else:
109
107
  if True:
110
108
  self.positions = torch.tensor(
@@ -119,7 +117,7 @@ class InputMetadata:
119
117
  )
120
118
  else:
121
119
  # Deprecated
122
- position_ids_offsets_cpu = position_ids_offsets.cpu().numpy()
120
+ position_ids_offsets_cpu = batch.position_ids_offsets.cpu().numpy()
123
121
  self.positions = torch.tensor(
124
122
  np.concatenate(
125
123
  [
@@ -467,7 +467,6 @@ class ModelRunner:
467
467
  logger.info("Capture cuda graph begin. This can take up to several minutes.")
468
468
  self.cuda_graph_runner = CudaGraphRunner(self)
469
469
 
470
- @torch.inference_mode()
471
470
  def forward_decode(self, batch: ScheduleBatch):
472
471
  if self.server_args.lora_paths is not None:
473
472
  self.lora_manager.prepare_lora_batch(batch)
@@ -481,7 +480,6 @@ class ModelRunner:
481
480
  batch.input_ids, input_metadata.positions, input_metadata
482
481
  )
483
482
 
484
- @torch.inference_mode()
485
483
  def forward_extend(self, batch: ScheduleBatch):
486
484
  input_metadata = InputMetadata.from_schedule_batch(self, batch)
487
485
  if self.server_args.lora_paths is not None:
@@ -500,7 +498,6 @@ class ModelRunner:
500
498
  get_embedding=True,
501
499
  )
502
500
 
503
- @torch.inference_mode()
504
501
  def forward_extend_multi_modal(self, batch: ScheduleBatch):
505
502
  input_metadata = InputMetadata.from_schedule_batch(self, batch)
506
503
  return self.model.forward(
@@ -403,6 +403,14 @@ class LlamaForCausalLM(nn.Module):
403
403
  weight_loader = getattr(param, "weight_loader", default_weight_loader)
404
404
  weight_loader(param, loaded_weight)
405
405
 
406
+ if (
407
+ hasattr(self.config, "tie_word_embeddings")
408
+ and self.config.tie_word_embeddings
409
+ ):
410
+ # Tie output embedding layer to input embedding layer, to solve issues where lm_head.weight is missing
411
+ param = self.lm_head.weight
412
+ weight_loader = getattr(param, "weight_loader", default_weight_loader)
413
+ weight_loader(param, self.model.embed_tokens.weight)
406
414
  apply_torchao_config_(self, params_dict, set(["proj.weight"]))
407
415
 
408
416
 
@@ -858,11 +858,18 @@ def v1_chat_generate_request(
858
858
  openai_compatible_messages.append(
859
859
  {"role": message.role, "content": content["text"]}
860
860
  )
861
+ if openai_compatible_messages[-1]["role"] == "assistant":
862
+ assistant_prefix = openai_compatible_messages[-1]["content"]
863
+ openai_compatible_messages = openai_compatible_messages[:-1]
864
+ else:
865
+ assistant_prefix = None
861
866
  prompt_ids = tokenizer_manager.tokenizer.apply_chat_template(
862
867
  openai_compatible_messages,
863
868
  tokenize=True,
864
869
  add_generation_prompt=True,
865
870
  )
871
+ if assistant_prefix:
872
+ prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
866
873
  stop = request.stop
867
874
  image_data = None
868
875
  modalities = []
sglang/test/runners.py CHANGED
@@ -21,19 +21,19 @@ from typing import List, Union
21
21
 
22
22
  import torch
23
23
  import torch.nn.functional as F
24
- from peft import PeftModel
25
- from transformers import AutoModelForCausalLM, AutoTokenizer
24
+ from transformers import AutoModelForCausalLM
26
25
 
26
+ from sglang.srt.hf_transformers_utils import get_tokenizer
27
27
  from sglang.srt.server import Runtime
28
28
  from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
29
29
 
30
30
  DEFAULT_PROMPTS = [
31
- # the output of gemma-2-2b from SRT is unstable on the commented prompt
32
- # "The capital of France is",
33
31
  "Apple is red. Banana is Yellow. " * 800 + "Apple is",
34
32
  "The capital of the United Kingdom is",
35
33
  "Today is a sunny day and I like",
36
34
  "AI is a field of computer science focused on",
35
+ # the output of gemma-2-2b from SRT is unstable on the commented prompt
36
+ # "The capital of France is",
37
37
  ]
38
38
 
39
39
  dirpath = os.path.dirname(__file__)
@@ -93,11 +93,7 @@ class HFRunner:
93
93
  self.model_proc.start()
94
94
 
95
95
  def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
96
- self.tokenizer = AutoTokenizer.from_pretrained(
97
- model_path,
98
- torch_dtype=torch_dtype,
99
- )
100
-
96
+ self.tokenizer = get_tokenizer(model_path)
101
97
  if self.is_generation:
102
98
  self.base_model = AutoModelForCausalLM.from_pretrained(
103
99
  model_path,
@@ -132,6 +128,8 @@ class HFRunner:
132
128
  input_ids = torch.tensor([p], device="cuda")
133
129
 
134
130
  if lora_paths is not None and lora_paths[i] is not None:
131
+ from peft import PeftModel
132
+
135
133
  self.model = PeftModel.from_pretrained(
136
134
  self.base_model,
137
135
  lora_paths[i],
sglang/test/test_utils.py CHANGED
@@ -26,6 +26,7 @@ DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
26
26
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
27
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
28
28
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
29
+ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
29
30
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
30
31
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
31
32
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
@@ -587,3 +588,37 @@ def run_bench_latency(model, other_args):
587
588
  kill_child_process(process.pid)
588
589
 
589
590
  return output_throughput
591
+
592
+
593
+ def lcs(X, Y):
594
+ m = len(X)
595
+ n = len(Y)
596
+ L = [[0] * (n + 1) for _ in range(m + 1)]
597
+
598
+ for i in range(m + 1):
599
+ for j in range(n + 1):
600
+ if i == 0 or j == 0:
601
+ L[i][j] = 0
602
+ elif X[i - 1] == Y[j - 1]:
603
+ L[i][j] = L[i - 1][j - 1] + 1
604
+ else:
605
+ L[i][j] = max(L[i - 1][j], L[i][j - 1])
606
+
607
+ return L[m][n]
608
+
609
+
610
+ def calculate_rouge_l(output_strs_list1, output_strs_list2):
611
+ """calculate the ROUGE-L score"""
612
+ rouge_l_scores = []
613
+
614
+ for s1, s2 in zip(output_strs_list1, output_strs_list2):
615
+ lcs_len = lcs(s1, s2)
616
+ precision = lcs_len / len(s1) if len(s1) > 0 else 0
617
+ recall = lcs_len / len(s2) if len(s2) > 0 else 0
618
+ if precision + recall > 0:
619
+ fmeasure = (2 * precision * recall) / (precision + recall)
620
+ else:
621
+ fmeasure = 0.0
622
+ rouge_l_scores.append(fmeasure)
623
+
624
+ return rouge_l_scores
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1.post3"
1
+ __version__ = "0.3.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post3
3
+ Version: 0.3.2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
318
  ### Method 2: From source
319
319
  ```
320
320
  # Use the last release branch
321
- git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.2 https://github.com/sgl-project/sglang.git
322
322
  cd sglang
323
323
 
324
324
  pip install --upgrade pip
@@ -348,9 +348,9 @@ docker run --gpus all \
348
348
  <summary>More</summary>
349
349
 
350
350
  > This method is recommended if you plan to serve it as a service.
351
- > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
351
+ > A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
352
352
 
353
- 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
353
+ 1. Copy the [compose.yml](docker/compose.yaml) to your local machine
354
354
  2. Execute the command `docker compose up -d` in your terminal.
355
355
  </details>
356
356
 
@@ -521,6 +521,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
521
521
  - BaiChuan2
522
522
  - MiniCPM / MiniCPM 3
523
523
  - XVERSE / XVERSE MoE
524
+ - SmolLM
524
525
 
525
526
 
526
527
  **Embedding Models**
@@ -529,7 +530,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
529
530
  - gte-Qwen2
530
531
  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
531
532
 
532
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
533
+ Instructions for supporting a new model are [here](docs/en/model_support.md).
533
534
 
534
535
  #### Use Models From ModelScope
535
536
  <details>
@@ -824,7 +825,7 @@ def chat_example(s):
824
825
  Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
825
826
 
826
827
  ## Roadmap
827
- [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
828
+ [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
828
829
 
829
830
  ## Citation And Acknowledgment
830
831
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -1,14 +1,14 @@
1
1
  sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
2
2
  sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
3
- sglang/bench_latency.py,sha256=lyA_AwlhDbLMrH9Ca5_X3NUYQdwbHn_vpNbMyvqOZic,17342
4
- sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
3
+ sglang/bench_latency.py,sha256=8Mb_Z8jZk7pDD9OisGfZapyJOsbcwtfxURy2lQ7bNYI,17128
4
+ sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
5
5
  sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
6
6
  sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
7
7
  sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
8
8
  sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
9
9
  sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
10
10
  sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
11
- sglang/version.py,sha256=vtapUd7gvia5JFNpZOX5Q2A4TqgNWABeKFK66x_VeZU,28
11
+ sglang/version.py,sha256=vNiWJ14r_cw5t_7UDqDQIVZvladKFGyHH2avsLpN7Vg,22
12
12
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
14
14
  sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
@@ -24,7 +24,7 @@ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI
24
24
  sglang/lang/backend/runtime_endpoint.py,sha256=MEyMl5cIAMwaWmp4j0HtuCOQ_XdJoyywztvAOGsicao,9832
25
25
  sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
26
26
  sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
27
- sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
27
+ sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
28
28
  sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
29
29
  sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
30
30
  sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
@@ -37,7 +37,7 @@ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCfl
37
37
  sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
38
38
  sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
39
39
  sglang/srt/layers/activation.py,sha256=tRWHxIjcIopkOremkb5Jy5O0rgdB1PAhHfIEONfyj6Y,5166
40
- sglang/srt/layers/attention_backend.py,sha256=TMxsN1HwgqAURD1i77c-TN-3Xy53H9Kbg6HgpRHHoj0,18167
40
+ sglang/srt/layers/attention_backend.py,sha256=ySiSEHQnhZdQ6kV_9gkAOAP_UEANXSxaSOuLx3rZGzk,17946
41
41
  sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
42
42
  sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
43
43
  sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
@@ -49,6 +49,7 @@ sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJ
49
49
  sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
50
50
  sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
51
51
  sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
52
+ sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
52
53
  sglang/srt/layers/quantization/__init__.py,sha256=wl9mIOeA6mtKIaW1LWUJABWPdqOb-2uZ-kSijWoxLtU,3095
53
54
  sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
54
55
  sglang/srt/layers/triton_attention/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
@@ -62,17 +63,17 @@ sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8
62
63
  sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
63
64
  sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
64
65
  sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
65
- sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
66
- sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
67
- sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
66
+ sglang/srt/managers/schedule_batch.py,sha256=rbBwX-Yy98WhaNfazgyyx4p5L3CaTOKMTOOYqpzEWng,27276
67
+ sglang/srt/managers/tokenizer_manager.py,sha256=oo6UwyHMUGWMyWnGVlbpgzh-kiq3QSA1XU3eGGQNcA8,29727
68
+ sglang/srt/managers/tp_worker.py,sha256=qTzR773tJdssLENqdkAcfAD0gn0c1Tlgx2IynJDlQcU,39503
68
69
  sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
69
70
  sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
70
71
  sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
71
72
  sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
72
- sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
73
- sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
74
- sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
75
- sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
73
+ sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
74
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=GgD0iIzJQ6xmyTIozOQCluBkM58EcsXHXaP-wpbkHYQ,10698
75
+ sglang/srt/model_executor/forward_batch_info.py,sha256=eDARLwjSnUGXzsLprTEQRtwC5kiRCk3NpbbfqkFDwS8,6094
76
+ sglang/srt/model_executor/model_runner.py,sha256=CxBX35i7epmdVBFCoSl57JTZz8yOLxEj5WjSPs88tus,22642
76
77
  sglang/srt/models/baichuan.py,sha256=d2PFmyLBXjzS7X7FL9uz139_CpBPb5WYhzcHgF--gRE,15115
77
78
  sglang/srt/models/chatglm.py,sha256=chDkgLTRU3bPxTUilhW_FGnsUWj_2fkvulCi9pdDxBY,13353
78
79
  sglang/srt/models/commandr.py,sha256=FspSRkMRAXUjD3xzAkxkMiGiRg91czn9T5bagrf3l9M,14136
@@ -85,7 +86,7 @@ sglang/srt/models/gemma2.py,sha256=8wGqNQPaPjuTtgHiKsUP4nowOukPvXwRywD4lkAW9Dg,1
85
86
  sglang/srt/models/gpt_bigcode.py,sha256=k_pZa4Sg5GEsr4ln0kjP765moGUPNs5a6iANPjE2W8U,10177
86
87
  sglang/srt/models/grok.py,sha256=71Zx-4Q3wggNMtRYlXuPMA-auK-sHBYukI1Usn8LVrE,14911
87
88
  sglang/srt/models/internlm2.py,sha256=nEr6MSHFkTjPLvWl1jQQdGFO7iOHex6YtE-I4rYuLao,12184
88
- sglang/srt/models/llama.py,sha256=bdIt9IfZBgsg6CoZT3lvB-dqXhfxempdRHLkY3Su_VU,15198
89
+ sglang/srt/models/llama.py,sha256=hTEi7Ce1RkbrTaAe_JuCdQprTbD1XkvglD1t9YecyvM,15629
89
90
  sglang/srt/models/llama_classification.py,sha256=UpwYsgNVS1065t7Yjmi2XGbk9Or8bq2cF82zH1Yx2Mg,3385
90
91
  sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
91
92
  sglang/srt/models/llava.py,sha256=1MG1JDDQb7xc67BSimDo98Gmvza6PmrHQHmKybsDui4,24872
@@ -103,7 +104,7 @@ sglang/srt/models/stablelm.py,sha256=v67JM1SHb-LinrsX598WMsLVeyzjoKquW6G5G30X5fQ
103
104
  sglang/srt/models/xverse.py,sha256=VThXXKg3DzepcEP1JHcqSyhRBvq6yL14oh4uj5TJOEM,13649
104
105
  sglang/srt/models/xverse_moe.py,sha256=BqmV-uk9ipp4nrj6-lnFfvkwUcuKmV7yfGAYB6Ob-UQ,15833
105
106
  sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
106
- sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
107
+ sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
107
108
  sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
108
109
  sglang/srt/sampling/sampling_batch_info.py,sha256=GewqyxCrW2PFwuzGHaCR59Pvw6j0n2dKGrlJWYQWwW4,6149
109
110
  sglang/srt/sampling/sampling_params.py,sha256=ggOXxafqfCD-xrGYcM57byLZ79CIeBP4AD5F44L_CW0,5635
@@ -115,7 +116,7 @@ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959
115
116
  sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
116
117
  sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
117
118
  sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
118
- sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
119
+ sglang/test/runners.py,sha256=ZKNGNxlXsgqIEatXO1xwnDkcybfNZ1U3sLfcMZRECdY,11400
119
120
  sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
120
121
  sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
121
122
  sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
@@ -125,10 +126,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
125
126
  sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
126
127
  sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
127
128
  sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
128
- sglang/test/test_utils.py,sha256=OnAFpTA94GmQCHCV5XpaYImn11U7Cg4yfSw0nC17GRs,17504
129
+ sglang/test/test_utils.py,sha256=6hVc0r_7bj1BTPeBPBwM1_rDJPqJElL9xfctvSJCrAI,18532
129
130
  sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
130
- sglang-0.3.1.post3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
131
- sglang-0.3.1.post3.dist-info/METADATA,sha256=uhvB-z9UZsAafHaPfU9qYU6oKxrC6BLcyBspbtoFAY8,38122
132
- sglang-0.3.1.post3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
133
- sglang-0.3.1.post3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
134
- sglang-0.3.1.post3.dist-info/RECORD,,
131
+ sglang-0.3.2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
132
+ sglang-0.3.2.dist-info/METADATA,sha256=9jaNpOSL-vIIWWpigGVUKX-mSoTY6OiVYg0VhwnDwiI,38068
133
+ sglang-0.3.2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
134
+ sglang-0.3.2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
135
+ sglang-0.3.2.dist-info/RECORD,,