sglang 0.3.0__py3-none-any.whl → 0.3.1.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. sglang/bench_latency.py +17 -8
  2. sglang/bench_serving.py +33 -38
  3. sglang/global_config.py +5 -17
  4. sglang/lang/backend/runtime_endpoint.py +5 -2
  5. sglang/lang/interpreter.py +1 -4
  6. sglang/launch_server.py +3 -6
  7. sglang/launch_server_llavavid.py +7 -8
  8. sglang/srt/{model_config.py → configs/model_config.py} +5 -0
  9. sglang/srt/constrained/__init__.py +2 -0
  10. sglang/srt/constrained/fsm_cache.py +33 -38
  11. sglang/srt/constrained/jump_forward.py +0 -1
  12. sglang/srt/conversation.py +4 -1
  13. sglang/srt/hf_transformers_utils.py +1 -3
  14. sglang/srt/layers/activation.py +12 -0
  15. sglang/srt/layers/attention_backend.py +480 -0
  16. sglang/srt/layers/flashinfer_utils.py +235 -0
  17. sglang/srt/layers/fused_moe/layer.py +27 -7
  18. sglang/srt/layers/layernorm.py +12 -0
  19. sglang/srt/layers/logits_processor.py +64 -77
  20. sglang/srt/layers/radix_attention.py +11 -161
  21. sglang/srt/layers/sampler.py +38 -122
  22. sglang/srt/layers/torchao_utils.py +75 -0
  23. sglang/srt/layers/{decode_attention.py → triton_attention/decode_attention.py} +67 -63
  24. sglang/srt/layers/{extend_attention.py → triton_attention/extend_attention.py} +40 -132
  25. sglang/srt/layers/{prefill_attention.py → triton_attention/prefill_attention.py} +13 -7
  26. sglang/srt/lora/lora.py +403 -0
  27. sglang/srt/lora/lora_config.py +43 -0
  28. sglang/srt/lora/lora_manager.py +259 -0
  29. sglang/srt/managers/controller_multi.py +1 -5
  30. sglang/srt/managers/controller_single.py +0 -5
  31. sglang/srt/managers/io_struct.py +16 -1
  32. sglang/srt/managers/policy_scheduler.py +122 -5
  33. sglang/srt/managers/schedule_batch.py +105 -71
  34. sglang/srt/managers/tokenizer_manager.py +17 -8
  35. sglang/srt/managers/tp_worker.py +188 -121
  36. sglang/srt/model_executor/cuda_graph_runner.py +69 -133
  37. sglang/srt/model_executor/forward_batch_info.py +35 -312
  38. sglang/srt/model_executor/model_runner.py +123 -154
  39. sglang/srt/models/baichuan.py +416 -0
  40. sglang/srt/models/chatglm.py +1 -5
  41. sglang/srt/models/commandr.py +1 -5
  42. sglang/srt/models/dbrx.py +1 -5
  43. sglang/srt/models/deepseek.py +1 -5
  44. sglang/srt/models/deepseek_v2.py +7 -6
  45. sglang/srt/models/exaone.py +1 -5
  46. sglang/srt/models/gemma.py +1 -5
  47. sglang/srt/models/gemma2.py +1 -5
  48. sglang/srt/models/gpt_bigcode.py +1 -5
  49. sglang/srt/models/grok.py +1 -5
  50. sglang/srt/models/internlm2.py +1 -5
  51. sglang/srt/models/llama.py +51 -5
  52. sglang/srt/models/llama_classification.py +1 -20
  53. sglang/srt/models/llava.py +30 -5
  54. sglang/srt/models/llavavid.py +2 -2
  55. sglang/srt/models/minicpm.py +1 -5
  56. sglang/srt/models/minicpm3.py +669 -0
  57. sglang/srt/models/mixtral.py +6 -5
  58. sglang/srt/models/mixtral_quant.py +1 -5
  59. sglang/srt/models/olmoe.py +415 -0
  60. sglang/srt/models/qwen.py +1 -5
  61. sglang/srt/models/qwen2.py +1 -5
  62. sglang/srt/models/qwen2_moe.py +6 -5
  63. sglang/srt/models/stablelm.py +1 -5
  64. sglang/srt/models/xverse.py +375 -0
  65. sglang/srt/models/xverse_moe.py +445 -0
  66. sglang/srt/openai_api/adapter.py +65 -46
  67. sglang/srt/openai_api/protocol.py +11 -3
  68. sglang/srt/sampling/sampling_batch_info.py +46 -80
  69. sglang/srt/server.py +30 -15
  70. sglang/srt/server_args.py +163 -28
  71. sglang/srt/utils.py +19 -51
  72. sglang/test/few_shot_gsm8k.py +132 -0
  73. sglang/test/runners.py +114 -22
  74. sglang/test/test_programs.py +7 -5
  75. sglang/test/test_utils.py +85 -2
  76. sglang/utils.py +32 -37
  77. sglang/version.py +1 -1
  78. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/METADATA +30 -18
  79. sglang-0.3.1.post1.dist-info/RECORD +130 -0
  80. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/WHEEL +1 -1
  81. sglang-0.3.0.dist-info/RECORD +0 -118
  82. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/LICENSE +0 -0
  83. {sglang-0.3.0.dist-info → sglang-0.3.1.post1.dist-info}/top_level.txt +0 -0
sglang/test/runners.py CHANGED
@@ -21,6 +21,7 @@ from typing import List, Union
21
21
 
22
22
  import torch
23
23
  import torch.nn.functional as F
24
+ from peft import PeftModel
24
25
  from transformers import AutoModelForCausalLM, AutoTokenizer
25
26
 
26
27
  from sglang.srt.server import Runtime
@@ -50,6 +51,13 @@ def get_dtype_str(torch_dtype):
50
51
  raise NotImplementedError()
51
52
 
52
53
 
54
+ def get_top_logprobs(logits, k):
55
+ logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
56
+ del logits
57
+ logprobs, top_indices = torch.topk(logprobs, k=k, dim=-1)
58
+ return logprobs
59
+
60
+
53
61
  @dataclass
54
62
  class ModelOutput:
55
63
  output_strs: List[str] = None
@@ -65,8 +73,10 @@ class HFRunner:
65
73
  model_path,
66
74
  torch_dtype,
67
75
  is_generation,
76
+ output_str_only=False,
68
77
  ):
69
78
  self.is_generation = is_generation
79
+ self.output_str_only = output_str_only
70
80
 
71
81
  self.in_queue = mp.Queue()
72
82
  self.out_queue = mp.Queue()
@@ -89,7 +99,7 @@ class HFRunner:
89
99
  )
90
100
 
91
101
  if self.is_generation:
92
- self.model = AutoModelForCausalLM.from_pretrained(
102
+ self.base_model = AutoModelForCausalLM.from_pretrained(
93
103
  model_path,
94
104
  torch_dtype=torch_dtype,
95
105
  trust_remote_code=False,
@@ -104,12 +114,16 @@ class HFRunner:
104
114
  )
105
115
 
106
116
  while True:
107
- prompts, max_new_tokens = in_queue.get()
117
+ prompts, max_new_tokens, lora_paths = in_queue.get()
118
+ if lora_paths is not None:
119
+ assert len(prompts) == len(lora_paths)
120
+
108
121
  if prompts is not None:
109
122
  if self.is_generation:
110
123
  output_strs = []
111
- prefill_logprobs = []
112
- for p in prompts:
124
+ top_input_logprobs = []
125
+ top_output_logprobs = []
126
+ for i, p in enumerate(prompts):
113
127
  if isinstance(p, str):
114
128
  input_ids = self.tokenizer.encode(
115
129
  p, return_tensors="pt"
@@ -117,40 +131,68 @@ class HFRunner:
117
131
  else:
118
132
  input_ids = torch.tensor([p], device="cuda")
119
133
 
120
- output_ids = self.model.generate(
121
- input_ids, do_sample=False, max_new_tokens=max_new_tokens
134
+ if lora_paths is not None and lora_paths[i] is not None:
135
+ self.model = PeftModel.from_pretrained(
136
+ self.base_model,
137
+ lora_paths[i],
138
+ torch_dtype=torch_dtype,
139
+ is_trainable=False,
140
+ )
141
+ else:
142
+ self.model = self.base_model
143
+
144
+ outputs = self.model.generate(
145
+ input_ids,
146
+ do_sample=False,
147
+ temperature=None,
148
+ top_p=None,
149
+ max_new_tokens=max_new_tokens,
150
+ return_dict_in_generate=True,
151
+ output_scores=(not self.output_str_only),
122
152
  )
123
153
  output_strs.append(
124
- self.tokenizer.decode(output_ids[0][len(input_ids[0]) :])
154
+ self.tokenizer.decode(outputs[0][0][len(input_ids[0]) :])
125
155
  )
126
-
127
- logits = self.model.forward(input_ids).logits[0]
128
- logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
129
- logprobs, top_indices = torch.topk(
130
- logprobs, k=NUM_TOP_LOGPROBS, dim=-1
131
- )
132
- # print("index", top_indices)
133
- prefill_logprobs.append(logprobs.tolist())
134
- del logits
135
- del logprobs
156
+ if not self.output_str_only:
157
+ # outputs.scores: (num_token, 1, vocab_size)
158
+ top_output_logprobs.append(
159
+ [
160
+ get_top_logprobs(
161
+ logits[0], NUM_TOP_LOGPROBS
162
+ ).tolist()
163
+ for logits in outputs.scores
164
+ ]
165
+ )
166
+ del outputs
167
+
168
+ input_logits = self.model.forward(input_ids).logits[0]
169
+ top_input_logprobs.append(
170
+ get_top_logprobs(
171
+ input_logits, NUM_TOP_LOGPROBS
172
+ ).tolist()
173
+ )
174
+ del input_logits
136
175
 
137
176
  out_queue.put(
138
177
  ModelOutput(
139
- output_strs=output_strs, top_input_logprobs=prefill_logprobs
178
+ output_strs=output_strs,
179
+ top_input_logprobs=top_input_logprobs,
180
+ top_output_logprobs=top_output_logprobs,
140
181
  )
141
182
  )
142
183
 
143
184
  else:
185
+ assert not self.output_str_only
144
186
  logits = self.model.encode(prompts).tolist()
145
-
146
187
  out_queue.put(ModelOutput(embed_logits=logits))
147
188
 
148
189
  def forward(
149
190
  self,
150
191
  prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
151
192
  max_new_tokens=8,
193
+ lora_paths=None,
152
194
  ):
153
- self.in_queue.put((prompts, max_new_tokens))
195
+ self.in_queue.put((prompts, max_new_tokens, lora_paths))
154
196
  return self.out_queue.get()
155
197
 
156
198
  def terminate(self):
@@ -173,6 +215,10 @@ class SRTRunner:
173
215
  is_generation,
174
216
  tp_size=1,
175
217
  port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
218
+ lora_paths=None,
219
+ max_loras_per_batch=4,
220
+ disable_cuda_graph=False,
221
+ disable_radix_cache=False,
176
222
  ):
177
223
  self.is_generation = is_generation
178
224
  self.runtime = Runtime(
@@ -183,21 +229,28 @@ class SRTRunner:
183
229
  mem_fraction_static=0.69,
184
230
  trust_remote_code=False,
185
231
  is_embedding=not self.is_generation,
232
+ lora_paths=lora_paths,
233
+ max_loras_per_batch=max_loras_per_batch,
234
+ disable_cuda_graph=disable_cuda_graph,
235
+ disable_radix_cache=disable_radix_cache,
186
236
  )
187
237
 
188
238
  def forward(
189
239
  self,
190
240
  prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
191
241
  max_new_tokens=8,
242
+ lora_paths=None,
192
243
  ):
193
244
  if self.is_generation:
194
245
  # the return value contains logprobs from prefill
195
246
  output_strs = []
196
247
  top_input_logprobs = []
248
+ top_output_logprobs = []
197
249
  sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
198
- for prompt in prompts:
250
+ for i, prompt in enumerate(prompts):
199
251
  response = self.runtime.generate(
200
252
  prompt,
253
+ lora_path=lora_paths[i] if lora_paths else None,
201
254
  sampling_params=sampling_params,
202
255
  return_logprob=True,
203
256
  logprob_start_len=0,
@@ -219,9 +272,48 @@ class SRTRunner:
219
272
  ]
220
273
  ]
221
274
  )
275
+ top_output_logprobs.append(
276
+ [
277
+ [tup[0] for tup in x[:NUM_TOP_LOGPROBS]]
278
+ for x in response["meta_info"]["output_top_logprobs"]
279
+ ]
280
+ )
281
+
282
+ return ModelOutput(
283
+ output_strs=output_strs,
284
+ top_input_logprobs=top_input_logprobs,
285
+ top_output_logprobs=top_output_logprobs,
286
+ )
287
+ else:
288
+ response = self.runtime.encode(prompts)
289
+ response = json.loads(response)
290
+ logits = [x["embedding"] for x in response]
291
+ return ModelOutput(embed_logits=logits)
292
+
293
+ def batch_forward(
294
+ self,
295
+ prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
296
+ max_new_tokens=8,
297
+ lora_paths=None,
298
+ ):
299
+ """
300
+ testing serving by sending all prompts once
301
+ only return output strings and no logprobs
302
+ """
303
+ if self.is_generation:
304
+ # the return value contains logprobs from prefill
305
+ output_strs = []
306
+ sampling_params = {"max_new_tokens": max_new_tokens, "temperature": 0}
307
+ response = self.runtime.generate(
308
+ prompts,
309
+ lora_path=lora_paths if lora_paths else None,
310
+ sampling_params=sampling_params,
311
+ )
312
+ response = json.loads(response)
313
+ output_strs = [r["text"] for r in response]
222
314
 
223
315
  return ModelOutput(
224
- output_strs=output_strs, top_input_logprobs=top_input_logprobs
316
+ output_strs=output_strs,
225
317
  )
226
318
  else:
227
319
  response = self.runtime.encode(prompts)
@@ -7,7 +7,7 @@ import time
7
7
  import numpy as np
8
8
 
9
9
  import sglang as sgl
10
- from sglang.utils import fetch_and_cache_jsonl
10
+ from sglang.utils import download_and_cache_file, read_jsonl
11
11
 
12
12
 
13
13
  def test_few_shot_qa():
@@ -456,10 +456,6 @@ def test_chat_completion_speculative():
456
456
  def test_hellaswag_select():
457
457
  """Benchmark the accuracy of sgl.select on the HellaSwag dataset."""
458
458
 
459
- url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
460
- lines = fetch_and_cache_jsonl(url)
461
-
462
- # Construct prompts
463
459
  def get_one_example(lines, i, include_answer):
464
460
  ret = lines[i]["activity_label"] + ": " + lines[i]["ctx"] + " "
465
461
  if include_answer:
@@ -472,6 +468,12 @@ def test_hellaswag_select():
472
468
  ret += get_one_example(lines, i, True) + "\n\n"
473
469
  return ret
474
470
 
471
+ # Read data
472
+ url = "https://raw.githubusercontent.com/rowanz/hellaswag/master/data/hellaswag_val.jsonl"
473
+ filename = download_and_cache_file(url)
474
+ lines = list(read_jsonl(filename))
475
+
476
+ # Construct prompts
475
477
  num_questions = 200
476
478
  num_shots = 20
477
479
  few_shot_examples = get_few_shot_examples(lines, num_shots)
sglang/test/test_utils.py CHANGED
@@ -7,6 +7,7 @@ import subprocess
7
7
  import threading
8
8
  import time
9
9
  from functools import partial
10
+ from types import SimpleNamespace
10
11
  from typing import Callable, List, Optional
11
12
 
12
13
  import numpy as np
@@ -14,6 +15,7 @@ import requests
14
15
  import torch
15
16
  import torch.nn.functional as F
16
17
 
18
+ from sglang.bench_serving import run_benchmark
17
19
  from sglang.global_config import global_config
18
20
  from sglang.lang.backend.openai import OpenAI
19
21
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
@@ -28,7 +30,13 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruc
28
30
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
29
31
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8"
30
32
 
31
- if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
33
+
34
+ def is_in_ci():
35
+ """Return whether it is in CI runner."""
36
+ return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
37
+
38
+
39
+ if is_in_ci():
32
40
  DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
33
41
  DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
34
42
  else:
@@ -296,7 +304,6 @@ def add_common_sglang_args_and_parse(parser: argparse.ArgumentParser):
296
304
  def select_sglang_backend(args: argparse.Namespace):
297
305
  if args.backend.startswith("srt"):
298
306
  if args.backend == "srt-no-parallel":
299
- global_config.enable_parallel_decoding = False
300
307
  global_config.enable_parallel_encoding = False
301
308
  backend = RuntimeEndpoint(f"{args.host}:{args.port}")
302
309
  elif args.backend.startswith("gpt-"):
@@ -501,3 +508,79 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
501
508
 
502
509
  def get_similarities(vec1, vec2):
503
510
  return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
511
+
512
+
513
+ def run_bench_serving(model, num_prompts, request_rate, other_server_args):
514
+ # Launch the server
515
+ base_url = DEFAULT_URL_FOR_TEST
516
+ process = popen_launch_server(
517
+ model,
518
+ base_url,
519
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
520
+ other_args=other_server_args,
521
+ )
522
+
523
+ # Run benchmark
524
+ args = SimpleNamespace(
525
+ backend="sglang",
526
+ base_url=base_url,
527
+ host=None,
528
+ port=None,
529
+ dataset_name="random",
530
+ dataset_path="",
531
+ model=None,
532
+ tokenizer=None,
533
+ num_prompts=num_prompts,
534
+ sharegpt_output_len=None,
535
+ random_input_len=4096,
536
+ random_output_len=2048,
537
+ random_range_ratio=0.0,
538
+ request_rate=request_rate,
539
+ multi=None,
540
+ seed=0,
541
+ output_file=None,
542
+ disable_tqdm=False,
543
+ disable_stream=False,
544
+ disable_ignore_eos=False,
545
+ extra_request_body=None,
546
+ )
547
+
548
+ try:
549
+ res = run_benchmark(args)
550
+ finally:
551
+ kill_child_process(process.pid)
552
+
553
+ assert res["completed"] == num_prompts
554
+ return res
555
+
556
+
557
+ def run_bench_latency(model, other_args):
558
+ command = [
559
+ "python3",
560
+ "-m",
561
+ "sglang.bench_latency",
562
+ "--model-path",
563
+ model,
564
+ "--batch-size",
565
+ "1",
566
+ "--input",
567
+ "128",
568
+ "--output",
569
+ "8",
570
+ *other_args,
571
+ ]
572
+ process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
573
+
574
+ try:
575
+ stdout, stderr = process.communicate()
576
+ output = stdout.decode()
577
+ error = stderr.decode()
578
+ print(f"Output: {output}", flush=True)
579
+ print(f"Error: {error}", flush=True)
580
+
581
+ lastline = output.split("\n")[-3]
582
+ output_throughput = float(lastline.split(" ")[-2])
583
+ finally:
584
+ kill_child_process(process.pid)
585
+
586
+ return output_throughput
sglang/utils.py CHANGED
@@ -12,7 +12,7 @@ import urllib.request
12
12
  from concurrent.futures import ThreadPoolExecutor
13
13
  from io import BytesIO
14
14
  from json import dumps
15
- from typing import Union
15
+ from typing import Optional, Union
16
16
 
17
17
  import numpy as np
18
18
  import requests
@@ -38,13 +38,11 @@ def is_same_type(values: list):
38
38
 
39
39
  def read_jsonl(filename: str):
40
40
  """Read a JSONL file."""
41
- rets = []
42
41
  with open(filename) as fin:
43
42
  for line in fin:
44
43
  if line.startswith("#"):
45
44
  continue
46
- rets.append(json.loads(line))
47
- return rets
45
+ yield json.loads(line)
48
46
 
49
47
 
50
48
  def dump_state_text(filename: str, states: list, mode: str = "w"):
@@ -264,38 +262,35 @@ class LazyImport:
264
262
  return module(*args, **kwargs)
265
263
 
266
264
 
267
- def fetch_and_cache_jsonl(url, cache_file="cached_data.jsonl"):
268
- """Read and cache a jsonl file from a url."""
265
+ def download_and_cache_file(url: str, filename: Optional[str] = None):
266
+ """Read and cache a file from a url."""
267
+ if filename is None:
268
+ filename = os.path.join("/tmp", url.split("/")[-1])
269
269
 
270
270
  # Check if the cache file already exists
271
- if os.path.exists(cache_file):
272
- print("Loading data from cache...")
273
- with open(cache_file, "r") as f:
274
- data = [json.loads(line) for line in f]
275
- else:
276
- print("Downloading data from URL...")
277
- # Stream the response to show the progress bar
278
- response = requests.get(url, stream=True)
279
- response.raise_for_status() # Check for request errors
280
-
281
- # Total size of the file in bytes
282
- total_size = int(response.headers.get("content-length", 0))
283
- chunk_size = 1024 # Download in chunks of 1KB
284
-
285
- # Use tqdm to display the progress bar
286
- with open(cache_file, "wb") as f, tqdm(
287
- desc=cache_file,
288
- total=total_size,
289
- unit="B",
290
- unit_scale=True,
291
- unit_divisor=1024,
292
- ) as bar:
293
- for chunk in response.iter_content(chunk_size=chunk_size):
294
- f.write(chunk)
295
- bar.update(len(chunk))
296
-
297
- # Convert the data to a list of dictionaries
298
- with open(cache_file, "r") as f:
299
- data = [json.loads(line) for line in f]
300
-
301
- return data
271
+ if os.path.exists(filename):
272
+ return filename
273
+
274
+ print(f"Downloading from {url} to {filename}")
275
+
276
+ # Stream the response to show the progress bar
277
+ response = requests.get(url, stream=True)
278
+ response.raise_for_status() # Check for request errors
279
+
280
+ # Total size of the file in bytes
281
+ total_size = int(response.headers.get("content-length", 0))
282
+ chunk_size = 1024 # Download in chunks of 1KB
283
+
284
+ # Use tqdm to display the progress bar
285
+ with open(filename, "wb") as f, tqdm(
286
+ desc=filename,
287
+ total=total_size,
288
+ unit="B",
289
+ unit_scale=True,
290
+ unit_divisor=1024,
291
+ ) as bar:
292
+ for chunk in response.iter_content(chunk_size=chunk_size):
293
+ f.write(chunk)
294
+ bar.update(len(chunk))
295
+
296
+ return filename
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.0"
1
+ __version__ = "0.3.1.post1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.0
3
+ Version: 0.3.1.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -242,6 +242,7 @@ Requires-Dist: psutil; extra == "srt"
242
242
  Requires-Dist: pydantic; extra == "srt"
243
243
  Requires-Dist: python-multipart; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
+ Requires-Dist: torchao; extra == "srt"
245
246
  Requires-Dist: uvicorn; extra == "srt"
246
247
  Requires-Dist: uvloop; extra == "srt"
247
248
  Requires-Dist: zmq; extra == "srt"
@@ -253,6 +254,7 @@ Requires-Dist: matplotlib; extra == "test"
253
254
  Requires-Dist: pandas; extra == "test"
254
255
  Requires-Dist: sentence-transformers; extra == "test"
255
256
  Requires-Dist: accelerate; extra == "test"
257
+ Requires-Dist: peft; extra == "test"
256
258
 
257
259
  <div align="center">
258
260
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -271,14 +273,16 @@ Requires-Dist: accelerate; extra == "test"
271
273
 
272
274
  SGLang is a fast serving framework for large language models and vision language models.
273
275
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
274
-
275
276
  The core features include:
276
- - **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
277
- - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
277
+
278
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption, welcoming contributions to improve LLM and VLM serving.
278
282
 
279
283
  ## News
284
+ - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
280
285
  - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
281
- - [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
282
286
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
283
287
 
284
288
  <details>
@@ -300,6 +304,8 @@ The core features include:
300
304
 
301
305
  ## Install
302
306
 
307
+ You can install SGLang using any of the methods below.
308
+
303
309
  ### Method 1: With pip
304
310
  ```
305
311
  pip install --upgrade pip
@@ -312,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
318
  ### Method 2: From source
313
319
  ```
314
320
  # Use the last release branch
315
- git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
321
+ git clone -b v0.3.1.post1 https://github.com/sgl-project/sglang.git
316
322
  cd sglang
317
323
 
318
324
  pip install --upgrade pip
@@ -323,7 +329,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
323
329
  ```
324
330
 
325
331
  ### Method 3: Using docker
326
- The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
332
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
327
333
  Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
328
334
 
329
335
  ```bash
@@ -391,7 +397,7 @@ sky status --endpoint 30000 sglang
391
397
 
392
398
 
393
399
  ### Common Notes
394
- - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
400
+ - [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is the default attention kernel backend. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), please switch to other kernels by adding `--attention-backend triton --sampling-backend pytorch` and open an issue on GitHub.
395
401
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
396
402
 
397
403
  ## Backend: SGLang Runtime (SRT)
@@ -457,24 +463,29 @@ print(response)
457
463
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
458
464
 
459
465
  ### Additional Server Arguments
460
- - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
466
+ - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
461
467
  ```
462
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
468
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 2
463
469
  ```
464
- - Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
470
+ - To enable multi-GPU data parallelism, add `--dp 2`. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
465
471
  ```
466
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
472
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --dp 2 --tp 2
467
473
  ```
468
474
  - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
469
475
  ```
470
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
476
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --mem-fraction-static 0.7
471
477
  ```
472
478
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
473
479
  - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
474
480
  ```
475
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
481
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
476
482
  ```
477
- - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
483
+ - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
484
+ - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
+ - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
+ - To enable DeepSeek MLA acceleration, add `--enable-mla`.
487
+ - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
488
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
478
489
  ```
479
490
  # Node 0
480
491
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -482,9 +493,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
482
493
  # Node 1
483
494
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
484
495
  ```
485
- - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
486
- - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
487
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
488
496
 
489
497
  ### Supported Models
490
498
 
@@ -510,6 +518,10 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
510
518
  - ChatGLM
511
519
  - InternLM 2
512
520
  - Exaone 3
521
+ - BaiChuan2
522
+ - MiniCPM / MiniCPM 3
523
+ - XVERSE / XVERSE MoE
524
+
513
525
 
514
526
  **Embedding Models**
515
527