sglang 0.3.4.post1__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. sglang/srt/configs/model_config.py +25 -2
  2. sglang/srt/constrained/fsm_cache.py +10 -3
  3. sglang/srt/hf_transformers_utils.py +14 -0
  4. sglang/srt/layers/attention/flashinfer_backend.py +5 -5
  5. sglang/srt/layers/logits_processor.py +5 -5
  6. sglang/srt/layers/rotary_embedding.py +15 -48
  7. sglang/srt/layers/sampler.py +51 -39
  8. sglang/srt/managers/data_parallel_controller.py +1 -1
  9. sglang/srt/managers/detokenizer_manager.py +4 -0
  10. sglang/srt/managers/io_struct.py +10 -0
  11. sglang/srt/managers/schedule_batch.py +13 -3
  12. sglang/srt/managers/scheduler.py +8 -2
  13. sglang/srt/managers/tokenizer_manager.py +14 -0
  14. sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
  15. sglang/srt/mem_cache/memory_pool.py +10 -3
  16. sglang/srt/model_executor/cuda_graph_runner.py +29 -21
  17. sglang/srt/model_executor/forward_batch_info.py +6 -9
  18. sglang/srt/model_executor/model_runner.py +2 -2
  19. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  20. sglang/srt/sampling/sampling_params.py +5 -7
  21. sglang/srt/server.py +12 -0
  22. sglang/test/run_eval.py +2 -0
  23. sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  24. sglang/test/test_utils.py +100 -3
  25. sglang/version.py +1 -1
  26. {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +13 -14
  27. {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +30 -30
  28. {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
  29. {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
  30. {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -38,7 +38,7 @@ class ReqToTokenPool:
38
38
  self.size = size
39
39
  self.max_context_len = max_context_len
40
40
  self.device = device
41
- self.req_to_token = torch.empty(
41
+ self.req_to_token = torch.zeros(
42
42
  (size, max_context_len), dtype=torch.int32, device=device
43
43
  )
44
44
  self.free_slots = list(range(size))
@@ -51,7 +51,7 @@ class ReqToTokenPool:
51
51
  self.write = self.write_without_records
52
52
 
53
53
  def write(self, indices, values):
54
- # Keep the signature for type checking, will be initialized during runtime
54
+ # Keep the signature for type checking. It will be assigned during runtime.
55
55
  raise NotImplementedError()
56
56
 
57
57
  def available_size(self):
@@ -223,7 +223,6 @@ class MHATokenToKVPool(BaseTokenToKVPool):
223
223
  layer_id = layer.layer_id
224
224
  if cache_k.dtype != self.dtype:
225
225
  cache_k = cache_k.to(self.dtype)
226
- if cache_v.dtype != self.dtype:
227
226
  cache_v = cache_v.to(self.dtype)
228
227
  if self.store_dtype != self.dtype:
229
228
  self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
@@ -233,6 +232,14 @@ class MHATokenToKVPool(BaseTokenToKVPool):
233
232
  self.v_buffer[layer_id][loc] = cache_v
234
233
 
235
234
 
235
+ # This compiled version is slower in the unit test
236
+ # python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
237
+ @torch.compile(dynamic=True)
238
+ def copy_two_array(loc, dst_1, src_1, dst_2, src_2, dtype, store_dtype):
239
+ dst_1[loc] = src_1.to(dtype).view(store_dtype)
240
+ dst_2[loc] = src_2.to(dtype).view(store_dtype)
241
+
242
+
236
243
  class MLATokenToKVPool(BaseTokenToKVPool):
237
244
 
238
245
  def __init__(
@@ -92,6 +92,11 @@ def set_torch_compile_config():
92
92
  torch._dynamo.config.accumulated_cache_size_limit = 1024
93
93
 
94
94
 
95
+ @torch.compile(dynamic=True)
96
+ def clamp_position(seq_lens):
97
+ return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
98
+
99
+
95
100
  class CudaGraphRunner:
96
101
  """A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
97
102
 
@@ -111,8 +116,7 @@ class CudaGraphRunner:
111
116
  if self.model_runner.server_args.disable_cuda_graph_padding:
112
117
  self.capture_bs = list(range(1, 32)) + [64, 128]
113
118
  else:
114
- self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
115
-
119
+ self.capture_bs = [1, 2, 3, 4] + [i * 8 for i in range(1, 21)]
116
120
  self.capture_bs = [
117
121
  bs for bs in self.capture_bs if bs <= model_runner.req_to_token_pool.size
118
122
  ]
@@ -129,6 +133,7 @@ class CudaGraphRunner:
129
133
  # Attention backend
130
134
  self.max_bs = max(self.capture_bs)
131
135
  self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
136
+
132
137
  self.seq_len_fill_value = (
133
138
  self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
134
139
  )
@@ -147,6 +152,7 @@ class CudaGraphRunner:
147
152
  (self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
148
153
  )
149
154
  self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
155
+ self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int32)
150
156
 
151
157
  if self.is_encoder_decoder:
152
158
  # NOTE: encoder_lens can influence the full_text_row_masked_out_mask tensor when doing mixed batch
@@ -228,6 +234,7 @@ class CudaGraphRunner:
228
234
  encoder_lens = None
229
235
 
230
236
  seq_lens_sum = seq_lens.sum().item()
237
+ mrope_positions = self.mrope_positions[:, :bs]
231
238
 
232
239
  # Attention backend
233
240
  self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
@@ -253,9 +260,11 @@ class CudaGraphRunner:
253
260
  encoder_lens=encoder_lens,
254
261
  return_logprob=False,
255
262
  top_logprobs_nums=[0] * bs,
256
- positions=torch.clamp((seq_lens - 1), min=0).to(torch.int64),
263
+ positions=clamp_position(seq_lens),
264
+ mrope_positions=mrope_positions,
257
265
  )
258
- return forward(input_ids, forward_batch.positions, forward_batch)
266
+ logits_output = forward(input_ids, forward_batch.positions, forward_batch)
267
+ return logits_output.next_token_logits
259
268
 
260
269
  for _ in range(2):
261
270
  torch.cuda.synchronize()
@@ -286,7 +295,7 @@ class CudaGraphRunner:
286
295
  index = bisect.bisect_left(self.capture_bs, raw_bs)
287
296
  bs = self.capture_bs[index]
288
297
  if bs != raw_bs:
289
- self.seq_lens.fill_(self.seq_len_fill_value)
298
+ self.seq_lens.fill_(1)
290
299
  self.out_cache_loc.zero_()
291
300
 
292
301
  # Common inputs
@@ -296,35 +305,30 @@ class CudaGraphRunner:
296
305
  self.out_cache_loc[:raw_bs].copy_(forward_batch.out_cache_loc)
297
306
  if self.is_encoder_decoder:
298
307
  self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
308
+ if forward_batch.mrope_positions is not None:
309
+ self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
299
310
 
300
311
  # Attention backend
301
312
  self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
302
313
  bs,
303
314
  self.req_pool_indices,
304
315
  self.seq_lens,
305
- forward_batch.seq_lens_sum,
316
+ forward_batch.seq_lens_sum + (bs - raw_bs),
306
317
  self.encoder_lens,
307
318
  )
308
319
 
309
320
  # Replay
310
321
  self.graphs[bs].replay()
311
- logits_output = self.output_buffers[bs]
312
-
313
- # Unpad
314
- if bs != raw_bs:
315
- logits_output = LogitsProcessorOutput(
316
- next_token_logits=logits_output.next_token_logits[:raw_bs],
317
- next_token_logprobs=None,
318
- normalized_prompt_logprobs=None,
319
- input_token_logprobs=None,
320
- input_top_logprobs=None,
321
- output_top_logprobs=None,
322
- )
322
+ next_token_logits = self.output_buffers[bs][:raw_bs]
323
323
 
324
324
  # Extract logprobs
325
325
  if forward_batch.return_logprob:
326
- logits_output.next_token_logprobs = torch.nn.functional.log_softmax(
327
- logits_output.next_token_logits, dim=-1
326
+ next_token_logprobs = torch.nn.functional.log_softmax(
327
+ next_token_logits, dim=-1
328
+ )
329
+ logits_output = LogitsProcessorOutput(
330
+ next_token_logits=next_token_logits,
331
+ next_token_logprobs=next_token_logprobs,
328
332
  )
329
333
  return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
330
334
  if return_top_logprob:
@@ -333,7 +337,11 @@ class CudaGraphRunner:
333
337
  top_logprobs_nums=forward_batch.top_logprobs_nums,
334
338
  )
335
339
  logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
336
- logits_output.next_token_logprobs, logits_metadata
340
+ next_token_logprobs, logits_metadata
337
341
  )[1]
342
+ else:
343
+ logits_output = LogitsProcessorOutput(
344
+ next_token_logits=next_token_logits,
345
+ )
338
346
 
339
347
  return logits_output
@@ -142,11 +142,12 @@ class ForwardBatch:
142
142
  int(self.seq_lens[i]),
143
143
  )
144
144
  elif self.forward_mode.is_extend():
145
+ extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
145
146
  for i, image_inputs in enumerate(batch.image_inputs):
146
147
  extend_start_loc, extend_seq_len, extend_prefix_len = (
147
- self.extend_start_loc[i],
148
- self.extend_seq_lens[i],
149
- self.extend_prefix_lens[i],
148
+ extend_start_loc_cpu[i],
149
+ batch.extend_seq_lens[i],
150
+ batch.extend_prefix_lens[i],
150
151
  )
151
152
  if image_inputs is None:
152
153
  # text only
@@ -160,20 +161,16 @@ class ForwardBatch:
160
161
  ] * 3
161
162
  mrope_position_delta = 0
162
163
  else:
164
+ # TODO: current qwen2-vl do not support radix cache since mrope position calculation
163
165
  mrope_positions, mrope_position_delta = (
164
166
  MRotaryEmbedding.get_input_positions(
165
167
  input_tokens=self.input_ids[
166
168
  extend_start_loc : extend_start_loc + extend_seq_len
167
- ].tolist(),
169
+ ],
168
170
  image_grid_thw=image_inputs.image_grid_thws,
169
- video_grid_thw=None,
170
- image_token_id=hf_config.image_token_id,
171
- video_token_id=hf_config.video_token_id,
172
171
  vision_start_token_id=hf_config.vision_start_token_id,
173
- vision_end_token_id=hf_config.vision_end_token_id,
174
172
  spatial_merge_size=hf_config.vision_config.spatial_merge_size,
175
173
  context_len=0,
176
- extend_prefix_len=extend_prefix_len.item(),
177
174
  )
178
175
  )
179
176
  mrope_positions_list[i] = mrope_positions
@@ -125,11 +125,11 @@ class ModelRunner:
125
125
  )
126
126
  server_args.chunked_prefill_size = None
127
127
  server_args.mem_fraction_static *= 0.95
128
- # TODO: qwen2-vl does not support cuda graph now, set disable-graph=True automatically
128
+ # TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
129
129
  if self.model_config.hf_config.architectures == [
130
130
  "Qwen2VLForConditionalGeneration"
131
131
  ]:
132
- server_args.disable_cuda_graph = True
132
+ server_args.disable_radix_cache = True
133
133
 
134
134
  # Global vars
135
135
  if server_args.show_time_cost:
@@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
31
31
  padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
32
32
  sequences=[
33
33
  torch.tensor(
34
- data=list(
35
- req.sampling_params.stop_token_ids
36
- | {req.tokenizer.eos_token_id}
34
+ data=(
35
+ list(
36
+ (req.sampling_params.stop_token_ids or set())
37
+ | (req.tokenizer.additional_stop_token_ids or set())
38
+ | {req.tokenizer.eos_token_id}
39
+ )
37
40
  ),
38
41
  dtype=torch.int64,
39
42
  device=self.orchestrator.device,
@@ -50,9 +50,10 @@ class SamplingParams:
50
50
  self.presence_penalty = presence_penalty
51
51
  self.repetition_penalty = repetition_penalty
52
52
  self.stop_strs = stop
53
- if stop_token_ids is None:
54
- stop_token_ids = []
55
- self.stop_token_ids = {*stop_token_ids}
53
+ if stop_token_ids:
54
+ self.stop_token_ids = set(stop_token_ids)
55
+ else:
56
+ self.stop_token_ids = None
56
57
  self.max_new_tokens = max_new_tokens
57
58
  self.min_new_tokens = min_new_tokens
58
59
  self.ignore_eos = ignore_eos
@@ -119,10 +120,7 @@ class SamplingParams:
119
120
  # Process stop strings
120
121
  if self.stop_strs is None:
121
122
  self.stop_strs = []
122
- if self.stop_token_ids is None:
123
- self.stop_str_max_len = 0
124
- else:
125
- self.stop_str_max_len = 1
123
+ self.stop_str_max_len = 0
126
124
  else:
127
125
  if isinstance(self.stop_strs, str):
128
126
  self.stop_strs = [self.stop_strs]
sglang/srt/server.py CHANGED
@@ -172,6 +172,18 @@ async def stop_profile():
172
172
  )
173
173
 
174
174
 
175
+ @app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
176
+ async def get_memory_pool_size():
177
+ """Get the memory pool size in number of tokens"""
178
+ try:
179
+ ret = await tokenizer_manager.get_memory_pool_size()
180
+ return ret.size
181
+ except Exception as e:
182
+ return JSONResponse(
183
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
184
+ )
185
+
186
+
175
187
  @app.post("/update_weights")
176
188
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
177
189
  """Update the weights inplace without re-launching the server."""
sglang/test/run_eval.py CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
67
67
  model=args.model,
68
68
  max_tokens=2048,
69
69
  base_url=base_url,
70
+ temperature=getattr(args, "temperature", 0.0),
70
71
  )
71
72
 
72
73
  # Run eval
@@ -119,6 +120,7 @@ if __name__ == "__main__":
119
120
  parser.add_argument("--eval-name", type=str, default="mmlu")
120
121
  parser.add_argument("--num-examples", type=int)
121
122
  parser.add_argument("--num-threads", type=int, default=512)
123
+ parser.add_argument("--temperature", type=float, default=0.0)
122
124
  args = parser.parse_args()
123
125
 
124
126
  run_eval(args)
@@ -24,6 +24,7 @@ class MockSamplingParams:
24
24
  @dataclasses.dataclass
25
25
  class MockTokenizer:
26
26
  eos_token_id: int
27
+ additional_stop_token_ids: typing.Optional[typing.List[int]] = None
27
28
 
28
29
 
29
30
  @dataclasses.dataclass
sglang/test/test_utils.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import os
6
+ import random
6
7
  import subprocess
7
8
  import threading
8
9
  import time
@@ -20,6 +21,7 @@ from sglang.global_config import global_config
20
21
  from sglang.lang.backend.openai import OpenAI
21
22
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
23
  from sglang.srt.utils import kill_child_process
24
+ from sglang.test.run_eval import run_eval
23
25
  from sglang.utils import get_exception_traceback
24
26
 
25
27
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -400,7 +402,7 @@ def popen_launch_server(
400
402
  api_key: Optional[str] = None,
401
403
  other_args: tuple = (),
402
404
  env: Optional[dict] = None,
403
- return_stdout_stderr: bool = False,
405
+ return_stdout_stderr: Optional[tuple] = None,
404
406
  ):
405
407
  _, host, port = base_url.split(":")
406
408
  host = host[2:]
@@ -423,8 +425,8 @@ def popen_launch_server(
423
425
  if return_stdout_stderr:
424
426
  process = subprocess.Popen(
425
427
  command,
426
- stdout=subprocess.PIPE,
427
- stderr=subprocess.PIPE,
428
+ stdout=return_stdout_stderr[0],
429
+ stderr=return_stdout_stderr[1],
428
430
  env=env,
429
431
  text=True,
430
432
  )
@@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
631
633
  rouge_l_scores.append(fmeasure)
632
634
 
633
635
  return rouge_l_scores
636
+
637
+
638
+ STDOUT_FILENAME = "stdout.txt"
639
+ STDERR_FILENAME = "stderr.txt"
640
+
641
+
642
+ def read_output(output_lines):
643
+ """Print the output in real time with another thread."""
644
+ while not os.path.exists(STDERR_FILENAME):
645
+ time.sleep(1)
646
+
647
+ pt = 0
648
+ while pt >= 0:
649
+ if pt > 0 and not os.path.exists(STDERR_FILENAME):
650
+ break
651
+ lines = open(STDERR_FILENAME).readlines()
652
+ for line in lines[pt:]:
653
+ print(line, end="", flush=True)
654
+ output_lines.append(line)
655
+ pt += 1
656
+ time.sleep(0.1)
657
+
658
+
659
+ def run_mmlu_test(
660
+ disable_radix_cache,
661
+ enable_mixed_chunk=False,
662
+ enable_overlap=False,
663
+ chunked_prefill_size=32,
664
+ ):
665
+ other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
666
+ if disable_radix_cache:
667
+ other_args += ["--disable-radix-cache"]
668
+ if enable_mixed_chunk:
669
+ other_args += ["--enable-mixed-chunk"]
670
+ if enable_overlap:
671
+ other_args += ["--enable-overlap-scheduler"]
672
+
673
+ model = DEFAULT_MODEL_NAME_FOR_TEST
674
+ port = random.randint(4000, 5000)
675
+ base_url = f"http://127.0.0.1:{port}"
676
+
677
+ # Create files and launch the server
678
+ stdout = open(STDOUT_FILENAME, "w")
679
+ stderr = open(STDERR_FILENAME, "w")
680
+ process = popen_launch_server(
681
+ model,
682
+ base_url,
683
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
684
+ other_args=other_args,
685
+ return_stdout_stderr=(stdout, stderr),
686
+ )
687
+
688
+ # Launch a thread to stream the output
689
+ output_lines = []
690
+ t = threading.Thread(target=read_output, args=(output_lines,))
691
+ t.start()
692
+
693
+ # Run the eval
694
+ args = SimpleNamespace(
695
+ base_url=base_url,
696
+ model=model,
697
+ eval_name="mmlu",
698
+ num_examples=128,
699
+ num_threads=128,
700
+ )
701
+
702
+ try:
703
+ metrics = run_eval(args)
704
+ print(f"{metrics=}")
705
+ assert metrics["score"] >= 0.65
706
+ finally:
707
+ pass
708
+
709
+ # Clean up everything
710
+ kill_child_process(process.pid)
711
+ kill_child_process(process.pid)
712
+ stdout.close()
713
+ stderr.close()
714
+ if os.path.exists(STDOUT_FILENAME):
715
+ os.remove(STDOUT_FILENAME)
716
+ if os.path.exists(STDERR_FILENAME):
717
+ os.remove(STDERR_FILENAME)
718
+ t.join()
719
+
720
+ # Assert success
721
+ has_new_server = False
722
+ has_leak = False
723
+ for line in output_lines:
724
+ if "The server is fired" in line:
725
+ has_new_server = True
726
+ if "leak" in line:
727
+ has_leak = True
728
+
729
+ assert has_new_server
730
+ # assert not has_leak
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.4.post1"
1
+ __version__ = "0.3.4.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.4.post1
3
+ Version: 0.3.4.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
328
328
  pip install --upgrade pip
329
329
  pip install "sglang[all]"
330
330
 
331
- # Install FlashInfer CUDA kernels
331
+ # Install FlashInfer accelerated kernels
332
332
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
333
333
  ```
334
334
 
335
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
336
+
335
337
  ### Method 2: From source
336
338
  ```
337
339
  # Use the last release branch
338
- git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
340
+ git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
339
341
  cd sglang
340
342
 
341
343
  pip install --upgrade pip
342
344
  pip install -e "python[all]"
343
345
 
344
- # Install FlashInfer CUDA kernels
346
+ # Install FlashInfer accelerated kernels
345
347
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
346
348
  ```
347
349
 
350
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
351
+
348
352
  ### Method 3: Using docker
349
353
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
350
354
  Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
498
502
  ```
499
503
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
500
504
  ```
501
- - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
505
+ - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
506
+ - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
502
507
  - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
503
508
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
504
509
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
519
524
  ```python
520
525
  import sglang as sgl
521
526
 
522
-
523
527
  def main():
524
528
  prompts = [
525
529
  "Hello, my name is",
@@ -539,12 +543,8 @@ if __name__ == "__main__":
539
543
  main()
540
544
  ```
541
545
 
542
- This can be used for:
543
-
544
- 1. **Offline Batch Inference**
545
- 2. **Building Custom Servers**
546
-
547
- You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
546
+ This can be used for offline batch inference and building custom servers.
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
548
548
 
549
549
  ### Supported Models
550
550
 
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
552
552
  - Llama / Llama 2 / Llama 3 / Llama 3.1
553
553
  - Mistral / Mixtral / Mistral NeMo
554
554
  - Gemma / Gemma 2
555
- - Qwen / Qwen 2 / Qwen 2 MoE
555
+ - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
556
556
  - DeepSeek / DeepSeek 2
557
557
  - OLMoE
558
558
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -712,7 +712,6 @@ print(state["answer_1"])
712
712
  ```
713
713
 
714
714
  #### More Examples
715
-
716
715
  Anthropic and VertexAI (Gemini) models are also supported.
717
716
  You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
718
717