sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. sglang/bench_latency.py +2 -1
  2. sglang/lang/chat_template.py +17 -0
  3. sglang/launch_server_llavavid.py +1 -1
  4. sglang/srt/configs/__init__.py +3 -0
  5. sglang/srt/configs/model_config.py +27 -2
  6. sglang/srt/configs/qwen2vl.py +133 -0
  7. sglang/srt/constrained/fsm_cache.py +10 -3
  8. sglang/srt/conversation.py +27 -0
  9. sglang/srt/hf_transformers_utils.py +16 -1
  10. sglang/srt/layers/attention/__init__.py +16 -5
  11. sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
  12. sglang/srt/layers/attention/flashinfer_backend.py +174 -54
  13. sglang/srt/layers/attention/triton_backend.py +22 -6
  14. sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
  15. sglang/srt/layers/linear.py +89 -63
  16. sglang/srt/layers/logits_processor.py +5 -5
  17. sglang/srt/layers/rotary_embedding.py +112 -0
  18. sglang/srt/layers/sampler.py +51 -39
  19. sglang/srt/lora/lora.py +3 -1
  20. sglang/srt/managers/data_parallel_controller.py +1 -1
  21. sglang/srt/managers/detokenizer_manager.py +4 -0
  22. sglang/srt/managers/image_processor.py +186 -13
  23. sglang/srt/managers/io_struct.py +10 -0
  24. sglang/srt/managers/schedule_batch.py +238 -68
  25. sglang/srt/managers/scheduler.py +69 -50
  26. sglang/srt/managers/tokenizer_manager.py +24 -4
  27. sglang/srt/managers/tp_worker.py +26 -111
  28. sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
  29. sglang/srt/mem_cache/memory_pool.py +56 -10
  30. sglang/srt/mem_cache/radix_cache.py +4 -3
  31. sglang/srt/model_executor/cuda_graph_runner.py +87 -28
  32. sglang/srt/model_executor/forward_batch_info.py +83 -3
  33. sglang/srt/model_executor/model_runner.py +32 -11
  34. sglang/srt/models/chatglm.py +3 -3
  35. sglang/srt/models/deepseek_v2.py +2 -2
  36. sglang/srt/models/mllama.py +1004 -0
  37. sglang/srt/models/qwen2_vl.py +724 -0
  38. sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
  39. sglang/srt/sampling/sampling_batch_info.py +13 -3
  40. sglang/srt/sampling/sampling_params.py +5 -7
  41. sglang/srt/server.py +12 -0
  42. sglang/srt/server_args.py +10 -0
  43. sglang/srt/utils.py +22 -0
  44. sglang/test/run_eval.py +2 -0
  45. sglang/test/runners.py +20 -1
  46. sglang/test/srt/sampling/penaltylib/utils.py +1 -0
  47. sglang/test/test_utils.py +100 -3
  48. sglang/version.py +1 -1
  49. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
  50. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
  51. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
  52. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
  53. {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
31
31
  padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
32
32
  sequences=[
33
33
  torch.tensor(
34
- data=list(
35
- req.sampling_params.stop_token_ids
36
- | {req.tokenizer.eos_token_id}
34
+ data=(
35
+ list(
36
+ (req.sampling_params.stop_token_ids or set())
37
+ | (req.tokenizer.additional_stop_token_ids or set())
38
+ | {req.tokenizer.eos_token_id}
39
+ )
37
40
  ),
38
41
  dtype=torch.int64,
39
42
  device=self.orchestrator.device,
@@ -51,7 +51,7 @@ class SamplingBatchInfo:
51
51
  disable_penalizer: bool,
52
52
  ):
53
53
  reqs = batch.reqs
54
- device = batch.input_ids.device
54
+ device = batch.device
55
55
  temperatures = (
56
56
  torch.tensor(
57
57
  [r.sampling_params.temperature for r in reqs],
@@ -78,7 +78,7 @@ class SamplingBatchInfo:
78
78
  need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
79
79
  is_all_greedy=top_ks.max().item() <= 1,
80
80
  vocab_size=vocab_size,
81
- device=batch.input_ids.device,
81
+ device=device,
82
82
  )
83
83
  # TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
84
84
 
@@ -95,7 +95,7 @@ class SamplingBatchInfo:
95
95
  ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
96
96
  vocab_size=vocab_size,
97
97
  batch=batch,
98
- device=batch.input_ids.device,
98
+ device=batch.device,
99
99
  Penalizers={
100
100
  penaltylib.BatchedFrequencyPenalizer,
101
101
  penaltylib.BatchedMinNewTokensPenalizer,
@@ -224,3 +224,13 @@ class SamplingBatchInfo:
224
224
  vocab_size=self.vocab_size,
225
225
  device=self.device,
226
226
  )
227
+
228
+ def to(self, device: str):
229
+ for item in [
230
+ "temperatures",
231
+ "top_ps",
232
+ "top_ks",
233
+ "min_ps",
234
+ ]:
235
+ value = getattr(self, item)
236
+ setattr(self, item, value.to(device, non_blocking=True))
@@ -50,9 +50,10 @@ class SamplingParams:
50
50
  self.presence_penalty = presence_penalty
51
51
  self.repetition_penalty = repetition_penalty
52
52
  self.stop_strs = stop
53
- if stop_token_ids is None:
54
- stop_token_ids = []
55
- self.stop_token_ids = {*stop_token_ids}
53
+ if stop_token_ids:
54
+ self.stop_token_ids = set(stop_token_ids)
55
+ else:
56
+ self.stop_token_ids = None
56
57
  self.max_new_tokens = max_new_tokens
57
58
  self.min_new_tokens = min_new_tokens
58
59
  self.ignore_eos = ignore_eos
@@ -119,10 +120,7 @@ class SamplingParams:
119
120
  # Process stop strings
120
121
  if self.stop_strs is None:
121
122
  self.stop_strs = []
122
- if self.stop_token_ids is None:
123
- self.stop_str_max_len = 0
124
- else:
125
- self.stop_str_max_len = 1
123
+ self.stop_str_max_len = 0
126
124
  else:
127
125
  if isinstance(self.stop_strs, str):
128
126
  self.stop_strs = [self.stop_strs]
sglang/srt/server.py CHANGED
@@ -172,6 +172,18 @@ async def stop_profile():
172
172
  )
173
173
 
174
174
 
175
+ @app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
176
+ async def get_memory_pool_size():
177
+ """Get the memory pool size in number of tokens"""
178
+ try:
179
+ ret = await tokenizer_manager.get_memory_pool_size()
180
+ return ret.size
181
+ except Exception as e:
182
+ return JSONResponse(
183
+ {"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
184
+ )
185
+
186
+
175
187
  @app.post("/update_weights")
176
188
  async def update_weights(obj: UpdateWeightReqInput, request: Request):
177
189
  """Update the weights inplace without re-launching the server."""
sglang/srt/server_args.py CHANGED
@@ -177,6 +177,16 @@ class ServerArgs:
177
177
  if self.sampling_backend is None:
178
178
  self.sampling_backend = "flashinfer"
179
179
 
180
+ if self.enable_overlap_schedule:
181
+ logger.warning(
182
+ "Overlap scheduler mode is enabled. This is an experimental feature. "
183
+ "Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
184
+ "and embedding APIs are not supported and will lead to wrong results. "
185
+ "The NaN detection is also disabled."
186
+ )
187
+ self.disable_penalizer = True
188
+ self.disable_nan_detection = True
189
+
180
190
  # Model-specific patches
181
191
  if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
182
192
  logger.info(
sglang/srt/utils.py CHANGED
@@ -209,6 +209,28 @@ def is_multimodal_model(model_architectures):
209
209
  or "LlavaQwenForCausalLM" in model_architectures
210
210
  or "LlavaMistralForCausalLM" in model_architectures
211
211
  or "LlavaVidForCausalLM" in model_architectures
212
+ or "MllamaForConditionalGeneration" in model_architectures
213
+ or "Qwen2VLForConditionalGeneration" in model_architectures
214
+ ):
215
+ return True
216
+ else:
217
+ return False
218
+
219
+
220
+ def is_attention_free_model(model_architectures):
221
+ return False
222
+
223
+
224
+ def model_has_inner_state(model_architectures):
225
+ return False
226
+
227
+
228
+ def is_embedding_model(model_architectures):
229
+ if (
230
+ "LlamaEmbeddingModel" in model_architectures
231
+ or "MistralModel" in model_architectures
232
+ or "LlamaForSequenceClassification" in model_architectures
233
+ or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
212
234
  ):
213
235
  return True
214
236
  else:
sglang/test/run_eval.py CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
67
67
  model=args.model,
68
68
  max_tokens=2048,
69
69
  base_url=base_url,
70
+ temperature=getattr(args, "temperature", 0.0),
70
71
  )
71
72
 
72
73
  # Run eval
@@ -119,6 +120,7 @@ if __name__ == "__main__":
119
120
  parser.add_argument("--eval-name", type=str, default="mmlu")
120
121
  parser.add_argument("--num-examples", type=int)
121
122
  parser.add_argument("--num-threads", type=int, default=512)
123
+ parser.add_argument("--temperature", type=float, default=0.0)
122
124
  args = parser.parse_args()
123
125
 
124
126
  run_eval(args)
sglang/test/runners.py CHANGED
@@ -102,8 +102,10 @@ class HFRunner:
102
102
  return False
103
103
 
104
104
  def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
105
- self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
105
+ # Apply model-specific patches
106
+ monkey_patch_gemma2_sdpa()
106
107
 
108
+ # Load the model and tokenizer
107
109
  if self.model_type == "generation":
108
110
  self.base_model = AutoModelForCausalLM.from_pretrained(
109
111
  model_path,
@@ -128,7 +130,9 @@ class HFRunner:
128
130
  ).cuda()
129
131
  else:
130
132
  raise Exception(f"Unrecognized model type {self.model_type}")
133
+ self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
131
134
 
135
+ # Run forward
132
136
  while True:
133
137
  prompts, max_new_tokens, lora_paths = in_queue.get()
134
138
  if lora_paths is not None:
@@ -370,3 +374,18 @@ class SRTRunner:
370
374
  def __exit__(self, exc_type, exc_value, traceback):
371
375
  self.runtime.shutdown()
372
376
  del self.runtime
377
+
378
+
379
+ def monkey_patch_gemma2_sdpa():
380
+ """
381
+ Use sdpa by default to fix the OOM issue.
382
+ Revert this commit:
383
+ https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
384
+ """
385
+ from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
386
+
387
+ def _check_and_enable_sdpa(config, hard_check_only: bool = False):
388
+ config._attn_implementation = "sdpa"
389
+ return config
390
+
391
+ setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
@@ -24,6 +24,7 @@ class MockSamplingParams:
24
24
  @dataclasses.dataclass
25
25
  class MockTokenizer:
26
26
  eos_token_id: int
27
+ additional_stop_token_ids: typing.Optional[typing.List[int]] = None
27
28
 
28
29
 
29
30
  @dataclasses.dataclass
sglang/test/test_utils.py CHANGED
@@ -3,6 +3,7 @@
3
3
  import argparse
4
4
  import asyncio
5
5
  import os
6
+ import random
6
7
  import subprocess
7
8
  import threading
8
9
  import time
@@ -20,6 +21,7 @@ from sglang.global_config import global_config
20
21
  from sglang.lang.backend.openai import OpenAI
21
22
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
22
23
  from sglang.srt.utils import kill_child_process
24
+ from sglang.test.run_eval import run_eval
23
25
  from sglang.utils import get_exception_traceback
24
26
 
25
27
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -400,7 +402,7 @@ def popen_launch_server(
400
402
  api_key: Optional[str] = None,
401
403
  other_args: tuple = (),
402
404
  env: Optional[dict] = None,
403
- return_stdout_stderr: bool = False,
405
+ return_stdout_stderr: Optional[tuple] = None,
404
406
  ):
405
407
  _, host, port = base_url.split(":")
406
408
  host = host[2:]
@@ -423,8 +425,8 @@ def popen_launch_server(
423
425
  if return_stdout_stderr:
424
426
  process = subprocess.Popen(
425
427
  command,
426
- stdout=subprocess.PIPE,
427
- stderr=subprocess.PIPE,
428
+ stdout=return_stdout_stderr[0],
429
+ stderr=return_stdout_stderr[1],
428
430
  env=env,
429
431
  text=True,
430
432
  )
@@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
631
633
  rouge_l_scores.append(fmeasure)
632
634
 
633
635
  return rouge_l_scores
636
+
637
+
638
+ STDOUT_FILENAME = "stdout.txt"
639
+ STDERR_FILENAME = "stderr.txt"
640
+
641
+
642
+ def read_output(output_lines):
643
+ """Print the output in real time with another thread."""
644
+ while not os.path.exists(STDERR_FILENAME):
645
+ time.sleep(1)
646
+
647
+ pt = 0
648
+ while pt >= 0:
649
+ if pt > 0 and not os.path.exists(STDERR_FILENAME):
650
+ break
651
+ lines = open(STDERR_FILENAME).readlines()
652
+ for line in lines[pt:]:
653
+ print(line, end="", flush=True)
654
+ output_lines.append(line)
655
+ pt += 1
656
+ time.sleep(0.1)
657
+
658
+
659
+ def run_mmlu_test(
660
+ disable_radix_cache,
661
+ enable_mixed_chunk=False,
662
+ enable_overlap=False,
663
+ chunked_prefill_size=32,
664
+ ):
665
+ other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
666
+ if disable_radix_cache:
667
+ other_args += ["--disable-radix-cache"]
668
+ if enable_mixed_chunk:
669
+ other_args += ["--enable-mixed-chunk"]
670
+ if enable_overlap:
671
+ other_args += ["--enable-overlap-scheduler"]
672
+
673
+ model = DEFAULT_MODEL_NAME_FOR_TEST
674
+ port = random.randint(4000, 5000)
675
+ base_url = f"http://127.0.0.1:{port}"
676
+
677
+ # Create files and launch the server
678
+ stdout = open(STDOUT_FILENAME, "w")
679
+ stderr = open(STDERR_FILENAME, "w")
680
+ process = popen_launch_server(
681
+ model,
682
+ base_url,
683
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
684
+ other_args=other_args,
685
+ return_stdout_stderr=(stdout, stderr),
686
+ )
687
+
688
+ # Launch a thread to stream the output
689
+ output_lines = []
690
+ t = threading.Thread(target=read_output, args=(output_lines,))
691
+ t.start()
692
+
693
+ # Run the eval
694
+ args = SimpleNamespace(
695
+ base_url=base_url,
696
+ model=model,
697
+ eval_name="mmlu",
698
+ num_examples=128,
699
+ num_threads=128,
700
+ )
701
+
702
+ try:
703
+ metrics = run_eval(args)
704
+ print(f"{metrics=}")
705
+ assert metrics["score"] >= 0.65
706
+ finally:
707
+ pass
708
+
709
+ # Clean up everything
710
+ kill_child_process(process.pid)
711
+ kill_child_process(process.pid)
712
+ stdout.close()
713
+ stderr.close()
714
+ if os.path.exists(STDOUT_FILENAME):
715
+ os.remove(STDOUT_FILENAME)
716
+ if os.path.exists(STDERR_FILENAME):
717
+ os.remove(STDERR_FILENAME)
718
+ t.join()
719
+
720
+ # Assert success
721
+ has_new_server = False
722
+ has_leak = False
723
+ for line in output_lines:
724
+ if "The server is fired" in line:
725
+ has_new_server = True
726
+ if "leak" in line:
727
+ has_leak = True
728
+
729
+ assert has_new_server
730
+ # assert not has_leak
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.4"
1
+ __version__ = "0.3.4.post2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.4
3
+ Version: 0.3.4.post2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -259,7 +259,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
259
259
  Provides-Extra: srt
260
260
  Requires-Dist: sglang[runtime_common]; extra == "srt"
261
261
  Requires-Dist: torch; extra == "srt"
262
- Requires-Dist: vllm==0.5.5; extra == "srt"
262
+ Requires-Dist: vllm==0.6.3.post1; extra == "srt"
263
263
  Provides-Extra: srt_xpu
264
264
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
265
265
  Provides-Extra: test
@@ -284,17 +284,17 @@ Requires-Dist: peft; extra == "test"
284
284
  --------------------------------------------------------------------------------
285
285
 
286
286
  | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
287
- [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
287
+ [**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
288
288
 
289
289
  ## News
290
290
  - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
291
291
  - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
292
292
  - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
293
- - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
294
293
 
295
294
  <details>
296
295
  <summary>More</summary>
297
296
 
297
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
298
298
  - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
299
299
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
300
300
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
328
328
  pip install --upgrade pip
329
329
  pip install "sglang[all]"
330
330
 
331
- # Install FlashInfer CUDA kernels
331
+ # Install FlashInfer accelerated kernels
332
332
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
333
333
  ```
334
334
 
335
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
336
+
335
337
  ### Method 2: From source
336
338
  ```
337
339
  # Use the last release branch
338
- git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
340
+ git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
339
341
  cd sglang
340
342
 
341
343
  pip install --upgrade pip
342
344
  pip install -e "python[all]"
343
345
 
344
- # Install FlashInfer CUDA kernels
346
+ # Install FlashInfer accelerated kernels
345
347
  pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
346
348
  ```
347
349
 
350
+ Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
351
+
348
352
  ### Method 3: Using docker
349
353
  The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
350
354
  Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
498
502
  ```
499
503
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
500
504
  ```
501
- - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
505
+ - To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
506
+ - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
502
507
  - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
503
508
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
504
509
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
519
524
  ```python
520
525
  import sglang as sgl
521
526
 
522
-
523
527
  def main():
524
528
  prompts = [
525
529
  "Hello, my name is",
@@ -539,12 +543,8 @@ if __name__ == "__main__":
539
543
  main()
540
544
  ```
541
545
 
542
- This can be used for:
543
-
544
- 1. **Offline Batch Inference**
545
- 2. **Building Custom Servers**
546
-
547
- You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
546
+ This can be used for offline batch inference and building custom servers.
547
+ You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
548
548
 
549
549
  ### Supported Models
550
550
 
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
552
552
  - Llama / Llama 2 / Llama 3 / Llama 3.1
553
553
  - Mistral / Mixtral / Mistral NeMo
554
554
  - Gemma / Gemma 2
555
- - Qwen / Qwen 2 / Qwen 2 MoE
555
+ - Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
556
556
  - DeepSeek / DeepSeek 2
557
557
  - OLMoE
558
558
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
575
575
  - MiniCPM / MiniCPM 3
576
576
  - XVERSE / XVERSE MoE
577
577
  - SmolLM
578
+ - GLM-4
578
579
 
579
580
  **Embedding Models**
580
581
 
@@ -711,7 +712,6 @@ print(state["answer_1"])
711
712
  ```
712
713
 
713
714
  #### More Examples
714
-
715
715
  Anthropic and VertexAI (Gemini) models are also supported.
716
716
  You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
717
717
 
@@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
892
892
  We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
893
893
 
894
894
 
895
-
896
895
  <p align="center">
897
896
  <a href="#sglangtop" target="_blank">
898
897
  <bold>Back To Top </bold>