sglang 0.3.4__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +2 -1
- sglang/lang/chat_template.py +17 -0
- sglang/launch_server_llavavid.py +1 -1
- sglang/srt/configs/__init__.py +3 -0
- sglang/srt/configs/model_config.py +27 -2
- sglang/srt/configs/qwen2vl.py +133 -0
- sglang/srt/constrained/fsm_cache.py +10 -3
- sglang/srt/conversation.py +27 -0
- sglang/srt/hf_transformers_utils.py +16 -1
- sglang/srt/layers/attention/__init__.py +16 -5
- sglang/srt/layers/attention/double_sparsity_backend.py +22 -6
- sglang/srt/layers/attention/flashinfer_backend.py +174 -54
- sglang/srt/layers/attention/triton_backend.py +22 -6
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +26 -4
- sglang/srt/layers/linear.py +89 -63
- sglang/srt/layers/logits_processor.py +5 -5
- sglang/srt/layers/rotary_embedding.py +112 -0
- sglang/srt/layers/sampler.py +51 -39
- sglang/srt/lora/lora.py +3 -1
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +4 -0
- sglang/srt/managers/image_processor.py +186 -13
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/schedule_batch.py +238 -68
- sglang/srt/managers/scheduler.py +69 -50
- sglang/srt/managers/tokenizer_manager.py +24 -4
- sglang/srt/managers/tp_worker.py +26 -111
- sglang/srt/managers/tp_worker_overlap_thread.py +209 -0
- sglang/srt/mem_cache/memory_pool.py +56 -10
- sglang/srt/mem_cache/radix_cache.py +4 -3
- sglang/srt/model_executor/cuda_graph_runner.py +87 -28
- sglang/srt/model_executor/forward_batch_info.py +83 -3
- sglang/srt/model_executor/model_runner.py +32 -11
- sglang/srt/models/chatglm.py +3 -3
- sglang/srt/models/deepseek_v2.py +2 -2
- sglang/srt/models/mllama.py +1004 -0
- sglang/srt/models/qwen2_vl.py +724 -0
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- sglang/srt/sampling/sampling_batch_info.py +13 -3
- sglang/srt/sampling/sampling_params.py +5 -7
- sglang/srt/server.py +12 -0
- sglang/srt/server_args.py +10 -0
- sglang/srt/utils.py +22 -0
- sglang/test/run_eval.py +2 -0
- sglang/test/runners.py +20 -1
- sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- sglang/test/test_utils.py +100 -3
- sglang/version.py +1 -1
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +17 -18
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +53 -48
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.4.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
|
|
31
31
|
padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
|
32
32
|
sequences=[
|
33
33
|
torch.tensor(
|
34
|
-
data=
|
35
|
-
|
36
|
-
|
34
|
+
data=(
|
35
|
+
list(
|
36
|
+
(req.sampling_params.stop_token_ids or set())
|
37
|
+
| (req.tokenizer.additional_stop_token_ids or set())
|
38
|
+
| {req.tokenizer.eos_token_id}
|
39
|
+
)
|
37
40
|
),
|
38
41
|
dtype=torch.int64,
|
39
42
|
device=self.orchestrator.device,
|
@@ -51,7 +51,7 @@ class SamplingBatchInfo:
|
|
51
51
|
disable_penalizer: bool,
|
52
52
|
):
|
53
53
|
reqs = batch.reqs
|
54
|
-
device = batch.
|
54
|
+
device = batch.device
|
55
55
|
temperatures = (
|
56
56
|
torch.tensor(
|
57
57
|
[r.sampling_params.temperature for r in reqs],
|
@@ -78,7 +78,7 @@ class SamplingBatchInfo:
|
|
78
78
|
need_min_p_sampling=any(r.sampling_params.min_p > 0 for r in reqs),
|
79
79
|
is_all_greedy=top_ks.max().item() <= 1,
|
80
80
|
vocab_size=vocab_size,
|
81
|
-
device=
|
81
|
+
device=device,
|
82
82
|
)
|
83
83
|
# TODO (lianmin): `need_min_p_sampling` needs to be updated in filter and merge.
|
84
84
|
|
@@ -95,7 +95,7 @@ class SamplingBatchInfo:
|
|
95
95
|
ret.penalizer_orchestrator = penaltylib.BatchedPenalizerOrchestrator(
|
96
96
|
vocab_size=vocab_size,
|
97
97
|
batch=batch,
|
98
|
-
device=batch.
|
98
|
+
device=batch.device,
|
99
99
|
Penalizers={
|
100
100
|
penaltylib.BatchedFrequencyPenalizer,
|
101
101
|
penaltylib.BatchedMinNewTokensPenalizer,
|
@@ -224,3 +224,13 @@ class SamplingBatchInfo:
|
|
224
224
|
vocab_size=self.vocab_size,
|
225
225
|
device=self.device,
|
226
226
|
)
|
227
|
+
|
228
|
+
def to(self, device: str):
|
229
|
+
for item in [
|
230
|
+
"temperatures",
|
231
|
+
"top_ps",
|
232
|
+
"top_ks",
|
233
|
+
"min_ps",
|
234
|
+
]:
|
235
|
+
value = getattr(self, item)
|
236
|
+
setattr(self, item, value.to(device, non_blocking=True))
|
@@ -50,9 +50,10 @@ class SamplingParams:
|
|
50
50
|
self.presence_penalty = presence_penalty
|
51
51
|
self.repetition_penalty = repetition_penalty
|
52
52
|
self.stop_strs = stop
|
53
|
-
if stop_token_ids
|
54
|
-
stop_token_ids =
|
55
|
-
|
53
|
+
if stop_token_ids:
|
54
|
+
self.stop_token_ids = set(stop_token_ids)
|
55
|
+
else:
|
56
|
+
self.stop_token_ids = None
|
56
57
|
self.max_new_tokens = max_new_tokens
|
57
58
|
self.min_new_tokens = min_new_tokens
|
58
59
|
self.ignore_eos = ignore_eos
|
@@ -119,10 +120,7 @@ class SamplingParams:
|
|
119
120
|
# Process stop strings
|
120
121
|
if self.stop_strs is None:
|
121
122
|
self.stop_strs = []
|
122
|
-
|
123
|
-
self.stop_str_max_len = 0
|
124
|
-
else:
|
125
|
-
self.stop_str_max_len = 1
|
123
|
+
self.stop_str_max_len = 0
|
126
124
|
else:
|
127
125
|
if isinstance(self.stop_strs, str):
|
128
126
|
self.stop_strs = [self.stop_strs]
|
sglang/srt/server.py
CHANGED
@@ -172,6 +172,18 @@ async def stop_profile():
|
|
172
172
|
)
|
173
173
|
|
174
174
|
|
175
|
+
@app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
|
176
|
+
async def get_memory_pool_size():
|
177
|
+
"""Get the memory pool size in number of tokens"""
|
178
|
+
try:
|
179
|
+
ret = await tokenizer_manager.get_memory_pool_size()
|
180
|
+
return ret.size
|
181
|
+
except Exception as e:
|
182
|
+
return JSONResponse(
|
183
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
184
|
+
)
|
185
|
+
|
186
|
+
|
175
187
|
@app.post("/update_weights")
|
176
188
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
177
189
|
"""Update the weights inplace without re-launching the server."""
|
sglang/srt/server_args.py
CHANGED
@@ -177,6 +177,16 @@ class ServerArgs:
|
|
177
177
|
if self.sampling_backend is None:
|
178
178
|
self.sampling_backend = "flashinfer"
|
179
179
|
|
180
|
+
if self.enable_overlap_schedule:
|
181
|
+
logger.warning(
|
182
|
+
"Overlap scheduler mode is enabled. This is an experimental feature. "
|
183
|
+
"Sampling penalizer (e.g., frequency and repetition penalty), constrained decoding (e.g., regex, JSON), "
|
184
|
+
"and embedding APIs are not supported and will lead to wrong results. "
|
185
|
+
"The NaN detection is also disabled."
|
186
|
+
)
|
187
|
+
self.disable_penalizer = True
|
188
|
+
self.disable_nan_detection = True
|
189
|
+
|
180
190
|
# Model-specific patches
|
181
191
|
if "Alibaba-NLP/gte-Qwen2-1.5B-instruct" == self.model_path:
|
182
192
|
logger.info(
|
sglang/srt/utils.py
CHANGED
@@ -209,6 +209,28 @@ def is_multimodal_model(model_architectures):
|
|
209
209
|
or "LlavaQwenForCausalLM" in model_architectures
|
210
210
|
or "LlavaMistralForCausalLM" in model_architectures
|
211
211
|
or "LlavaVidForCausalLM" in model_architectures
|
212
|
+
or "MllamaForConditionalGeneration" in model_architectures
|
213
|
+
or "Qwen2VLForConditionalGeneration" in model_architectures
|
214
|
+
):
|
215
|
+
return True
|
216
|
+
else:
|
217
|
+
return False
|
218
|
+
|
219
|
+
|
220
|
+
def is_attention_free_model(model_architectures):
|
221
|
+
return False
|
222
|
+
|
223
|
+
|
224
|
+
def model_has_inner_state(model_architectures):
|
225
|
+
return False
|
226
|
+
|
227
|
+
|
228
|
+
def is_embedding_model(model_architectures):
|
229
|
+
if (
|
230
|
+
"LlamaEmbeddingModel" in model_architectures
|
231
|
+
or "MistralModel" in model_architectures
|
232
|
+
or "LlamaForSequenceClassification" in model_architectures
|
233
|
+
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
|
212
234
|
):
|
213
235
|
return True
|
214
236
|
else:
|
sglang/test/run_eval.py
CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
|
|
67
67
|
model=args.model,
|
68
68
|
max_tokens=2048,
|
69
69
|
base_url=base_url,
|
70
|
+
temperature=getattr(args, "temperature", 0.0),
|
70
71
|
)
|
71
72
|
|
72
73
|
# Run eval
|
@@ -119,6 +120,7 @@ if __name__ == "__main__":
|
|
119
120
|
parser.add_argument("--eval-name", type=str, default="mmlu")
|
120
121
|
parser.add_argument("--num-examples", type=int)
|
121
122
|
parser.add_argument("--num-threads", type=int, default=512)
|
123
|
+
parser.add_argument("--temperature", type=float, default=0.0)
|
122
124
|
args = parser.parse_args()
|
123
125
|
|
124
126
|
run_eval(args)
|
sglang/test/runners.py
CHANGED
@@ -102,8 +102,10 @@ class HFRunner:
|
|
102
102
|
return False
|
103
103
|
|
104
104
|
def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
|
105
|
-
|
105
|
+
# Apply model-specific patches
|
106
|
+
monkey_patch_gemma2_sdpa()
|
106
107
|
|
108
|
+
# Load the model and tokenizer
|
107
109
|
if self.model_type == "generation":
|
108
110
|
self.base_model = AutoModelForCausalLM.from_pretrained(
|
109
111
|
model_path,
|
@@ -128,7 +130,9 @@ class HFRunner:
|
|
128
130
|
).cuda()
|
129
131
|
else:
|
130
132
|
raise Exception(f"Unrecognized model type {self.model_type}")
|
133
|
+
self.tokenizer = get_tokenizer(model_path, torch_dtype=torch.dtype)
|
131
134
|
|
135
|
+
# Run forward
|
132
136
|
while True:
|
133
137
|
prompts, max_new_tokens, lora_paths = in_queue.get()
|
134
138
|
if lora_paths is not None:
|
@@ -370,3 +374,18 @@ class SRTRunner:
|
|
370
374
|
def __exit__(self, exc_type, exc_value, traceback):
|
371
375
|
self.runtime.shutdown()
|
372
376
|
del self.runtime
|
377
|
+
|
378
|
+
|
379
|
+
def monkey_patch_gemma2_sdpa():
|
380
|
+
"""
|
381
|
+
Use sdpa by default to fix the OOM issue.
|
382
|
+
Revert this commit:
|
383
|
+
https://github.com/huggingface/transformers/commit/975b988bfe6e7ebb47390cd9a1556c6888804883#diff-5f76eac6f18f4b491521314c318a9692318feb4d19228e9576cce7bde4240834R660
|
384
|
+
"""
|
385
|
+
from transformers.models.gemma2.modeling_gemma2 import Gemma2PreTrainedModel
|
386
|
+
|
387
|
+
def _check_and_enable_sdpa(config, hard_check_only: bool = False):
|
388
|
+
config._attn_implementation = "sdpa"
|
389
|
+
return config
|
390
|
+
|
391
|
+
setattr(Gemma2PreTrainedModel, "_check_and_enable_sdpa", _check_and_enable_sdpa)
|
sglang/test/test_utils.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
5
|
import os
|
6
|
+
import random
|
6
7
|
import subprocess
|
7
8
|
import threading
|
8
9
|
import time
|
@@ -20,6 +21,7 @@ from sglang.global_config import global_config
|
|
20
21
|
from sglang.lang.backend.openai import OpenAI
|
21
22
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
22
23
|
from sglang.srt.utils import kill_child_process
|
24
|
+
from sglang.test.run_eval import run_eval
|
23
25
|
from sglang.utils import get_exception_traceback
|
24
26
|
|
25
27
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
@@ -400,7 +402,7 @@ def popen_launch_server(
|
|
400
402
|
api_key: Optional[str] = None,
|
401
403
|
other_args: tuple = (),
|
402
404
|
env: Optional[dict] = None,
|
403
|
-
return_stdout_stderr:
|
405
|
+
return_stdout_stderr: Optional[tuple] = None,
|
404
406
|
):
|
405
407
|
_, host, port = base_url.split(":")
|
406
408
|
host = host[2:]
|
@@ -423,8 +425,8 @@ def popen_launch_server(
|
|
423
425
|
if return_stdout_stderr:
|
424
426
|
process = subprocess.Popen(
|
425
427
|
command,
|
426
|
-
stdout=
|
427
|
-
stderr=
|
428
|
+
stdout=return_stdout_stderr[0],
|
429
|
+
stderr=return_stdout_stderr[1],
|
428
430
|
env=env,
|
429
431
|
text=True,
|
430
432
|
)
|
@@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
|
631
633
|
rouge_l_scores.append(fmeasure)
|
632
634
|
|
633
635
|
return rouge_l_scores
|
636
|
+
|
637
|
+
|
638
|
+
STDOUT_FILENAME = "stdout.txt"
|
639
|
+
STDERR_FILENAME = "stderr.txt"
|
640
|
+
|
641
|
+
|
642
|
+
def read_output(output_lines):
|
643
|
+
"""Print the output in real time with another thread."""
|
644
|
+
while not os.path.exists(STDERR_FILENAME):
|
645
|
+
time.sleep(1)
|
646
|
+
|
647
|
+
pt = 0
|
648
|
+
while pt >= 0:
|
649
|
+
if pt > 0 and not os.path.exists(STDERR_FILENAME):
|
650
|
+
break
|
651
|
+
lines = open(STDERR_FILENAME).readlines()
|
652
|
+
for line in lines[pt:]:
|
653
|
+
print(line, end="", flush=True)
|
654
|
+
output_lines.append(line)
|
655
|
+
pt += 1
|
656
|
+
time.sleep(0.1)
|
657
|
+
|
658
|
+
|
659
|
+
def run_mmlu_test(
|
660
|
+
disable_radix_cache,
|
661
|
+
enable_mixed_chunk=False,
|
662
|
+
enable_overlap=False,
|
663
|
+
chunked_prefill_size=32,
|
664
|
+
):
|
665
|
+
other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
|
666
|
+
if disable_radix_cache:
|
667
|
+
other_args += ["--disable-radix-cache"]
|
668
|
+
if enable_mixed_chunk:
|
669
|
+
other_args += ["--enable-mixed-chunk"]
|
670
|
+
if enable_overlap:
|
671
|
+
other_args += ["--enable-overlap-scheduler"]
|
672
|
+
|
673
|
+
model = DEFAULT_MODEL_NAME_FOR_TEST
|
674
|
+
port = random.randint(4000, 5000)
|
675
|
+
base_url = f"http://127.0.0.1:{port}"
|
676
|
+
|
677
|
+
# Create files and launch the server
|
678
|
+
stdout = open(STDOUT_FILENAME, "w")
|
679
|
+
stderr = open(STDERR_FILENAME, "w")
|
680
|
+
process = popen_launch_server(
|
681
|
+
model,
|
682
|
+
base_url,
|
683
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
684
|
+
other_args=other_args,
|
685
|
+
return_stdout_stderr=(stdout, stderr),
|
686
|
+
)
|
687
|
+
|
688
|
+
# Launch a thread to stream the output
|
689
|
+
output_lines = []
|
690
|
+
t = threading.Thread(target=read_output, args=(output_lines,))
|
691
|
+
t.start()
|
692
|
+
|
693
|
+
# Run the eval
|
694
|
+
args = SimpleNamespace(
|
695
|
+
base_url=base_url,
|
696
|
+
model=model,
|
697
|
+
eval_name="mmlu",
|
698
|
+
num_examples=128,
|
699
|
+
num_threads=128,
|
700
|
+
)
|
701
|
+
|
702
|
+
try:
|
703
|
+
metrics = run_eval(args)
|
704
|
+
print(f"{metrics=}")
|
705
|
+
assert metrics["score"] >= 0.65
|
706
|
+
finally:
|
707
|
+
pass
|
708
|
+
|
709
|
+
# Clean up everything
|
710
|
+
kill_child_process(process.pid)
|
711
|
+
kill_child_process(process.pid)
|
712
|
+
stdout.close()
|
713
|
+
stderr.close()
|
714
|
+
if os.path.exists(STDOUT_FILENAME):
|
715
|
+
os.remove(STDOUT_FILENAME)
|
716
|
+
if os.path.exists(STDERR_FILENAME):
|
717
|
+
os.remove(STDERR_FILENAME)
|
718
|
+
t.join()
|
719
|
+
|
720
|
+
# Assert success
|
721
|
+
has_new_server = False
|
722
|
+
has_leak = False
|
723
|
+
for line in output_lines:
|
724
|
+
if "The server is fired" in line:
|
725
|
+
has_new_server = True
|
726
|
+
if "leak" in line:
|
727
|
+
has_leak = True
|
728
|
+
|
729
|
+
assert has_new_server
|
730
|
+
# assert not has_leak
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.4"
|
1
|
+
__version__ = "0.3.4.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.4
|
3
|
+
Version: 0.3.4.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -259,7 +259,7 @@ Requires-Dist: modelscope; extra == "runtime-common"
|
|
259
259
|
Provides-Extra: srt
|
260
260
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
261
261
|
Requires-Dist: torch; extra == "srt"
|
262
|
-
Requires-Dist: vllm==0.
|
262
|
+
Requires-Dist: vllm==0.6.3.post1; extra == "srt"
|
263
263
|
Provides-Extra: srt_xpu
|
264
264
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
265
265
|
Provides-Extra: test
|
@@ -284,17 +284,17 @@ Requires-Dist: peft; extra == "test"
|
|
284
284
|
--------------------------------------------------------------------------------
|
285
285
|
|
286
286
|
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
287
|
-
[**Join Bi-Weekly Development Meeting
|
287
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
288
288
|
|
289
289
|
## News
|
290
290
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
291
291
|
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
292
292
|
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
293
|
-
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
294
293
|
|
295
294
|
<details>
|
296
295
|
<summary>More</summary>
|
297
296
|
|
297
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
298
298
|
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
299
299
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
300
300
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
|
|
328
328
|
pip install --upgrade pip
|
329
329
|
pip install "sglang[all]"
|
330
330
|
|
331
|
-
# Install FlashInfer
|
331
|
+
# Install FlashInfer accelerated kernels
|
332
332
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
333
333
|
```
|
334
334
|
|
335
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
336
|
+
|
335
337
|
### Method 2: From source
|
336
338
|
```
|
337
339
|
# Use the last release branch
|
338
|
-
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
340
|
+
git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
|
339
341
|
cd sglang
|
340
342
|
|
341
343
|
pip install --upgrade pip
|
342
344
|
pip install -e "python[all]"
|
343
345
|
|
344
|
-
# Install FlashInfer
|
346
|
+
# Install FlashInfer accelerated kernels
|
345
347
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
346
348
|
```
|
347
349
|
|
350
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
351
|
+
|
348
352
|
### Method 3: Using docker
|
349
353
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
350
354
|
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
498
502
|
```
|
499
503
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
500
504
|
```
|
501
|
-
- To enable
|
505
|
+
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
|
506
|
+
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
|
502
507
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
503
508
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
504
509
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
|
|
519
524
|
```python
|
520
525
|
import sglang as sgl
|
521
526
|
|
522
|
-
|
523
527
|
def main():
|
524
528
|
prompts = [
|
525
529
|
"Hello, my name is",
|
@@ -539,12 +543,8 @@ if __name__ == "__main__":
|
|
539
543
|
main()
|
540
544
|
```
|
541
545
|
|
542
|
-
This can be used for
|
543
|
-
|
544
|
-
1. **Offline Batch Inference**
|
545
|
-
2. **Building Custom Servers**
|
546
|
-
|
547
|
-
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
546
|
+
This can be used for offline batch inference and building custom servers.
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
|
548
548
|
|
549
549
|
### Supported Models
|
550
550
|
|
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
552
552
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
553
553
|
- Mistral / Mixtral / Mistral NeMo
|
554
554
|
- Gemma / Gemma 2
|
555
|
-
- Qwen / Qwen 2 / Qwen 2 MoE
|
555
|
+
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
|
556
556
|
- DeepSeek / DeepSeek 2
|
557
557
|
- OLMoE
|
558
558
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -575,6 +575,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
575
575
|
- MiniCPM / MiniCPM 3
|
576
576
|
- XVERSE / XVERSE MoE
|
577
577
|
- SmolLM
|
578
|
+
- GLM-4
|
578
579
|
|
579
580
|
**Embedding Models**
|
580
581
|
|
@@ -711,7 +712,6 @@ print(state["answer_1"])
|
|
711
712
|
```
|
712
713
|
|
713
714
|
#### More Examples
|
714
|
-
|
715
715
|
Anthropic and VertexAI (Gemini) models are also supported.
|
716
716
|
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
717
717
|
|
@@ -892,7 +892,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
|
|
892
892
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
893
893
|
|
894
894
|
|
895
|
-
|
896
895
|
<p align="center">
|
897
896
|
<a href="#sglangtop" target="_blank">
|
898
897
|
<bold>Back To Top </bold>
|