sglang 0.2.13__py3-none-any.whl → 0.2.14.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +6 -0
- sglang/bench_latency.py +7 -3
- sglang/bench_serving.py +50 -26
- sglang/check_env.py +15 -0
- sglang/lang/chat_template.py +10 -5
- sglang/lang/compiler.py +4 -0
- sglang/lang/interpreter.py +1 -0
- sglang/lang/ir.py +9 -0
- sglang/launch_server.py +8 -1
- sglang/srt/constrained/fsm_cache.py +11 -2
- sglang/srt/constrained/jump_forward.py +1 -0
- sglang/srt/conversation.py +50 -1
- sglang/srt/hf_transformers_utils.py +22 -23
- sglang/srt/layers/activation.py +100 -1
- sglang/srt/layers/decode_attention.py +338 -50
- sglang/srt/layers/fused_moe/layer.py +2 -2
- sglang/srt/layers/logits_processor.py +56 -19
- sglang/srt/layers/radix_attention.py +3 -4
- sglang/srt/layers/sampler.py +101 -0
- sglang/srt/managers/controller_multi.py +2 -8
- sglang/srt/managers/controller_single.py +7 -10
- sglang/srt/managers/detokenizer_manager.py +20 -9
- sglang/srt/managers/io_struct.py +44 -11
- sglang/srt/managers/policy_scheduler.py +5 -2
- sglang/srt/managers/schedule_batch.py +46 -166
- sglang/srt/managers/tokenizer_manager.py +192 -83
- sglang/srt/managers/tp_worker.py +118 -24
- sglang/srt/mem_cache/memory_pool.py +82 -8
- sglang/srt/mm_utils.py +79 -7
- sglang/srt/model_executor/cuda_graph_runner.py +32 -8
- sglang/srt/model_executor/forward_batch_info.py +51 -26
- sglang/srt/model_executor/model_runner.py +201 -58
- sglang/srt/models/gemma2.py +10 -6
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +11 -1
- sglang/srt/models/llama_embedding.py +4 -0
- sglang/srt/models/llava.py +176 -59
- sglang/srt/models/qwen2.py +9 -3
- sglang/srt/openai_api/adapter.py +200 -39
- sglang/srt/openai_api/protocol.py +2 -0
- sglang/srt/sampling/sampling_batch_info.py +136 -0
- sglang/srt/{sampling_params.py → sampling/sampling_params.py} +22 -0
- sglang/srt/server.py +92 -57
- sglang/srt/server_args.py +43 -15
- sglang/srt/utils.py +26 -16
- sglang/test/runners.py +22 -30
- sglang/test/simple_eval_common.py +9 -10
- sglang/test/simple_eval_gpqa.py +2 -1
- sglang/test/simple_eval_humaneval.py +2 -2
- sglang/test/simple_eval_math.py +2 -1
- sglang/test/simple_eval_mmlu.py +2 -1
- sglang/test/test_activation.py +55 -0
- sglang/test/test_utils.py +36 -53
- sglang/version.py +1 -1
- {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/METADATA +100 -27
- sglang-0.2.14.post1.dist-info/RECORD +114 -0
- {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/WHEEL +1 -1
- sglang/launch_server_llavavid.py +0 -29
- sglang-0.2.13.dist-info/RECORD +0 -112
- {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/LICENSE +0 -0
- {sglang-0.2.13.dist-info → sglang-0.2.14.post1.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,12 @@
|
|
1
1
|
# Adapted from https://github.com/openai/simple-evals/
|
2
2
|
|
3
|
-
import base64
|
4
3
|
import os
|
5
4
|
import resource
|
6
5
|
import time
|
7
6
|
from collections import defaultdict
|
8
7
|
from dataclasses import dataclass, field
|
9
8
|
from multiprocessing.pool import ThreadPool
|
10
|
-
from typing import Any, Dict, List, Tuple
|
9
|
+
from typing import Any, Dict, List, Optional, Tuple
|
11
10
|
|
12
11
|
import httpx
|
13
12
|
import jinja2
|
@@ -44,8 +43,8 @@ class EvalResult:
|
|
44
43
|
Result of running an evaluation (usually consisting of many samples)
|
45
44
|
"""
|
46
45
|
|
47
|
-
score: float
|
48
|
-
metrics: Dict[str, float]
|
46
|
+
score: Optional[float] # top-line metric
|
47
|
+
metrics: Optional[Dict[str, float]] # other metrics
|
49
48
|
htmls: List[str] # strings of valid HTML
|
50
49
|
convos: List[MessageList] # sampled conversations
|
51
50
|
|
@@ -56,10 +55,10 @@ class SingleEvalResult:
|
|
56
55
|
Result of evaluating a single sample
|
57
56
|
"""
|
58
57
|
|
59
|
-
score: float
|
58
|
+
score: Optional[float]
|
60
59
|
metrics: Dict[str, float] = field(default_factory=dict)
|
61
|
-
html: str
|
62
|
-
convo: MessageList
|
60
|
+
html: Optional[str] = None
|
61
|
+
convo: Optional[MessageList] = None # sampled conversation
|
63
62
|
|
64
63
|
|
65
64
|
class Eval:
|
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
|
|
89
88
|
def __init__(
|
90
89
|
self,
|
91
90
|
base_url: str = None,
|
92
|
-
model: str
|
93
|
-
system_message: str
|
91
|
+
model: Optional[str] = None,
|
92
|
+
system_message: Optional[str] = None,
|
94
93
|
temperature: float = 0.0,
|
95
94
|
max_tokens: int = 2048,
|
96
95
|
):
|
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
|
|
272
271
|
def aggregate_results(
|
273
272
|
single_eval_results: List[SingleEvalResult],
|
274
273
|
default_stats: Tuple[str] = ("mean", "std"),
|
275
|
-
name2stats: Dict[str, Tuple[str]]
|
274
|
+
name2stats: Optional[Dict[str, Tuple[str]]] = None,
|
276
275
|
) -> EvalResult:
|
277
276
|
"""
|
278
277
|
Aggregate results from multiple evaluations into a single EvalResult.
|
sglang/test/simple_eval_gpqa.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
|
|
28
29
|
def __init__(
|
29
30
|
self,
|
30
31
|
filename: str,
|
31
|
-
num_examples: int
|
32
|
+
num_examples: Optional[int],
|
32
33
|
num_threads: int,
|
33
34
|
n_repeats: int = 1,
|
34
35
|
):
|
@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
|
|
9
9
|
import random
|
10
10
|
import re
|
11
11
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
12
|
-
from typing import Dict, List
|
12
|
+
from typing import Dict, List, Optional
|
13
13
|
|
14
14
|
import tqdm
|
15
15
|
|
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
|
|
61
61
|
class HumanEval(Eval):
|
62
62
|
def __init__(
|
63
63
|
self,
|
64
|
-
num_examples: int
|
64
|
+
num_examples: Optional[int],
|
65
65
|
num_threads: int,
|
66
66
|
num_samples_per_task: int = 5,
|
67
67
|
ks_passes: List[int] = [1, 2, 5],
|
sglang/test/simple_eval_math.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -36,7 +37,7 @@ class MathEval(Eval):
|
|
36
37
|
self,
|
37
38
|
filename: str,
|
38
39
|
equality_checker: SamplerBase,
|
39
|
-
num_examples: int
|
40
|
+
num_examples: Optional[int],
|
40
41
|
num_threads: int,
|
41
42
|
):
|
42
43
|
df = pandas.read_csv(filename)
|
sglang/test/simple_eval_mmlu.py
CHANGED
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
|
|
8
8
|
|
9
9
|
import random
|
10
10
|
import re
|
11
|
+
from typing import Optional
|
11
12
|
|
12
13
|
import pandas
|
13
14
|
|
@@ -84,7 +85,7 @@ subject2category = {
|
|
84
85
|
|
85
86
|
|
86
87
|
class MMLUEval(Eval):
|
87
|
-
def __init__(self, filename: str, num_examples: int
|
88
|
+
def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
|
88
89
|
df = pandas.read_csv(filename)
|
89
90
|
examples = [row.to_dict() for _, row in df.iterrows()]
|
90
91
|
if num_examples:
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import itertools
|
2
|
+
import unittest
|
3
|
+
|
4
|
+
import torch
|
5
|
+
|
6
|
+
from sglang.srt.layers.activation import GeluAndMul
|
7
|
+
|
8
|
+
|
9
|
+
class TestGeluAndMul(unittest.TestCase):
|
10
|
+
DTYPES = [torch.half, torch.bfloat16]
|
11
|
+
NUM_TOKENS = [7, 83, 2048]
|
12
|
+
D = [512, 4096, 5120, 13824]
|
13
|
+
SEEDS = [0]
|
14
|
+
|
15
|
+
@classmethod
|
16
|
+
def setUpClass(cls):
|
17
|
+
if not torch.cuda.is_available():
|
18
|
+
raise unittest.SkipTest("CUDA is not available")
|
19
|
+
torch.set_default_device("cuda")
|
20
|
+
|
21
|
+
def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
|
22
|
+
torch.manual_seed(seed)
|
23
|
+
|
24
|
+
layer = GeluAndMul().to(dtype=dtype)
|
25
|
+
x = torch.randn(num_tokens, 2 * d, dtype=dtype)
|
26
|
+
|
27
|
+
with torch.inference_mode():
|
28
|
+
ref_out = layer.forward_native(x)
|
29
|
+
out = layer.forward_cuda(x)
|
30
|
+
|
31
|
+
if dtype == torch.bfloat16:
|
32
|
+
atol = rtol = 1e-2
|
33
|
+
else:
|
34
|
+
atol = rtol = 1e-3
|
35
|
+
|
36
|
+
self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
|
37
|
+
|
38
|
+
def test_gelu_and_mul(self):
|
39
|
+
for params in itertools.product(
|
40
|
+
self.NUM_TOKENS,
|
41
|
+
self.D,
|
42
|
+
self.DTYPES,
|
43
|
+
self.SEEDS,
|
44
|
+
):
|
45
|
+
with self.subTest(
|
46
|
+
num_tokens=params[0],
|
47
|
+
d=params[1],
|
48
|
+
dtype=params[2],
|
49
|
+
seed=params[3],
|
50
|
+
):
|
51
|
+
self._run_gelu_and_mul_test(*params)
|
52
|
+
|
53
|
+
|
54
|
+
if __name__ == "__main__":
|
55
|
+
unittest.main(verbosity=2)
|
sglang/test/test_utils.py
CHANGED
@@ -2,11 +2,10 @@
|
|
2
2
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
|
-
import
|
5
|
+
import os
|
6
6
|
import subprocess
|
7
7
|
import threading
|
8
8
|
import time
|
9
|
-
import unittest
|
10
9
|
from functools import partial
|
11
10
|
from typing import Callable, List, Optional
|
12
11
|
|
@@ -18,14 +17,19 @@ import torch.nn.functional as F
|
|
18
17
|
from sglang.global_config import global_config
|
19
18
|
from sglang.lang.backend.openai import OpenAI
|
20
19
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
20
|
+
from sglang.srt.utils import kill_child_process
|
21
21
|
from sglang.utils import get_exception_traceback
|
22
22
|
|
23
23
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
24
24
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
25
|
+
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
26
|
+
|
27
|
+
if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
|
28
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
|
29
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
|
30
|
+
else:
|
31
|
+
DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
|
32
|
+
DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
|
29
33
|
|
30
34
|
|
31
35
|
def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
|
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
|
|
104
108
|
return pred
|
105
109
|
|
106
110
|
|
107
|
-
def
|
108
|
-
|
109
|
-
from ginfer import sampler_pb2, sampler_pb2_grpc
|
110
|
-
|
111
|
-
sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
|
112
|
-
sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
|
113
|
-
|
114
|
-
if stop is None:
|
115
|
-
stop_strings = None
|
116
|
-
else:
|
117
|
-
stop_strings = [stop]
|
118
|
-
|
119
|
-
sample_request = sampler_pb2.SampleTextRequest(
|
120
|
-
prompt=prompt,
|
121
|
-
settings=sampler_pb2.SampleSettings(
|
122
|
-
max_len=max_tokens,
|
123
|
-
rng_seed=0,
|
124
|
-
temperature=max(temperature, 1e-7),
|
125
|
-
nucleus_p=1,
|
126
|
-
stop_strings=stop_strings,
|
127
|
-
),
|
128
|
-
)
|
129
|
-
stream = sampler.SampleText(sample_request)
|
130
|
-
response = "".join([x.text for x in stream])
|
131
|
-
return response
|
111
|
+
def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
|
112
|
+
raise NotImplementedError()
|
132
113
|
|
133
114
|
|
134
115
|
def call_generate_guidance(
|
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
271
252
|
"vllm",
|
272
253
|
"outlines",
|
273
254
|
"lightllm",
|
274
|
-
"
|
255
|
+
"gserver",
|
275
256
|
"guidance",
|
276
257
|
"lmql",
|
277
258
|
"srt-raw",
|
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
|
|
292
273
|
"lightllm": 22000,
|
293
274
|
"lmql": 23000,
|
294
275
|
"srt-raw": 30000,
|
295
|
-
"
|
276
|
+
"gserver": 9988,
|
296
277
|
}
|
297
278
|
args.port = default_port.get(args.backend, None)
|
298
279
|
return args
|
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
|
|
328
309
|
return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
|
329
310
|
elif args.backend == "srt-raw":
|
330
311
|
return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
|
331
|
-
elif args.backend == "
|
332
|
-
return partial(
|
312
|
+
elif args.backend == "gserver":
|
313
|
+
return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
|
333
314
|
elif args.backend == "outlines":
|
334
315
|
return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
|
335
316
|
elif args.backend == "guidance":
|
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
|
|
480
461
|
success = True
|
481
462
|
|
482
463
|
for filename in files:
|
464
|
+
global process
|
483
465
|
|
484
|
-
def
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
p.join()
|
466
|
+
def run_one_file(filename):
|
467
|
+
filename = os.path.join(os.getcwd(), filename)
|
468
|
+
print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
|
469
|
+
process = subprocess.Popen(
|
470
|
+
["python3", filename], stdout=None, stderr=None, env=os.environ
|
471
|
+
)
|
472
|
+
process.wait()
|
473
|
+
return process.returncode
|
493
474
|
|
494
475
|
try:
|
495
|
-
run_with_timeout(
|
496
|
-
|
497
|
-
|
498
|
-
|
476
|
+
ret_code = run_with_timeout(
|
477
|
+
run_one_file, args=(filename,), timeout=timeout_per_file
|
478
|
+
)
|
479
|
+
assert ret_code == 0
|
499
480
|
except TimeoutError:
|
500
|
-
|
481
|
+
kill_child_process(process.pid)
|
501
482
|
time.sleep(5)
|
502
483
|
print(
|
503
|
-
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
|
484
|
+
f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
|
485
|
+
flush=True,
|
504
486
|
)
|
505
|
-
|
487
|
+
success = False
|
488
|
+
break
|
506
489
|
|
507
490
|
if success:
|
508
|
-
print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
|
491
|
+
print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
509
492
|
else:
|
510
|
-
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
|
493
|
+
print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
|
511
494
|
|
512
495
|
return 0 if success else -1
|
513
496
|
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.2.
|
1
|
+
__version__ = "0.2.14.post1"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.14.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -231,6 +231,7 @@ Requires-Dist: openai>=1.0; extra == "openai"
|
|
231
231
|
Requires-Dist: tiktoken; extra == "openai"
|
232
232
|
Provides-Extra: srt
|
233
233
|
Requires-Dist: aiohttp; extra == "srt"
|
234
|
+
Requires-Dist: decord; extra == "srt"
|
234
235
|
Requires-Dist: fastapi; extra == "srt"
|
235
236
|
Requires-Dist: hf-transfer; extra == "srt"
|
236
237
|
Requires-Dist: huggingface-hub; extra == "srt"
|
@@ -244,12 +245,14 @@ Requires-Dist: torch; extra == "srt"
|
|
244
245
|
Requires-Dist: uvicorn; extra == "srt"
|
245
246
|
Requires-Dist: uvloop; extra == "srt"
|
246
247
|
Requires-Dist: zmq; extra == "srt"
|
247
|
-
Requires-Dist: vllm==0.5.
|
248
|
+
Requires-Dist: vllm==0.5.5; extra == "srt"
|
248
249
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
249
250
|
Provides-Extra: test
|
250
251
|
Requires-Dist: jsonlines; extra == "test"
|
251
252
|
Requires-Dist: matplotlib; extra == "test"
|
252
253
|
Requires-Dist: pandas; extra == "test"
|
254
|
+
Requires-Dist: sentence-transformers; extra == "test"
|
255
|
+
Requires-Dist: accelerate; extra == "test"
|
253
256
|
|
254
257
|
<div align="center">
|
255
258
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -270,17 +273,18 @@ SGLang is a fast serving framework for large language models and vision language
|
|
270
273
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
271
274
|
|
272
275
|
The core features include:
|
273
|
-
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism,
|
276
|
+
- **Fast Backend Runtime**: Efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, and quantization (AWQ/FP8/GPTQ/Marlin).
|
274
277
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
275
278
|
|
276
279
|
## News
|
277
280
|
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
278
|
-
- [2024/
|
281
|
+
- [2024/08] 🔥 LLaVA-OneVision with single-image, multi-image and video are supported ([blog](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)).
|
279
282
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
280
283
|
|
281
284
|
<details>
|
282
285
|
<summary>More</summary>
|
283
286
|
|
287
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
284
288
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
285
289
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
286
290
|
|
@@ -308,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
312
|
### Method 2: From source
|
309
313
|
```
|
310
314
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
315
|
+
git clone -b v0.2.14.post1 https://github.com/sgl-project/sglang.git
|
312
316
|
cd sglang
|
313
317
|
|
314
318
|
pip install --upgrade pip
|
@@ -334,14 +338,60 @@ docker run --gpus all \
|
|
334
338
|
|
335
339
|
### Method 4: Using docker compose
|
336
340
|
|
341
|
+
<details>
|
342
|
+
<summary>More</summary>
|
343
|
+
|
337
344
|
> This method is recommended if you plan to serve it as a service.
|
338
345
|
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
339
346
|
|
340
347
|
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
341
348
|
2. Execute the command `docker compose up -d` in your terminal.
|
349
|
+
</details>
|
350
|
+
|
351
|
+
### Method 5: Run on Kubernetes or Clouds with SkyPilot
|
352
|
+
|
353
|
+
<details>
|
354
|
+
<summary>More</summary>
|
355
|
+
|
356
|
+
To deploy on Kubernetes or 12+ clouds, you can use [SkyPilot](https://github.com/skypilot-org/skypilot).
|
357
|
+
|
358
|
+
1. Install SkyPilot and set up Kubernetes cluster or cloud access: see [SkyPilot's documentation](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html).
|
359
|
+
2. Deploy on your own infra with a single command and get the HTTP API endpoint:
|
360
|
+
<details>
|
361
|
+
<summary>SkyPilot YAML: <code>sglang.yaml</code></summary>
|
362
|
+
|
363
|
+
```yaml
|
364
|
+
# sglang.yaml
|
365
|
+
envs:
|
366
|
+
HF_TOKEN: null
|
367
|
+
|
368
|
+
resources:
|
369
|
+
image_id: docker:lmsysorg/sglang:latest
|
370
|
+
accelerators: A100
|
371
|
+
ports: 30000
|
372
|
+
|
373
|
+
run: |
|
374
|
+
conda deactivate
|
375
|
+
python3 -m sglang.launch_server \
|
376
|
+
--model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
|
377
|
+
--host 0.0.0.0 \
|
378
|
+
--port 30000
|
379
|
+
```
|
380
|
+
</details>
|
381
|
+
|
382
|
+
```bash
|
383
|
+
# Deploy on any cloud or Kubernetes cluster. Use --cloud <cloud> to select a specific cloud provider.
|
384
|
+
HF_TOKEN=<secret> sky launch -c sglang --env HF_TOKEN sglang.yaml
|
385
|
+
|
386
|
+
# Get the HTTP API endpoint
|
387
|
+
sky status --endpoint 30000 sglang
|
388
|
+
```
|
389
|
+
3. To further scale up your deployment with autoscaling and failure recovery, check out the [SkyServe + SGLang guide](https://github.com/skypilot-org/skypilot/tree/master/llm/sglang#serving-llama-2-with-sglang-for-more-traffic-using-skyserve).
|
390
|
+
</details>
|
391
|
+
|
342
392
|
|
343
393
|
### Common Notes
|
344
|
-
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang.
|
394
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. It only supports sm75 and above. If you encounter any FlashInfer-related issues on sm75+ devices (e.g., T4, A10, A100, L4, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise an issue.
|
345
395
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
346
396
|
|
347
397
|
## Backend: SGLang Runtime (SRT)
|
@@ -395,6 +445,13 @@ response = client.chat.completions.create(
|
|
395
445
|
max_tokens=64,
|
396
446
|
)
|
397
447
|
print(response)
|
448
|
+
|
449
|
+
# Text embedding
|
450
|
+
response = client.embeddings.create(
|
451
|
+
model="default",
|
452
|
+
input="How are you today",
|
453
|
+
)
|
454
|
+
print(response)
|
398
455
|
```
|
399
456
|
|
400
457
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
@@ -431,19 +488,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
431
488
|
|
432
489
|
### Supported Models
|
433
490
|
|
491
|
+
**Generative Models**
|
492
|
+
|
434
493
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
435
494
|
- Mistral / Mixtral / Mistral NeMo
|
436
495
|
- Gemma / Gemma 2
|
437
496
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
438
497
|
- DeepSeek / DeepSeek 2
|
439
|
-
- LLaVA
|
440
|
-
- `
|
441
|
-
-
|
442
|
-
|
443
|
-
-
|
444
|
-
-
|
498
|
+
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
499
|
+
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava --chunked-prefill-size=16384`
|
500
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
501
|
+
- LLaVA 1.5 / 1.6 / NeXT
|
502
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llama3-llava-next-8b --port=30000 --tp-size=1 --chat-template=llava_llama_3`
|
503
|
+
- `python -m sglang.launch_server --model-path lmms-lab/llava-next-72b --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
504
|
+
- Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
|
445
505
|
- Yi-VL
|
446
|
-
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
447
506
|
- StableLM
|
448
507
|
- Command-R
|
449
508
|
- DBRX
|
@@ -451,37 +510,52 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
451
510
|
- ChatGLM
|
452
511
|
- InternLM 2
|
453
512
|
|
513
|
+
**Embedding Models**
|
514
|
+
|
515
|
+
- e5-mistral
|
516
|
+
- gte-Qwen2
|
517
|
+
- `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
|
518
|
+
|
454
519
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
455
520
|
|
456
521
|
#### Use Models From ModelScope
|
457
|
-
|
522
|
+
<details>
|
523
|
+
<summary>More</summary>
|
524
|
+
|
525
|
+
To use a model from [ModelScope](https://www.modelscope.cn), set the environment variable SGLANG_USE_MODELSCOPE.
|
458
526
|
```
|
459
527
|
export SGLANG_USE_MODELSCOPE=true
|
460
528
|
```
|
461
529
|
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
462
530
|
```
|
463
531
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
464
|
-
```
|
532
|
+
```
|
533
|
+
|
534
|
+
</details>
|
465
535
|
|
466
536
|
#### Run Llama 3.1 405B
|
537
|
+
<details>
|
538
|
+
<summary>More</summary>
|
467
539
|
|
468
540
|
```bash
|
469
|
-
|
541
|
+
# Run 405B (fp8) on a single node
|
470
542
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
471
543
|
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
# on the first node
|
476
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
544
|
+
# Run 405B (fp16) on two nodes
|
545
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
546
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph
|
477
547
|
|
478
|
-
|
479
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
548
|
+
## on the first node, replace the `172.16.4.52:20000` with your own first node ip address and port
|
549
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph
|
480
550
|
```
|
481
551
|
|
552
|
+
</details>
|
553
|
+
|
482
554
|
### Benchmark Performance
|
483
555
|
|
484
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
556
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`.
|
557
|
+
Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle.
|
558
|
+
A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, please use `sglang.bench_serving` instead.
|
485
559
|
```
|
486
560
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
487
561
|
```
|
@@ -614,7 +688,7 @@ def tip_suggestion(s):
|
|
614
688
|
s += "In summary" + sgl.gen("summary")
|
615
689
|
```
|
616
690
|
|
617
|
-
#### Multi
|
691
|
+
#### Multi-Modality
|
618
692
|
Use `sgl.image` to pass an image as input.
|
619
693
|
|
620
694
|
```python
|
@@ -668,7 +742,7 @@ def character_gen(s, name):
|
|
668
742
|
s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
|
669
743
|
```
|
670
744
|
|
671
|
-
See also [json_decode.py](examples/usage/json_decode.py) for an additional example
|
745
|
+
See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
|
672
746
|
|
673
747
|
#### Batching
|
674
748
|
Use `run_batch` to run a batch of requests with continuous batching.
|
@@ -730,7 +804,6 @@ def chat_example(s):
|
|
730
804
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
731
805
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
732
806
|
|
733
|
-
|
734
807
|
## Benchmark And Performance
|
735
808
|

|
736
809
|

|