sglang 0.3.1.post1__py3-none-any.whl → 0.3.1.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_latency.py +3 -1
- sglang/bench_server_latency.py +187 -0
- sglang/bench_serving.py +1 -1
- sglang/srt/layers/activation.py +6 -3
- sglang/srt/layers/layernorm.py +10 -7
- sglang/srt/layers/sampler.py +9 -2
- sglang/srt/managers/io_struct.py +3 -0
- sglang/srt/managers/policy_scheduler.py +49 -93
- sglang/srt/managers/schedule_batch.py +1 -1
- sglang/srt/managers/tp_worker.py +11 -6
- sglang/srt/model_executor/cuda_graph_runner.py +15 -14
- sglang/srt/model_executor/model_runner.py +13 -5
- sglang/srt/models/deepseek_v2.py +2 -2
- sglang/srt/models/llama.py +1 -3
- sglang/srt/models/llama_classification.py +2 -3
- sglang/srt/models/minicpm3.py +2 -2
- sglang/srt/models/xverse.py +1 -3
- sglang/srt/models/xverse_moe.py +1 -4
- sglang/srt/server_args.py +17 -21
- sglang/test/few_shot_gsm8k.py +8 -2
- sglang/test/test_utils.py +1 -0
- sglang/version.py +1 -1
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/METADATA +4 -5
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/RECORD +27 -26
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.1.post1.dist-info → sglang-0.3.1.post2.dist-info}/top_level.txt +0 -0
sglang/bench_latency.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of a
|
2
|
+
Benchmark the latency of running a single static batch.
|
3
|
+
This script does not launch a server and uses the low-level APIs.
|
4
|
+
It accepts arguments similar to those of launch_server.py.
|
3
5
|
|
4
6
|
# Usage (latency test)
|
5
7
|
## with dummy weights:
|
@@ -0,0 +1,187 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark the latency of serving a single batch with a real server.
|
3
|
+
This script launches a server and uses the HTTP interface.
|
4
|
+
It accepts arguments similar to those of launch_server.py.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
|
8
|
+
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
|
+
"""
|
10
|
+
|
11
|
+
import argparse
|
12
|
+
import dataclasses
|
13
|
+
import itertools
|
14
|
+
import json
|
15
|
+
import multiprocessing
|
16
|
+
import os
|
17
|
+
import time
|
18
|
+
from typing import Tuple
|
19
|
+
|
20
|
+
import numpy as np
|
21
|
+
import requests
|
22
|
+
|
23
|
+
from sglang.srt.server import launch_server
|
24
|
+
from sglang.srt.server_args import ServerArgs
|
25
|
+
from sglang.srt.utils import kill_child_process
|
26
|
+
|
27
|
+
|
28
|
+
@dataclasses.dataclass
|
29
|
+
class BenchArgs:
|
30
|
+
run_name: str = "default"
|
31
|
+
batch_size: Tuple[int] = (1,)
|
32
|
+
input_len: Tuple[int] = (1024,)
|
33
|
+
output_len: Tuple[int] = (16,)
|
34
|
+
result_filename: str = "result.jsonl"
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
38
|
+
parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
|
39
|
+
parser.add_argument(
|
40
|
+
"--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
|
41
|
+
)
|
42
|
+
parser.add_argument(
|
43
|
+
"--input-len", type=int, nargs="+", default=BenchArgs.input_len
|
44
|
+
)
|
45
|
+
parser.add_argument(
|
46
|
+
"--output-len", type=int, nargs="+", default=BenchArgs.output_len
|
47
|
+
)
|
48
|
+
parser.add_argument(
|
49
|
+
"--result-filename", type=str, default=BenchArgs.result_filename
|
50
|
+
)
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
54
|
+
# use the default value's type to case the args into correct types.
|
55
|
+
attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
|
56
|
+
return cls(
|
57
|
+
**{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
|
58
|
+
)
|
59
|
+
|
60
|
+
|
61
|
+
def launch_server_internal(server_args):
|
62
|
+
try:
|
63
|
+
launch_server(server_args)
|
64
|
+
except Exception as e:
|
65
|
+
raise e
|
66
|
+
finally:
|
67
|
+
kill_child_process(os.getpid(), including_parent=False)
|
68
|
+
|
69
|
+
|
70
|
+
def launch_server_process(server_args: ServerArgs):
|
71
|
+
proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
|
72
|
+
proc.start()
|
73
|
+
base_url = f"http://{server_args.host}:{server_args.port}"
|
74
|
+
timeout = 600
|
75
|
+
|
76
|
+
start_time = time.time()
|
77
|
+
while time.time() - start_time < timeout:
|
78
|
+
try:
|
79
|
+
headers = {
|
80
|
+
"Content-Type": "application/json; charset=utf-8",
|
81
|
+
}
|
82
|
+
response = requests.get(f"{base_url}/v1/models", headers=headers)
|
83
|
+
if response.status_code == 200:
|
84
|
+
return proc, base_url
|
85
|
+
except requests.RequestException:
|
86
|
+
pass
|
87
|
+
time.sleep(10)
|
88
|
+
raise TimeoutError("Server failed to start within the timeout period.")
|
89
|
+
|
90
|
+
|
91
|
+
def run_one_case(
|
92
|
+
url: str,
|
93
|
+
batch_size: int,
|
94
|
+
input_len: int,
|
95
|
+
output_len: int,
|
96
|
+
run_name: str,
|
97
|
+
result_filename: str,
|
98
|
+
):
|
99
|
+
input_ids = [
|
100
|
+
[int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
|
101
|
+
for _ in range(batch_size)
|
102
|
+
]
|
103
|
+
|
104
|
+
tic = time.time()
|
105
|
+
response = requests.post(
|
106
|
+
url + "/generate",
|
107
|
+
json={
|
108
|
+
"input_ids": input_ids,
|
109
|
+
"sampling_params": {
|
110
|
+
"temperature": 0,
|
111
|
+
"max_new_tokens": output_len,
|
112
|
+
"ignore_eos": True,
|
113
|
+
},
|
114
|
+
},
|
115
|
+
)
|
116
|
+
latency = time.time() - tic
|
117
|
+
|
118
|
+
_ = response.json()
|
119
|
+
output_throughput = batch_size * output_len / latency
|
120
|
+
overall_throughput = batch_size * (input_len + output_len) / latency
|
121
|
+
|
122
|
+
print(f"batch size: {batch_size}")
|
123
|
+
print(f"latency: {latency:.2f} s")
|
124
|
+
print(f"output throughput: {output_throughput:.2f} token/s")
|
125
|
+
print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
|
126
|
+
|
127
|
+
if result_filename:
|
128
|
+
with open(result_filename, "a") as fout:
|
129
|
+
res = {
|
130
|
+
"run_name": run_name,
|
131
|
+
"batch_size": batch_size,
|
132
|
+
"input_len": input_len,
|
133
|
+
"output_len": output_len,
|
134
|
+
"latency": round(latency, 4),
|
135
|
+
"output_throughput": round(output_throughput, 2),
|
136
|
+
"overall_throughput": round(overall_throughput, 2),
|
137
|
+
}
|
138
|
+
fout.write(json.dumps(res) + "\n")
|
139
|
+
|
140
|
+
|
141
|
+
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
142
|
+
proc, base_url = launch_server_process(server_args)
|
143
|
+
|
144
|
+
# warmup
|
145
|
+
run_one_case(
|
146
|
+
base_url,
|
147
|
+
batch_size=16,
|
148
|
+
input_len=1024,
|
149
|
+
output_len=16,
|
150
|
+
run_name="",
|
151
|
+
result_filename="",
|
152
|
+
)
|
153
|
+
|
154
|
+
# benchmark
|
155
|
+
try:
|
156
|
+
for bs, il, ol in itertools.product(
|
157
|
+
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
158
|
+
):
|
159
|
+
run_one_case(
|
160
|
+
base_url,
|
161
|
+
bs,
|
162
|
+
il,
|
163
|
+
ol,
|
164
|
+
bench_args.run_name,
|
165
|
+
bench_args.result_filename,
|
166
|
+
)
|
167
|
+
finally:
|
168
|
+
kill_child_process(proc.pid)
|
169
|
+
|
170
|
+
print(f"\nResults are saved to {bench_args.result_filename}")
|
171
|
+
|
172
|
+
|
173
|
+
if __name__ == "__main__":
|
174
|
+
parser = argparse.ArgumentParser()
|
175
|
+
ServerArgs.add_cli_args(parser)
|
176
|
+
BenchArgs.add_cli_args(parser)
|
177
|
+
# For this script, model-path is not required
|
178
|
+
assert (
|
179
|
+
parser._actions[1].option_strings[0] == "--model-path"
|
180
|
+
), "options changed, this code need to be updated"
|
181
|
+
parser._actions[1].required = False
|
182
|
+
args = parser.parse_args()
|
183
|
+
|
184
|
+
server_args = ServerArgs.from_cli_args(args)
|
185
|
+
bench_args = BenchArgs.from_cli_args(args)
|
186
|
+
|
187
|
+
run_benchmark(server_args, bench_args)
|
sglang/bench_serving.py
CHANGED
@@ -2,7 +2,7 @@
|
|
2
2
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
|
3
3
|
|
4
4
|
"""
|
5
|
-
Benchmark online serving.
|
5
|
+
Benchmark online serving with dynamic requests.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
sglang/srt/layers/activation.py
CHANGED
@@ -19,7 +19,12 @@ from typing import Optional
|
|
19
19
|
import torch
|
20
20
|
import torch.nn as nn
|
21
21
|
import torch.nn.functional as F
|
22
|
-
|
22
|
+
|
23
|
+
from sglang.srt.utils import is_hip
|
24
|
+
|
25
|
+
if not is_hip():
|
26
|
+
from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
|
27
|
+
|
23
28
|
from vllm.distributed import (
|
24
29
|
divide,
|
25
30
|
get_tensor_model_parallel_rank,
|
@@ -29,8 +34,6 @@ from vllm.model_executor.custom_op import CustomOp
|
|
29
34
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
30
35
|
from vllm.model_executor.utils import set_weight_attrs
|
31
36
|
|
32
|
-
from sglang.srt.utils import is_hip
|
33
|
-
|
34
37
|
logger = logging.getLogger(__name__)
|
35
38
|
|
36
39
|
|
sglang/srt/layers/layernorm.py
CHANGED
@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
|
|
20
20
|
|
21
21
|
import torch
|
22
22
|
import torch.nn as nn
|
23
|
-
from flashinfer.norm import (
|
24
|
-
fused_add_rmsnorm,
|
25
|
-
gemma_fused_add_rmsnorm,
|
26
|
-
gemma_rmsnorm,
|
27
|
-
rmsnorm,
|
28
|
-
)
|
29
|
-
from vllm.model_executor.custom_op import CustomOp
|
30
23
|
|
31
24
|
from sglang.srt.utils import is_hip
|
32
25
|
|
26
|
+
if not is_hip():
|
27
|
+
from flashinfer.norm import (
|
28
|
+
fused_add_rmsnorm,
|
29
|
+
gemma_fused_add_rmsnorm,
|
30
|
+
gemma_rmsnorm,
|
31
|
+
rmsnorm,
|
32
|
+
)
|
33
|
+
|
34
|
+
from vllm.model_executor.custom_op import CustomOp
|
35
|
+
|
33
36
|
logger = logging.getLogger(__name__)
|
34
37
|
|
35
38
|
|
sglang/srt/layers/sampler.py
CHANGED
@@ -31,8 +31,11 @@ class Sampler(nn.Module):
|
|
31
31
|
logits = logits.next_token_logits
|
32
32
|
|
33
33
|
# Post process logits
|
34
|
+
logits = logits.contiguous()
|
34
35
|
logits.div_(sampling_info.temperatures)
|
35
|
-
probs =
|
36
|
+
probs = torch.softmax(logits, dim=-1)
|
37
|
+
logits = None
|
38
|
+
del logits
|
36
39
|
|
37
40
|
if torch.any(torch.isnan(probs)):
|
38
41
|
logger.warning("Detected errors during sampling! NaN in the probability.")
|
@@ -53,7 +56,11 @@ class Sampler(nn.Module):
|
|
53
56
|
)
|
54
57
|
else:
|
55
58
|
batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
|
56
|
-
probs,
|
59
|
+
probs,
|
60
|
+
uniform_samples,
|
61
|
+
sampling_info.top_ks,
|
62
|
+
sampling_info.top_ps,
|
63
|
+
filter_apply_order="joint",
|
57
64
|
)
|
58
65
|
|
59
66
|
if not torch.all(success):
|
sglang/srt/managers/io_struct.py
CHANGED
@@ -133,6 +133,9 @@ class GenerateReqInput:
|
|
133
133
|
self.image_data = [None] * num
|
134
134
|
elif not isinstance(self.image_data, list):
|
135
135
|
self.image_data = [self.image_data] * num
|
136
|
+
elif isinstance(self.image_data, list):
|
137
|
+
# multi-image with n > 1
|
138
|
+
self.image_data = self.image_data * num
|
136
139
|
|
137
140
|
if self.sampling_params is None:
|
138
141
|
self.sampling_params = [{}] * num
|
@@ -119,19 +119,32 @@ class PrefillAdder:
|
|
119
119
|
self.running_batch = running_batch
|
120
120
|
self.new_token_ratio = new_token_ratio
|
121
121
|
self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
|
122
|
-
self.rem_total_tokens_ = self.rem_total_tokens
|
123
|
-
self.total_tokens = rem_total_tokens
|
124
122
|
self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
|
125
123
|
self.rem_chunk_tokens = rem_chunk_tokens
|
126
124
|
if self.rem_chunk_tokens is not None:
|
127
125
|
self.rem_chunk_tokens -= mixed_with_decode_tokens
|
128
126
|
|
127
|
+
self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
|
128
|
+
|
129
129
|
self.req_states = None
|
130
130
|
self.can_run_list = []
|
131
131
|
self.new_inflight_req = None
|
132
132
|
self.log_hit_tokens = 0
|
133
133
|
self.log_input_tokens = 0
|
134
134
|
|
135
|
+
if running_batch is not None:
|
136
|
+
# Pre-remove the tokens which will be occupied by the running requests
|
137
|
+
self.rem_total_tokens -= sum(
|
138
|
+
[
|
139
|
+
min(
|
140
|
+
(r.sampling_params.max_new_tokens - len(r.output_ids)),
|
141
|
+
CLIP_MAX_NEW_TOKENS,
|
142
|
+
)
|
143
|
+
* self.new_token_ratio
|
144
|
+
for r in running_batch.reqs
|
145
|
+
]
|
146
|
+
)
|
147
|
+
|
135
148
|
def no_remaining_tokens(self):
|
136
149
|
return (
|
137
150
|
self.rem_total_tokens <= 0
|
@@ -141,31 +154,14 @@ class PrefillAdder:
|
|
141
154
|
if self.rem_chunk_tokens is not None
|
142
155
|
else False
|
143
156
|
)
|
144
|
-
|
145
|
-
|
146
|
-
def remove_running_tokens(self, running_batch: ScheduleBatch):
|
147
|
-
self.rem_total_tokens -= sum(
|
148
|
-
[
|
149
|
-
min(
|
150
|
-
(r.sampling_params.max_new_tokens - len(r.output_ids)),
|
151
|
-
CLIP_MAX_NEW_TOKENS,
|
152
|
-
)
|
153
|
-
* self.new_token_ratio
|
154
|
-
for r in running_batch.reqs
|
155
|
-
]
|
156
|
-
)
|
157
|
-
self.rem_total_tokens_ -= sum(
|
158
|
-
[
|
159
|
-
r.sampling_params.max_new_tokens - len(r.output_ids)
|
160
|
-
for r in running_batch.reqs
|
161
|
-
]
|
157
|
+
or self.cur_rem_tokens <= 0
|
162
158
|
)
|
163
159
|
|
164
160
|
def _prefill_one_req(
|
165
161
|
self, prefix_len: int, extend_input_len: int, max_new_tokens: int
|
166
162
|
):
|
167
163
|
self.rem_total_tokens -= extend_input_len + max_new_tokens
|
168
|
-
self.
|
164
|
+
self.cur_rem_tokens -= extend_input_len
|
169
165
|
self.rem_input_tokens -= extend_input_len
|
170
166
|
if self.rem_chunk_tokens is not None:
|
171
167
|
self.rem_chunk_tokens -= extend_input_len
|
@@ -173,29 +169,7 @@ class PrefillAdder:
|
|
173
169
|
self.log_hit_tokens += prefix_len
|
174
170
|
self.log_input_tokens += extend_input_len
|
175
171
|
|
176
|
-
def add_inflight_req_ignore_eos(self, req: Req):
|
177
|
-
truncated = req.extend_input_len > self.rem_chunk_tokens
|
178
|
-
req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
|
179
|
-
req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
|
180
|
-
self.can_run_list.append(req)
|
181
|
-
|
182
|
-
self._prefill_one_req(
|
183
|
-
0,
|
184
|
-
req.extend_input_len,
|
185
|
-
(
|
186
|
-
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
|
187
|
-
if not truncated
|
188
|
-
else 0
|
189
|
-
),
|
190
|
-
)
|
191
|
-
|
192
|
-
# Return if chunked prefill not finished
|
193
|
-
return req if truncated else None
|
194
|
-
|
195
172
|
def add_inflight_req(self, req: Req):
|
196
|
-
if req.sampling_params.ignore_eos:
|
197
|
-
return self.add_inflight_req_ignore_eos(req)
|
198
|
-
|
199
173
|
truncated = req.extend_input_len > self.rem_chunk_tokens
|
200
174
|
req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
|
201
175
|
req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
|
@@ -225,7 +199,7 @@ class PrefillAdder:
|
|
225
199
|
self.rem_total_tokens += delta
|
226
200
|
|
227
201
|
def add_one_req_ignore_eos(self, req: Req):
|
228
|
-
def
|
202
|
+
def add_req_state(r, insert_sort=False):
|
229
203
|
new_token_ratio = (
|
230
204
|
1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
|
231
205
|
)
|
@@ -235,56 +209,38 @@ class PrefillAdder:
|
|
235
209
|
tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
|
236
210
|
|
237
211
|
if tokens_left > 0:
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
can_run = False
|
244
|
-
if (
|
245
|
-
req.extend_input_len + req.sampling_params.max_new_tokens
|
246
|
-
<= self.rem_total_tokens
|
247
|
-
):
|
248
|
-
can_run = True
|
249
|
-
|
250
|
-
if not can_run:
|
251
|
-
if self.req_states is None:
|
252
|
-
self.req_states = []
|
253
|
-
if self.running_batch is not None:
|
254
|
-
for r in self.running_batch.reqs:
|
255
|
-
state = get_req_state(r)
|
256
|
-
if state is not None:
|
257
|
-
self.req_states.append(state)
|
258
|
-
for r in self.can_run_list:
|
259
|
-
state = get_req_state(r)
|
260
|
-
if state is not None:
|
261
|
-
self.req_states.append(state)
|
262
|
-
state = get_req_state(req)
|
263
|
-
if state is not None:
|
264
|
-
self.req_states.append(state)
|
265
|
-
|
266
|
-
self.req_states.sort(key=lambda x: x[0])
|
267
|
-
else:
|
268
|
-
state = get_req_state(req)
|
269
|
-
if state is not None:
|
270
|
-
for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
|
271
|
-
if tokens_left >= state[0]:
|
272
|
-
self.req_states.insert(i, state)
|
212
|
+
if not insert_sort:
|
213
|
+
self.req_states.append((tokens_left, tokens_occupied))
|
214
|
+
else:
|
215
|
+
for i in range(len(self.req_states)):
|
216
|
+
if tokens_left <= self.req_states[i][0]:
|
273
217
|
break
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
)
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
218
|
+
self.req_states.insert(i, (tokens_left, tokens_occupied))
|
219
|
+
|
220
|
+
if self.req_states is None:
|
221
|
+
self.req_states = []
|
222
|
+
add_req_state(req)
|
223
|
+
if self.running_batch is not None:
|
224
|
+
for r in self.running_batch.reqs:
|
225
|
+
add_req_state(r)
|
226
|
+
for r in self.can_run_list:
|
227
|
+
add_req_state(r)
|
228
|
+
self.req_states.sort(key=lambda x: x[0])
|
229
|
+
else:
|
230
|
+
add_req_state(req, insert_sort=True)
|
231
|
+
|
232
|
+
cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
|
233
|
+
tokens_freed = 0
|
234
|
+
for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
|
235
|
+
decode_steps = (
|
236
|
+
self.req_states[i + 1][0]
|
237
|
+
if i + 1 < len(self.req_states)
|
238
|
+
else tokens_left
|
239
|
+
)
|
240
|
+
bs = len(self.req_states) - i
|
241
|
+
if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
|
242
|
+
return False
|
243
|
+
tokens_freed += tokens_occupied
|
288
244
|
|
289
245
|
if req.extend_input_len <= self.rem_chunk_tokens:
|
290
246
|
self.can_run_list.append(req)
|
@@ -40,7 +40,7 @@ global_server_args_dict = {
|
|
40
40
|
"attention_backend": ServerArgs.attention_backend,
|
41
41
|
"sampling_backend": ServerArgs.sampling_backend,
|
42
42
|
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
43
|
-
"
|
43
|
+
"disable_mla": ServerArgs.disable_mla,
|
44
44
|
"torchao_config": ServerArgs.torchao_config,
|
45
45
|
}
|
46
46
|
|
sglang/srt/managers/tp_worker.py
CHANGED
@@ -445,9 +445,6 @@ class ModelTpServer:
|
|
445
445
|
num_mixed_running,
|
446
446
|
)
|
447
447
|
|
448
|
-
if self.running_batch is not None:
|
449
|
-
adder.remove_running_tokens(self.running_batch)
|
450
|
-
|
451
448
|
has_inflight = self.current_inflight_req is not None
|
452
449
|
if self.current_inflight_req is not None:
|
453
450
|
self.current_inflight_req.init_next_round_input(
|
@@ -465,9 +462,6 @@ class ModelTpServer:
|
|
465
462
|
)
|
466
463
|
|
467
464
|
for req in self.waiting_queue:
|
468
|
-
if adder.no_remaining_tokens():
|
469
|
-
break
|
470
|
-
req.init_next_round_input(None if prefix_computed else self.tree_cache)
|
471
465
|
if (
|
472
466
|
self.lora_paths is not None
|
473
467
|
and len(
|
@@ -478,6 +472,10 @@ class ModelTpServer:
|
|
478
472
|
> self.max_loras_per_batch
|
479
473
|
):
|
480
474
|
break
|
475
|
+
|
476
|
+
if adder.no_remaining_tokens():
|
477
|
+
break
|
478
|
+
req.init_next_round_input(None if prefix_computed else self.tree_cache)
|
481
479
|
res = adder.add_one_req(req)
|
482
480
|
if (
|
483
481
|
not res
|
@@ -507,6 +505,11 @@ class ModelTpServer:
|
|
507
505
|
else:
|
508
506
|
tree_cache_hit_rate = 0.0
|
509
507
|
|
508
|
+
num_used = self.max_total_num_tokens - (
|
509
|
+
self.token_to_kv_pool.available_size()
|
510
|
+
+ self.tree_cache.evictable_size()
|
511
|
+
)
|
512
|
+
|
510
513
|
if num_mixed_running > 0:
|
511
514
|
logger.info(
|
512
515
|
f"Prefill batch"
|
@@ -515,6 +518,7 @@ class ModelTpServer:
|
|
515
518
|
f"#new-token: {adder.log_input_tokens}, "
|
516
519
|
f"#cached-token: {adder.log_hit_tokens}, "
|
517
520
|
f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
|
521
|
+
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
518
522
|
f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
|
519
523
|
)
|
520
524
|
else:
|
@@ -524,6 +528,7 @@ class ModelTpServer:
|
|
524
528
|
f"#new-token: {adder.log_input_tokens}, "
|
525
529
|
f"#cached-token: {adder.log_hit_tokens}, "
|
526
530
|
f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
|
531
|
+
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
527
532
|
f"#running-req: {running_bs}, "
|
528
533
|
f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
|
529
534
|
)
|
@@ -108,6 +108,10 @@ class CudaGraphRunner:
|
|
108
108
|
self.capture_bs = list(range(1, 32)) + [64, 128]
|
109
109
|
else:
|
110
110
|
self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
|
111
|
+
|
112
|
+
self.capture_bs = [
|
113
|
+
bs for bs in self.capture_bs if bs <= model_runner.req_to_token_pool.size
|
114
|
+
]
|
111
115
|
self.compile_bs = (
|
112
116
|
[
|
113
117
|
bs
|
@@ -118,21 +122,8 @@ class CudaGraphRunner:
|
|
118
122
|
else []
|
119
123
|
)
|
120
124
|
|
121
|
-
# Common inputs
|
122
|
-
self.max_bs = max(self.capture_bs)
|
123
|
-
self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32, device="cuda")
|
124
|
-
self.req_pool_indices = torch.zeros(
|
125
|
-
(self.max_bs,), dtype=torch.int32, device="cuda"
|
126
|
-
)
|
127
|
-
self.seq_lens = torch.ones((self.max_bs,), dtype=torch.int32, device="cuda")
|
128
|
-
self.position_ids_offsets = torch.ones(
|
129
|
-
(self.max_bs,), dtype=torch.int32, device="cuda"
|
130
|
-
)
|
131
|
-
self.out_cache_loc = torch.zeros(
|
132
|
-
(self.max_bs,), dtype=torch.int32, device="cuda"
|
133
|
-
)
|
134
|
-
|
135
125
|
# Attention backend
|
126
|
+
self.max_bs = max(self.capture_bs)
|
136
127
|
self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
|
137
128
|
self.seq_len_fill_value = (
|
138
129
|
self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
|
@@ -141,6 +132,16 @@ class CudaGraphRunner:
|
|
141
132
|
if self.use_torch_compile:
|
142
133
|
set_torch_compile_config()
|
143
134
|
|
135
|
+
# Common inputs
|
136
|
+
with torch.device("cuda"):
|
137
|
+
self.input_ids = torch.zeros((self.max_bs,), dtype=torch.int32)
|
138
|
+
self.req_pool_indices = torch.zeros((self.max_bs,), dtype=torch.int32)
|
139
|
+
self.seq_lens = torch.full(
|
140
|
+
(self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
|
141
|
+
)
|
142
|
+
self.position_ids_offsets = torch.ones((self.max_bs,), dtype=torch.int32)
|
143
|
+
self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
|
144
|
+
|
144
145
|
# Capture
|
145
146
|
try:
|
146
147
|
self.capture()
|
@@ -86,12 +86,20 @@ class ModelRunner:
|
|
86
86
|
self.is_multimodal_model = is_multimodal_model(
|
87
87
|
self.model_config.hf_config.architectures
|
88
88
|
)
|
89
|
+
|
90
|
+
if (
|
91
|
+
self.model_config.attention_arch == AttentionArch.MLA
|
92
|
+
and not self.server_args.disable_mla
|
93
|
+
):
|
94
|
+
logger.info("MLA optimization is tunred on. Use triton backend.")
|
95
|
+
self.server_args.attention_backend = "triton"
|
96
|
+
|
89
97
|
global_server_args_dict.update(
|
90
98
|
{
|
91
99
|
"attention_backend": server_args.attention_backend,
|
92
100
|
"sampling_backend": server_args.sampling_backend,
|
93
101
|
"triton_attention_reduce_in_fp32": server_args.triton_attention_reduce_in_fp32,
|
94
|
-
"
|
102
|
+
"disable_mla": server_args.disable_mla,
|
95
103
|
"torchao_config": server_args.torchao_config,
|
96
104
|
}
|
97
105
|
)
|
@@ -329,7 +337,7 @@ class ModelRunner:
|
|
329
337
|
)
|
330
338
|
if (
|
331
339
|
self.model_config.attention_arch == AttentionArch.MLA
|
332
|
-
and self.server_args.
|
340
|
+
and not self.server_args.disable_mla
|
333
341
|
):
|
334
342
|
cell_size = (
|
335
343
|
(self.model_config.kv_lora_rank + self.model_config.qk_rope_head_dim)
|
@@ -392,12 +400,12 @@ class ModelRunner:
|
|
392
400
|
)
|
393
401
|
|
394
402
|
self.req_to_token_pool = ReqToTokenPool(
|
395
|
-
max_num_reqs,
|
396
|
-
self.model_config.context_len +
|
403
|
+
max_num_reqs + 1,
|
404
|
+
self.model_config.context_len + 4,
|
397
405
|
)
|
398
406
|
if (
|
399
407
|
self.model_config.attention_arch == AttentionArch.MLA
|
400
|
-
and self.server_args.
|
408
|
+
and not self.server_args.disable_mla
|
401
409
|
):
|
402
410
|
self.token_to_kv_pool = MLATokenToKVPool(
|
403
411
|
self.max_total_num_tokens,
|
sglang/srt/models/deepseek_v2.py
CHANGED
@@ -507,7 +507,7 @@ class DeepseekV2DecoderLayer(nn.Module):
|
|
507
507
|
rope_theta = getattr(config, "rope_theta", 10000)
|
508
508
|
rope_scaling = getattr(config, "rope_scaling", None)
|
509
509
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
510
|
-
if global_server_args_dict["
|
510
|
+
if not global_server_args_dict["disable_mla"]:
|
511
511
|
self.self_attn = DeepseekV2AttentionMLA(
|
512
512
|
config=config,
|
513
513
|
hidden_size=self.hidden_size,
|
@@ -732,7 +732,7 @@ class DeepseekV2ForCausalLM(nn.Module):
|
|
732
732
|
)
|
733
733
|
weight_loader(param, loaded_weight)
|
734
734
|
|
735
|
-
if global_server_args_dict["
|
735
|
+
if not global_server_args_dict["disable_mla"]:
|
736
736
|
for layer_id in range(self.config.num_hidden_layers):
|
737
737
|
self_attn = self.model.layers[layer_id].self_attn
|
738
738
|
w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
|
sglang/srt/models/llama.py
CHANGED
@@ -305,8 +305,6 @@ class LlamaForCausalLM(nn.Module):
|
|
305
305
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
306
306
|
self.logits_processor = LogitsProcessor(config)
|
307
307
|
|
308
|
-
self.param_dict = dict(self.named_parameters())
|
309
|
-
|
310
308
|
@torch.no_grad()
|
311
309
|
def forward(
|
312
310
|
self,
|
@@ -374,7 +372,7 @@ class LlamaForCausalLM(nn.Module):
|
|
374
372
|
(".gate_up_proj", ".gate_proj", 0),
|
375
373
|
(".gate_up_proj", ".up_proj", 1),
|
376
374
|
]
|
377
|
-
params_dict = self.
|
375
|
+
params_dict = dict(self.named_parameters())
|
378
376
|
|
379
377
|
for name, loaded_weight in weights:
|
380
378
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
@@ -36,6 +36,7 @@ class LlamaForClassification(nn.Module):
|
|
36
36
|
) -> None:
|
37
37
|
super().__init__()
|
38
38
|
self.config = config
|
39
|
+
self.torchao_config = None
|
39
40
|
self.quant_config = quant_config
|
40
41
|
self.model = LlamaModel(config, quant_config=quant_config)
|
41
42
|
|
@@ -44,8 +45,6 @@ class LlamaForClassification(nn.Module):
|
|
44
45
|
)
|
45
46
|
self.eos_token_id = config.eos_token_id
|
46
47
|
|
47
|
-
self.param_dict = dict(self.named_parameters())
|
48
|
-
|
49
48
|
@torch.no_grad()
|
50
49
|
def forward(
|
51
50
|
self,
|
@@ -77,7 +76,7 @@ class LlamaForClassification(nn.Module):
|
|
77
76
|
return logits_output
|
78
77
|
|
79
78
|
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
80
|
-
params_dict = self.
|
79
|
+
params_dict = dict(self.named_parameters())
|
81
80
|
|
82
81
|
for name, loaded_weight in weights:
|
83
82
|
if "classification_head" in name:
|
sglang/srt/models/minicpm3.py
CHANGED
@@ -419,7 +419,7 @@ class MiniCPM3DecoderLayer(nn.Module):
|
|
419
419
|
rope_theta = getattr(config, "rope_theta", 10000)
|
420
420
|
rope_scaling = getattr(config, "rope_scaling", None)
|
421
421
|
max_position_embeddings = getattr(config, "max_position_embeddings", 8192)
|
422
|
-
if global_server_args_dict["
|
422
|
+
if not global_server_args_dict["disable_mla"]:
|
423
423
|
self.self_attn = MiniCPM3AttentionMLA(
|
424
424
|
config=config,
|
425
425
|
hidden_size=self.hidden_size,
|
@@ -653,7 +653,7 @@ class MiniCPM3ForCausalLM(nn.Module):
|
|
653
653
|
)
|
654
654
|
weight_loader(param, loaded_weight)
|
655
655
|
|
656
|
-
if global_server_args_dict["
|
656
|
+
if not global_server_args_dict["disable_mla"]:
|
657
657
|
for layer_id in range(self.config.num_hidden_layers):
|
658
658
|
self_attn = self.model.layers[layer_id].self_attn
|
659
659
|
w_kc, w_vc = self_attn.kv_b_proj.weight.unflatten(
|
sglang/srt/models/xverse.py
CHANGED
@@ -307,8 +307,6 @@ class XverseForCausalLM(nn.Module):
|
|
307
307
|
self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
|
308
308
|
self.logits_processor = LogitsProcessor(config)
|
309
309
|
|
310
|
-
self.param_dict = dict(self.named_parameters())
|
311
|
-
|
312
310
|
@torch.no_grad()
|
313
311
|
def forward(
|
314
312
|
self,
|
@@ -333,7 +331,7 @@ class XverseForCausalLM(nn.Module):
|
|
333
331
|
("gate_up_proj", "gate_proj", 0),
|
334
332
|
("gate_up_proj", "up_proj", 1),
|
335
333
|
]
|
336
|
-
params_dict = self.
|
334
|
+
params_dict = dict(self.named_parameters())
|
337
335
|
|
338
336
|
def load_weights_per_param(name, loaded_weight):
|
339
337
|
if "rotary_emb.inv_freq" in name or "projector" in name:
|
sglang/srt/models/xverse_moe.py
CHANGED
@@ -383,8 +383,6 @@ class XverseMoeForCausalLM(nn.Module):
|
|
383
383
|
)
|
384
384
|
self.logits_processor = LogitsProcessor(config)
|
385
385
|
|
386
|
-
self.param_dict = dict(self.named_parameters())
|
387
|
-
|
388
386
|
@torch.no_grad()
|
389
387
|
def forward(
|
390
388
|
self,
|
@@ -406,8 +404,7 @@ class XverseMoeForCausalLM(nn.Module):
|
|
406
404
|
("gate_up_proj", "gate_proj", 0),
|
407
405
|
("gate_up_proj", "up_proj", 1),
|
408
406
|
]
|
409
|
-
|
410
|
-
params_dict = self.param_dict
|
407
|
+
params_dict = dict(self.named_parameters())
|
411
408
|
|
412
409
|
for name, loaded_weight in weights:
|
413
410
|
if "rotary_emb.inv_freq" in name:
|
sglang/srt/server_args.py
CHANGED
@@ -26,17 +26,6 @@ from sglang.srt.utils import is_hip
|
|
26
26
|
logger = logging.getLogger(__name__)
|
27
27
|
|
28
28
|
|
29
|
-
class LoRAPathAction(argparse.Action):
|
30
|
-
def __call__(self, parser, namespace, values, option_string=None):
|
31
|
-
setattr(namespace, self.dest, {})
|
32
|
-
for lora_path in values:
|
33
|
-
if "=" in lora_path:
|
34
|
-
name, path = lora_path.split("=", 1)
|
35
|
-
getattr(namespace, self.dest)[name] = path
|
36
|
-
else:
|
37
|
-
getattr(namespace, self.dest)[lora_path] = lora_path
|
38
|
-
|
39
|
-
|
40
29
|
@dataclasses.dataclass
|
41
30
|
class ServerArgs:
|
42
31
|
# Model and tokenizer
|
@@ -108,12 +97,12 @@ class ServerArgs:
|
|
108
97
|
disable_cuda_graph_padding: bool = False
|
109
98
|
disable_disk_cache: bool = False
|
110
99
|
disable_custom_all_reduce: bool = False
|
100
|
+
disable_mla: bool = False
|
111
101
|
enable_mixed_chunk: bool = False
|
112
102
|
enable_torch_compile: bool = False
|
113
103
|
max_torch_compile_bs: int = 32
|
114
104
|
torchao_config: str = ""
|
115
105
|
enable_p2p_check: bool = False
|
116
|
-
enable_mla: bool = False
|
117
106
|
triton_attention_reduce_in_fp32: bool = False
|
118
107
|
|
119
108
|
# LoRA
|
@@ -173,10 +162,6 @@ class ServerArgs:
|
|
173
162
|
self.sampling_backend = "pytorch"
|
174
163
|
|
175
164
|
# Default kernel backends
|
176
|
-
if self.enable_mla:
|
177
|
-
logger.info("MLA optimization is tunred on. Use triton backend.")
|
178
|
-
self.attention_backend = "triton"
|
179
|
-
|
180
165
|
if self.attention_backend is None:
|
181
166
|
self.attention_backend = "flashinfer"
|
182
167
|
|
@@ -514,6 +499,11 @@ class ServerArgs:
|
|
514
499
|
default=False,
|
515
500
|
help="Disable the custom all-reduce kernel and fall back to NCCL.",
|
516
501
|
)
|
502
|
+
parser.add_argument(
|
503
|
+
"--disable-mla",
|
504
|
+
action="store_true",
|
505
|
+
help="Disable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
506
|
+
)
|
517
507
|
parser.add_argument(
|
518
508
|
"--enable-mixed-chunk",
|
519
509
|
action="store_true",
|
@@ -541,11 +531,6 @@ class ServerArgs:
|
|
541
531
|
action="store_true",
|
542
532
|
help="Enable P2P check for GPU access, otherwise the p2p access is allowed by default.",
|
543
533
|
)
|
544
|
-
parser.add_argument(
|
545
|
-
"--enable-mla",
|
546
|
-
action="store_true",
|
547
|
-
help="Enable Multi-head Latent Attention (MLA) for DeepSeek-V2.",
|
548
|
-
)
|
549
534
|
parser.add_argument(
|
550
535
|
"--triton-attention-reduce-in-fp32",
|
551
536
|
action="store_true",
|
@@ -623,3 +608,14 @@ class PortArgs:
|
|
623
608
|
controller_port: int
|
624
609
|
detokenizer_port: int
|
625
610
|
nccl_ports: List[int]
|
611
|
+
|
612
|
+
|
613
|
+
class LoRAPathAction(argparse.Action):
|
614
|
+
def __call__(self, parser, namespace, values, option_string=None):
|
615
|
+
setattr(namespace, self.dest, {})
|
616
|
+
for lora_path in values:
|
617
|
+
if "=" in lora_path:
|
618
|
+
name, path = lora_path.split("=", 1)
|
619
|
+
getattr(namespace, self.dest)[name] = path
|
620
|
+
else:
|
621
|
+
getattr(namespace, self.dest)[lora_path] = lora_path
|
sglang/test/few_shot_gsm8k.py
CHANGED
@@ -44,7 +44,7 @@ def get_answer_value(answer_str):
|
|
44
44
|
return INVALID
|
45
45
|
|
46
46
|
|
47
|
-
def
|
47
|
+
def run_eval(args):
|
48
48
|
# Select backend
|
49
49
|
set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
|
50
50
|
|
@@ -119,6 +119,12 @@ def main(args):
|
|
119
119
|
# Dump results
|
120
120
|
dump_state_text("tmp_output_gsm8k.txt", states)
|
121
121
|
|
122
|
+
return {
|
123
|
+
"accuracy": acc,
|
124
|
+
"latency": latency,
|
125
|
+
"output_throughput": output_throughput,
|
126
|
+
}
|
127
|
+
|
122
128
|
|
123
129
|
if __name__ == "__main__":
|
124
130
|
parser = argparse.ArgumentParser()
|
@@ -129,4 +135,4 @@ if __name__ == "__main__":
|
|
129
135
|
parser.add_argument("--host", type=str, default="http://127.0.0.1")
|
130
136
|
parser.add_argument("--port", type=int, default=30000)
|
131
137
|
args = parser.parse_args()
|
132
|
-
|
138
|
+
run_eval(args)
|
sglang/test/test_utils.py
CHANGED
@@ -22,6 +22,7 @@ from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
|
22
22
|
from sglang.srt.utils import kill_child_process
|
23
23
|
from sglang.utils import get_exception_traceback
|
24
24
|
|
25
|
+
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
25
26
|
DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
|
26
27
|
DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
27
28
|
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.1.
|
1
|
+
__version__ = "0.3.1.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.1.
|
3
|
+
Version: 0.3.1.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -269,7 +269,7 @@ Requires-Dist: peft; extra == "test"
|
|
269
269
|
|
270
270
|
--------------------------------------------------------------------------------
|
271
271
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
272
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
|
273
273
|
|
274
274
|
SGLang is a fast serving framework for large language models and vision language models.
|
275
275
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -278,7 +278,7 @@ The core features include:
|
|
278
278
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
279
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
280
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption
|
281
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
282
282
|
|
283
283
|
## News
|
284
284
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.1.
|
321
|
+
git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
483
483
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
484
484
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
485
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
|
-
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
487
486
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
488
487
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
489
488
|
```
|
@@ -1,13 +1,14 @@
|
|
1
1
|
sglang/__init__.py,sha256=T8MYdFfKFPZcgFKHMBpOCIlFbhjwmr77Nqm6mdE6bCY,1590
|
2
2
|
sglang/api.py,sha256=pH4CjwOXUweL5MF1sIkFMddDxfnF7PyUxEHC5kvNVbI,6468
|
3
|
-
sglang/bench_latency.py,sha256=
|
4
|
-
sglang/
|
3
|
+
sglang/bench_latency.py,sha256=bA50iUYOxEnLjzY2S4AgwxtSAqujUbGfQFwbLZj5XNc,17160
|
4
|
+
sglang/bench_server_latency.py,sha256=KvFJgKQTSons7KOG0CBqnnOOx1gW29bBM1Z3GQO_6-E,5599
|
5
|
+
sglang/bench_serving.py,sha256=3gIJ1O2x51Fwd4wYJjgwluTbWKXL-azckQte7YC5zIc,36261
|
5
6
|
sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
|
6
7
|
sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
|
7
8
|
sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
|
8
9
|
sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
|
9
10
|
sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
|
10
|
-
sglang/version.py,sha256=
|
11
|
+
sglang/version.py,sha256=U9F0UlFDynnYN5dX-kxehylWCwXo9a6E6W4FfDusfRg,28
|
11
12
|
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
13
|
sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
|
13
14
|
sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
|
@@ -26,7 +27,7 @@ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19
|
|
26
27
|
sglang/srt/hf_transformers_utils.py,sha256=6HlqcmGPIvnSGaEEICeuzwag1QylSoSGbXRVvUdIMDo,6016
|
27
28
|
sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
|
28
29
|
sglang/srt/server.py,sha256=n4QRn36_t-HAH-lSME3tiZSCUGRQwqMUckgs0paHq5g,20179
|
29
|
-
sglang/srt/server_args.py,sha256=
|
30
|
+
sglang/srt/server_args.py,sha256=3XjDt6SSjTfbOe0HSXA--2aUvrpWSnQmAHYwmeS1-M0,23159
|
30
31
|
sglang/srt/utils.py,sha256=8yxiMRttCcfswynkNPWD3yZFNAGFz2P1PzSuxHCBGns,22340
|
31
32
|
sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
|
32
33
|
sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
|
@@ -35,14 +36,14 @@ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5
|
|
35
36
|
sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
|
36
37
|
sglang/srt/constrained/fsm_cache.py,sha256=k7DRUAaiLTEX5UarfJ17gEYQ-QWQAGfvykjYFkM_Y2U,2982
|
37
38
|
sglang/srt/constrained/jump_forward.py,sha256=9_HxmXtWjr5S6a5e0cBimbY3ZhiLiJC74V6jIqDXfuo,6575
|
38
|
-
sglang/srt/layers/activation.py,sha256=
|
39
|
+
sglang/srt/layers/activation.py,sha256=i3omgj3GdUIZBqJNUjpdJsMc2UM3Lx07FT2J1WICrqA,5171
|
39
40
|
sglang/srt/layers/attention_backend.py,sha256=lqMsY4VaOO_szIWoTAinXf1DnP2UsbF32kzvwFySz9w,18119
|
40
41
|
sglang/srt/layers/flashinfer_utils.py,sha256=jyaO7XiEisFZg_dfaCbfRCHSHSKYoM1wOzfHa0h1q14,7413
|
41
|
-
sglang/srt/layers/layernorm.py,sha256
|
42
|
+
sglang/srt/layers/layernorm.py,sha256=p_7bnmSpJ_slpoP0Gk5wQPpHtLllUu3imSIRBqGqTP0,3737
|
42
43
|
sglang/srt/layers/logits_processor.py,sha256=Js2qSk1Z3uPL2cYO1ARai51f2i8OedV3qdwByQVSJtI,12439
|
43
44
|
sglang/srt/layers/pooler.py,sha256=qNMG3Ycvt2yf9mk1Lcs-2K7oPeCuVeDYoHAxkMu9b_Q,1610
|
44
45
|
sglang/srt/layers/radix_attention.py,sha256=EcVO0fUSmgvE_9R-MlpgJq0O_uT8ACuHzbMi19bANYc,1874
|
45
|
-
sglang/srt/layers/sampler.py,sha256=
|
46
|
+
sglang/srt/layers/sampler.py,sha256=Y0o1bndTGRD713fHMbN5-LRUiyneBkb7bH_QlkkeqSs,3836
|
46
47
|
sglang/srt/layers/torchao_utils.py,sha256=rTECwKSXhj_ylh_iSzfbopz9_lZOFHatquQrNJNLZlE,2703
|
47
48
|
sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
|
48
49
|
sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
|
@@ -56,38 +57,38 @@ sglang/srt/lora/lora_manager.py,sha256=7J7cGmyy1Ph4HCvLdM-ViAizAbV1snZqD-S7JLWXa
|
|
56
57
|
sglang/srt/managers/controller_multi.py,sha256=KolZDso2WqH1ZhQw9p1eTmlFRgo4bcvzBxE44_sNE_o,6300
|
57
58
|
sglang/srt/managers/controller_single.py,sha256=DiZALP_iIPZQMRx09a-LwT5_Dg7p-WU8HXyMoxJ9sRA,4955
|
58
59
|
sglang/srt/managers/detokenizer_manager.py,sha256=yQkL5gLomLiy1qc6e9HNz8hcj7JQFHm1AfIrzpXaWJE,6852
|
59
|
-
sglang/srt/managers/io_struct.py,sha256=
|
60
|
-
sglang/srt/managers/policy_scheduler.py,sha256=
|
61
|
-
sglang/srt/managers/schedule_batch.py,sha256=
|
60
|
+
sglang/srt/managers/io_struct.py,sha256=yNV5BmeUzLPqv19j79kXQ50Iaqdk4vP-_TciiRf4OEE,11396
|
61
|
+
sglang/srt/managers/policy_scheduler.py,sha256=PVo0DV0-5ODNN7FkPkeF1Y8BQ6uuLldPETOlB_YvvL4,11560
|
62
|
+
sglang/srt/managers/schedule_batch.py,sha256=ns2qkaYAvzul-LCV1BEB6q1t5jKyftNsReMv62PC8M0,27386
|
62
63
|
sglang/srt/managers/tokenizer_manager.py,sha256=ql-sObjl1oRigJwnLtqqTaaw-i7gPTDMoNXDEMftr40,29643
|
63
|
-
sglang/srt/managers/tp_worker.py,sha256=
|
64
|
+
sglang/srt/managers/tp_worker.py,sha256=0Y0k-roDrBxWZxD0axv5CCvUUW8vsJ8n78TANHLzEFs,39503
|
64
65
|
sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
|
65
66
|
sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
|
66
67
|
sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
|
67
68
|
sglang/srt/mem_cache/memory_pool.py,sha256=4br3Ea2bfA-YsF_sPOVHlF2zQzYGd8fVaYTp197yZsE,7871
|
68
69
|
sglang/srt/mem_cache/radix_cache.py,sha256=0AVr1BKKDOtTyybUkwxrz6PT8khDx-DpzgN5MgL27IE,10088
|
69
|
-
sglang/srt/model_executor/cuda_graph_runner.py,sha256=
|
70
|
+
sglang/srt/model_executor/cuda_graph_runner.py,sha256=gZ0Wukqz6u67MMIj4MC8JET9jcHdh0rotYzpuPlHruY,10512
|
70
71
|
sglang/srt/model_executor/forward_batch_info.py,sha256=yvkhayY9Zu6gysoojcGT73lADGOtfHKkFKWdJLRyACI,6141
|
71
|
-
sglang/srt/model_executor/model_runner.py,sha256=
|
72
|
+
sglang/srt/model_executor/model_runner.py,sha256=X7AG1k9AI_kqS8q1i5Bfv-kFysIdqJAVWMGGZoAPThY,22726
|
72
73
|
sglang/srt/models/baichuan.py,sha256=NrG1rMJXhemkrUCEf8xKOSDQVsOD-nN8RQz6MWHOg84,15124
|
73
74
|
sglang/srt/models/chatglm.py,sha256=KwxLHBEvK02McXDvBS0gnRxfIvOAu2QP7lgibrj9Nbc,13371
|
74
75
|
sglang/srt/models/commandr.py,sha256=2rAXRZRb4PkJZ4NWEqP_rIgsjxbdZyHpuoMOarqTWzQ,14163
|
75
76
|
sglang/srt/models/dbrx.py,sha256=N_0Ku_p1NCsc29NktUBNqPv7Z33XhYxOZK5xN7nzW4s,14661
|
76
77
|
sglang/srt/models/deepseek.py,sha256=7UJgde1EV9ey6d-CKRcEyTKh1_WhZdatpZiltIuqpik,16006
|
77
|
-
sglang/srt/models/deepseek_v2.py,sha256=
|
78
|
+
sglang/srt/models/deepseek_v2.py,sha256=1J0pt1jZRcBBGYbgt1wGiuxPcrdpfTEUEaGFqju6TVA,28431
|
78
79
|
sglang/srt/models/exaone.py,sha256=3I5ZoiLotf7U-8c9QJRubpgf6JDx9I_z-ViXQlCC-x8,13087
|
79
80
|
sglang/srt/models/gemma.py,sha256=GkwgGFHgGlXgBZN7s7Wooz5tMyCp1YtgLahU2NOo66M,12273
|
80
81
|
sglang/srt/models/gemma2.py,sha256=sFfCNEm0_OOWElRSTDuroRv8wNMX8v_81Uko9m546KA,14923
|
81
82
|
sglang/srt/models/gpt_bigcode.py,sha256=kzHYogeGXZF4KHpkXA-RGqvs016mA-6klWxD2QJTi9E,10195
|
82
83
|
sglang/srt/models/grok.py,sha256=6I4OwQwNyAbh5GF24_SRm12XYBvM9iGWB-T4TSTJ0wU,14929
|
83
84
|
sglang/srt/models/internlm2.py,sha256=6j7JH0p3yib8GZDH8Cmrs-pgwfH3eOlAK6V3Cq64O7w,12202
|
84
|
-
sglang/srt/models/llama.py,sha256=
|
85
|
-
sglang/srt/models/llama_classification.py,sha256=
|
85
|
+
sglang/srt/models/llama.py,sha256=nbJwRcG9DnurVNSGLKJjnmBmTXP1_5WZpudth_0PVpw,15216
|
86
|
+
sglang/srt/models/llama_classification.py,sha256=HF-69J9qIYdfX0R5wEtIgvafMzprKcXdvF3W_orl_kA,3394
|
86
87
|
sglang/srt/models/llama_embedding.py,sha256=RI2mpYheP5WwhuTINU-6IrU61usuMyCK9h2zDEyLW4g,3458
|
87
88
|
sglang/srt/models/llava.py,sha256=O4XGdl70Hh4tM_OHapFGHbReC82mbe9xLw6GELKWKhU,24881
|
88
89
|
sglang/srt/models/llavavid.py,sha256=ou5uIuskBoBo0lXvqFFfDLBYYVfehx27n-Lu8X9gpLs,11992
|
89
90
|
sglang/srt/models/minicpm.py,sha256=ioqCsTCE_oF8xqGF5fm5cK9dclK5Y0EQ1UJfyteIDDo,13825
|
90
|
-
sglang/srt/models/minicpm3.py,sha256=
|
91
|
+
sglang/srt/models/minicpm3.py,sha256=McPWyy2fQqfHUhi9Nk36rkvvPAS8RmLOY7Vh4ah5c1w,25216
|
91
92
|
sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
|
92
93
|
sglang/srt/models/mixtral.py,sha256=oRC7mKBrPJhvzkWSabrbeQQQac-jtF4EV6H2Sgjc5JY,13897
|
93
94
|
sglang/srt/models/mixtral_quant.py,sha256=wMACJq78OTWj7HlqPDRNEh8cjrVAjKqJEsOG3CO5xow,14072
|
@@ -96,8 +97,8 @@ sglang/srt/models/qwen.py,sha256=nqSRzkiZzpRVG6WGQ1MBUclQnXyw8jlvoOq-euM8j5s,995
|
|
96
97
|
sglang/srt/models/qwen2.py,sha256=9_M-VkHN1_T1XN-gsl_L636QMQ9BLF2WqvTcx_1L6aw,12432
|
97
98
|
sglang/srt/models/qwen2_moe.py,sha256=s7b5XnSvsBYtZZUkjPp442m59CqPJ3HxGUIwXBVWsXw,17153
|
98
99
|
sglang/srt/models/stablelm.py,sha256=30ngpc0Xq3VxzXJlf6svP1oax8Q3krMJkxM8PVKtZWU,11359
|
99
|
-
sglang/srt/models/xverse.py,sha256=
|
100
|
-
sglang/srt/models/xverse_moe.py,sha256=
|
100
|
+
sglang/srt/models/xverse.py,sha256=L3g32-je_7JmzF2-hztaIVshHYCIv7jOM3oFs-fb2MY,13658
|
101
|
+
sglang/srt/models/xverse_moe.py,sha256=CgDD9cR83UVfTsPU6WcbHVYBrkYKv_kTdwncTIx7Q7U,15842
|
101
102
|
sglang/srt/models/yivl.py,sha256=B6MELthWIm5KdSzX3o2tbbpApY8XdjUdmcQSD4dQe_I,4835
|
102
103
|
sglang/srt/openai_api/adapter.py,sha256=CJ47YftRHAip1FMcHIhtCorBtzlIkv7F0Wz_JUcI4T4,51032
|
103
104
|
sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
|
@@ -109,7 +110,7 @@ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq
|
|
109
110
|
sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
|
110
111
|
sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
|
111
112
|
sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
|
112
|
-
sglang/test/few_shot_gsm8k.py,sha256=
|
113
|
+
sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
|
113
114
|
sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
|
114
115
|
sglang/test/runners.py,sha256=ZoWhT1TDXfLBVdbivXx1KUu9dhPlGjL_xrP18WLzVLo,11404
|
115
116
|
sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
|
@@ -121,10 +122,10 @@ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9
|
|
121
122
|
sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
|
122
123
|
sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
|
123
124
|
sglang/test/test_programs.py,sha256=3-XKnppQdCNWjaJb6jwib5Z9OSpgKvH8SFLJbE4J9qI,17001
|
124
|
-
sglang/test/test_utils.py,sha256=
|
125
|
+
sglang/test/test_utils.py,sha256=dsHRd1xLzcjlarxUnDIz2XEHfut7HvqVPwx2Fn7vf10,17179
|
125
126
|
sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
|
126
|
-
sglang-0.3.1.
|
127
|
-
sglang-0.3.1.
|
128
|
-
sglang-0.3.1.
|
129
|
-
sglang-0.3.1.
|
130
|
-
sglang-0.3.1.
|
127
|
+
sglang-0.3.1.post2.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
128
|
+
sglang-0.3.1.post2.dist-info/METADATA,sha256=WxMy8Ur_rjPxqVOoWSFoM3eBHWt0cKGyrtwOUfWL-Vc,38114
|
129
|
+
sglang-0.3.1.post2.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
130
|
+
sglang-0.3.1.post2.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
131
|
+
sglang-0.3.1.post2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|