sglang 0.3.4.post1__py3-none-any.whl → 0.3.4.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/srt/configs/model_config.py +25 -2
- sglang/srt/constrained/fsm_cache.py +10 -3
- sglang/srt/hf_transformers_utils.py +14 -0
- sglang/srt/layers/attention/flashinfer_backend.py +5 -5
- sglang/srt/layers/logits_processor.py +5 -5
- sglang/srt/layers/rotary_embedding.py +15 -48
- sglang/srt/layers/sampler.py +51 -39
- sglang/srt/managers/data_parallel_controller.py +1 -1
- sglang/srt/managers/detokenizer_manager.py +4 -0
- sglang/srt/managers/io_struct.py +10 -0
- sglang/srt/managers/schedule_batch.py +13 -3
- sglang/srt/managers/scheduler.py +8 -2
- sglang/srt/managers/tokenizer_manager.py +14 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +58 -21
- sglang/srt/mem_cache/memory_pool.py +10 -3
- sglang/srt/model_executor/cuda_graph_runner.py +29 -21
- sglang/srt/model_executor/forward_batch_info.py +6 -9
- sglang/srt/model_executor/model_runner.py +2 -2
- sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +6 -3
- sglang/srt/sampling/sampling_params.py +5 -7
- sglang/srt/server.py +12 -0
- sglang/test/run_eval.py +2 -0
- sglang/test/srt/sampling/penaltylib/utils.py +1 -0
- sglang/test/test_utils.py +100 -3
- sglang/version.py +1 -1
- {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/METADATA +13 -14
- {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/RECORD +30 -30
- {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/LICENSE +0 -0
- {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.4.post1.dist-info → sglang-0.3.4.post2.dist-info}/top_level.txt +0 -0
@@ -38,7 +38,7 @@ class ReqToTokenPool:
|
|
38
38
|
self.size = size
|
39
39
|
self.max_context_len = max_context_len
|
40
40
|
self.device = device
|
41
|
-
self.req_to_token = torch.
|
41
|
+
self.req_to_token = torch.zeros(
|
42
42
|
(size, max_context_len), dtype=torch.int32, device=device
|
43
43
|
)
|
44
44
|
self.free_slots = list(range(size))
|
@@ -51,7 +51,7 @@ class ReqToTokenPool:
|
|
51
51
|
self.write = self.write_without_records
|
52
52
|
|
53
53
|
def write(self, indices, values):
|
54
|
-
# Keep the signature for type checking
|
54
|
+
# Keep the signature for type checking. It will be assigned during runtime.
|
55
55
|
raise NotImplementedError()
|
56
56
|
|
57
57
|
def available_size(self):
|
@@ -223,7 +223,6 @@ class MHATokenToKVPool(BaseTokenToKVPool):
|
|
223
223
|
layer_id = layer.layer_id
|
224
224
|
if cache_k.dtype != self.dtype:
|
225
225
|
cache_k = cache_k.to(self.dtype)
|
226
|
-
if cache_v.dtype != self.dtype:
|
227
226
|
cache_v = cache_v.to(self.dtype)
|
228
227
|
if self.store_dtype != self.dtype:
|
229
228
|
self.k_buffer[layer_id][loc] = cache_k.view(self.store_dtype)
|
@@ -233,6 +232,14 @@ class MHATokenToKVPool(BaseTokenToKVPool):
|
|
233
232
|
self.v_buffer[layer_id][loc] = cache_v
|
234
233
|
|
235
234
|
|
235
|
+
# This compiled version is slower in the unit test
|
236
|
+
# python3 -m unittest test_bench_serving.TestBenchServing.test_offline_throughput_non_stream_small_batch_size
|
237
|
+
@torch.compile(dynamic=True)
|
238
|
+
def copy_two_array(loc, dst_1, src_1, dst_2, src_2, dtype, store_dtype):
|
239
|
+
dst_1[loc] = src_1.to(dtype).view(store_dtype)
|
240
|
+
dst_2[loc] = src_2.to(dtype).view(store_dtype)
|
241
|
+
|
242
|
+
|
236
243
|
class MLATokenToKVPool(BaseTokenToKVPool):
|
237
244
|
|
238
245
|
def __init__(
|
@@ -92,6 +92,11 @@ def set_torch_compile_config():
|
|
92
92
|
torch._dynamo.config.accumulated_cache_size_limit = 1024
|
93
93
|
|
94
94
|
|
95
|
+
@torch.compile(dynamic=True)
|
96
|
+
def clamp_position(seq_lens):
|
97
|
+
return torch.clamp((seq_lens - 1), min=0).to(torch.int64)
|
98
|
+
|
99
|
+
|
95
100
|
class CudaGraphRunner:
|
96
101
|
"""A CudaGraphRunner runs the forward pass of a model with cuda graph and torch.compile."""
|
97
102
|
|
@@ -111,8 +116,7 @@ class CudaGraphRunner:
|
|
111
116
|
if self.model_runner.server_args.disable_cuda_graph_padding:
|
112
117
|
self.capture_bs = list(range(1, 32)) + [64, 128]
|
113
118
|
else:
|
114
|
-
self.capture_bs = [1, 2, 4] + [i * 8 for i in range(1, 21)]
|
115
|
-
|
119
|
+
self.capture_bs = [1, 2, 3, 4] + [i * 8 for i in range(1, 21)]
|
116
120
|
self.capture_bs = [
|
117
121
|
bs for bs in self.capture_bs if bs <= model_runner.req_to_token_pool.size
|
118
122
|
]
|
@@ -129,6 +133,7 @@ class CudaGraphRunner:
|
|
129
133
|
# Attention backend
|
130
134
|
self.max_bs = max(self.capture_bs)
|
131
135
|
self.model_runner.attn_backend.init_cuda_graph_state(self.max_bs)
|
136
|
+
|
132
137
|
self.seq_len_fill_value = (
|
133
138
|
self.model_runner.attn_backend.get_cuda_graph_seq_len_fill_value()
|
134
139
|
)
|
@@ -147,6 +152,7 @@ class CudaGraphRunner:
|
|
147
152
|
(self.max_bs,), self.seq_len_fill_value, dtype=torch.int32
|
148
153
|
)
|
149
154
|
self.out_cache_loc = torch.zeros((self.max_bs,), dtype=torch.int32)
|
155
|
+
self.mrope_positions = torch.zeros((3, self.max_bs), dtype=torch.int32)
|
150
156
|
|
151
157
|
if self.is_encoder_decoder:
|
152
158
|
# NOTE: encoder_lens can influence the full_text_row_masked_out_mask tensor when doing mixed batch
|
@@ -228,6 +234,7 @@ class CudaGraphRunner:
|
|
228
234
|
encoder_lens = None
|
229
235
|
|
230
236
|
seq_lens_sum = seq_lens.sum().item()
|
237
|
+
mrope_positions = self.mrope_positions[:, :bs]
|
231
238
|
|
232
239
|
# Attention backend
|
233
240
|
self.model_runner.attn_backend.init_forward_metadata_capture_cuda_graph(
|
@@ -253,9 +260,11 @@ class CudaGraphRunner:
|
|
253
260
|
encoder_lens=encoder_lens,
|
254
261
|
return_logprob=False,
|
255
262
|
top_logprobs_nums=[0] * bs,
|
256
|
-
positions=
|
263
|
+
positions=clamp_position(seq_lens),
|
264
|
+
mrope_positions=mrope_positions,
|
257
265
|
)
|
258
|
-
|
266
|
+
logits_output = forward(input_ids, forward_batch.positions, forward_batch)
|
267
|
+
return logits_output.next_token_logits
|
259
268
|
|
260
269
|
for _ in range(2):
|
261
270
|
torch.cuda.synchronize()
|
@@ -286,7 +295,7 @@ class CudaGraphRunner:
|
|
286
295
|
index = bisect.bisect_left(self.capture_bs, raw_bs)
|
287
296
|
bs = self.capture_bs[index]
|
288
297
|
if bs != raw_bs:
|
289
|
-
self.seq_lens.fill_(
|
298
|
+
self.seq_lens.fill_(1)
|
290
299
|
self.out_cache_loc.zero_()
|
291
300
|
|
292
301
|
# Common inputs
|
@@ -296,35 +305,30 @@ class CudaGraphRunner:
|
|
296
305
|
self.out_cache_loc[:raw_bs].copy_(forward_batch.out_cache_loc)
|
297
306
|
if self.is_encoder_decoder:
|
298
307
|
self.encoder_lens[:raw_bs].copy_(forward_batch.encoder_lens)
|
308
|
+
if forward_batch.mrope_positions is not None:
|
309
|
+
self.mrope_positions[:, :raw_bs].copy_(forward_batch.mrope_positions)
|
299
310
|
|
300
311
|
# Attention backend
|
301
312
|
self.model_runner.attn_backend.init_forward_metadata_replay_cuda_graph(
|
302
313
|
bs,
|
303
314
|
self.req_pool_indices,
|
304
315
|
self.seq_lens,
|
305
|
-
forward_batch.seq_lens_sum,
|
316
|
+
forward_batch.seq_lens_sum + (bs - raw_bs),
|
306
317
|
self.encoder_lens,
|
307
318
|
)
|
308
319
|
|
309
320
|
# Replay
|
310
321
|
self.graphs[bs].replay()
|
311
|
-
|
312
|
-
|
313
|
-
# Unpad
|
314
|
-
if bs != raw_bs:
|
315
|
-
logits_output = LogitsProcessorOutput(
|
316
|
-
next_token_logits=logits_output.next_token_logits[:raw_bs],
|
317
|
-
next_token_logprobs=None,
|
318
|
-
normalized_prompt_logprobs=None,
|
319
|
-
input_token_logprobs=None,
|
320
|
-
input_top_logprobs=None,
|
321
|
-
output_top_logprobs=None,
|
322
|
-
)
|
322
|
+
next_token_logits = self.output_buffers[bs][:raw_bs]
|
323
323
|
|
324
324
|
# Extract logprobs
|
325
325
|
if forward_batch.return_logprob:
|
326
|
-
|
327
|
-
|
326
|
+
next_token_logprobs = torch.nn.functional.log_softmax(
|
327
|
+
next_token_logits, dim=-1
|
328
|
+
)
|
329
|
+
logits_output = LogitsProcessorOutput(
|
330
|
+
next_token_logits=next_token_logits,
|
331
|
+
next_token_logprobs=next_token_logprobs,
|
328
332
|
)
|
329
333
|
return_top_logprob = any(x > 0 for x in forward_batch.top_logprobs_nums)
|
330
334
|
if return_top_logprob:
|
@@ -333,7 +337,11 @@ class CudaGraphRunner:
|
|
333
337
|
top_logprobs_nums=forward_batch.top_logprobs_nums,
|
334
338
|
)
|
335
339
|
logits_output.output_top_logprobs = LogitsProcessor.get_top_logprobs(
|
336
|
-
|
340
|
+
next_token_logprobs, logits_metadata
|
337
341
|
)[1]
|
342
|
+
else:
|
343
|
+
logits_output = LogitsProcessorOutput(
|
344
|
+
next_token_logits=next_token_logits,
|
345
|
+
)
|
338
346
|
|
339
347
|
return logits_output
|
@@ -142,11 +142,12 @@ class ForwardBatch:
|
|
142
142
|
int(self.seq_lens[i]),
|
143
143
|
)
|
144
144
|
elif self.forward_mode.is_extend():
|
145
|
+
extend_start_loc_cpu = self.extend_start_loc.cpu().numpy()
|
145
146
|
for i, image_inputs in enumerate(batch.image_inputs):
|
146
147
|
extend_start_loc, extend_seq_len, extend_prefix_len = (
|
147
|
-
|
148
|
-
|
149
|
-
|
148
|
+
extend_start_loc_cpu[i],
|
149
|
+
batch.extend_seq_lens[i],
|
150
|
+
batch.extend_prefix_lens[i],
|
150
151
|
)
|
151
152
|
if image_inputs is None:
|
152
153
|
# text only
|
@@ -160,20 +161,16 @@ class ForwardBatch:
|
|
160
161
|
] * 3
|
161
162
|
mrope_position_delta = 0
|
162
163
|
else:
|
164
|
+
# TODO: current qwen2-vl do not support radix cache since mrope position calculation
|
163
165
|
mrope_positions, mrope_position_delta = (
|
164
166
|
MRotaryEmbedding.get_input_positions(
|
165
167
|
input_tokens=self.input_ids[
|
166
168
|
extend_start_loc : extend_start_loc + extend_seq_len
|
167
|
-
]
|
169
|
+
],
|
168
170
|
image_grid_thw=image_inputs.image_grid_thws,
|
169
|
-
video_grid_thw=None,
|
170
|
-
image_token_id=hf_config.image_token_id,
|
171
|
-
video_token_id=hf_config.video_token_id,
|
172
171
|
vision_start_token_id=hf_config.vision_start_token_id,
|
173
|
-
vision_end_token_id=hf_config.vision_end_token_id,
|
174
172
|
spatial_merge_size=hf_config.vision_config.spatial_merge_size,
|
175
173
|
context_len=0,
|
176
|
-
extend_prefix_len=extend_prefix_len.item(),
|
177
174
|
)
|
178
175
|
)
|
179
176
|
mrope_positions_list[i] = mrope_positions
|
@@ -125,11 +125,11 @@ class ModelRunner:
|
|
125
125
|
)
|
126
126
|
server_args.chunked_prefill_size = None
|
127
127
|
server_args.mem_fraction_static *= 0.95
|
128
|
-
# TODO: qwen2-vl does not support
|
128
|
+
# TODO: qwen2-vl does not support radix cache now, set disable_radix_cache=True automatically
|
129
129
|
if self.model_config.hf_config.architectures == [
|
130
130
|
"Qwen2VLForConditionalGeneration"
|
131
131
|
]:
|
132
|
-
server_args.
|
132
|
+
server_args.disable_radix_cache = True
|
133
133
|
|
134
134
|
# Global vars
|
135
135
|
if server_args.show_time_cost:
|
@@ -31,9 +31,12 @@ class BatchedMinNewTokensPenalizer(_BatchedPenalizer):
|
|
31
31
|
padded_stop_token_ids = torch.nn.utils.rnn.pad_sequence(
|
32
32
|
sequences=[
|
33
33
|
torch.tensor(
|
34
|
-
data=
|
35
|
-
|
36
|
-
|
34
|
+
data=(
|
35
|
+
list(
|
36
|
+
(req.sampling_params.stop_token_ids or set())
|
37
|
+
| (req.tokenizer.additional_stop_token_ids or set())
|
38
|
+
| {req.tokenizer.eos_token_id}
|
39
|
+
)
|
37
40
|
),
|
38
41
|
dtype=torch.int64,
|
39
42
|
device=self.orchestrator.device,
|
@@ -50,9 +50,10 @@ class SamplingParams:
|
|
50
50
|
self.presence_penalty = presence_penalty
|
51
51
|
self.repetition_penalty = repetition_penalty
|
52
52
|
self.stop_strs = stop
|
53
|
-
if stop_token_ids
|
54
|
-
stop_token_ids =
|
55
|
-
|
53
|
+
if stop_token_ids:
|
54
|
+
self.stop_token_ids = set(stop_token_ids)
|
55
|
+
else:
|
56
|
+
self.stop_token_ids = None
|
56
57
|
self.max_new_tokens = max_new_tokens
|
57
58
|
self.min_new_tokens = min_new_tokens
|
58
59
|
self.ignore_eos = ignore_eos
|
@@ -119,10 +120,7 @@ class SamplingParams:
|
|
119
120
|
# Process stop strings
|
120
121
|
if self.stop_strs is None:
|
121
122
|
self.stop_strs = []
|
122
|
-
|
123
|
-
self.stop_str_max_len = 0
|
124
|
-
else:
|
125
|
-
self.stop_str_max_len = 1
|
123
|
+
self.stop_str_max_len = 0
|
126
124
|
else:
|
127
125
|
if isinstance(self.stop_strs, str):
|
128
126
|
self.stop_strs = [self.stop_strs]
|
sglang/srt/server.py
CHANGED
@@ -172,6 +172,18 @@ async def stop_profile():
|
|
172
172
|
)
|
173
173
|
|
174
174
|
|
175
|
+
@app.api_route("/get_memory_pool_size", methods=["GET", "POST"])
|
176
|
+
async def get_memory_pool_size():
|
177
|
+
"""Get the memory pool size in number of tokens"""
|
178
|
+
try:
|
179
|
+
ret = await tokenizer_manager.get_memory_pool_size()
|
180
|
+
return ret.size
|
181
|
+
except Exception as e:
|
182
|
+
return JSONResponse(
|
183
|
+
{"error": {"message": str(e)}}, status_code=HTTPStatus.BAD_REQUEST
|
184
|
+
)
|
185
|
+
|
186
|
+
|
175
187
|
@app.post("/update_weights")
|
176
188
|
async def update_weights(obj: UpdateWeightReqInput, request: Request):
|
177
189
|
"""Update the weights inplace without re-launching the server."""
|
sglang/test/run_eval.py
CHANGED
@@ -67,6 +67,7 @@ def run_eval(args):
|
|
67
67
|
model=args.model,
|
68
68
|
max_tokens=2048,
|
69
69
|
base_url=base_url,
|
70
|
+
temperature=getattr(args, "temperature", 0.0),
|
70
71
|
)
|
71
72
|
|
72
73
|
# Run eval
|
@@ -119,6 +120,7 @@ if __name__ == "__main__":
|
|
119
120
|
parser.add_argument("--eval-name", type=str, default="mmlu")
|
120
121
|
parser.add_argument("--num-examples", type=int)
|
121
122
|
parser.add_argument("--num-threads", type=int, default=512)
|
123
|
+
parser.add_argument("--temperature", type=float, default=0.0)
|
122
124
|
args = parser.parse_args()
|
123
125
|
|
124
126
|
run_eval(args)
|
sglang/test/test_utils.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
import argparse
|
4
4
|
import asyncio
|
5
5
|
import os
|
6
|
+
import random
|
6
7
|
import subprocess
|
7
8
|
import threading
|
8
9
|
import time
|
@@ -20,6 +21,7 @@ from sglang.global_config import global_config
|
|
20
21
|
from sglang.lang.backend.openai import OpenAI
|
21
22
|
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
22
23
|
from sglang.srt.utils import kill_child_process
|
24
|
+
from sglang.test.run_eval import run_eval
|
23
25
|
from sglang.utils import get_exception_traceback
|
24
26
|
|
25
27
|
DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
|
@@ -400,7 +402,7 @@ def popen_launch_server(
|
|
400
402
|
api_key: Optional[str] = None,
|
401
403
|
other_args: tuple = (),
|
402
404
|
env: Optional[dict] = None,
|
403
|
-
return_stdout_stderr:
|
405
|
+
return_stdout_stderr: Optional[tuple] = None,
|
404
406
|
):
|
405
407
|
_, host, port = base_url.split(":")
|
406
408
|
host = host[2:]
|
@@ -423,8 +425,8 @@ def popen_launch_server(
|
|
423
425
|
if return_stdout_stderr:
|
424
426
|
process = subprocess.Popen(
|
425
427
|
command,
|
426
|
-
stdout=
|
427
|
-
stderr=
|
428
|
+
stdout=return_stdout_stderr[0],
|
429
|
+
stderr=return_stdout_stderr[1],
|
428
430
|
env=env,
|
429
431
|
text=True,
|
430
432
|
)
|
@@ -631,3 +633,98 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
|
|
631
633
|
rouge_l_scores.append(fmeasure)
|
632
634
|
|
633
635
|
return rouge_l_scores
|
636
|
+
|
637
|
+
|
638
|
+
STDOUT_FILENAME = "stdout.txt"
|
639
|
+
STDERR_FILENAME = "stderr.txt"
|
640
|
+
|
641
|
+
|
642
|
+
def read_output(output_lines):
|
643
|
+
"""Print the output in real time with another thread."""
|
644
|
+
while not os.path.exists(STDERR_FILENAME):
|
645
|
+
time.sleep(1)
|
646
|
+
|
647
|
+
pt = 0
|
648
|
+
while pt >= 0:
|
649
|
+
if pt > 0 and not os.path.exists(STDERR_FILENAME):
|
650
|
+
break
|
651
|
+
lines = open(STDERR_FILENAME).readlines()
|
652
|
+
for line in lines[pt:]:
|
653
|
+
print(line, end="", flush=True)
|
654
|
+
output_lines.append(line)
|
655
|
+
pt += 1
|
656
|
+
time.sleep(0.1)
|
657
|
+
|
658
|
+
|
659
|
+
def run_mmlu_test(
|
660
|
+
disable_radix_cache,
|
661
|
+
enable_mixed_chunk=False,
|
662
|
+
enable_overlap=False,
|
663
|
+
chunked_prefill_size=32,
|
664
|
+
):
|
665
|
+
other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
|
666
|
+
if disable_radix_cache:
|
667
|
+
other_args += ["--disable-radix-cache"]
|
668
|
+
if enable_mixed_chunk:
|
669
|
+
other_args += ["--enable-mixed-chunk"]
|
670
|
+
if enable_overlap:
|
671
|
+
other_args += ["--enable-overlap-scheduler"]
|
672
|
+
|
673
|
+
model = DEFAULT_MODEL_NAME_FOR_TEST
|
674
|
+
port = random.randint(4000, 5000)
|
675
|
+
base_url = f"http://127.0.0.1:{port}"
|
676
|
+
|
677
|
+
# Create files and launch the server
|
678
|
+
stdout = open(STDOUT_FILENAME, "w")
|
679
|
+
stderr = open(STDERR_FILENAME, "w")
|
680
|
+
process = popen_launch_server(
|
681
|
+
model,
|
682
|
+
base_url,
|
683
|
+
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
|
684
|
+
other_args=other_args,
|
685
|
+
return_stdout_stderr=(stdout, stderr),
|
686
|
+
)
|
687
|
+
|
688
|
+
# Launch a thread to stream the output
|
689
|
+
output_lines = []
|
690
|
+
t = threading.Thread(target=read_output, args=(output_lines,))
|
691
|
+
t.start()
|
692
|
+
|
693
|
+
# Run the eval
|
694
|
+
args = SimpleNamespace(
|
695
|
+
base_url=base_url,
|
696
|
+
model=model,
|
697
|
+
eval_name="mmlu",
|
698
|
+
num_examples=128,
|
699
|
+
num_threads=128,
|
700
|
+
)
|
701
|
+
|
702
|
+
try:
|
703
|
+
metrics = run_eval(args)
|
704
|
+
print(f"{metrics=}")
|
705
|
+
assert metrics["score"] >= 0.65
|
706
|
+
finally:
|
707
|
+
pass
|
708
|
+
|
709
|
+
# Clean up everything
|
710
|
+
kill_child_process(process.pid)
|
711
|
+
kill_child_process(process.pid)
|
712
|
+
stdout.close()
|
713
|
+
stderr.close()
|
714
|
+
if os.path.exists(STDOUT_FILENAME):
|
715
|
+
os.remove(STDOUT_FILENAME)
|
716
|
+
if os.path.exists(STDERR_FILENAME):
|
717
|
+
os.remove(STDERR_FILENAME)
|
718
|
+
t.join()
|
719
|
+
|
720
|
+
# Assert success
|
721
|
+
has_new_server = False
|
722
|
+
has_leak = False
|
723
|
+
for line in output_lines:
|
724
|
+
if "The server is fired" in line:
|
725
|
+
has_new_server = True
|
726
|
+
if "leak" in line:
|
727
|
+
has_leak = True
|
728
|
+
|
729
|
+
assert has_new_server
|
730
|
+
# assert not has_leak
|
sglang/version.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "0.3.4.
|
1
|
+
__version__ = "0.3.4.post2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.4.
|
3
|
+
Version: 0.3.4.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -328,23 +328,27 @@ You can install SGLang using any of the methods below.
|
|
328
328
|
pip install --upgrade pip
|
329
329
|
pip install "sglang[all]"
|
330
330
|
|
331
|
-
# Install FlashInfer
|
331
|
+
# Install FlashInfer accelerated kernels
|
332
332
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
333
333
|
```
|
334
334
|
|
335
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
336
|
+
|
335
337
|
### Method 2: From source
|
336
338
|
```
|
337
339
|
# Use the last release branch
|
338
|
-
git clone -b v0.3.4.
|
340
|
+
git clone -b v0.3.4.post2 https://github.com/sgl-project/sglang.git
|
339
341
|
cd sglang
|
340
342
|
|
341
343
|
pip install --upgrade pip
|
342
344
|
pip install -e "python[all]"
|
343
345
|
|
344
|
-
# Install FlashInfer
|
346
|
+
# Install FlashInfer accelerated kernels
|
345
347
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
346
348
|
```
|
347
349
|
|
350
|
+
Note: Please check the [FlashInfer installation doc](https://docs.flashinfer.ai/installation.html) to install the proper version according to your PyTorch and CUDA versions.
|
351
|
+
|
348
352
|
### Method 3: Using docker
|
349
353
|
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](https://github.com/sgl-project/sglang/tree/main/docker).
|
350
354
|
Replace `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
@@ -498,7 +502,8 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
498
502
|
```
|
499
503
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
500
504
|
```
|
501
|
-
- To enable
|
505
|
+
- To enable the experimental overlapped scheduler, add `--enable-overlap-scheduler`. It overlaps CPU scheduler with GPU computation and can accelerate almost all workloads. This does not work for constrained decoding currenly.
|
506
|
+
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes. This does not work for FP8 currenly.
|
502
507
|
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
503
508
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
504
509
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
@@ -519,7 +524,6 @@ We also provide an inference engine **without a HTTP server**. For example,
|
|
519
524
|
```python
|
520
525
|
import sglang as sgl
|
521
526
|
|
522
|
-
|
523
527
|
def main():
|
524
528
|
prompts = [
|
525
529
|
"Hello, my name is",
|
@@ -539,12 +543,8 @@ if __name__ == "__main__":
|
|
539
543
|
main()
|
540
544
|
```
|
541
545
|
|
542
|
-
This can be used for
|
543
|
-
|
544
|
-
1. **Offline Batch Inference**
|
545
|
-
2. **Building Custom Servers**
|
546
|
-
|
547
|
-
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
546
|
+
This can be used for offline batch inference and building custom servers.
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine).
|
548
548
|
|
549
549
|
### Supported Models
|
550
550
|
|
@@ -552,7 +552,7 @@ You can view the full example [here](https://github.com/sgl-project/sglang/tree/
|
|
552
552
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
553
553
|
- Mistral / Mixtral / Mistral NeMo
|
554
554
|
- Gemma / Gemma 2
|
555
|
-
- Qwen / Qwen 2 / Qwen 2 MoE
|
555
|
+
- Qwen / Qwen 2 / Qwen 2 MoE / Qwen 2 VL
|
556
556
|
- DeepSeek / DeepSeek 2
|
557
557
|
- OLMoE
|
558
558
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
@@ -712,7 +712,6 @@ print(state["answer_1"])
|
|
712
712
|
```
|
713
713
|
|
714
714
|
#### More Examples
|
715
|
-
|
716
715
|
Anthropic and VertexAI (Gemini) models are also supported.
|
717
716
|
You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
|
718
717
|
|