sglang 0.1.3__py3-none-any.whl → 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +1 -1
- sglang/srt/layers/context_flashattention_nopad.py +8 -1
- sglang/srt/layers/extend_attention.py +47 -1
- sglang/srt/managers/router/model_rpc.py +2 -1
- sglang/srt/utils.py +1 -1
- {sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/METADATA +20 -6
- {sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/RECORD +10 -10
- {sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/LICENSE +0 -0
- {sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/WHEEL +0 -0
- {sglang-0.1.3.dist-info → sglang-0.1.4.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -6,6 +6,9 @@ import triton.language as tl
|
|
6
6
|
from sglang.srt.utils import wrap_kernel_launcher
|
7
7
|
|
8
8
|
|
9
|
+
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
10
|
+
|
11
|
+
|
9
12
|
@triton.jit
|
10
13
|
def _fwd_kernel(
|
11
14
|
Q,
|
@@ -120,7 +123,11 @@ cached_kernel = None
|
|
120
123
|
|
121
124
|
|
122
125
|
def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
|
123
|
-
|
126
|
+
if CUDA_CAPABILITY[0] >= 8:
|
127
|
+
BLOCK = 128
|
128
|
+
else:
|
129
|
+
BLOCK = 64
|
130
|
+
|
124
131
|
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
|
125
132
|
assert Lq == Lk and Lk == Lv
|
126
133
|
assert Lk in {16, 32, 64, 128}
|
@@ -2,6 +2,10 @@ import torch
|
|
2
2
|
import triton
|
3
3
|
import triton.language as tl
|
4
4
|
from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
|
5
|
+
from sglang.srt.utils import wrap_kernel_launcher
|
6
|
+
|
7
|
+
|
8
|
+
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
5
9
|
|
6
10
|
|
7
11
|
@triton.jit
|
@@ -153,6 +157,9 @@ def _fwd_kernel(
|
|
153
157
|
tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
|
154
158
|
|
155
159
|
|
160
|
+
cached_kernel = None
|
161
|
+
|
162
|
+
|
156
163
|
def extend_attention_fwd(
|
157
164
|
q_extend,
|
158
165
|
k_extend,
|
@@ -175,7 +182,11 @@ def extend_attention_fwd(
|
|
175
182
|
|
176
183
|
k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
|
177
184
|
"""
|
178
|
-
|
185
|
+
if CUDA_CAPABILITY[0] >= 8:
|
186
|
+
BLOCK_M, BLOCK_N = 128, 128
|
187
|
+
else:
|
188
|
+
BLOCK_M, BLOCK_N = 64, 64
|
189
|
+
|
179
190
|
Lq, Lk, Lv, Lo = (
|
180
191
|
q_extend.shape[-1],
|
181
192
|
k_extend.shape[-1],
|
@@ -193,6 +204,40 @@ def extend_attention_fwd(
|
|
193
204
|
num_warps = 4 if Lk <= 64 else 8
|
194
205
|
num_stages = 1
|
195
206
|
|
207
|
+
global cached_kernel
|
208
|
+
if cached_kernel:
|
209
|
+
cached_kernel(
|
210
|
+
grid,
|
211
|
+
num_warps,
|
212
|
+
q_extend,
|
213
|
+
k_extend,
|
214
|
+
v_extend,
|
215
|
+
o_extend,
|
216
|
+
k_buffer,
|
217
|
+
v_buffer,
|
218
|
+
req_to_tokens,
|
219
|
+
b_req_idx,
|
220
|
+
b_seq_len,
|
221
|
+
b_start_loc_extend,
|
222
|
+
b_seq_len_extend,
|
223
|
+
sm_scale,
|
224
|
+
kv_group_num,
|
225
|
+
q_extend.stride(0),
|
226
|
+
q_extend.stride(1),
|
227
|
+
k_extend.stride(0),
|
228
|
+
k_extend.stride(1),
|
229
|
+
v_extend.stride(0),
|
230
|
+
v_extend.stride(1),
|
231
|
+
o_extend.stride(0),
|
232
|
+
o_extend.stride(1),
|
233
|
+
k_buffer.stride(0),
|
234
|
+
k_buffer.stride(1),
|
235
|
+
v_buffer.stride(0),
|
236
|
+
v_buffer.stride(1),
|
237
|
+
req_to_tokens.stride(0),
|
238
|
+
)
|
239
|
+
return
|
240
|
+
|
196
241
|
_fwd_kernel[grid](
|
197
242
|
q_extend,
|
198
243
|
k_extend,
|
@@ -226,6 +271,7 @@ def extend_attention_fwd(
|
|
226
271
|
num_warps=num_warps,
|
227
272
|
num_stages=num_stages,
|
228
273
|
)
|
274
|
+
cached_kernel = wrap_kernel_launcher(_fwd_kernel)
|
229
275
|
|
230
276
|
|
231
277
|
def redundant_attention(
|
@@ -5,6 +5,7 @@ import time
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
6
6
|
from enum import Enum, auto
|
7
7
|
from typing import Dict, List, Optional, Tuple, Union
|
8
|
+
import warnings
|
8
9
|
|
9
10
|
import numpy as np
|
10
11
|
import rpyc
|
@@ -164,7 +165,7 @@ class ModelRpcServer(rpyc.Service):
|
|
164
165
|
+ self.tree_cache.evictable_size()
|
165
166
|
)
|
166
167
|
if available_size != self.max_total_num_token:
|
167
|
-
|
168
|
+
warnings.warn(
|
168
169
|
"Warning: "
|
169
170
|
f"available_size={available_size}, max_total_num_token={self.max_total_num_token}\n"
|
170
171
|
"KV cache pool leak detected!"
|
sglang/srt/utils.py
CHANGED
@@ -209,7 +209,7 @@ def load_image(image_file):
|
|
209
209
|
elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
|
210
210
|
image = Image.open(image_file)
|
211
211
|
elif image_file.startswith("data:"):
|
212
|
-
image_file =
|
212
|
+
image_file = image_file.split(",")[1]
|
213
213
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
214
214
|
else:
|
215
215
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -267,10 +267,20 @@ pip install --upgrade pip
|
|
267
267
|
pip install -e "python[all]"
|
268
268
|
```
|
269
269
|
|
270
|
+
### Notes
|
271
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
272
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
273
|
+
|
270
274
|
## Quick Start
|
271
275
|
The example below shows how to use sglang to answer a mulit-turn question.
|
272
276
|
|
273
277
|
### Using OpenAI Models
|
278
|
+
Set the OpenAI API Key
|
279
|
+
```
|
280
|
+
export OPENAI_API_KEY=sk-xxxxxx
|
281
|
+
```
|
282
|
+
|
283
|
+
Then, answer a multi-turn question.
|
274
284
|
```python
|
275
285
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
276
286
|
|
@@ -334,7 +344,7 @@ To begin with, import sglang.
|
|
334
344
|
import sglang as sgl
|
335
345
|
```
|
336
346
|
|
337
|
-
`sglang` provides some simple primitives such as `gen`, `select`, `fork`.
|
347
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
338
348
|
You can implement your prompt flow in a function decorated by `sgl.function`.
|
339
349
|
You can then invoke the function with `run` or `run_batch`.
|
340
350
|
The system will manage the state, chat template, and parallelism for you.
|
@@ -382,10 +392,10 @@ def image_qa(s, image_file, question):
|
|
382
392
|
|
383
393
|
### Constrained Decoding
|
384
394
|
```python
|
385
|
-
@function
|
395
|
+
@sgl.function
|
386
396
|
def regular_expression_gen(s):
|
387
397
|
s += "Q: What is the IP address of the Google DNS servers?\n"
|
388
|
-
s += "A: " + gen(
|
398
|
+
s += "A: " + sgl.gen(
|
389
399
|
"answer",
|
390
400
|
temperature=0,
|
391
401
|
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
@@ -426,7 +436,7 @@ for out in state.text_iter():
|
|
426
436
|
## Backend: SGLang Runtime (SRT)
|
427
437
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
428
438
|
However, it can also be used as a standalone API server.
|
429
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
439
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
430
440
|
|
431
441
|
### Usage
|
432
442
|
Launch a server
|
@@ -450,6 +460,10 @@ curl http://localhost:30000/v1/completions \
|
|
450
460
|
```
|
451
461
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
452
462
|
```
|
463
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
464
|
+
```
|
465
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
466
|
+
```
|
453
467
|
|
454
468
|
### Supported Models
|
455
469
|
- Llama
|
@@ -466,7 +480,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
466
480
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
467
481
|

|
468
482
|
|
469
|
-
Learn more [here]().
|
483
|
+
Learn more [here](docs/benchmark_results.md).
|
470
484
|
|
471
485
|
## Roadmap
|
472
486
|
- [ ] Function call
|
@@ -1,4 +1,4 @@
|
|
1
|
-
sglang/__init__.py,sha256=
|
1
|
+
sglang/__init__.py,sha256=lfYPLrb_Fy-J-l7NGMhnRDk0hlvAkCIzJEjzN6AsV0g,95
|
2
2
|
sglang/api.py,sha256=tJuEyB28BUQfl0-dQr4vi6UMHBhUbmyu9Z3iAE5xFcU,3883
|
3
3
|
sglang/flush_cache.py,sha256=cCD_MTlQ5qEv__w0nOthDnVitdAfyscYjksBljwC5Mw,1835
|
4
4
|
sglang/global_config.py,sha256=PAX7TWeFcq0HBzNUWyCONAOjqIokWqw8vT7I6sBSKTc,797
|
@@ -24,13 +24,13 @@ sglang/srt/model_config.py,sha256=R7YaR8H8AmCJl_1XcSP0zII_5ebZNl0wMXNVANGWd2c,99
|
|
24
24
|
sglang/srt/sampling_params.py,sha256=Sd9l_uIIuS_mhbzljKwTGDO9ESMviNOYGxOifc71RrY,2895
|
25
25
|
sglang/srt/server.py,sha256=XxTS1K4N5y-ZknLBQefxk1UxC50l6DABVqJOrJ-NG74,6388
|
26
26
|
sglang/srt/server_args.py,sha256=Fpj3To5hEgmWn9qCS-pfypOEh34x9xVmiHBoEx5Smbo,4932
|
27
|
-
sglang/srt/utils.py,sha256
|
27
|
+
sglang/srt/utils.py,sha256=-2F99bqYT99x1jScMjciJxgQec6CaH6PcCHSmrKHhhY,5692
|
28
28
|
sglang/srt/constrained/fsm.py,sha256=H4kXSsV4IX2ow5TMmnmd-8ho4qqJ5mpVZ4MOH5FUtnY,12900
|
29
29
|
sglang/srt/constrained/fsm_cache.py,sha256=KX4bFX5hj0W66SC9pSvst1ew7etaOMTtTC75z0enRME,1087
|
30
30
|
sglang/srt/constrained/regex.py,sha256=CcV7KBOKS2ZxGoEr6BHG5okagNIGEXYvGvhKXu5gtDA,18689
|
31
31
|
sglang/srt/constrained/tokenizer.py,sha256=rei9yKHFETcbDPOpI7bpIYdrBFgIBhGr_U-zb3r5Beo,7951
|
32
|
-
sglang/srt/layers/context_flashattention_nopad.py,sha256=
|
33
|
-
sglang/srt/layers/extend_attention.py,sha256=
|
32
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=yTQBOo-kKKu5F-YFBo26lCWKtyaGae1M5gsn2GZpfAE,5205
|
33
|
+
sglang/srt/layers/extend_attention.py,sha256=nERsTpimVwdF-gHXmjy3D7zbSb4RrbVswmlzuA2NpWA,12559
|
34
34
|
sglang/srt/layers/get_selected_logprob.py,sha256=CpMXM9WXMSB-AVaxBB_aVl1Qx_ZtAFFnjDTm4CgNDpU,2199
|
35
35
|
sglang/srt/layers/logits_processor.py,sha256=rwcXwdZ7-dW9zvJX3MF_EHSxMLbU7TIQ9xUIYRu-WAs,3013
|
36
36
|
sglang/srt/layers/radix_attention.py,sha256=hmPNFg2TkN4EAVUj376N_89RRtUYRwFgUpjj5SydnRk,6170
|
@@ -41,7 +41,7 @@ sglang/srt/managers/openai_protocol.py,sha256=Eid_734Wup4jsL1ZS2Op0vwRuzvNbF4mV2
|
|
41
41
|
sglang/srt/managers/tokenizer_manager.py,sha256=jVwr0lM18RFJLhDb5TWlUpQ4Q8tALT4L6GY0jmaZkLw,7861
|
42
42
|
sglang/srt/managers/router/infer_batch.py,sha256=UfS1uVhGnM-62Xv1cfu_IoTeIUxkjkKc4W3trtGbadc,11541
|
43
43
|
sglang/srt/managers/router/manager.py,sha256=H-T-LlnIssHw-FXMHbs3yDQewkTMBCqG6jTYjugopCA,2527
|
44
|
-
sglang/srt/managers/router/model_rpc.py,sha256=
|
44
|
+
sglang/srt/managers/router/model_rpc.py,sha256=G7NvEDguSNj-ZAXBo7GpNQJJHW5WAy_1-qQ7bzqltTU,19286
|
45
45
|
sglang/srt/managers/router/model_runner.py,sha256=U-SBnEeLvwolLcaxyxrPgVG7PnR2rRvuXWV50t9y0Fo,16480
|
46
46
|
sglang/srt/managers/router/radix_cache.py,sha256=ZQPm9HhQ7vD3Gl5nhuvw3ZW4ZRARcplqWed1GYUvHCg,6441
|
47
47
|
sglang/srt/managers/router/scheduler.py,sha256=ejuIRwqqMZVXFKUionRJxy5AtNvK25YoGRO9rFY-rc8,2926
|
@@ -50,8 +50,8 @@ sglang/srt/models/llava.py,sha256=COS0IC6Yo-QiwKe5emgCbtEe9HgaSu5tt6CQA7UtV38,85
|
|
50
50
|
sglang/srt/models/mixtral.py,sha256=j91xOt6NZ5tJiyTPqmUSzgJqFAw7vTDnfBtEs5x0jDM,13714
|
51
51
|
sglang/test/test_programs.py,sha256=ua3wufnS3x6d_U3aboY4ivqoglrRPZj18j96vuiUtiE,11348
|
52
52
|
sglang/test/test_utils.py,sha256=Knxg3BTA6d_7XSlprbBCdvfDr2SN5x7LhkT-tZFk5EQ,4828
|
53
|
-
sglang-0.1.
|
54
|
-
sglang-0.1.
|
55
|
-
sglang-0.1.
|
56
|
-
sglang-0.1.
|
57
|
-
sglang-0.1.
|
53
|
+
sglang-0.1.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
54
|
+
sglang-0.1.4.dist-info/METADATA,sha256=qYnmtS2k2ddncYRUOCZoE_SXEli5cFD7yh_JkP7IVWk,22676
|
55
|
+
sglang-0.1.4.dist-info/WHEEL,sha256=oiQVh_5PnQM0E3gPdiz09WCNmwiHDMaGer_elqB3coM,92
|
56
|
+
sglang-0.1.4.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
57
|
+
sglang-0.1.4.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|