sglang 0.1.2__tar.gz → 0.1.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.2 → sglang-0.1.4}/PKG-INFO +101 -5
- {sglang-0.1.2 → sglang-0.1.4}/README.md +100 -4
- {sglang-0.1.2 → sglang-0.1.4}/pyproject.toml +1 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang/__init__.py +1 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/context_flashattention_nopad.py +8 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/extend_attention.py +47 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/model_rpc.py +2 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/utils.py +1 -1
- {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/PKG-INFO +101 -5
- {sglang-0.1.2 → sglang-0.1.4}/LICENSE +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/setup.cfg +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/api.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/huggingface.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/openai.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/backend/tgi.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/flush_cache.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/global_config.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/interpreter.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/ir.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/launch_server.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/backend_config.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/fsm.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/regex.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/constrained/tokenizer.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/get_selected_logprob.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/openai_protocol.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/infer_batch.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/manager.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/model_runner.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/radix_cache.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/router/scheduler.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/model_config.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/llama2.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/llava.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/server.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/srt/server_args.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/test/test_programs.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang/utils.py +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/SOURCES.txt +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.1.2 → sglang-0.1.4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -267,10 +267,20 @@ pip install --upgrade pip
|
|
267
267
|
pip install -e "python[all]"
|
268
268
|
```
|
269
269
|
|
270
|
+
### Notes
|
271
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
272
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
273
|
+
|
270
274
|
## Quick Start
|
271
275
|
The example below shows how to use sglang to answer a mulit-turn question.
|
272
276
|
|
273
277
|
### Using OpenAI Models
|
278
|
+
Set the OpenAI API Key
|
279
|
+
```
|
280
|
+
export OPENAI_API_KEY=sk-xxxxxx
|
281
|
+
```
|
282
|
+
|
283
|
+
Then, answer a multi-turn question.
|
274
284
|
```python
|
275
285
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
276
286
|
|
@@ -329,30 +339,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
|
|
329
339
|
|
330
340
|
## Frontend: Structured Generation Langauge (SGLang)
|
331
341
|
|
342
|
+
To begin with, import sglang.
|
343
|
+
```python
|
344
|
+
import sglang as sgl
|
345
|
+
```
|
346
|
+
|
347
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
348
|
+
You can implement your prompt flow in a function decorated by `sgl.function`.
|
349
|
+
You can then invoke the function with `run` or `run_batch`.
|
350
|
+
The system will manage the state, chat template, and parallelism for you.
|
351
|
+
|
332
352
|
### Control Flow
|
353
|
+
```python
|
354
|
+
@sgl.function
|
355
|
+
def control_flow(s, question):
|
356
|
+
s += "To answer this question: " + question + ", "
|
357
|
+
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
358
|
+
|
359
|
+
# You can use if or nested function calls
|
360
|
+
if s["tool"] == "calculator":
|
361
|
+
s += "The math expression is" + sgl.gen("expression")
|
362
|
+
elif s["tool"] == "web browser":
|
363
|
+
s += "The website url is" + sgl.gen("url")
|
364
|
+
```
|
333
365
|
|
334
366
|
### Parallelism
|
367
|
+
```python
|
368
|
+
@sgl.function
|
369
|
+
def tip_suggestion(s):
|
370
|
+
s += (
|
371
|
+
"Here are two tips for staying healthy: "
|
372
|
+
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
373
|
+
)
|
374
|
+
|
375
|
+
forks = s.fork(2) # Launch parallel prompts
|
376
|
+
for i, f in enumerate(forks):
|
377
|
+
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
378
|
+
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
379
|
+
|
380
|
+
s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
|
381
|
+
s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
|
382
|
+
s += "In summary" + sgl.gen("summary")
|
383
|
+
```
|
335
384
|
|
336
385
|
### Multi Modality
|
337
386
|
```python
|
338
387
|
@sgl.function
|
339
388
|
def image_qa(s, image_file, question):
|
340
389
|
s += sgl.user(sgl.image(image_file) + question)
|
341
|
-
s += sgl.assistant(sgl.gen("
|
390
|
+
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
342
391
|
```
|
343
392
|
|
344
|
-
### Constrained
|
393
|
+
### Constrained Decoding
|
394
|
+
```python
|
395
|
+
@sgl.function
|
396
|
+
def regular_expression_gen(s):
|
397
|
+
s += "Q: What is the IP address of the Google DNS servers?\n"
|
398
|
+
s += "A: " + sgl.gen(
|
399
|
+
"answer",
|
400
|
+
temperature=0,
|
401
|
+
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
402
|
+
)
|
403
|
+
```
|
345
404
|
|
346
405
|
### Batching
|
406
|
+
```python
|
407
|
+
@sgl.function
|
408
|
+
def text_qa(s, question):
|
409
|
+
s += "Q: " + question + "\n"
|
410
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
411
|
+
|
412
|
+
states = text_qa.run_batch(
|
413
|
+
[
|
414
|
+
{"question": "What is the capital of the United Kingdom?"},
|
415
|
+
{"question": "What is the capital of France?"},
|
416
|
+
{"question": "What is the capital of Japan?"},
|
417
|
+
],
|
418
|
+
)
|
419
|
+
```
|
347
420
|
|
348
421
|
### Streaming
|
422
|
+
```python
|
423
|
+
@sgl.function
|
424
|
+
def text_qa(s, question):
|
425
|
+
s += "Q: " + question + "\n"
|
426
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
349
427
|
|
350
|
-
|
428
|
+
states = text_qa.run(
|
429
|
+
question="What is the capital of France?",
|
430
|
+
temperature=0.1)
|
431
|
+
|
432
|
+
for out in state.text_iter():
|
433
|
+
print(out, end="", flush=True)
|
434
|
+
```
|
351
435
|
|
352
436
|
## Backend: SGLang Runtime (SRT)
|
353
437
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
354
438
|
However, it can also be used as a standalone API server.
|
355
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
439
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
356
440
|
|
357
441
|
### Usage
|
358
442
|
Launch a server
|
@@ -376,6 +460,10 @@ curl http://localhost:30000/v1/completions \
|
|
376
460
|
```
|
377
461
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
378
462
|
```
|
463
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
464
|
+
```
|
465
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
466
|
+
```
|
379
467
|
|
380
468
|
### Supported Models
|
381
469
|
- Llama
|
@@ -386,6 +474,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
386
474
|
|
387
475
|
## Benchmark And Performance
|
388
476
|
|
477
|
+
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
478
|
+

|
479
|
+
|
480
|
+
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
481
|
+

|
482
|
+
|
483
|
+
Learn more [here](docs/benchmark_results.md).
|
484
|
+
|
389
485
|
## Roadmap
|
390
486
|
- [ ] Function call
|
391
487
|
- [ ] Quantization
|
@@ -32,10 +32,20 @@ pip install --upgrade pip
|
|
32
32
|
pip install -e "python[all]"
|
33
33
|
```
|
34
34
|
|
35
|
+
### Notes
|
36
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
37
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
38
|
+
|
35
39
|
## Quick Start
|
36
40
|
The example below shows how to use sglang to answer a mulit-turn question.
|
37
41
|
|
38
42
|
### Using OpenAI Models
|
43
|
+
Set the OpenAI API Key
|
44
|
+
```
|
45
|
+
export OPENAI_API_KEY=sk-xxxxxx
|
46
|
+
```
|
47
|
+
|
48
|
+
Then, answer a multi-turn question.
|
39
49
|
```python
|
40
50
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
41
51
|
|
@@ -94,30 +104,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
|
|
94
104
|
|
95
105
|
## Frontend: Structured Generation Langauge (SGLang)
|
96
106
|
|
107
|
+
To begin with, import sglang.
|
108
|
+
```python
|
109
|
+
import sglang as sgl
|
110
|
+
```
|
111
|
+
|
112
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
113
|
+
You can implement your prompt flow in a function decorated by `sgl.function`.
|
114
|
+
You can then invoke the function with `run` or `run_batch`.
|
115
|
+
The system will manage the state, chat template, and parallelism for you.
|
116
|
+
|
97
117
|
### Control Flow
|
118
|
+
```python
|
119
|
+
@sgl.function
|
120
|
+
def control_flow(s, question):
|
121
|
+
s += "To answer this question: " + question + ", "
|
122
|
+
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
123
|
+
|
124
|
+
# You can use if or nested function calls
|
125
|
+
if s["tool"] == "calculator":
|
126
|
+
s += "The math expression is" + sgl.gen("expression")
|
127
|
+
elif s["tool"] == "web browser":
|
128
|
+
s += "The website url is" + sgl.gen("url")
|
129
|
+
```
|
98
130
|
|
99
131
|
### Parallelism
|
132
|
+
```python
|
133
|
+
@sgl.function
|
134
|
+
def tip_suggestion(s):
|
135
|
+
s += (
|
136
|
+
"Here are two tips for staying healthy: "
|
137
|
+
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
138
|
+
)
|
139
|
+
|
140
|
+
forks = s.fork(2) # Launch parallel prompts
|
141
|
+
for i, f in enumerate(forks):
|
142
|
+
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
143
|
+
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
144
|
+
|
145
|
+
s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
|
146
|
+
s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
|
147
|
+
s += "In summary" + sgl.gen("summary")
|
148
|
+
```
|
100
149
|
|
101
150
|
### Multi Modality
|
102
151
|
```python
|
103
152
|
@sgl.function
|
104
153
|
def image_qa(s, image_file, question):
|
105
154
|
s += sgl.user(sgl.image(image_file) + question)
|
106
|
-
s += sgl.assistant(sgl.gen("
|
155
|
+
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
107
156
|
```
|
108
157
|
|
109
|
-
### Constrained
|
158
|
+
### Constrained Decoding
|
159
|
+
```python
|
160
|
+
@sgl.function
|
161
|
+
def regular_expression_gen(s):
|
162
|
+
s += "Q: What is the IP address of the Google DNS servers?\n"
|
163
|
+
s += "A: " + sgl.gen(
|
164
|
+
"answer",
|
165
|
+
temperature=0,
|
166
|
+
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
167
|
+
)
|
168
|
+
```
|
110
169
|
|
111
170
|
### Batching
|
171
|
+
```python
|
172
|
+
@sgl.function
|
173
|
+
def text_qa(s, question):
|
174
|
+
s += "Q: " + question + "\n"
|
175
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
176
|
+
|
177
|
+
states = text_qa.run_batch(
|
178
|
+
[
|
179
|
+
{"question": "What is the capital of the United Kingdom?"},
|
180
|
+
{"question": "What is the capital of France?"},
|
181
|
+
{"question": "What is the capital of Japan?"},
|
182
|
+
],
|
183
|
+
)
|
184
|
+
```
|
112
185
|
|
113
186
|
### Streaming
|
187
|
+
```python
|
188
|
+
@sgl.function
|
189
|
+
def text_qa(s, question):
|
190
|
+
s += "Q: " + question + "\n"
|
191
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
114
192
|
|
115
|
-
|
193
|
+
states = text_qa.run(
|
194
|
+
question="What is the capital of France?",
|
195
|
+
temperature=0.1)
|
196
|
+
|
197
|
+
for out in state.text_iter():
|
198
|
+
print(out, end="", flush=True)
|
199
|
+
```
|
116
200
|
|
117
201
|
## Backend: SGLang Runtime (SRT)
|
118
202
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
119
203
|
However, it can also be used as a standalone API server.
|
120
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
204
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
121
205
|
|
122
206
|
### Usage
|
123
207
|
Launch a server
|
@@ -141,6 +225,10 @@ curl http://localhost:30000/v1/completions \
|
|
141
225
|
```
|
142
226
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
143
227
|
```
|
228
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
229
|
+
```
|
230
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
231
|
+
```
|
144
232
|
|
145
233
|
### Supported Models
|
146
234
|
- Llama
|
@@ -151,6 +239,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
151
239
|
|
152
240
|
## Benchmark And Performance
|
153
241
|
|
242
|
+
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
243
|
+

|
244
|
+
|
245
|
+
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
246
|
+

|
247
|
+
|
248
|
+
Learn more [here](docs/benchmark_results.md).
|
249
|
+
|
154
250
|
## Roadmap
|
155
251
|
- [ ] Function call
|
156
252
|
- [ ] Quantization
|
@@ -6,6 +6,9 @@ import triton.language as tl
|
|
6
6
|
from sglang.srt.utils import wrap_kernel_launcher
|
7
7
|
|
8
8
|
|
9
|
+
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
10
|
+
|
11
|
+
|
9
12
|
@triton.jit
|
10
13
|
def _fwd_kernel(
|
11
14
|
Q,
|
@@ -120,7 +123,11 @@ cached_kernel = None
|
|
120
123
|
|
121
124
|
|
122
125
|
def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
|
123
|
-
|
126
|
+
if CUDA_CAPABILITY[0] >= 8:
|
127
|
+
BLOCK = 128
|
128
|
+
else:
|
129
|
+
BLOCK = 64
|
130
|
+
|
124
131
|
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
|
125
132
|
assert Lq == Lk and Lk == Lv
|
126
133
|
assert Lk in {16, 32, 64, 128}
|
@@ -2,6 +2,10 @@ import torch
|
|
2
2
|
import triton
|
3
3
|
import triton.language as tl
|
4
4
|
from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
|
5
|
+
from sglang.srt.utils import wrap_kernel_launcher
|
6
|
+
|
7
|
+
|
8
|
+
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
5
9
|
|
6
10
|
|
7
11
|
@triton.jit
|
@@ -153,6 +157,9 @@ def _fwd_kernel(
|
|
153
157
|
tl.store(O_Extend + offs_o, acc / deno[:, None], mask=mask_m[:, None])
|
154
158
|
|
155
159
|
|
160
|
+
cached_kernel = None
|
161
|
+
|
162
|
+
|
156
163
|
def extend_attention_fwd(
|
157
164
|
q_extend,
|
158
165
|
k_extend,
|
@@ -175,7 +182,11 @@ def extend_attention_fwd(
|
|
175
182
|
|
176
183
|
k_buffer, v_buffer: (prefix + extend) tensors in mem_manager
|
177
184
|
"""
|
178
|
-
|
185
|
+
if CUDA_CAPABILITY[0] >= 8:
|
186
|
+
BLOCK_M, BLOCK_N = 128, 128
|
187
|
+
else:
|
188
|
+
BLOCK_M, BLOCK_N = 64, 64
|
189
|
+
|
179
190
|
Lq, Lk, Lv, Lo = (
|
180
191
|
q_extend.shape[-1],
|
181
192
|
k_extend.shape[-1],
|
@@ -193,6 +204,40 @@ def extend_attention_fwd(
|
|
193
204
|
num_warps = 4 if Lk <= 64 else 8
|
194
205
|
num_stages = 1
|
195
206
|
|
207
|
+
global cached_kernel
|
208
|
+
if cached_kernel:
|
209
|
+
cached_kernel(
|
210
|
+
grid,
|
211
|
+
num_warps,
|
212
|
+
q_extend,
|
213
|
+
k_extend,
|
214
|
+
v_extend,
|
215
|
+
o_extend,
|
216
|
+
k_buffer,
|
217
|
+
v_buffer,
|
218
|
+
req_to_tokens,
|
219
|
+
b_req_idx,
|
220
|
+
b_seq_len,
|
221
|
+
b_start_loc_extend,
|
222
|
+
b_seq_len_extend,
|
223
|
+
sm_scale,
|
224
|
+
kv_group_num,
|
225
|
+
q_extend.stride(0),
|
226
|
+
q_extend.stride(1),
|
227
|
+
k_extend.stride(0),
|
228
|
+
k_extend.stride(1),
|
229
|
+
v_extend.stride(0),
|
230
|
+
v_extend.stride(1),
|
231
|
+
o_extend.stride(0),
|
232
|
+
o_extend.stride(1),
|
233
|
+
k_buffer.stride(0),
|
234
|
+
k_buffer.stride(1),
|
235
|
+
v_buffer.stride(0),
|
236
|
+
v_buffer.stride(1),
|
237
|
+
req_to_tokens.stride(0),
|
238
|
+
)
|
239
|
+
return
|
240
|
+
|
196
241
|
_fwd_kernel[grid](
|
197
242
|
q_extend,
|
198
243
|
k_extend,
|
@@ -226,6 +271,7 @@ def extend_attention_fwd(
|
|
226
271
|
num_warps=num_warps,
|
227
272
|
num_stages=num_stages,
|
228
273
|
)
|
274
|
+
cached_kernel = wrap_kernel_launcher(_fwd_kernel)
|
229
275
|
|
230
276
|
|
231
277
|
def redundant_attention(
|
@@ -5,6 +5,7 @@ import time
|
|
5
5
|
from concurrent.futures import ThreadPoolExecutor
|
6
6
|
from enum import Enum, auto
|
7
7
|
from typing import Dict, List, Optional, Tuple, Union
|
8
|
+
import warnings
|
8
9
|
|
9
10
|
import numpy as np
|
10
11
|
import rpyc
|
@@ -164,7 +165,7 @@ class ModelRpcServer(rpyc.Service):
|
|
164
165
|
+ self.tree_cache.evictable_size()
|
165
166
|
)
|
166
167
|
if available_size != self.max_total_num_token:
|
167
|
-
|
168
|
+
warnings.warn(
|
168
169
|
"Warning: "
|
169
170
|
f"available_size={available_size}, max_total_num_token={self.max_total_num_token}\n"
|
170
171
|
"KV cache pool leak detected!"
|
@@ -209,7 +209,7 @@ def load_image(image_file):
|
|
209
209
|
elif image_file.lower().endswith(("png", "jpg", "jpeg", "webp", "gif")):
|
210
210
|
image = Image.open(image_file)
|
211
211
|
elif image_file.startswith("data:"):
|
212
|
-
image_file =
|
212
|
+
image_file = image_file.split(",")[1]
|
213
213
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
214
214
|
else:
|
215
215
|
image = Image.open(BytesIO(base64.b64decode(image_file)))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.4
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -267,10 +267,20 @@ pip install --upgrade pip
|
|
267
267
|
pip install -e "python[all]"
|
268
268
|
```
|
269
269
|
|
270
|
+
### Notes
|
271
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
272
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
273
|
+
|
270
274
|
## Quick Start
|
271
275
|
The example below shows how to use sglang to answer a mulit-turn question.
|
272
276
|
|
273
277
|
### Using OpenAI Models
|
278
|
+
Set the OpenAI API Key
|
279
|
+
```
|
280
|
+
export OPENAI_API_KEY=sk-xxxxxx
|
281
|
+
```
|
282
|
+
|
283
|
+
Then, answer a multi-turn question.
|
274
284
|
```python
|
275
285
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
276
286
|
|
@@ -329,30 +339,104 @@ You can find more examples at [examples/quick_start](examples/quick_start).
|
|
329
339
|
|
330
340
|
## Frontend: Structured Generation Langauge (SGLang)
|
331
341
|
|
342
|
+
To begin with, import sglang.
|
343
|
+
```python
|
344
|
+
import sglang as sgl
|
345
|
+
```
|
346
|
+
|
347
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
348
|
+
You can implement your prompt flow in a function decorated by `sgl.function`.
|
349
|
+
You can then invoke the function with `run` or `run_batch`.
|
350
|
+
The system will manage the state, chat template, and parallelism for you.
|
351
|
+
|
332
352
|
### Control Flow
|
353
|
+
```python
|
354
|
+
@sgl.function
|
355
|
+
def control_flow(s, question):
|
356
|
+
s += "To answer this question: " + question + ", "
|
357
|
+
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
358
|
+
|
359
|
+
# You can use if or nested function calls
|
360
|
+
if s["tool"] == "calculator":
|
361
|
+
s += "The math expression is" + sgl.gen("expression")
|
362
|
+
elif s["tool"] == "web browser":
|
363
|
+
s += "The website url is" + sgl.gen("url")
|
364
|
+
```
|
333
365
|
|
334
366
|
### Parallelism
|
367
|
+
```python
|
368
|
+
@sgl.function
|
369
|
+
def tip_suggestion(s):
|
370
|
+
s += (
|
371
|
+
"Here are two tips for staying healthy: "
|
372
|
+
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
373
|
+
)
|
374
|
+
|
375
|
+
forks = s.fork(2) # Launch parallel prompts
|
376
|
+
for i, f in enumerate(forks):
|
377
|
+
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
378
|
+
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
379
|
+
|
380
|
+
s += "Tip 1:" + forks[0]["detailed_tip"] + "\n"
|
381
|
+
s += "Tip 2:" + forks[1]["detailed_tip"] + "\n"
|
382
|
+
s += "In summary" + sgl.gen("summary")
|
383
|
+
```
|
335
384
|
|
336
385
|
### Multi Modality
|
337
386
|
```python
|
338
387
|
@sgl.function
|
339
388
|
def image_qa(s, image_file, question):
|
340
389
|
s += sgl.user(sgl.image(image_file) + question)
|
341
|
-
s += sgl.assistant(sgl.gen("
|
390
|
+
s += sgl.assistant(sgl.gen("answer", max_tokens=256)
|
342
391
|
```
|
343
392
|
|
344
|
-
### Constrained
|
393
|
+
### Constrained Decoding
|
394
|
+
```python
|
395
|
+
@sgl.function
|
396
|
+
def regular_expression_gen(s):
|
397
|
+
s += "Q: What is the IP address of the Google DNS servers?\n"
|
398
|
+
s += "A: " + sgl.gen(
|
399
|
+
"answer",
|
400
|
+
temperature=0,
|
401
|
+
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
402
|
+
)
|
403
|
+
```
|
345
404
|
|
346
405
|
### Batching
|
406
|
+
```python
|
407
|
+
@sgl.function
|
408
|
+
def text_qa(s, question):
|
409
|
+
s += "Q: " + question + "\n"
|
410
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
411
|
+
|
412
|
+
states = text_qa.run_batch(
|
413
|
+
[
|
414
|
+
{"question": "What is the capital of the United Kingdom?"},
|
415
|
+
{"question": "What is the capital of France?"},
|
416
|
+
{"question": "What is the capital of Japan?"},
|
417
|
+
],
|
418
|
+
)
|
419
|
+
```
|
347
420
|
|
348
421
|
### Streaming
|
422
|
+
```python
|
423
|
+
@sgl.function
|
424
|
+
def text_qa(s, question):
|
425
|
+
s += "Q: " + question + "\n"
|
426
|
+
s += "A:" + sgl.gen("answer", stop="\n")
|
349
427
|
|
350
|
-
|
428
|
+
states = text_qa.run(
|
429
|
+
question="What is the capital of France?",
|
430
|
+
temperature=0.1)
|
431
|
+
|
432
|
+
for out in state.text_iter():
|
433
|
+
print(out, end="", flush=True)
|
434
|
+
```
|
351
435
|
|
352
436
|
## Backend: SGLang Runtime (SRT)
|
353
437
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
354
438
|
However, it can also be used as a standalone API server.
|
355
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
439
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
356
440
|
|
357
441
|
### Usage
|
358
442
|
Launch a server
|
@@ -376,6 +460,10 @@ curl http://localhost:30000/v1/completions \
|
|
376
460
|
```
|
377
461
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
378
462
|
```
|
463
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
464
|
+
```
|
465
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
466
|
+
```
|
379
467
|
|
380
468
|
### Supported Models
|
381
469
|
- Llama
|
@@ -386,6 +474,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
386
474
|
|
387
475
|
## Benchmark And Performance
|
388
476
|
|
477
|
+
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
478
|
+

|
479
|
+
|
480
|
+
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
481
|
+

|
482
|
+
|
483
|
+
Learn more [here](docs/benchmark_results.md).
|
484
|
+
|
389
485
|
## Roadmap
|
390
486
|
- [ ] Function call
|
391
487
|
- [ ] Quantization
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|