sglang 0.1.3__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.3/sglang.egg-info → sglang-0.1.5}/PKG-INFO +44 -12
- {sglang-0.1.3 → sglang-0.1.5}/README.md +43 -11
- {sglang-0.1.3 → sglang-0.1.5}/pyproject.toml +1 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/__init__.py +1 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/api.py +1 -0
- sglang-0.1.5/sglang/backend/vertexai.py +147 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/interpreter.py +8 -9
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/ir.py +21 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/context_flashattention_nopad.py +7 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/extend_attention.py +46 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/manager.py +2 -2
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/model_rpc.py +7 -3
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/model_runner.py +1 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/mixtral.py +1 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/server_args.py +22 -4
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/utils.py +1 -1
- {sglang-0.1.3 → sglang-0.1.5}/sglang/test/test_programs.py +4 -1
- {sglang-0.1.3 → sglang-0.1.5/sglang.egg-info}/PKG-INFO +44 -12
- {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/SOURCES.txt +1 -2
- sglang-0.1.3/sglang/backend/huggingface.py +0 -349
- sglang-0.1.3/sglang/backend/tgi.py +0 -190
- {sglang-0.1.3 → sglang-0.1.5}/LICENSE +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/setup.cfg +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/openai.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/flush_cache.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/global_config.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/launch_server.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/backend_config.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/fsm.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/regex.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/constrained/tokenizer.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/get_selected_logprob.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/openai_protocol.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/infer_batch.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/radix_cache.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/router/scheduler.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/model_config.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/llama2.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/models/llava.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/srt/server.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang/utils.py +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.1.3 → sglang-0.1.5}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.5
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -234,6 +234,7 @@ Requires-Dist: sglang[openai]; extra == "all"
|
|
234
234
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
235
235
|
|
236
236
|
# SGLang
|
237
|
+
| [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
237
238
|
|
238
239
|
SGLang is a structured generation language designed for large language models (LLMs).
|
239
240
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
@@ -267,10 +268,20 @@ pip install --upgrade pip
|
|
267
268
|
pip install -e "python[all]"
|
268
269
|
```
|
269
270
|
|
271
|
+
### Notes
|
272
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
273
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
274
|
+
|
270
275
|
## Quick Start
|
271
276
|
The example below shows how to use sglang to answer a mulit-turn question.
|
272
277
|
|
273
278
|
### Using OpenAI Models
|
279
|
+
Set the OpenAI API Key
|
280
|
+
```
|
281
|
+
export OPENAI_API_KEY=sk-******
|
282
|
+
```
|
283
|
+
|
284
|
+
Then, answer a multi-turn question.
|
274
285
|
```python
|
275
286
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
276
287
|
|
@@ -325,6 +336,7 @@ for m in state.messages():
|
|
325
336
|
|
326
337
|
### More Examples
|
327
338
|
|
339
|
+
Anthropic and VertexAI (Gemini) models are also supported.
|
328
340
|
You can find more examples at [examples/quick_start](examples/quick_start).
|
329
341
|
|
330
342
|
## Frontend: Structured Generation Langauge (SGLang)
|
@@ -334,19 +346,20 @@ To begin with, import sglang.
|
|
334
346
|
import sglang as sgl
|
335
347
|
```
|
336
348
|
|
337
|
-
`sglang` provides some simple primitives such as `gen`, `select`, `fork`.
|
349
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
338
350
|
You can implement your prompt flow in a function decorated by `sgl.function`.
|
339
351
|
You can then invoke the function with `run` or `run_batch`.
|
340
352
|
The system will manage the state, chat template, and parallelism for you.
|
341
353
|
|
342
354
|
### Control Flow
|
355
|
+
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
356
|
+
|
343
357
|
```python
|
344
358
|
@sgl.function
|
345
359
|
def control_flow(s, question):
|
346
360
|
s += "To answer this question: " + question + ", "
|
347
361
|
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
348
362
|
|
349
|
-
# You can use if or nested function calls
|
350
363
|
if s["tool"] == "calculator":
|
351
364
|
s += "The math expression is" + sgl.gen("expression")
|
352
365
|
elif s["tool"] == "web browser":
|
@@ -354,6 +367,9 @@ def control_flow(s, question):
|
|
354
367
|
```
|
355
368
|
|
356
369
|
### Parallelism
|
370
|
+
Use `fork` to launch parallel prompts.
|
371
|
+
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
|
372
|
+
|
357
373
|
```python
|
358
374
|
@sgl.function
|
359
375
|
def tip_suggestion(s):
|
@@ -362,7 +378,7 @@ def tip_suggestion(s):
|
|
362
378
|
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
363
379
|
)
|
364
380
|
|
365
|
-
forks = s.fork(2)
|
381
|
+
forks = s.fork(2)
|
366
382
|
for i, f in enumerate(forks):
|
367
383
|
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
368
384
|
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
@@ -373,6 +389,8 @@ def tip_suggestion(s):
|
|
373
389
|
```
|
374
390
|
|
375
391
|
### Multi Modality
|
392
|
+
Use `sgl.image` to pass an image as input.
|
393
|
+
|
376
394
|
```python
|
377
395
|
@sgl.function
|
378
396
|
def image_qa(s, image_file, question):
|
@@ -381,11 +399,13 @@ def image_qa(s, image_file, question):
|
|
381
399
|
```
|
382
400
|
|
383
401
|
### Constrained Decoding
|
402
|
+
Use `regex=` to specify a regular expression as a decoding constraint.
|
403
|
+
|
384
404
|
```python
|
385
|
-
@function
|
405
|
+
@sgl.function
|
386
406
|
def regular_expression_gen(s):
|
387
407
|
s += "Q: What is the IP address of the Google DNS servers?\n"
|
388
|
-
s += "A: " + gen(
|
408
|
+
s += "A: " + sgl.gen(
|
389
409
|
"answer",
|
390
410
|
temperature=0,
|
391
411
|
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
@@ -393,6 +413,8 @@ def regular_expression_gen(s):
|
|
393
413
|
```
|
394
414
|
|
395
415
|
### Batching
|
416
|
+
Use `run_batch` to run a batch of requests with continuous batching.
|
417
|
+
|
396
418
|
```python
|
397
419
|
@sgl.function
|
398
420
|
def text_qa(s, question):
|
@@ -405,10 +427,13 @@ states = text_qa.run_batch(
|
|
405
427
|
{"question": "What is the capital of France?"},
|
406
428
|
{"question": "What is the capital of Japan?"},
|
407
429
|
],
|
430
|
+
progress_bar=True
|
408
431
|
)
|
409
432
|
```
|
410
433
|
|
411
434
|
### Streaming
|
435
|
+
Add `stream=True` to enable streaming.
|
436
|
+
|
412
437
|
```python
|
413
438
|
@sgl.function
|
414
439
|
def text_qa(s, question):
|
@@ -417,7 +442,9 @@ def text_qa(s, question):
|
|
417
442
|
|
418
443
|
states = text_qa.run(
|
419
444
|
question="What is the capital of France?",
|
420
|
-
temperature=0.1
|
445
|
+
temperature=0.1,
|
446
|
+
stream=True
|
447
|
+
)
|
421
448
|
|
422
449
|
for out in state.text_iter():
|
423
450
|
print(out, end="", flush=True)
|
@@ -426,7 +453,7 @@ for out in state.text_iter():
|
|
426
453
|
## Backend: SGLang Runtime (SRT)
|
427
454
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
428
455
|
However, it can also be used as a standalone API server.
|
429
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
456
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
430
457
|
|
431
458
|
### Usage
|
432
459
|
Launch a server
|
@@ -450,6 +477,10 @@ curl http://localhost:30000/v1/completions \
|
|
450
477
|
```
|
451
478
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
452
479
|
```
|
480
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
481
|
+
```
|
482
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
483
|
+
```
|
453
484
|
|
454
485
|
### Supported Models
|
455
486
|
- Llama
|
@@ -457,6 +488,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
457
488
|
- Mixtral
|
458
489
|
- LLaVA
|
459
490
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
|
491
|
+
- AWQ quantization
|
460
492
|
|
461
493
|
## Benchmark And Performance
|
462
494
|
|
@@ -466,13 +498,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
466
498
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
467
499
|

|
468
500
|
|
469
|
-
Learn more [here]().
|
501
|
+
Learn more [here](docs/benchmark_results.md).
|
470
502
|
|
471
503
|
## Roadmap
|
472
|
-
- [ ] Function call
|
473
|
-
- [ ] Quantization
|
504
|
+
- [ ] Function call APIs
|
474
505
|
- [ ] S-LoRA
|
475
|
-
- [ ]
|
506
|
+
- [ ] Support more models
|
507
|
+
- [ ] Support more hardware backends
|
476
508
|
|
477
509
|
## Citation And Acknowledgment
|
478
510
|
```
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# SGLang
|
2
|
+
| [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
2
3
|
|
3
4
|
SGLang is a structured generation language designed for large language models (LLMs).
|
4
5
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
@@ -32,10 +33,20 @@ pip install --upgrade pip
|
|
32
33
|
pip install -e "python[all]"
|
33
34
|
```
|
34
35
|
|
36
|
+
### Notes
|
37
|
+
- If you are using older GPUs (NVIDIA T4, V100), please use `pip install "triton>=2.2.0"` to avoid some bugs in the triton compiler
|
38
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install sglang[openai]`
|
39
|
+
|
35
40
|
## Quick Start
|
36
41
|
The example below shows how to use sglang to answer a mulit-turn question.
|
37
42
|
|
38
43
|
### Using OpenAI Models
|
44
|
+
Set the OpenAI API Key
|
45
|
+
```
|
46
|
+
export OPENAI_API_KEY=sk-******
|
47
|
+
```
|
48
|
+
|
49
|
+
Then, answer a multi-turn question.
|
39
50
|
```python
|
40
51
|
from sglang import function, system, user, assistant, gen, set_default_backend, OpenAI
|
41
52
|
|
@@ -90,6 +101,7 @@ for m in state.messages():
|
|
90
101
|
|
91
102
|
### More Examples
|
92
103
|
|
104
|
+
Anthropic and VertexAI (Gemini) models are also supported.
|
93
105
|
You can find more examples at [examples/quick_start](examples/quick_start).
|
94
106
|
|
95
107
|
## Frontend: Structured Generation Langauge (SGLang)
|
@@ -99,19 +111,20 @@ To begin with, import sglang.
|
|
99
111
|
import sglang as sgl
|
100
112
|
```
|
101
113
|
|
102
|
-
`sglang` provides some simple primitives such as `gen`, `select`, `fork`.
|
114
|
+
`sglang` provides some simple primitives such as `gen`, `select`, `fork`, `image`.
|
103
115
|
You can implement your prompt flow in a function decorated by `sgl.function`.
|
104
116
|
You can then invoke the function with `run` or `run_batch`.
|
105
117
|
The system will manage the state, chat template, and parallelism for you.
|
106
118
|
|
107
119
|
### Control Flow
|
120
|
+
You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
|
121
|
+
|
108
122
|
```python
|
109
123
|
@sgl.function
|
110
124
|
def control_flow(s, question):
|
111
125
|
s += "To answer this question: " + question + ", "
|
112
126
|
s += "I need to use a " + sgl.gen("tool", choices=["calculator", "web browser"]) + ". "
|
113
127
|
|
114
|
-
# You can use if or nested function calls
|
115
128
|
if s["tool"] == "calculator":
|
116
129
|
s += "The math expression is" + sgl.gen("expression")
|
117
130
|
elif s["tool"] == "web browser":
|
@@ -119,6 +132,9 @@ def control_flow(s, question):
|
|
119
132
|
```
|
120
133
|
|
121
134
|
### Parallelism
|
135
|
+
Use `fork` to launch parallel prompts.
|
136
|
+
Because `sgl.gen` is non-blocking, the for loop below issues two generation calls in parallel.
|
137
|
+
|
122
138
|
```python
|
123
139
|
@sgl.function
|
124
140
|
def tip_suggestion(s):
|
@@ -127,7 +143,7 @@ def tip_suggestion(s):
|
|
127
143
|
"1. Balanced Diet. 2. Regular Exercise.\n\n"
|
128
144
|
)
|
129
145
|
|
130
|
-
forks = s.fork(2)
|
146
|
+
forks = s.fork(2)
|
131
147
|
for i, f in enumerate(forks):
|
132
148
|
f += f"Now, expand tip {i+1} into a paragraph:\n"
|
133
149
|
f += sgl.gen(f"detailed_tip", max_tokens=256, stop="\n\n")
|
@@ -138,6 +154,8 @@ def tip_suggestion(s):
|
|
138
154
|
```
|
139
155
|
|
140
156
|
### Multi Modality
|
157
|
+
Use `sgl.image` to pass an image as input.
|
158
|
+
|
141
159
|
```python
|
142
160
|
@sgl.function
|
143
161
|
def image_qa(s, image_file, question):
|
@@ -146,11 +164,13 @@ def image_qa(s, image_file, question):
|
|
146
164
|
```
|
147
165
|
|
148
166
|
### Constrained Decoding
|
167
|
+
Use `regex=` to specify a regular expression as a decoding constraint.
|
168
|
+
|
149
169
|
```python
|
150
|
-
@function
|
170
|
+
@sgl.function
|
151
171
|
def regular_expression_gen(s):
|
152
172
|
s += "Q: What is the IP address of the Google DNS servers?\n"
|
153
|
-
s += "A: " + gen(
|
173
|
+
s += "A: " + sgl.gen(
|
154
174
|
"answer",
|
155
175
|
temperature=0,
|
156
176
|
regex=r"((25[0-5]|2[0-4]\d|[01]?\d\d?).){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
|
@@ -158,6 +178,8 @@ def regular_expression_gen(s):
|
|
158
178
|
```
|
159
179
|
|
160
180
|
### Batching
|
181
|
+
Use `run_batch` to run a batch of requests with continuous batching.
|
182
|
+
|
161
183
|
```python
|
162
184
|
@sgl.function
|
163
185
|
def text_qa(s, question):
|
@@ -170,10 +192,13 @@ states = text_qa.run_batch(
|
|
170
192
|
{"question": "What is the capital of France?"},
|
171
193
|
{"question": "What is the capital of Japan?"},
|
172
194
|
],
|
195
|
+
progress_bar=True
|
173
196
|
)
|
174
197
|
```
|
175
198
|
|
176
199
|
### Streaming
|
200
|
+
Add `stream=True` to enable streaming.
|
201
|
+
|
177
202
|
```python
|
178
203
|
@sgl.function
|
179
204
|
def text_qa(s, question):
|
@@ -182,7 +207,9 @@ def text_qa(s, question):
|
|
182
207
|
|
183
208
|
states = text_qa.run(
|
184
209
|
question="What is the capital of France?",
|
185
|
-
temperature=0.1
|
210
|
+
temperature=0.1,
|
211
|
+
stream=True
|
212
|
+
)
|
186
213
|
|
187
214
|
for out in state.text_iter():
|
188
215
|
print(out, end="", flush=True)
|
@@ -191,7 +218,7 @@ for out in state.text_iter():
|
|
191
218
|
## Backend: SGLang Runtime (SRT)
|
192
219
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
193
220
|
However, it can also be used as a standalone API server.
|
194
|
-
In this case, the RadixAttention can still greatly accelerate many use cases.
|
221
|
+
In this case, the [RadixAttention](https://arxiv.org/abs/2312.07104) can still greatly accelerate many use cases with automatic KV cache reuse.
|
195
222
|
|
196
223
|
### Usage
|
197
224
|
Launch a server
|
@@ -215,6 +242,10 @@ curl http://localhost:30000/v1/completions \
|
|
215
242
|
```
|
216
243
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
217
244
|
```
|
245
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
246
|
+
```
|
247
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
248
|
+
```
|
218
249
|
|
219
250
|
### Supported Models
|
220
251
|
- Llama
|
@@ -222,6 +253,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
222
253
|
- Mixtral
|
223
254
|
- LLaVA
|
224
255
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --port 30000`
|
256
|
+
- AWQ quantization
|
225
257
|
|
226
258
|
## Benchmark And Performance
|
227
259
|
|
@@ -231,13 +263,13 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
231
263
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
232
264
|

|
233
265
|
|
234
|
-
Learn more [here]().
|
266
|
+
Learn more [here](docs/benchmark_results.md).
|
235
267
|
|
236
268
|
## Roadmap
|
237
|
-
- [ ] Function call
|
238
|
-
- [ ] Quantization
|
269
|
+
- [ ] Function call APIs
|
239
270
|
- [ ] S-LoRA
|
240
|
-
- [ ]
|
271
|
+
- [ ] Support more models
|
272
|
+
- [ ] Support more hardware backends
|
241
273
|
|
242
274
|
## Citation And Acknowledgment
|
243
275
|
```
|
@@ -6,6 +6,7 @@ from sglang.backend.anthropic import Anthropic
|
|
6
6
|
from sglang.backend.base_backend import BaseBackend
|
7
7
|
from sglang.backend.openai import OpenAI
|
8
8
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
9
|
+
from sglang.backend.vertexai import VertexAI
|
9
10
|
from sglang.global_config import global_config
|
10
11
|
from sglang.lang.ir import (
|
11
12
|
SglExpr,
|
@@ -0,0 +1,147 @@
|
|
1
|
+
import os
|
2
|
+
import warnings
|
3
|
+
from typing import List, Optional, Union
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
from sglang.backend.base_backend import BaseBackend
|
7
|
+
from sglang.lang.chat_template import get_chat_template
|
8
|
+
from sglang.lang.interpreter import StreamExecutor
|
9
|
+
from sglang.lang.ir import SglSamplingParams
|
10
|
+
|
11
|
+
try:
|
12
|
+
import vertexai
|
13
|
+
from vertexai.preview.generative_models import (
|
14
|
+
GenerationConfig,
|
15
|
+
GenerativeModel,
|
16
|
+
Image,
|
17
|
+
)
|
18
|
+
except ImportError as e:
|
19
|
+
GenerativeModel = e
|
20
|
+
|
21
|
+
|
22
|
+
class VertexAI(BaseBackend):
|
23
|
+
def __init__(self, model_name):
|
24
|
+
super().__init__()
|
25
|
+
|
26
|
+
if isinstance(GenerativeModel, Exception):
|
27
|
+
raise GenerativeModel
|
28
|
+
|
29
|
+
project_id = os.environ["GCP_PROJECT_ID"]
|
30
|
+
location = os.environ.get("GCP_LOCATION")
|
31
|
+
vertexai.init(project=project_id, location=location)
|
32
|
+
|
33
|
+
self.model_name = model_name
|
34
|
+
self.chat_template = get_chat_template("default")
|
35
|
+
|
36
|
+
def get_chat_template(self):
|
37
|
+
return self.chat_template
|
38
|
+
|
39
|
+
def generate(
|
40
|
+
self,
|
41
|
+
s: StreamExecutor,
|
42
|
+
sampling_params: SglSamplingParams,
|
43
|
+
):
|
44
|
+
if s.messages_:
|
45
|
+
prompt = self.messages_to_vertexai_input(s.messages_)
|
46
|
+
else:
|
47
|
+
# single-turn
|
48
|
+
prompt = (
|
49
|
+
self.text_to_vertexai_input(s.text_, s.cur_images)
|
50
|
+
if s.cur_images
|
51
|
+
else s.text_
|
52
|
+
)
|
53
|
+
ret = GenerativeModel(self.model_name).generate_content(
|
54
|
+
prompt,
|
55
|
+
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
56
|
+
)
|
57
|
+
|
58
|
+
comp = ret.text
|
59
|
+
|
60
|
+
return comp, {}
|
61
|
+
|
62
|
+
def generate_stream(
|
63
|
+
self,
|
64
|
+
s: StreamExecutor,
|
65
|
+
sampling_params: SglSamplingParams,
|
66
|
+
):
|
67
|
+
if s.messages_:
|
68
|
+
prompt = self.messages_to_vertexai_input(s.messages_)
|
69
|
+
else:
|
70
|
+
# single-turn
|
71
|
+
prompt = (
|
72
|
+
self.text_to_vertexai_input(s.text_, s.cur_images)
|
73
|
+
if s.cur_images
|
74
|
+
else s.text_
|
75
|
+
)
|
76
|
+
generator = GenerativeModel(self.model_name).generate_content(
|
77
|
+
prompt,
|
78
|
+
stream=True,
|
79
|
+
generation_config=GenerationConfig(**sampling_params.to_vertexai_kwargs()),
|
80
|
+
)
|
81
|
+
for ret in generator:
|
82
|
+
yield ret.text, {}
|
83
|
+
|
84
|
+
def text_to_vertexai_input(self, text, images):
|
85
|
+
input = []
|
86
|
+
# split with image token
|
87
|
+
text_segs = text.split(self.chat_template.image_token)
|
88
|
+
for image_path, image_base64_data in images:
|
89
|
+
text_seg = text_segs.pop(0)
|
90
|
+
if text_seg != "":
|
91
|
+
input.append(text_seg)
|
92
|
+
input.append(Image.from_bytes(image_base64_data))
|
93
|
+
text_seg = text_segs.pop(0)
|
94
|
+
if text_seg != "":
|
95
|
+
input.append(text_seg)
|
96
|
+
return input
|
97
|
+
|
98
|
+
def messages_to_vertexai_input(self, messages):
|
99
|
+
vertexai_message = []
|
100
|
+
# from openai message format to vertexai message format
|
101
|
+
for msg in messages:
|
102
|
+
if isinstance(msg["content"], str):
|
103
|
+
text = msg["content"]
|
104
|
+
else:
|
105
|
+
text = msg["content"][0]["text"]
|
106
|
+
|
107
|
+
if msg["role"] == "system":
|
108
|
+
warnings.warn("Warning: system prompt is not supported in VertexAI.")
|
109
|
+
vertexai_message.append(
|
110
|
+
{
|
111
|
+
"role": "user",
|
112
|
+
"parts": [{"text": "System prompt: " + text}],
|
113
|
+
}
|
114
|
+
)
|
115
|
+
vertexai_message.append(
|
116
|
+
{
|
117
|
+
"role": "model",
|
118
|
+
"parts": [{"text": "Understood."}],
|
119
|
+
}
|
120
|
+
)
|
121
|
+
continue
|
122
|
+
if msg["role"] == "user":
|
123
|
+
vertexai_msg = {
|
124
|
+
"role": "user",
|
125
|
+
"parts": [{"text": text}],
|
126
|
+
}
|
127
|
+
elif msg["role"] == "assistant":
|
128
|
+
vertexai_msg = {
|
129
|
+
"role": "model",
|
130
|
+
"parts": [{"text": text}],
|
131
|
+
}
|
132
|
+
|
133
|
+
# images
|
134
|
+
if isinstance(msg["content"], list) and len(msg["content"]) > 1:
|
135
|
+
for image in msg["content"][1:]:
|
136
|
+
assert image["type"] == "image_url"
|
137
|
+
vertexai_msg["parts"].append(
|
138
|
+
{
|
139
|
+
"inline_data": {
|
140
|
+
"data": image["image_url"]["url"].split(",")[1],
|
141
|
+
"mime_type": "image/jpeg",
|
142
|
+
}
|
143
|
+
}
|
144
|
+
)
|
145
|
+
|
146
|
+
vertexai_message.append(vertexai_msg)
|
147
|
+
return vertexai_message
|
@@ -365,11 +365,10 @@ class StreamExecutor:
|
|
365
365
|
for comp, meta_info in generator:
|
366
366
|
self.text_ += comp
|
367
367
|
self.variables[name] += comp
|
368
|
+
self.meta_info[name] = meta_info
|
368
369
|
self.stream_var_event[name].set()
|
369
370
|
self.stream_text_event.set()
|
370
371
|
|
371
|
-
self.meta_info[name] = meta_info
|
372
|
-
|
373
372
|
self.variable_event[name].set()
|
374
373
|
self.stream_var_event[name].set()
|
375
374
|
|
@@ -428,6 +427,7 @@ class StreamExecutor:
|
|
428
427
|
self.messages_.append(last_msg)
|
429
428
|
self.cur_images = []
|
430
429
|
else:
|
430
|
+
# OpenAI chat API format
|
431
431
|
self.messages_.append({"role": expr.role, "content": new_text})
|
432
432
|
|
433
433
|
self.cur_role = None
|
@@ -582,7 +582,7 @@ class ProgramState:
|
|
582
582
|
else:
|
583
583
|
yield self.get_var(name)
|
584
584
|
|
585
|
-
async def text_async_iter(self, var_name=None):
|
585
|
+
async def text_async_iter(self, var_name=None, return_meta_data=False):
|
586
586
|
loop = asyncio.get_running_loop()
|
587
587
|
|
588
588
|
if self.stream_executor.stream:
|
@@ -606,7 +606,10 @@ class ProgramState:
|
|
606
606
|
out = str(self.stream_executor.variables[var_name][prev:])
|
607
607
|
prev += len(out)
|
608
608
|
if out:
|
609
|
-
|
609
|
+
if return_meta_data:
|
610
|
+
yield out, self.stream_executor.meta_info[var_name]
|
611
|
+
else:
|
612
|
+
yield out
|
610
613
|
if self.stream_executor.variable_event[var_name].is_set():
|
611
614
|
break
|
612
615
|
else:
|
@@ -632,11 +635,7 @@ class ProgramState:
|
|
632
635
|
self.stream_executor.end()
|
633
636
|
|
634
637
|
def __repr__(self) -> str:
|
635
|
-
|
636
|
-
ret = ""
|
637
|
-
for msg in msgs:
|
638
|
-
ret += msg["role"] + ":\n" + msg["content"] + "\n"
|
639
|
-
return ret
|
638
|
+
return f"ProgramState({self.text()})"
|
640
639
|
|
641
640
|
|
642
641
|
class ProgramStateGroup:
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
import dataclasses
|
4
4
|
import inspect
|
5
|
+
import warnings
|
5
6
|
from typing import List, Optional, Union
|
6
7
|
|
7
8
|
from sglang.global_config import global_config
|
@@ -40,6 +41,8 @@ class SglSamplingParams:
|
|
40
41
|
|
41
42
|
def to_openai_kwargs(self):
|
42
43
|
# OpenAI does not support top_k, so we drop it here
|
44
|
+
if self.regex is not None:
|
45
|
+
warnings.warn("Regular expression is not supported in the OpenAI backend.")
|
43
46
|
return {
|
44
47
|
"max_tokens": self.max_new_tokens,
|
45
48
|
"stop": self.stop or None,
|
@@ -49,8 +52,26 @@ class SglSamplingParams:
|
|
49
52
|
"presence_penalty": self.presence_penalty,
|
50
53
|
}
|
51
54
|
|
55
|
+
def to_vertexai_kwargs(self):
|
56
|
+
if self.regex is not None:
|
57
|
+
warnings.warn(
|
58
|
+
"Regular expression is not supported in the VertexAI backend."
|
59
|
+
)
|
60
|
+
return {
|
61
|
+
"candidate_count": 1,
|
62
|
+
"max_output_tokens": self.max_new_tokens,
|
63
|
+
"stop_sequences": self.stop,
|
64
|
+
"temperature": self.temperature,
|
65
|
+
"top_p": self.top_p,
|
66
|
+
"top_k": self.top_k if self.top_k > 0 else None,
|
67
|
+
}
|
68
|
+
|
52
69
|
def to_anthropic_kwargs(self):
|
53
70
|
# Anthropic does not support frequency_penalty or presence_penalty, so we drop it here
|
71
|
+
if self.regex is not None:
|
72
|
+
warnings.warn(
|
73
|
+
"Regular expression is not supported in the Anthropic backend."
|
74
|
+
)
|
54
75
|
return {
|
55
76
|
"max_tokens_to_sample": self.max_new_tokens,
|
56
77
|
"stop_sequences": self.stop,
|
@@ -5,6 +5,8 @@ import triton
|
|
5
5
|
import triton.language as tl
|
6
6
|
from sglang.srt.utils import wrap_kernel_launcher
|
7
7
|
|
8
|
+
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
9
|
+
|
8
10
|
|
9
11
|
@triton.jit
|
10
12
|
def _fwd_kernel(
|
@@ -120,7 +122,11 @@ cached_kernel = None
|
|
120
122
|
|
121
123
|
|
122
124
|
def context_attention_fwd(q, k, v, o, b_start_loc, b_seq_len, max_input_len):
|
123
|
-
|
125
|
+
if CUDA_CAPABILITY[0] >= 8:
|
126
|
+
BLOCK = 128
|
127
|
+
else:
|
128
|
+
BLOCK = 64
|
129
|
+
|
124
130
|
Lq, Lk, Lv = q.shape[-1], k.shape[-1], v.shape[-1]
|
125
131
|
assert Lq == Lk and Lk == Lv
|
126
132
|
assert Lk in {16, 32, 64, 128}
|