sglang 0.1.16__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.16/sglang.egg-info → sglang-0.1.18}/PKG-INFO +40 -27
- {sglang-0.1.16 → sglang-0.1.18}/README.md +27 -16
- {sglang-0.1.16 → sglang-0.1.18}/pyproject.toml +9 -7
- {sglang-0.1.16 → sglang-0.1.18}/sglang/__init__.py +3 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/api.py +7 -7
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/anthropic.py +1 -1
- sglang-0.1.18/sglang/backend/litellm.py +90 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/openai.py +158 -11
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/runtime_endpoint.py +18 -10
- sglang-0.1.18/sglang/bench_latency.py +299 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/global_config.py +12 -2
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/compiler.py +2 -2
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/interpreter.py +114 -67
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/ir.py +28 -3
- {sglang-0.1.16 → sglang-0.1.18}/sglang/launch_server.py +4 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/launch_server_llavavid.py +2 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/__init__.py +13 -6
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/fsm_cache.py +8 -2
- sglang-0.1.18/sglang/srt/constrained/jump_forward.py +164 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/conversation.py +2 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/flush_cache.py +3 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/hf_transformers_utils.py +130 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/extend_attention.py +17 -0
- sglang-0.1.18/sglang/srt/layers/fused_moe.py +582 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/logits_processor.py +65 -32
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/radix_attention.py +41 -7
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/token_attention.py +16 -1
- sglang-0.1.18/sglang/srt/managers/controller/dp_worker.py +113 -0
- {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/infer_batch.py +242 -100
- sglang-0.1.18/sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang-0.1.16/sglang/srt/managers/router/manager.py → sglang-0.1.18/sglang/srt/managers/controller/manager_single.py +34 -14
- {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/model_runner.py +262 -158
- {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/radix_cache.py +11 -1
- sglang-0.1.16/sglang/srt/managers/router/scheduler.py → sglang-0.1.18/sglang/srt/managers/controller/schedule_heuristic.py +9 -7
- sglang-0.1.16/sglang/srt/managers/router/model_rpc.py → sglang-0.1.18/sglang/srt/managers/controller/tp_worker.py +298 -267
- sglang-0.1.18/sglang/srt/managers/detokenizer_manager.py +91 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/managers/io_struct.py +22 -12
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/managers/tokenizer_manager.py +151 -87
- sglang-0.1.18/sglang/srt/model_config.py +125 -0
- sglang-0.1.18/sglang/srt/models/chatglm.py +399 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/commandr.py +10 -13
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/dbrx.py +9 -15
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/gemma.py +12 -15
- sglang-0.1.18/sglang/srt/models/grok.py +738 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llama2.py +26 -15
- sglang-0.1.18/sglang/srt/models/llama_classification.py +104 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llava.py +86 -19
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llavavid.py +11 -20
- sglang-0.1.18/sglang/srt/models/mixtral.py +562 -0
- sglang-0.1.16/sglang/srt/models/mixtral.py → sglang-0.1.18/sglang/srt/models/mixtral_quant.py +11 -22
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/qwen.py +9 -13
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/qwen2.py +11 -13
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/stablelm.py +9 -15
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/yivl.py +17 -22
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/openai_api_adapter.py +150 -95
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/openai_protocol.py +11 -2
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/server.py +124 -48
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/server_args.py +128 -48
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/utils.py +234 -67
- {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_programs.py +65 -3
- {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_utils.py +32 -1
- {sglang-0.1.16 → sglang-0.1.18}/sglang/utils.py +23 -4
- {sglang-0.1.16 → sglang-0.1.18/sglang.egg-info}/PKG-INFO +40 -27
- {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/SOURCES.txt +15 -9
- {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/requires.txt +14 -11
- sglang-0.1.16/sglang/srt/backend_config.py +0 -13
- sglang-0.1.16/sglang/srt/constrained/jump_forward.py +0 -76
- sglang-0.1.16/sglang/srt/managers/detokenizer_manager.py +0 -95
- sglang-0.1.16/sglang/srt/model_config.py +0 -47
- sglang-0.1.16/sglang/srt/models/dbrx_config.py +0 -281
- sglang-0.1.16/sglang/srt/weight_utils.py +0 -417
- {sglang-0.1.16 → sglang-0.1.18}/LICENSE +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/setup.cfg +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,34 +213,36 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: srt
|
217
218
|
Requires-Dist: aiohttp; extra == "srt"
|
218
219
|
Requires-Dist: fastapi; extra == "srt"
|
220
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
221
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
222
|
+
Requires-Dist: interegular; extra == "srt"
|
223
|
+
Requires-Dist: packaging; extra == "srt"
|
224
|
+
Requires-Dist: pillow; extra == "srt"
|
219
225
|
Requires-Dist: psutil; extra == "srt"
|
226
|
+
Requires-Dist: pydantic; extra == "srt"
|
220
227
|
Requires-Dist: rpyc; extra == "srt"
|
221
228
|
Requires-Dist: torch; extra == "srt"
|
222
|
-
Requires-Dist: uvloop; extra == "srt"
|
223
229
|
Requires-Dist: uvicorn; extra == "srt"
|
230
|
+
Requires-Dist: uvloop; extra == "srt"
|
224
231
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm
|
226
|
-
Requires-Dist:
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: pillow; extra == "srt"
|
229
|
-
Requires-Dist: packaging; extra == "srt"
|
230
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
-
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
232
|
+
Requires-Dist: vllm==0.5.0; extra == "srt"
|
233
|
+
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
234
|
Provides-Extra: openai
|
234
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
235
|
-
Requires-Dist: numpy; extra == "openai"
|
236
236
|
Requires-Dist: tiktoken; extra == "openai"
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
|
-
|
239
|
+
Provides-Extra: litellm
|
240
|
+
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
240
241
|
Provides-Extra: all
|
241
242
|
Requires-Dist: sglang[srt]; extra == "all"
|
242
243
|
Requires-Dist: sglang[openai]; extra == "all"
|
243
244
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
245
|
+
Requires-Dist: sglang[litellm]; extra == "all"
|
244
246
|
|
245
247
|
<div align="center">
|
246
248
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -253,9 +255,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
|
|
253
255
|
SGLang is a structured generation language designed for large language models (LLMs).
|
254
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
255
257
|
|
256
|
-
The core features
|
257
|
-
- **
|
258
|
-
- **
|
258
|
+
The core features include:
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
259
261
|
|
260
262
|
## News
|
261
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -276,23 +278,27 @@ The core features of SGLang include:
|
|
276
278
|
### Method 1: With pip
|
277
279
|
```
|
278
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
279
284
|
```
|
280
285
|
|
281
286
|
### Method 2: From source
|
282
287
|
```
|
283
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
284
289
|
cd sglang
|
285
290
|
|
286
291
|
pip install --upgrade pip
|
287
292
|
pip install -e "python[all]"
|
293
|
+
|
294
|
+
# Install FlashInfer CUDA kernels
|
295
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
288
296
|
```
|
289
297
|
|
290
298
|
### Notes
|
291
|
-
- If you
|
292
|
-
|
293
|
-
|
294
|
-
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
295
|
-
|
299
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
300
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
296
302
|
|
297
303
|
## Quick Start
|
298
304
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -603,11 +609,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
603
609
|
```
|
604
610
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
605
611
|
```
|
612
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
613
|
+
```
|
614
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
615
|
+
```
|
606
616
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
607
617
|
```
|
608
618
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
609
619
|
```
|
610
|
-
-
|
620
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
611
621
|
|
612
622
|
### Supported Models
|
613
623
|
- Llama
|
@@ -621,6 +631,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
621
631
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
622
632
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
623
633
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
634
|
+
- LLaVA-NeXT-Video
|
635
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
624
636
|
- Yi-VL
|
625
637
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
626
638
|
- StableLM
|
@@ -637,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
637
649
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
638
650
|

|
639
651
|
|
640
|
-
Learn more [
|
652
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
653
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
641
654
|
|
642
655
|
## Roadmap
|
643
656
|
https://github.com/sgl-project/sglang/issues/157
|
644
657
|
|
645
658
|
## Citation And Acknowledgment
|
646
659
|
```
|
647
|
-
@misc{
|
648
|
-
title={
|
649
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
650
|
-
year={
|
660
|
+
@misc{zheng2024sglang,
|
661
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
662
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
663
|
+
year={2024},
|
651
664
|
eprint={2312.07104},
|
652
665
|
archivePrefix={arXiv},
|
653
666
|
primaryClass={cs.AI}
|
@@ -9,9 +9,9 @@
|
|
9
9
|
SGLang is a structured generation language designed for large language models (LLMs).
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
|
-
The core features
|
13
|
-
- **
|
14
|
-
- **
|
12
|
+
The core features include:
|
13
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
14
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -32,23 +32,27 @@ The core features of SGLang include:
|
|
32
32
|
### Method 1: With pip
|
33
33
|
```
|
34
34
|
pip install "sglang[all]"
|
35
|
+
|
36
|
+
# Install FlashInfer CUDA kernels
|
37
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
35
38
|
```
|
36
39
|
|
37
40
|
### Method 2: From source
|
38
41
|
```
|
39
|
-
git clone
|
42
|
+
git clone https://github.com/sgl-project/sglang.git
|
40
43
|
cd sglang
|
41
44
|
|
42
45
|
pip install --upgrade pip
|
43
46
|
pip install -e "python[all]"
|
47
|
+
|
48
|
+
# Install FlashInfer CUDA kernels
|
49
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
44
50
|
```
|
45
51
|
|
46
52
|
### Notes
|
47
|
-
- If you
|
48
|
-
|
49
|
-
|
50
|
-
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
51
|
-
|
53
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
54
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
55
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
52
56
|
|
53
57
|
## Quick Start
|
54
58
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -359,11 +363,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
359
363
|
```
|
360
364
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
361
365
|
```
|
366
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
367
|
+
```
|
368
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
369
|
+
```
|
362
370
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
363
371
|
```
|
364
372
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
365
373
|
```
|
366
|
-
-
|
374
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
367
375
|
|
368
376
|
### Supported Models
|
369
377
|
- Llama
|
@@ -377,6 +385,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
377
385
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
378
386
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
379
387
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
388
|
+
- LLaVA-NeXT-Video
|
389
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
380
390
|
- Yi-VL
|
381
391
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
382
392
|
- StableLM
|
@@ -393,21 +403,22 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
393
403
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
394
404
|

|
395
405
|
|
396
|
-
Learn more [
|
406
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
407
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
397
408
|
|
398
409
|
## Roadmap
|
399
410
|
https://github.com/sgl-project/sglang/issues/157
|
400
411
|
|
401
412
|
## Citation And Acknowledgment
|
402
413
|
```
|
403
|
-
@misc{
|
404
|
-
title={
|
405
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
406
|
-
year={
|
414
|
+
@misc{zheng2024sglang,
|
415
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
416
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
417
|
+
year={2024},
|
407
418
|
eprint={2312.07104},
|
408
419
|
archivePrefix={arXiv},
|
409
420
|
primaryClass={cs.AI}
|
410
421
|
}
|
411
422
|
```
|
412
423
|
|
413
|
-
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
424
|
+
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
8
|
-
description = "A structured generation langauge for LLMs."
|
7
|
+
version = "0.1.18"
|
8
|
+
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
11
11
|
license = {file = "LICENSE"}
|
@@ -16,14 +16,16 @@ classifiers = [
|
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
18
|
"tqdm",
|
19
|
+
"numpy",
|
19
20
|
]
|
20
21
|
|
21
22
|
[project.optional-dependencies]
|
22
|
-
srt = ["aiohttp", "fastapi", "
|
23
|
-
"
|
24
|
-
openai = ["openai>=1.0", "
|
25
|
-
anthropic = ["anthropic>=0.20.0"
|
26
|
-
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
+
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
|
25
|
+
openai = ["openai>=1.0", "tiktoken"]
|
26
|
+
anthropic = ["anthropic>=0.20.0"]
|
27
|
+
litellm = ["litellm>=1.0.0"]
|
28
|
+
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
27
29
|
|
28
30
|
[project.urls]
|
29
31
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.18"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -24,6 +24,7 @@ from sglang.api import (
|
|
24
24
|
|
25
25
|
# SGL Backends
|
26
26
|
from sglang.backend.anthropic import Anthropic
|
27
|
+
from sglang.backend.litellm import LiteLLM
|
27
28
|
from sglang.backend.openai import OpenAI
|
28
29
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
30
|
from sglang.backend.vertexai import VertexAI
|
@@ -35,6 +36,7 @@ from sglang.global_config import global_config
|
|
35
36
|
__all__ = [
|
36
37
|
"global_config",
|
37
38
|
"Anthropic",
|
39
|
+
"LiteLLM",
|
38
40
|
"OpenAI",
|
39
41
|
"RuntimeEndpoint",
|
40
42
|
"VertexAI",
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Public APIs of the language."""
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
@@ -20,13 +20,13 @@ from sglang.lang.ir import (
|
|
20
20
|
|
21
21
|
|
22
22
|
def function(
|
23
|
-
func: Optional[Callable] = None,
|
23
|
+
func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
|
24
24
|
):
|
25
25
|
if func:
|
26
|
-
return SglFunction(func,
|
26
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
27
27
|
|
28
28
|
def decorator(func):
|
29
|
-
return SglFunction(func,
|
29
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
30
30
|
|
31
31
|
return decorator
|
32
32
|
|
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
|
|
43
43
|
global_config.default_backend = backend
|
44
44
|
|
45
45
|
|
46
|
-
def flush_cache(backend: BaseBackend = None):
|
46
|
+
def flush_cache(backend: Optional[BaseBackend] = None):
|
47
47
|
backend = backend or global_config.default_backend
|
48
48
|
if backend is None:
|
49
49
|
return False
|
50
50
|
return backend.flush_cache()
|
51
51
|
|
52
52
|
|
53
|
-
def get_server_args(backend: BaseBackend = None):
|
53
|
+
def get_server_args(backend: Optional[BaseBackend] = None):
|
54
54
|
backend = backend or global_config.default_backend
|
55
55
|
if backend is None:
|
56
56
|
return None
|
@@ -158,7 +158,7 @@ def video(path: str, num_frames: int):
|
|
158
158
|
|
159
159
|
def select(
|
160
160
|
name: Optional[str] = None,
|
161
|
-
choices: List[str] = None,
|
161
|
+
choices: Optional[List[str]] = None,
|
162
162
|
temperature: float = 0.0,
|
163
163
|
):
|
164
164
|
assert choices is not None
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from typing import Mapping, Optional
|
2
|
+
|
3
|
+
from sglang.backend.base_backend import BaseBackend
|
4
|
+
from sglang.lang.chat_template import get_chat_template_by_model_path
|
5
|
+
from sglang.lang.interpreter import StreamExecutor
|
6
|
+
from sglang.lang.ir import SglSamplingParams
|
7
|
+
|
8
|
+
try:
|
9
|
+
import litellm
|
10
|
+
except ImportError as e:
|
11
|
+
litellm = e
|
12
|
+
litellm.num_retries = 1
|
13
|
+
|
14
|
+
|
15
|
+
class LiteLLM(BaseBackend):
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
model_name,
|
19
|
+
chat_template=None,
|
20
|
+
api_key=None,
|
21
|
+
organization: Optional[str] = None,
|
22
|
+
base_url: Optional[str] = None,
|
23
|
+
timeout: Optional[float] = 600,
|
24
|
+
max_retries: Optional[int] = litellm.num_retries,
|
25
|
+
default_headers: Optional[Mapping[str, str]] = None,
|
26
|
+
):
|
27
|
+
super().__init__()
|
28
|
+
|
29
|
+
if isinstance(litellm, Exception):
|
30
|
+
raise litellm
|
31
|
+
|
32
|
+
self.model_name = model_name
|
33
|
+
|
34
|
+
self.chat_template = chat_template or get_chat_template_by_model_path(
|
35
|
+
model_name
|
36
|
+
)
|
37
|
+
|
38
|
+
self.client_params = {
|
39
|
+
"api_key": api_key,
|
40
|
+
"organization": organization,
|
41
|
+
"base_url": base_url,
|
42
|
+
"timeout": timeout,
|
43
|
+
"max_retries": max_retries,
|
44
|
+
"default_headers": default_headers,
|
45
|
+
}
|
46
|
+
|
47
|
+
def get_chat_template(self):
|
48
|
+
return self.chat_template
|
49
|
+
|
50
|
+
def generate(
|
51
|
+
self,
|
52
|
+
s: StreamExecutor,
|
53
|
+
sampling_params: SglSamplingParams,
|
54
|
+
):
|
55
|
+
if s.messages_:
|
56
|
+
messages = s.messages_
|
57
|
+
else:
|
58
|
+
messages = [{"role": "user", "content": s.text_}]
|
59
|
+
|
60
|
+
ret = litellm.completion(
|
61
|
+
model=self.model_name,
|
62
|
+
messages=messages,
|
63
|
+
**self.client_params,
|
64
|
+
**sampling_params.to_anthropic_kwargs(),
|
65
|
+
)
|
66
|
+
comp = ret.choices[0].message.content
|
67
|
+
|
68
|
+
return comp, {}
|
69
|
+
|
70
|
+
def generate_stream(
|
71
|
+
self,
|
72
|
+
s: StreamExecutor,
|
73
|
+
sampling_params: SglSamplingParams,
|
74
|
+
):
|
75
|
+
if s.messages_:
|
76
|
+
messages = s.messages_
|
77
|
+
else:
|
78
|
+
messages = [{"role": "user", "content": s.text_}]
|
79
|
+
|
80
|
+
ret = litellm.completion(
|
81
|
+
model=self.model_name,
|
82
|
+
messages=messages,
|
83
|
+
stream=True,
|
84
|
+
**self.client_params,
|
85
|
+
**sampling_params.to_litellm_kwargs(),
|
86
|
+
)
|
87
|
+
for chunk in ret:
|
88
|
+
text = chunk.choices[0].delta.content
|
89
|
+
if text is not None:
|
90
|
+
yield text, {}
|