sglang 0.1.17__tar.gz → 0.1.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.17/sglang.egg-info → sglang-0.1.19}/PKG-INFO +44 -31
- {sglang-0.1.17 → sglang-0.1.19}/README.md +33 -19
- {sglang-0.1.17 → sglang-0.1.19}/pyproject.toml +6 -5
- {sglang-0.1.17 → sglang-0.1.19}/sglang/__init__.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/api.py +30 -4
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/litellm.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/openai.py +26 -15
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/runtime_endpoint.py +18 -14
- sglang-0.1.19/sglang/bench_latency.py +317 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/global_config.py +5 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/chat_template.py +41 -6
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/compiler.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/interpreter.py +6 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/ir.py +74 -28
- {sglang-0.1.17 → sglang-0.1.19}/sglang/launch_server.py +4 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/launch_server_llavavid.py +2 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/constrained/__init__.py +14 -6
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/constrained/fsm_cache.py +6 -3
- sglang-0.1.19/sglang/srt/constrained/jump_forward.py +164 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/conversation.py +2 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/flush_cache.py +2 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/hf_transformers_utils.py +68 -9
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/extend_attention.py +2 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/fused_moe.py +280 -169
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/logits_processor.py +106 -42
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/radix_attention.py +53 -29
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/token_attention.py +4 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/dp_worker.py +6 -3
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/infer_batch.py +144 -69
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/manager_multi.py +5 -5
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/manager_single.py +9 -4
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/model_runner.py +167 -55
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/radix_cache.py +4 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/schedule_heuristic.py +2 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/controller/tp_worker.py +156 -134
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/detokenizer_manager.py +19 -21
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/io_struct.py +11 -5
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/managers/tokenizer_manager.py +16 -14
- sglang-0.1.19/sglang/srt/model_config.py +131 -0
- sglang-0.1.19/sglang/srt/models/chatglm.py +399 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/commandr.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/gemma.py +5 -1
- sglang-0.1.19/sglang/srt/models/gemma2.py +436 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/grok.py +204 -137
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/llama2.py +12 -5
- sglang-0.1.19/sglang/srt/models/llama_classification.py +107 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/llava.py +11 -8
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/llavavid.py +1 -1
- sglang-0.1.19/sglang/srt/models/minicpm.py +373 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/mixtral.py +164 -115
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/mixtral_quant.py +0 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/qwen2.py +1 -1
- sglang-0.1.19/sglang/srt/models/qwen2_moe.py +454 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/yivl.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/openai_api_adapter.py +35 -25
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/openai_protocol.py +2 -2
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/server.py +69 -19
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/server_args.py +76 -43
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/utils.py +177 -35
- {sglang-0.1.17 → sglang-0.1.19}/sglang/test/test_programs.py +28 -10
- {sglang-0.1.17 → sglang-0.1.19}/sglang/utils.py +4 -3
- {sglang-0.1.17 → sglang-0.1.19/sglang.egg-info}/PKG-INFO +44 -31
- {sglang-0.1.17 → sglang-0.1.19}/sglang.egg-info/SOURCES.txt +6 -6
- {sglang-0.1.17 → sglang-0.1.19}/sglang.egg-info/requires.txt +10 -11
- sglang-0.1.17/sglang/srt/constrained/jump_forward.py +0 -76
- sglang-0.1.17/sglang/srt/managers/router/infer_batch.py +0 -596
- sglang-0.1.17/sglang/srt/managers/router/manager.py +0 -82
- sglang-0.1.17/sglang/srt/managers/router/model_rpc.py +0 -818
- sglang-0.1.17/sglang/srt/managers/router/model_runner.py +0 -445
- sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +0 -267
- sglang-0.1.17/sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17/sglang/srt/model_config.py +0 -46
- {sglang-0.1.17 → sglang-0.1.19}/LICENSE +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/setup.cfg +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.17 → sglang-0.1.19}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.19
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,30 +213,29 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: srt
|
217
218
|
Requires-Dist: aiohttp; extra == "srt"
|
218
219
|
Requires-Dist: fastapi; extra == "srt"
|
220
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
221
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
222
|
+
Requires-Dist: interegular; extra == "srt"
|
223
|
+
Requires-Dist: packaging; extra == "srt"
|
224
|
+
Requires-Dist: pillow; extra == "srt"
|
219
225
|
Requires-Dist: psutil; extra == "srt"
|
226
|
+
Requires-Dist: pydantic; extra == "srt"
|
220
227
|
Requires-Dist: rpyc; extra == "srt"
|
221
228
|
Requires-Dist: torch; extra == "srt"
|
222
|
-
Requires-Dist: uvloop; extra == "srt"
|
223
229
|
Requires-Dist: uvicorn; extra == "srt"
|
230
|
+
Requires-Dist: uvloop; extra == "srt"
|
224
231
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm==0.
|
226
|
-
Requires-Dist:
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: pillow; extra == "srt"
|
229
|
-
Requires-Dist: packaging; extra == "srt"
|
230
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
-
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
232
|
+
Requires-Dist: vllm==0.5.1; extra == "srt"
|
233
|
+
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
234
|
Provides-Extra: openai
|
234
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
235
|
-
Requires-Dist: numpy; extra == "openai"
|
236
236
|
Requires-Dist: tiktoken; extra == "openai"
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
|
-
Requires-Dist: numpy; extra == "anthropic"
|
240
239
|
Provides-Extra: litellm
|
241
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
242
241
|
Provides-Extra: all
|
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
257
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
258
257
|
|
259
258
|
The core features include:
|
260
|
-
- **
|
261
|
-
- **
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
262
261
|
|
263
262
|
## News
|
264
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -279,19 +278,33 @@ The core features include:
|
|
279
278
|
### Method 1: With pip
|
280
279
|
```
|
281
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
282
284
|
```
|
283
285
|
|
284
286
|
### Method 2: From source
|
285
287
|
```
|
286
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
287
289
|
cd sglang
|
288
290
|
|
289
|
-
pip install --upgrade pip
|
290
291
|
pip install -e "python[all]"
|
292
|
+
|
293
|
+
# Install FlashInfer CUDA kernels
|
294
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
291
295
|
```
|
292
296
|
|
293
|
-
###
|
294
|
-
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
299
|
+
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
306
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
307
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
295
308
|
|
296
309
|
## Quick Start
|
297
310
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -511,8 +524,8 @@ for out in state.text_iter():
|
|
511
524
|
```
|
512
525
|
|
513
526
|
### Tips and Implementation Details
|
514
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
515
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
516
529
|
|
517
530
|
## Backend: SGLang Runtime (SRT)
|
518
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
|
|
569
582
|
print(response)
|
570
583
|
```
|
571
584
|
|
572
|
-
|
573
585
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
574
586
|
|
575
587
|
If needed, you can also override the chat template when launching the server:
|
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
598
610
|
```
|
599
611
|
|
600
612
|
### Additional Arguments
|
601
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
602
614
|
```
|
603
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
604
616
|
```
|
@@ -610,16 +622,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
610
622
|
```
|
611
623
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
612
624
|
```
|
613
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
625
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
615
626
|
|
616
627
|
### Supported Models
|
617
628
|
- Llama
|
618
629
|
- Mistral
|
619
630
|
- Mixtral
|
620
|
-
- Qwen / Qwen 2
|
621
|
-
- Gemma
|
622
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
631
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
632
|
+
- Gemma / Gemma 2
|
623
633
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
624
634
|
- LLaVA
|
625
635
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -632,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
632
642
|
- StableLM
|
633
643
|
- Command-R
|
634
644
|
- DBRX
|
645
|
+
- Grok
|
646
|
+
- ChatGLM
|
635
647
|
- AWQ/GPTQ/Marlin quantization
|
636
648
|
|
637
649
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -643,17 +655,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
643
655
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
644
656
|

|
645
657
|
|
646
|
-
Learn more [
|
658
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
659
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
647
660
|
|
648
661
|
## Roadmap
|
649
662
|
https://github.com/sgl-project/sglang/issues/157
|
650
663
|
|
651
664
|
## Citation And Acknowledgment
|
652
665
|
```
|
653
|
-
@misc{
|
654
|
-
title={
|
655
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
656
|
-
year={
|
666
|
+
@misc{zheng2024sglang,
|
667
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
668
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
669
|
+
year={2024},
|
657
670
|
eprint={2312.07104},
|
658
671
|
archivePrefix={arXiv},
|
659
672
|
primaryClass={cs.AI}
|
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
12
|
The core features include:
|
13
|
-
- **
|
14
|
-
- **
|
13
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
14
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -32,19 +32,33 @@ The core features include:
|
|
32
32
|
### Method 1: With pip
|
33
33
|
```
|
34
34
|
pip install "sglang[all]"
|
35
|
+
|
36
|
+
# Install FlashInfer CUDA kernels
|
37
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
35
38
|
```
|
36
39
|
|
37
40
|
### Method 2: From source
|
38
41
|
```
|
39
|
-
git clone
|
42
|
+
git clone https://github.com/sgl-project/sglang.git
|
40
43
|
cd sglang
|
41
44
|
|
42
|
-
pip install --upgrade pip
|
43
45
|
pip install -e "python[all]"
|
46
|
+
|
47
|
+
# Install FlashInfer CUDA kernels
|
48
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
44
49
|
```
|
45
50
|
|
46
|
-
###
|
47
|
-
|
51
|
+
### Method 3: Using docker
|
52
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
53
|
+
|
54
|
+
### Common Notes
|
55
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
56
|
+
```
|
57
|
+
pip uninstall -y triton triton-nightly
|
58
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
59
|
+
```
|
60
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
61
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
48
62
|
|
49
63
|
## Quick Start
|
50
64
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -264,8 +278,8 @@ for out in state.text_iter():
|
|
264
278
|
```
|
265
279
|
|
266
280
|
### Tips and Implementation Details
|
267
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
268
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
281
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
282
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
269
283
|
|
270
284
|
## Backend: SGLang Runtime (SRT)
|
271
285
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -322,7 +336,6 @@ response = client.chat.completions.create(
|
|
322
336
|
print(response)
|
323
337
|
```
|
324
338
|
|
325
|
-
|
326
339
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
327
340
|
|
328
341
|
If needed, you can also override the chat template when launching the server:
|
@@ -351,7 +364,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
351
364
|
```
|
352
365
|
|
353
366
|
### Additional Arguments
|
354
|
-
- Add `--tp 2` to enable tensor parallelism.
|
367
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
355
368
|
```
|
356
369
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
357
370
|
```
|
@@ -363,16 +376,14 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
363
376
|
```
|
364
377
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
365
378
|
```
|
366
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
367
379
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
368
380
|
|
369
381
|
### Supported Models
|
370
382
|
- Llama
|
371
383
|
- Mistral
|
372
384
|
- Mixtral
|
373
|
-
- Qwen / Qwen 2
|
374
|
-
- Gemma
|
375
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
385
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
386
|
+
- Gemma / Gemma 2
|
376
387
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
377
388
|
- LLaVA
|
378
389
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -385,6 +396,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
385
396
|
- StableLM
|
386
397
|
- Command-R
|
387
398
|
- DBRX
|
399
|
+
- Grok
|
400
|
+
- ChatGLM
|
388
401
|
- AWQ/GPTQ/Marlin quantization
|
389
402
|
|
390
403
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -396,17 +409,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
396
409
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
397
410
|

|
398
411
|
|
399
|
-
Learn more [
|
412
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
413
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
400
414
|
|
401
415
|
## Roadmap
|
402
416
|
https://github.com/sgl-project/sglang/issues/157
|
403
417
|
|
404
418
|
## Citation And Acknowledgment
|
405
419
|
```
|
406
|
-
@misc{
|
407
|
-
title={
|
408
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
409
|
-
year={
|
420
|
+
@misc{zheng2024sglang,
|
421
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
422
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
423
|
+
year={2024},
|
410
424
|
eprint={2312.07104},
|
411
425
|
archivePrefix={arXiv},
|
412
426
|
primaryClass={cs.AI}
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.19"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -16,13 +16,14 @@ classifiers = [
|
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
18
|
"tqdm",
|
19
|
+
"numpy",
|
19
20
|
]
|
20
21
|
|
21
22
|
[project.optional-dependencies]
|
22
|
-
srt = ["aiohttp", "fastapi", "
|
23
|
-
"
|
24
|
-
openai = ["openai>=1.0", "
|
25
|
-
anthropic = ["anthropic>=0.20.0"
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
+
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
|
25
|
+
openai = ["openai>=1.0", "tiktoken"]
|
26
|
+
anthropic = ["anthropic>=0.20.0"]
|
26
27
|
litellm = ["litellm>=1.0.0"]
|
27
28
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
28
29
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.19"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -24,10 +24,10 @@ from sglang.api import (
|
|
24
24
|
|
25
25
|
# SGL Backends
|
26
26
|
from sglang.backend.anthropic import Anthropic
|
27
|
+
from sglang.backend.litellm import LiteLLM
|
27
28
|
from sglang.backend.openai import OpenAI
|
28
29
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
30
|
from sglang.backend.vertexai import VertexAI
|
30
|
-
from sglang.backend.litellm import LiteLLM
|
31
31
|
|
32
32
|
# Global Configurations
|
33
33
|
from sglang.global_config import global_config
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Public APIs of the language."""
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
|
|
43
43
|
global_config.default_backend = backend
|
44
44
|
|
45
45
|
|
46
|
-
def flush_cache(backend: BaseBackend = None):
|
46
|
+
def flush_cache(backend: Optional[BaseBackend] = None):
|
47
47
|
backend = backend or global_config.default_backend
|
48
48
|
if backend is None:
|
49
49
|
return False
|
50
50
|
return backend.flush_cache()
|
51
51
|
|
52
52
|
|
53
|
-
def get_server_args(backend: BaseBackend = None):
|
53
|
+
def get_server_args(backend: Optional[BaseBackend] = None):
|
54
54
|
backend = backend or global_config.default_backend
|
55
55
|
if backend is None:
|
56
56
|
return None
|
@@ -67,10 +67,16 @@ def gen(
|
|
67
67
|
frequency_penalty: Optional[float] = None,
|
68
68
|
presence_penalty: Optional[float] = None,
|
69
69
|
ignore_eos: Optional[bool] = None,
|
70
|
+
return_logprob: Optional[bool] = None,
|
71
|
+
logprob_start_len: Optional[int] = None,
|
72
|
+
top_logprobs_num: Optional[int] = None,
|
73
|
+
return_text_in_logprobs: Optional[bool] = None,
|
70
74
|
dtype: Optional[type] = None,
|
71
75
|
choices: Optional[List[str]] = None,
|
72
76
|
regex: Optional[str] = None,
|
73
77
|
):
|
78
|
+
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
79
|
+
|
74
80
|
if choices:
|
75
81
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
76
82
|
|
@@ -91,6 +97,10 @@ def gen(
|
|
91
97
|
frequency_penalty,
|
92
98
|
presence_penalty,
|
93
99
|
ignore_eos,
|
100
|
+
return_logprob,
|
101
|
+
logprob_start_len,
|
102
|
+
top_logprobs_num,
|
103
|
+
return_text_in_logprobs,
|
94
104
|
dtype,
|
95
105
|
regex,
|
96
106
|
)
|
@@ -106,6 +116,10 @@ def gen_int(
|
|
106
116
|
frequency_penalty: Optional[float] = None,
|
107
117
|
presence_penalty: Optional[float] = None,
|
108
118
|
ignore_eos: Optional[bool] = None,
|
119
|
+
return_logprob: Optional[bool] = None,
|
120
|
+
logprob_start_len: Optional[int] = None,
|
121
|
+
top_logprobs_num: Optional[int] = None,
|
122
|
+
return_text_in_logprobs: Optional[bool] = None,
|
109
123
|
):
|
110
124
|
return SglGen(
|
111
125
|
name,
|
@@ -117,6 +131,10 @@ def gen_int(
|
|
117
131
|
frequency_penalty,
|
118
132
|
presence_penalty,
|
119
133
|
ignore_eos,
|
134
|
+
return_logprob,
|
135
|
+
logprob_start_len,
|
136
|
+
top_logprobs_num,
|
137
|
+
return_text_in_logprobs,
|
120
138
|
int,
|
121
139
|
None,
|
122
140
|
)
|
@@ -132,6 +150,10 @@ def gen_string(
|
|
132
150
|
frequency_penalty: Optional[float] = None,
|
133
151
|
presence_penalty: Optional[float] = None,
|
134
152
|
ignore_eos: Optional[bool] = None,
|
153
|
+
return_logprob: Optional[bool] = None,
|
154
|
+
logprob_start_len: Optional[int] = None,
|
155
|
+
top_logprobs_num: Optional[int] = None,
|
156
|
+
return_text_in_logprobs: Optional[bool] = None,
|
135
157
|
):
|
136
158
|
return SglGen(
|
137
159
|
name,
|
@@ -143,6 +165,10 @@ def gen_string(
|
|
143
165
|
frequency_penalty,
|
144
166
|
presence_penalty,
|
145
167
|
ignore_eos,
|
168
|
+
return_logprob,
|
169
|
+
logprob_start_len,
|
170
|
+
top_logprobs_num,
|
171
|
+
return_text_in_logprobs,
|
146
172
|
str,
|
147
173
|
None,
|
148
174
|
)
|
@@ -158,7 +184,7 @@ def video(path: str, num_frames: int):
|
|
158
184
|
|
159
185
|
def select(
|
160
186
|
name: Optional[str] = None,
|
161
|
-
choices: List[str] = None,
|
187
|
+
choices: Optional[List[str]] = None,
|
162
188
|
temperature: float = 0.0,
|
163
189
|
):
|
164
190
|
assert choices is not None
|
@@ -13,7 +13,6 @@ except ImportError as e:
|
|
13
13
|
|
14
14
|
|
15
15
|
class LiteLLM(BaseBackend):
|
16
|
-
|
17
16
|
def __init__(
|
18
17
|
self,
|
19
18
|
model_name,
|
@@ -33,7 +32,8 @@ class LiteLLM(BaseBackend):
|
|
33
32
|
self.model_name = model_name
|
34
33
|
|
35
34
|
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
-
model_name
|
35
|
+
model_name
|
36
|
+
)
|
37
37
|
|
38
38
|
self.client_params = {
|
39
39
|
"api_key": api_key,
|
@@ -1,7 +1,7 @@
|
|
1
|
+
import dataclasses
|
1
2
|
import logging
|
2
3
|
import time
|
3
4
|
import warnings
|
4
|
-
import dataclasses
|
5
5
|
from typing import Callable, List, Optional, Union
|
6
6
|
|
7
7
|
import numpy as np
|
@@ -105,14 +105,16 @@ class OpenAI(BaseBackend):
|
|
105
105
|
def get_chat_template(self):
|
106
106
|
return self.chat_template
|
107
107
|
|
108
|
-
def _prepare_spec_execution(
|
109
|
-
|
108
|
+
def _prepare_spec_execution(
|
109
|
+
self,
|
110
|
+
sampling_params: SglSamplingParams,
|
111
|
+
num_api_spec_tokens: int,
|
112
|
+
spec_var_name: str,
|
113
|
+
):
|
110
114
|
if "max_tokens" not in self.spec_kwargs:
|
111
115
|
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
116
|
else:
|
113
|
-
assert
|
114
|
-
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
-
)
|
117
|
+
assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
116
118
|
|
117
119
|
params = sampling_params.to_openai_kwargs()
|
118
120
|
for key, value in params.items():
|
@@ -151,8 +153,9 @@ class OpenAI(BaseBackend):
|
|
151
153
|
)
|
152
154
|
prompt = s.messages_
|
153
155
|
else:
|
154
|
-
return self._prepare_spec_execution(
|
155
|
-
s.num_api_spec_tokens, spec_var_name
|
156
|
+
return self._prepare_spec_execution(
|
157
|
+
sampling_params, s.num_api_spec_tokens, spec_var_name
|
158
|
+
)
|
156
159
|
else:
|
157
160
|
prompt = s.text_
|
158
161
|
|
@@ -325,7 +328,7 @@ class OpenAI(BaseBackend):
|
|
325
328
|
ret_str = ret.choices[0].text
|
326
329
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
330
|
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
-
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
331
|
+
self.token_usage.completion_tokens = ret.usage.completion_tokens
|
329
332
|
|
330
333
|
# TODO:
|
331
334
|
# 1. return logits as the scores
|
@@ -355,7 +358,9 @@ class OpenAI(BaseBackend):
|
|
355
358
|
return decision, scores, None, None
|
356
359
|
|
357
360
|
|
358
|
-
def openai_completion(
|
361
|
+
def openai_completion(
|
362
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
363
|
+
):
|
359
364
|
for attempt in range(retries):
|
360
365
|
try:
|
361
366
|
if is_chat:
|
@@ -385,15 +390,19 @@ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None,
|
|
385
390
|
return comp
|
386
391
|
|
387
392
|
|
388
|
-
def openai_completion_stream(
|
393
|
+
def openai_completion_stream(
|
394
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
395
|
+
):
|
389
396
|
for attempt in range(retries):
|
390
397
|
try:
|
391
398
|
if is_chat:
|
392
399
|
if "stop" in kwargs and kwargs["stop"] is None:
|
393
400
|
kwargs.pop("stop")
|
394
401
|
generator = client.chat.completions.create(
|
395
|
-
messages=prompt,
|
396
|
-
|
402
|
+
messages=prompt,
|
403
|
+
stream=True,
|
404
|
+
stream_options={"include_usage": True},
|
405
|
+
**kwargs,
|
397
406
|
)
|
398
407
|
for ret in generator:
|
399
408
|
if len(ret.choices) == 0:
|
@@ -405,8 +414,10 @@ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, promp
|
|
405
414
|
yield content or "", {}
|
406
415
|
else:
|
407
416
|
generator = client.completions.create(
|
408
|
-
prompt=prompt,
|
409
|
-
|
417
|
+
prompt=prompt,
|
418
|
+
stream=True,
|
419
|
+
stream_options={"include_usage": True},
|
420
|
+
**kwargs,
|
410
421
|
)
|
411
422
|
for ret in generator:
|
412
423
|
if len(ret.choices) == 0:
|
@@ -1,18 +1,18 @@
|
|
1
1
|
import json
|
2
|
-
from typing import
|
2
|
+
from typing import List, Optional
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
-
import requests
|
6
5
|
|
7
6
|
from sglang.backend.base_backend import BaseBackend
|
8
7
|
from sglang.global_config import global_config
|
9
8
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
10
9
|
from sglang.lang.interpreter import StreamExecutor
|
11
|
-
from sglang.lang.ir import
|
12
|
-
from sglang.utils import
|
10
|
+
from sglang.lang.ir import SglSamplingParams
|
11
|
+
from sglang.utils import http_request
|
13
12
|
|
14
13
|
|
15
14
|
class RuntimeEndpoint(BaseBackend):
|
15
|
+
|
16
16
|
def __init__(
|
17
17
|
self,
|
18
18
|
base_url: str,
|
@@ -38,8 +38,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
38
38
|
self.model_info = res.json()
|
39
39
|
|
40
40
|
self.chat_template = get_chat_template_by_model_path(
|
41
|
-
self.model_info["model_path"]
|
42
|
-
)
|
41
|
+
self.model_info["model_path"])
|
43
42
|
|
44
43
|
def get_model_name(self):
|
45
44
|
return self.model_info["model_path"]
|
@@ -125,6 +124,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
125
124
|
else:
|
126
125
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
127
126
|
|
127
|
+
for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
|
128
|
+
value = getattr(sampling_params, item, None)
|
129
|
+
if value is not None:
|
130
|
+
data[item] = value
|
131
|
+
|
128
132
|
self._add_images(s, data)
|
129
133
|
|
130
134
|
res = http_request(
|
@@ -167,6 +171,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
167
171
|
else:
|
168
172
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
169
173
|
|
174
|
+
for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
|
175
|
+
value = getattr(sampling_params, item, None)
|
176
|
+
if value is not None:
|
177
|
+
data[item] = value
|
178
|
+
|
170
179
|
data["stream"] = True
|
171
180
|
self._add_images(s, data)
|
172
181
|
|
@@ -181,21 +190,16 @@ class RuntimeEndpoint(BaseBackend):
|
|
181
190
|
self._assert_success(res)
|
182
191
|
pos = 0
|
183
192
|
|
184
|
-
incomplete_text = ""
|
185
193
|
for chunk in res.iter_lines(decode_unicode=False):
|
186
194
|
chunk = chunk.decode("utf-8")
|
187
195
|
if chunk and chunk.startswith("data:"):
|
188
196
|
if chunk == "data: [DONE]":
|
189
197
|
break
|
190
198
|
data = json.loads(chunk[5:].strip("\n"))
|
191
|
-
|
199
|
+
chunk_text = data["text"][pos:]
|
192
200
|
meta_info = data["meta_info"]
|
193
|
-
pos += len(
|
194
|
-
|
195
|
-
yield text, meta_info
|
196
|
-
|
197
|
-
if len(incomplete_text) > 0:
|
198
|
-
yield incomplete_text, meta_info
|
201
|
+
pos += len(chunk_text)
|
202
|
+
yield chunk_text, meta_info
|
199
203
|
|
200
204
|
def select(
|
201
205
|
self,
|