sglang 0.1.17__tar.gz → 0.1.21__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.17/sglang.egg-info → sglang-0.1.21}/PKG-INFO +52 -31
- {sglang-0.1.17 → sglang-0.1.21}/README.md +41 -19
- {sglang-0.1.17 → sglang-0.1.21}/pyproject.toml +6 -5
- {sglang-0.1.17 → sglang-0.1.21}/sglang/__init__.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/api.py +30 -4
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/litellm.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/openai.py +26 -15
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/runtime_endpoint.py +26 -12
- sglang-0.1.21/sglang/bench_latency.py +320 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/global_config.py +22 -12
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/chat_template.py +40 -5
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/compiler.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/interpreter.py +6 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/ir.py +74 -28
- {sglang-0.1.17 → sglang-0.1.21}/sglang/launch_server.py +4 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/launch_server_llavavid.py +2 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/__init__.py +14 -6
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/fsm_cache.py +6 -3
- sglang-0.1.21/sglang/srt/constrained/jump_forward.py +164 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/conversation.py +2 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/flush_cache.py +2 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/hf_transformers_utils.py +68 -9
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/extend_attention.py +2 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/fused_moe.py +280 -169
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/logits_processor.py +106 -42
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/radix_attention.py +59 -58
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/token_attention.py +4 -8
- sglang-0.1.21/sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/dp_worker.py +6 -3
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/infer_batch.py +397 -108
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/manager_multi.py +11 -7
- sglang-0.1.21/sglang/srt/managers/controller/manager_single.py +177 -0
- sglang-0.1.21/sglang/srt/managers/controller/model_runner.py +359 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/radix_cache.py +8 -3
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/schedule_heuristic.py +6 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/tp_worker.py +198 -176
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/detokenizer_manager.py +19 -21
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/io_struct.py +11 -5
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/tokenizer_manager.py +16 -14
- sglang-0.1.21/sglang/srt/memory_pool.py +105 -0
- sglang-0.1.21/sglang/srt/model_config.py +131 -0
- sglang-0.1.21/sglang/srt/models/chatglm.py +399 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/commandr.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/gemma.py +5 -1
- sglang-0.1.21/sglang/srt/models/gemma2.py +436 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/grok.py +204 -137
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llama2.py +12 -5
- sglang-0.1.21/sglang/srt/models/llama_classification.py +107 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llava.py +11 -8
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llavavid.py +1 -1
- sglang-0.1.21/sglang/srt/models/minicpm.py +366 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mixtral.py +164 -115
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mixtral_quant.py +0 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/qwen2.py +1 -1
- sglang-0.1.21/sglang/srt/models/qwen2_moe.py +473 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/yivl.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/openai_api_adapter.py +35 -25
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/openai_protocol.py +2 -2
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/server.py +65 -19
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/server_args.py +88 -47
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/utils.py +177 -35
- {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_programs.py +28 -10
- {sglang-0.1.17 → sglang-0.1.21}/sglang/utils.py +4 -3
- {sglang-0.1.17 → sglang-0.1.21/sglang.egg-info}/PKG-INFO +52 -31
- {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/SOURCES.txt +7 -6
- {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/requires.txt +10 -11
- sglang-0.1.17/sglang/srt/constrained/jump_forward.py +0 -76
- sglang-0.1.17/sglang/srt/managers/controller/manager_single.py +0 -97
- sglang-0.1.17/sglang/srt/managers/controller/model_runner.py +0 -462
- sglang-0.1.17/sglang/srt/managers/router/infer_batch.py +0 -596
- sglang-0.1.17/sglang/srt/managers/router/manager.py +0 -82
- sglang-0.1.17/sglang/srt/managers/router/model_rpc.py +0 -818
- sglang-0.1.17/sglang/srt/managers/router/model_runner.py +0 -445
- sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +0 -267
- sglang-0.1.17/sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17/sglang/srt/memory_pool.py +0 -103
- sglang-0.1.17/sglang/srt/model_config.py +0 -46
- {sglang-0.1.17 → sglang-0.1.21}/LICENSE +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/setup.cfg +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.21
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,30 +213,29 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: srt
|
217
218
|
Requires-Dist: aiohttp; extra == "srt"
|
218
219
|
Requires-Dist: fastapi; extra == "srt"
|
220
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
221
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
222
|
+
Requires-Dist: interegular; extra == "srt"
|
223
|
+
Requires-Dist: packaging; extra == "srt"
|
224
|
+
Requires-Dist: pillow; extra == "srt"
|
219
225
|
Requires-Dist: psutil; extra == "srt"
|
226
|
+
Requires-Dist: pydantic; extra == "srt"
|
220
227
|
Requires-Dist: rpyc; extra == "srt"
|
221
228
|
Requires-Dist: torch; extra == "srt"
|
222
|
-
Requires-Dist: uvloop; extra == "srt"
|
223
229
|
Requires-Dist: uvicorn; extra == "srt"
|
230
|
+
Requires-Dist: uvloop; extra == "srt"
|
224
231
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm==0.
|
226
|
-
Requires-Dist:
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: pillow; extra == "srt"
|
229
|
-
Requires-Dist: packaging; extra == "srt"
|
230
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
-
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
232
|
+
Requires-Dist: vllm==0.5.1; extra == "srt"
|
233
|
+
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
234
|
Provides-Extra: openai
|
234
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
235
|
-
Requires-Dist: numpy; extra == "openai"
|
236
236
|
Requires-Dist: tiktoken; extra == "openai"
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
|
-
Requires-Dist: numpy; extra == "anthropic"
|
240
239
|
Provides-Extra: litellm
|
241
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
242
241
|
Provides-Extra: all
|
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
257
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
258
257
|
|
259
258
|
The core features include:
|
260
|
-
- **
|
261
|
-
- **
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
262
261
|
|
263
262
|
## News
|
264
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -279,19 +278,33 @@ The core features include:
|
|
279
278
|
### Method 1: With pip
|
280
279
|
```
|
281
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
282
284
|
```
|
283
285
|
|
284
286
|
### Method 2: From source
|
285
287
|
```
|
286
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
287
289
|
cd sglang
|
288
290
|
|
289
|
-
pip install --upgrade pip
|
290
291
|
pip install -e "python[all]"
|
292
|
+
|
293
|
+
# Install FlashInfer CUDA kernels
|
294
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
291
295
|
```
|
292
296
|
|
293
|
-
###
|
294
|
-
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
299
|
+
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
306
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
307
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
295
308
|
|
296
309
|
## Quick Start
|
297
310
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -511,8 +524,8 @@ for out in state.text_iter():
|
|
511
524
|
```
|
512
525
|
|
513
526
|
### Tips and Implementation Details
|
514
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
515
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
516
529
|
|
517
530
|
## Backend: SGLang Runtime (SRT)
|
518
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
|
|
569
582
|
print(response)
|
570
583
|
```
|
571
584
|
|
572
|
-
|
573
585
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
574
586
|
|
575
587
|
If needed, you can also override the chat template when launching the server:
|
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
598
610
|
```
|
599
611
|
|
600
612
|
### Additional Arguments
|
601
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
602
614
|
```
|
603
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
604
616
|
```
|
@@ -610,16 +622,22 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
610
622
|
```
|
611
623
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
612
624
|
```
|
613
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
625
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
626
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
|
627
|
+
```
|
628
|
+
# Node 0
|
629
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
|
630
|
+
|
631
|
+
# Node 1
|
632
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
633
|
+
```
|
615
634
|
|
616
635
|
### Supported Models
|
617
636
|
- Llama
|
618
637
|
- Mistral
|
619
638
|
- Mixtral
|
620
|
-
- Qwen / Qwen 2
|
621
|
-
- Gemma
|
622
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
639
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
640
|
+
- Gemma / Gemma 2
|
623
641
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
624
642
|
- LLaVA
|
625
643
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -632,6 +650,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
632
650
|
- StableLM
|
633
651
|
- Command-R
|
634
652
|
- DBRX
|
653
|
+
- Grok
|
654
|
+
- ChatGLM
|
635
655
|
- AWQ/GPTQ/Marlin quantization
|
636
656
|
|
637
657
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -643,17 +663,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
643
663
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
644
664
|

|
645
665
|
|
646
|
-
Learn more [
|
666
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
667
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
647
668
|
|
648
669
|
## Roadmap
|
649
670
|
https://github.com/sgl-project/sglang/issues/157
|
650
671
|
|
651
672
|
## Citation And Acknowledgment
|
652
673
|
```
|
653
|
-
@misc{
|
654
|
-
title={
|
655
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
656
|
-
year={
|
674
|
+
@misc{zheng2024sglang,
|
675
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
676
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
677
|
+
year={2024},
|
657
678
|
eprint={2312.07104},
|
658
679
|
archivePrefix={arXiv},
|
659
680
|
primaryClass={cs.AI}
|
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
12
|
The core features include:
|
13
|
-
- **
|
14
|
-
- **
|
13
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
14
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -32,19 +32,33 @@ The core features include:
|
|
32
32
|
### Method 1: With pip
|
33
33
|
```
|
34
34
|
pip install "sglang[all]"
|
35
|
+
|
36
|
+
# Install FlashInfer CUDA kernels
|
37
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
35
38
|
```
|
36
39
|
|
37
40
|
### Method 2: From source
|
38
41
|
```
|
39
|
-
git clone
|
42
|
+
git clone https://github.com/sgl-project/sglang.git
|
40
43
|
cd sglang
|
41
44
|
|
42
|
-
pip install --upgrade pip
|
43
45
|
pip install -e "python[all]"
|
46
|
+
|
47
|
+
# Install FlashInfer CUDA kernels
|
48
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
44
49
|
```
|
45
50
|
|
46
|
-
###
|
47
|
-
|
51
|
+
### Method 3: Using docker
|
52
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
53
|
+
|
54
|
+
### Common Notes
|
55
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
56
|
+
```
|
57
|
+
pip uninstall -y triton triton-nightly
|
58
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
59
|
+
```
|
60
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
61
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
48
62
|
|
49
63
|
## Quick Start
|
50
64
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -264,8 +278,8 @@ for out in state.text_iter():
|
|
264
278
|
```
|
265
279
|
|
266
280
|
### Tips and Implementation Details
|
267
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
268
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
281
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
282
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
269
283
|
|
270
284
|
## Backend: SGLang Runtime (SRT)
|
271
285
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -322,7 +336,6 @@ response = client.chat.completions.create(
|
|
322
336
|
print(response)
|
323
337
|
```
|
324
338
|
|
325
|
-
|
326
339
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
327
340
|
|
328
341
|
If needed, you can also override the chat template when launching the server:
|
@@ -351,7 +364,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
351
364
|
```
|
352
365
|
|
353
366
|
### Additional Arguments
|
354
|
-
- Add `--tp 2` to enable tensor parallelism.
|
367
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
355
368
|
```
|
356
369
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
357
370
|
```
|
@@ -363,16 +376,22 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
363
376
|
```
|
364
377
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
365
378
|
```
|
366
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
367
379
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
380
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
|
381
|
+
```
|
382
|
+
# Node 0
|
383
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
|
384
|
+
|
385
|
+
# Node 1
|
386
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
|
387
|
+
```
|
368
388
|
|
369
389
|
### Supported Models
|
370
390
|
- Llama
|
371
391
|
- Mistral
|
372
392
|
- Mixtral
|
373
|
-
- Qwen / Qwen 2
|
374
|
-
- Gemma
|
375
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
393
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
394
|
+
- Gemma / Gemma 2
|
376
395
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
377
396
|
- LLaVA
|
378
397
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -385,6 +404,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
385
404
|
- StableLM
|
386
405
|
- Command-R
|
387
406
|
- DBRX
|
407
|
+
- Grok
|
408
|
+
- ChatGLM
|
388
409
|
- AWQ/GPTQ/Marlin quantization
|
389
410
|
|
390
411
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -396,17 +417,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
396
417
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
397
418
|

|
398
419
|
|
399
|
-
Learn more [
|
420
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
421
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
400
422
|
|
401
423
|
## Roadmap
|
402
424
|
https://github.com/sgl-project/sglang/issues/157
|
403
425
|
|
404
426
|
## Citation And Acknowledgment
|
405
427
|
```
|
406
|
-
@misc{
|
407
|
-
title={
|
408
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
409
|
-
year={
|
428
|
+
@misc{zheng2024sglang,
|
429
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
430
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
431
|
+
year={2024},
|
410
432
|
eprint={2312.07104},
|
411
433
|
archivePrefix={arXiv},
|
412
434
|
primaryClass={cs.AI}
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.21"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -16,13 +16,14 @@ classifiers = [
|
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
18
|
"tqdm",
|
19
|
+
"numpy",
|
19
20
|
]
|
20
21
|
|
21
22
|
[project.optional-dependencies]
|
22
|
-
srt = ["aiohttp", "fastapi", "
|
23
|
-
"
|
24
|
-
openai = ["openai>=1.0", "
|
25
|
-
anthropic = ["anthropic>=0.20.0"
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
+
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
|
25
|
+
openai = ["openai>=1.0", "tiktoken"]
|
26
|
+
anthropic = ["anthropic>=0.20.0"]
|
26
27
|
litellm = ["litellm>=1.0.0"]
|
27
28
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
28
29
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.21"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -24,10 +24,10 @@ from sglang.api import (
|
|
24
24
|
|
25
25
|
# SGL Backends
|
26
26
|
from sglang.backend.anthropic import Anthropic
|
27
|
+
from sglang.backend.litellm import LiteLLM
|
27
28
|
from sglang.backend.openai import OpenAI
|
28
29
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
30
|
from sglang.backend.vertexai import VertexAI
|
30
|
-
from sglang.backend.litellm import LiteLLM
|
31
31
|
|
32
32
|
# Global Configurations
|
33
33
|
from sglang.global_config import global_config
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Public APIs of the language."""
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
|
|
43
43
|
global_config.default_backend = backend
|
44
44
|
|
45
45
|
|
46
|
-
def flush_cache(backend: BaseBackend = None):
|
46
|
+
def flush_cache(backend: Optional[BaseBackend] = None):
|
47
47
|
backend = backend or global_config.default_backend
|
48
48
|
if backend is None:
|
49
49
|
return False
|
50
50
|
return backend.flush_cache()
|
51
51
|
|
52
52
|
|
53
|
-
def get_server_args(backend: BaseBackend = None):
|
53
|
+
def get_server_args(backend: Optional[BaseBackend] = None):
|
54
54
|
backend = backend or global_config.default_backend
|
55
55
|
if backend is None:
|
56
56
|
return None
|
@@ -67,10 +67,16 @@ def gen(
|
|
67
67
|
frequency_penalty: Optional[float] = None,
|
68
68
|
presence_penalty: Optional[float] = None,
|
69
69
|
ignore_eos: Optional[bool] = None,
|
70
|
+
return_logprob: Optional[bool] = None,
|
71
|
+
logprob_start_len: Optional[int] = None,
|
72
|
+
top_logprobs_num: Optional[int] = None,
|
73
|
+
return_text_in_logprobs: Optional[bool] = None,
|
70
74
|
dtype: Optional[type] = None,
|
71
75
|
choices: Optional[List[str]] = None,
|
72
76
|
regex: Optional[str] = None,
|
73
77
|
):
|
78
|
+
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
79
|
+
|
74
80
|
if choices:
|
75
81
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
76
82
|
|
@@ -91,6 +97,10 @@ def gen(
|
|
91
97
|
frequency_penalty,
|
92
98
|
presence_penalty,
|
93
99
|
ignore_eos,
|
100
|
+
return_logprob,
|
101
|
+
logprob_start_len,
|
102
|
+
top_logprobs_num,
|
103
|
+
return_text_in_logprobs,
|
94
104
|
dtype,
|
95
105
|
regex,
|
96
106
|
)
|
@@ -106,6 +116,10 @@ def gen_int(
|
|
106
116
|
frequency_penalty: Optional[float] = None,
|
107
117
|
presence_penalty: Optional[float] = None,
|
108
118
|
ignore_eos: Optional[bool] = None,
|
119
|
+
return_logprob: Optional[bool] = None,
|
120
|
+
logprob_start_len: Optional[int] = None,
|
121
|
+
top_logprobs_num: Optional[int] = None,
|
122
|
+
return_text_in_logprobs: Optional[bool] = None,
|
109
123
|
):
|
110
124
|
return SglGen(
|
111
125
|
name,
|
@@ -117,6 +131,10 @@ def gen_int(
|
|
117
131
|
frequency_penalty,
|
118
132
|
presence_penalty,
|
119
133
|
ignore_eos,
|
134
|
+
return_logprob,
|
135
|
+
logprob_start_len,
|
136
|
+
top_logprobs_num,
|
137
|
+
return_text_in_logprobs,
|
120
138
|
int,
|
121
139
|
None,
|
122
140
|
)
|
@@ -132,6 +150,10 @@ def gen_string(
|
|
132
150
|
frequency_penalty: Optional[float] = None,
|
133
151
|
presence_penalty: Optional[float] = None,
|
134
152
|
ignore_eos: Optional[bool] = None,
|
153
|
+
return_logprob: Optional[bool] = None,
|
154
|
+
logprob_start_len: Optional[int] = None,
|
155
|
+
top_logprobs_num: Optional[int] = None,
|
156
|
+
return_text_in_logprobs: Optional[bool] = None,
|
135
157
|
):
|
136
158
|
return SglGen(
|
137
159
|
name,
|
@@ -143,6 +165,10 @@ def gen_string(
|
|
143
165
|
frequency_penalty,
|
144
166
|
presence_penalty,
|
145
167
|
ignore_eos,
|
168
|
+
return_logprob,
|
169
|
+
logprob_start_len,
|
170
|
+
top_logprobs_num,
|
171
|
+
return_text_in_logprobs,
|
146
172
|
str,
|
147
173
|
None,
|
148
174
|
)
|
@@ -158,7 +184,7 @@ def video(path: str, num_frames: int):
|
|
158
184
|
|
159
185
|
def select(
|
160
186
|
name: Optional[str] = None,
|
161
|
-
choices: List[str] = None,
|
187
|
+
choices: Optional[List[str]] = None,
|
162
188
|
temperature: float = 0.0,
|
163
189
|
):
|
164
190
|
assert choices is not None
|
@@ -13,7 +13,6 @@ except ImportError as e:
|
|
13
13
|
|
14
14
|
|
15
15
|
class LiteLLM(BaseBackend):
|
16
|
-
|
17
16
|
def __init__(
|
18
17
|
self,
|
19
18
|
model_name,
|
@@ -33,7 +32,8 @@ class LiteLLM(BaseBackend):
|
|
33
32
|
self.model_name = model_name
|
34
33
|
|
35
34
|
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
-
model_name
|
35
|
+
model_name
|
36
|
+
)
|
37
37
|
|
38
38
|
self.client_params = {
|
39
39
|
"api_key": api_key,
|
@@ -1,7 +1,7 @@
|
|
1
|
+
import dataclasses
|
1
2
|
import logging
|
2
3
|
import time
|
3
4
|
import warnings
|
4
|
-
import dataclasses
|
5
5
|
from typing import Callable, List, Optional, Union
|
6
6
|
|
7
7
|
import numpy as np
|
@@ -105,14 +105,16 @@ class OpenAI(BaseBackend):
|
|
105
105
|
def get_chat_template(self):
|
106
106
|
return self.chat_template
|
107
107
|
|
108
|
-
def _prepare_spec_execution(
|
109
|
-
|
108
|
+
def _prepare_spec_execution(
|
109
|
+
self,
|
110
|
+
sampling_params: SglSamplingParams,
|
111
|
+
num_api_spec_tokens: int,
|
112
|
+
spec_var_name: str,
|
113
|
+
):
|
110
114
|
if "max_tokens" not in self.spec_kwargs:
|
111
115
|
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
116
|
else:
|
113
|
-
assert
|
114
|
-
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
-
)
|
117
|
+
assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
116
118
|
|
117
119
|
params = sampling_params.to_openai_kwargs()
|
118
120
|
for key, value in params.items():
|
@@ -151,8 +153,9 @@ class OpenAI(BaseBackend):
|
|
151
153
|
)
|
152
154
|
prompt = s.messages_
|
153
155
|
else:
|
154
|
-
return self._prepare_spec_execution(
|
155
|
-
s.num_api_spec_tokens, spec_var_name
|
156
|
+
return self._prepare_spec_execution(
|
157
|
+
sampling_params, s.num_api_spec_tokens, spec_var_name
|
158
|
+
)
|
156
159
|
else:
|
157
160
|
prompt = s.text_
|
158
161
|
|
@@ -325,7 +328,7 @@ class OpenAI(BaseBackend):
|
|
325
328
|
ret_str = ret.choices[0].text
|
326
329
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
330
|
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
-
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
331
|
+
self.token_usage.completion_tokens = ret.usage.completion_tokens
|
329
332
|
|
330
333
|
# TODO:
|
331
334
|
# 1. return logits as the scores
|
@@ -355,7 +358,9 @@ class OpenAI(BaseBackend):
|
|
355
358
|
return decision, scores, None, None
|
356
359
|
|
357
360
|
|
358
|
-
def openai_completion(
|
361
|
+
def openai_completion(
|
362
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
363
|
+
):
|
359
364
|
for attempt in range(retries):
|
360
365
|
try:
|
361
366
|
if is_chat:
|
@@ -385,15 +390,19 @@ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None,
|
|
385
390
|
return comp
|
386
391
|
|
387
392
|
|
388
|
-
def openai_completion_stream(
|
393
|
+
def openai_completion_stream(
|
394
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
395
|
+
):
|
389
396
|
for attempt in range(retries):
|
390
397
|
try:
|
391
398
|
if is_chat:
|
392
399
|
if "stop" in kwargs and kwargs["stop"] is None:
|
393
400
|
kwargs.pop("stop")
|
394
401
|
generator = client.chat.completions.create(
|
395
|
-
messages=prompt,
|
396
|
-
|
402
|
+
messages=prompt,
|
403
|
+
stream=True,
|
404
|
+
stream_options={"include_usage": True},
|
405
|
+
**kwargs,
|
397
406
|
)
|
398
407
|
for ret in generator:
|
399
408
|
if len(ret.choices) == 0:
|
@@ -405,8 +414,10 @@ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, promp
|
|
405
414
|
yield content or "", {}
|
406
415
|
else:
|
407
416
|
generator = client.completions.create(
|
408
|
-
prompt=prompt,
|
409
|
-
|
417
|
+
prompt=prompt,
|
418
|
+
stream=True,
|
419
|
+
stream_options={"include_usage": True},
|
420
|
+
**kwargs,
|
410
421
|
)
|
411
422
|
for ret in generator:
|
412
423
|
if len(ret.choices) == 0:
|
@@ -1,15 +1,14 @@
|
|
1
1
|
import json
|
2
|
-
from typing import
|
2
|
+
from typing import List, Optional
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
-
import requests
|
6
5
|
|
7
6
|
from sglang.backend.base_backend import BaseBackend
|
8
7
|
from sglang.global_config import global_config
|
9
8
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
10
9
|
from sglang.lang.interpreter import StreamExecutor
|
11
|
-
from sglang.lang.ir import
|
12
|
-
from sglang.utils import
|
10
|
+
from sglang.lang.ir import SglSamplingParams
|
11
|
+
from sglang.utils import http_request
|
13
12
|
|
14
13
|
|
15
14
|
class RuntimeEndpoint(BaseBackend):
|
@@ -125,6 +124,16 @@ class RuntimeEndpoint(BaseBackend):
|
|
125
124
|
else:
|
126
125
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
127
126
|
|
127
|
+
for item in [
|
128
|
+
"return_logprob",
|
129
|
+
"logprob_start_len",
|
130
|
+
"top_logprobs_num",
|
131
|
+
"return_text_in_logprobs",
|
132
|
+
]:
|
133
|
+
value = getattr(sampling_params, item, None)
|
134
|
+
if value is not None:
|
135
|
+
data[item] = value
|
136
|
+
|
128
137
|
self._add_images(s, data)
|
129
138
|
|
130
139
|
res = http_request(
|
@@ -167,6 +176,16 @@ class RuntimeEndpoint(BaseBackend):
|
|
167
176
|
else:
|
168
177
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
169
178
|
|
179
|
+
for item in [
|
180
|
+
"return_logprob",
|
181
|
+
"logprob_start_len",
|
182
|
+
"top_logprobs_num",
|
183
|
+
"return_text_in_logprobs",
|
184
|
+
]:
|
185
|
+
value = getattr(sampling_params, item, None)
|
186
|
+
if value is not None:
|
187
|
+
data[item] = value
|
188
|
+
|
170
189
|
data["stream"] = True
|
171
190
|
self._add_images(s, data)
|
172
191
|
|
@@ -181,21 +200,16 @@ class RuntimeEndpoint(BaseBackend):
|
|
181
200
|
self._assert_success(res)
|
182
201
|
pos = 0
|
183
202
|
|
184
|
-
incomplete_text = ""
|
185
203
|
for chunk in res.iter_lines(decode_unicode=False):
|
186
204
|
chunk = chunk.decode("utf-8")
|
187
205
|
if chunk and chunk.startswith("data:"):
|
188
206
|
if chunk == "data: [DONE]":
|
189
207
|
break
|
190
208
|
data = json.loads(chunk[5:].strip("\n"))
|
191
|
-
|
209
|
+
chunk_text = data["text"][pos:]
|
192
210
|
meta_info = data["meta_info"]
|
193
|
-
pos += len(
|
194
|
-
|
195
|
-
yield text, meta_info
|
196
|
-
|
197
|
-
if len(incomplete_text) > 0:
|
198
|
-
yield incomplete_text, meta_info
|
211
|
+
pos += len(chunk_text)
|
212
|
+
yield chunk_text, meta_info
|
199
213
|
|
200
214
|
def select(
|
201
215
|
self,
|