sglang 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.0/sglang.egg-info → sglang-0.2.2}/PKG-INFO +28 -14
- {sglang-0.2.0 → sglang-0.2.2}/README.md +27 -13
- {sglang-0.2.0 → sglang-0.2.2}/pyproject.toml +1 -1
- {sglang-0.2.0 → sglang-0.2.2}/sglang/bench_serving.py +3 -3
- {sglang-0.2.0 → sglang-0.2.2}/sglang/global_config.py +1 -1
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/model_runner.py +1 -1
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/io_struct.py +4 -1
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/openai_api/adapter.py +6 -1
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/utils.py +1 -0
- sglang-0.2.2/sglang/version.py +1 -0
- {sglang-0.2.0 → sglang-0.2.2/sglang.egg-info}/PKG-INFO +28 -14
- sglang-0.2.0/sglang/version.py +0 -1
- {sglang-0.2.0 → sglang-0.2.2}/LICENSE +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/setup.cfg +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/__init__.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/api.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/bench_latency.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/check_env.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/compiler.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/ir.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/launch_server.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/flush_cache.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/quantization/fp8.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/cuda_graph_runner.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/infer_batch.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/manager_multi.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/manager_single.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/radix_cache.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/schedule_heuristic.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/controller/tp_worker.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/grok.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/llama2.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/llava.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/server.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/srt/server_args.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/test/test_conversation.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/test/test_utils.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang/utils.py +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang.egg-info/SOURCES.txt +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.2.0 → sglang-0.2.2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -249,7 +249,7 @@ Requires-Dist: sglang[litellm]; extra == "all"
|
|
249
249
|
|
250
250
|
--------------------------------------------------------------------------------
|
251
251
|
|
252
|
-
| [**Blog**](https://lmsys.org/blog/2024-
|
252
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
253
253
|
|
254
254
|
SGLang is a fast serving framework for large language models and vision language models.
|
255
255
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -259,13 +259,14 @@ The core features include:
|
|
259
259
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
260
|
|
261
261
|
## News
|
262
|
-
- [2024/
|
263
|
-
- [2024/
|
264
|
-
- [2024/
|
262
|
+
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
263
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
264
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
265
265
|
|
266
266
|
<details>
|
267
267
|
<summary>More</summary>
|
268
268
|
|
269
|
+
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
269
270
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
270
271
|
|
271
272
|
</details>
|
@@ -302,7 +303,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
302
303
|
```
|
303
304
|
|
304
305
|
### Method 3: Using docker
|
305
|
-
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
306
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
307
|
+
Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
306
308
|
|
307
309
|
```bash
|
308
310
|
docker run --gpus all \
|
@@ -311,7 +313,7 @@ docker run --gpus all \
|
|
311
313
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
312
314
|
--ipc=host \
|
313
315
|
lmsysorg/sglang:latest \
|
314
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
|
316
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
315
317
|
```
|
316
318
|
|
317
319
|
### Common Notes
|
@@ -399,6 +401,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
399
401
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
400
402
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
401
403
|
|
404
|
+
### Run Llama 3.1 405B
|
405
|
+
|
406
|
+
```bash
|
407
|
+
# 2 nodes run 405B fp16
|
408
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
409
|
+
# on the first node
|
410
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
411
|
+
|
412
|
+
# on the second
|
413
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
414
|
+
|
415
|
+
# single node run 405B fp8
|
416
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
417
|
+
```
|
418
|
+
|
402
419
|
### Supported Models
|
403
420
|
|
404
421
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -656,15 +673,12 @@ for out in state.text_iter():
|
|
656
673
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
657
674
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
658
675
|
|
659
|
-
## Benchmark And Performance
|
660
|
-
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
661
|
-

|
662
676
|
|
663
|
-
|
664
|
-

|
679
|
+

|
665
680
|
|
666
|
-
|
667
|
-
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
681
|
+
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
668
682
|
|
669
683
|
## Roadmap
|
670
684
|
[Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
|
@@ -4,7 +4,7 @@
|
|
4
4
|
|
5
5
|
--------------------------------------------------------------------------------
|
6
6
|
|
7
|
-
| [**Blog**](https://lmsys.org/blog/2024-
|
7
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
8
8
|
|
9
9
|
SGLang is a fast serving framework for large language models and vision language models.
|
10
10
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -14,13 +14,14 @@ The core features include:
|
|
14
14
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
15
15
|
|
16
16
|
## News
|
17
|
-
- [2024/
|
18
|
-
- [2024/
|
19
|
-
- [2024/
|
17
|
+
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
18
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
19
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
20
20
|
|
21
21
|
<details>
|
22
22
|
<summary>More</summary>
|
23
23
|
|
24
|
+
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
24
25
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
25
26
|
|
26
27
|
</details>
|
@@ -57,7 +58,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
57
58
|
```
|
58
59
|
|
59
60
|
### Method 3: Using docker
|
60
|
-
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
61
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
62
|
+
Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
61
63
|
|
62
64
|
```bash
|
63
65
|
docker run --gpus all \
|
@@ -66,7 +68,7 @@ docker run --gpus all \
|
|
66
68
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
67
69
|
--ipc=host \
|
68
70
|
lmsysorg/sglang:latest \
|
69
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
|
71
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
70
72
|
```
|
71
73
|
|
72
74
|
### Common Notes
|
@@ -154,6 +156,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
154
156
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
155
157
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
156
158
|
|
159
|
+
### Run Llama 3.1 405B
|
160
|
+
|
161
|
+
```bash
|
162
|
+
# 2 nodes run 405B fp16
|
163
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
164
|
+
# on the first node
|
165
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
166
|
+
|
167
|
+
# on the second
|
168
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
169
|
+
|
170
|
+
# single node run 405B fp8
|
171
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
|
+
```
|
173
|
+
|
157
174
|
### Supported Models
|
158
175
|
|
159
176
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -411,15 +428,12 @@ for out in state.text_iter():
|
|
411
428
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
412
429
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
413
430
|
|
414
|
-
## Benchmark And Performance
|
415
|
-
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
416
|
-

|
417
431
|
|
418
|
-
|
419
|
-

|
434
|
+

|
420
435
|
|
421
|
-
|
422
|
-
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
436
|
+
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
423
437
|
|
424
438
|
## Roadmap
|
425
439
|
[Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -369,7 +369,7 @@ def sample_random_requests(
|
|
369
369
|
) -> List[Tuple[str, int, int]]:
|
370
370
|
|
371
371
|
input_lens = np.random.randint(
|
372
|
-
int(input_len * range_ratio),
|
372
|
+
max(int(input_len * range_ratio), 1),
|
373
373
|
input_len + 1,
|
374
374
|
size=num_prompts,
|
375
375
|
)
|
@@ -415,7 +415,7 @@ def sample_random_requests(
|
|
415
415
|
prompt_token_ids = tokenizer(prompt).input_ids
|
416
416
|
prompt_len = len(prompt_token_ids)
|
417
417
|
|
418
|
-
if prompt_len
|
418
|
+
if prompt_len > input_lens[i]:
|
419
419
|
input_ids = prompt_token_ids[: input_lens[i]]
|
420
420
|
else:
|
421
421
|
ratio = (input_lens[i] + prompt_len - 1) // prompt_len
|
@@ -935,7 +935,7 @@ if __name__ == "__main__":
|
|
935
935
|
parser.add_argument(
|
936
936
|
"--random-range-ratio",
|
937
937
|
type=float,
|
938
|
-
default=
|
938
|
+
default=0.0,
|
939
939
|
help="Range of sampled ratio of input/output length, "
|
940
940
|
"used only for random dataset.",
|
941
941
|
)
|
@@ -17,7 +17,7 @@ class GlobalConfig:
|
|
17
17
|
|
18
18
|
# Runtime constants: New generation token ratio estimation
|
19
19
|
self.init_new_token_ratio = 0.7
|
20
|
-
self.base_min_new_token_ratio = 0.
|
20
|
+
self.base_min_new_token_ratio = 0.1
|
21
21
|
self.new_token_ratio_decay = 0.001
|
22
22
|
self.new_token_ratio_recovery = 0.05
|
23
23
|
|
@@ -121,7 +121,7 @@ class ModelRunner:
|
|
121
121
|
skip_tokenizer_init=True,
|
122
122
|
)
|
123
123
|
|
124
|
-
if is_llama3_405b_fp8(self.model_config):
|
124
|
+
if is_llama3_405b_fp8(self.model_config) and self.tp_size <= 8:
|
125
125
|
# A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
|
126
126
|
self.model_config.hf_config.num_key_value_heads = 8
|
127
127
|
vllm_model_config.hf_config.num_key_value_heads = 8
|
@@ -40,7 +40,10 @@ class GenerateReqInput:
|
|
40
40
|
self.text is not None and self.input_ids is not None
|
41
41
|
):
|
42
42
|
raise ValueError("Either text or input_ids should be provided.")
|
43
|
-
if
|
43
|
+
if (
|
44
|
+
isinstance(self.sampling_params, dict)
|
45
|
+
and self.sampling_params.get("n", 1) != 1
|
46
|
+
):
|
44
47
|
is_single = False
|
45
48
|
else:
|
46
49
|
if self.text is not None:
|
@@ -94,9 +94,14 @@ def load_chat_template_for_openai_api(chat_template_arg):
|
|
94
94
|
async def v1_completions(tokenizer_manager, raw_request: Request):
|
95
95
|
request_json = await raw_request.json()
|
96
96
|
request = CompletionRequest(**request_json)
|
97
|
+
prompt = request.prompt
|
98
|
+
if isinstance(prompt, str) or isinstance(prompt[0], str):
|
99
|
+
prompt_kwargs = {"text": prompt}
|
100
|
+
else:
|
101
|
+
prompt_kwargs = {"input_ids": prompt}
|
97
102
|
|
98
103
|
adapted_request = GenerateReqInput(
|
99
|
-
|
104
|
+
**prompt_kwargs,
|
100
105
|
sampling_params={
|
101
106
|
"temperature": request.temperature,
|
102
107
|
"max_new_tokens": request.max_tokens,
|
@@ -626,6 +626,7 @@ def is_llama3_405b_fp8(model_config):
|
|
626
626
|
and model_config.hf_config.intermediate_size == 53248
|
627
627
|
and model_config.hf_config.num_hidden_layers == 126
|
628
628
|
and model_config.hf_config.num_key_value_heads == 16
|
629
|
+
and hasattr(model_config.hf_config, "quantization_config")
|
629
630
|
and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
|
630
631
|
):
|
631
632
|
return True
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "0.2.2"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -249,7 +249,7 @@ Requires-Dist: sglang[litellm]; extra == "all"
|
|
249
249
|
|
250
250
|
--------------------------------------------------------------------------------
|
251
251
|
|
252
|
-
| [**Blog**](https://lmsys.org/blog/2024-
|
252
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
253
253
|
|
254
254
|
SGLang is a fast serving framework for large language models and vision language models.
|
255
255
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -259,13 +259,14 @@ The core features include:
|
|
259
259
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
260
|
|
261
261
|
## News
|
262
|
-
- [2024/
|
263
|
-
- [2024/
|
264
|
-
- [2024/
|
262
|
+
- [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
263
|
+
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
264
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
265
265
|
|
266
266
|
<details>
|
267
267
|
<summary>More</summary>
|
268
268
|
|
269
|
+
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
269
270
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
270
271
|
|
271
272
|
</details>
|
@@ -302,7 +303,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
302
303
|
```
|
303
304
|
|
304
305
|
### Method 3: Using docker
|
305
|
-
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
306
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
|
307
|
+
Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
|
306
308
|
|
307
309
|
```bash
|
308
310
|
docker run --gpus all \
|
@@ -311,7 +313,7 @@ docker run --gpus all \
|
|
311
313
|
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \
|
312
314
|
--ipc=host \
|
313
315
|
lmsysorg/sglang:latest \
|
314
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
|
316
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
315
317
|
```
|
316
318
|
|
317
319
|
### Common Notes
|
@@ -399,6 +401,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
399
401
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
400
402
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
401
403
|
|
404
|
+
### Run Llama 3.1 405B
|
405
|
+
|
406
|
+
```bash
|
407
|
+
# 2 nodes run 405B fp16
|
408
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
409
|
+
# on the first node
|
410
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
411
|
+
|
412
|
+
# on the second
|
413
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
414
|
+
|
415
|
+
# single node run 405B fp8
|
416
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
417
|
+
```
|
418
|
+
|
402
419
|
### Supported Models
|
403
420
|
|
404
421
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -656,15 +673,12 @@ for out in state.text_iter():
|
|
656
673
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
657
674
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
658
675
|
|
659
|
-
## Benchmark And Performance
|
660
|
-
- Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
|
661
|
-

|
662
676
|
|
663
|
-
|
664
|
-

|
679
|
+

|
665
680
|
|
666
|
-
|
667
|
-
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
681
|
+
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
668
682
|
|
669
683
|
## Roadmap
|
670
684
|
[Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
|
sglang-0.2.0/sglang/version.py
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "0.2.0"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|