sglang 0.1.22__tar.gz → 0.1.24__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.22/sglang.egg-info → sglang-0.1.24}/PKG-INFO +9 -5
- {sglang-0.1.22 → sglang-0.1.24}/README.md +7 -3
- {sglang-0.1.22 → sglang-0.1.24}/pyproject.toml +2 -2
- {sglang-0.1.22 → sglang-0.1.24}/sglang/__init__.py +1 -1
- {sglang-0.1.22 → sglang-0.1.24}/sglang/bench_serving.py +243 -25
- {sglang-0.1.22 → sglang-0.1.24}/sglang/global_config.py +3 -2
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/interpreter.py +1 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/hf_transformers_utils.py +13 -1
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/logits_processor.py +4 -5
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/radix_attention.py +38 -49
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/cuda_graph_runner.py +58 -16
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/infer_batch.py +51 -22
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/model_runner.py +7 -4
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/schedule_heuristic.py +8 -3
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/tp_worker.py +9 -11
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/memory_pool.py +13 -5
- sglang-0.1.24/sglang/srt/models/deepseek.py +430 -0
- sglang-0.1.24/sglang/srt/models/gpt_bigcode.py +282 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/llama2.py +19 -10
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/server.py +20 -1
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/server_args.py +12 -6
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/utils.py +49 -0
- {sglang-0.1.22 → sglang-0.1.24/sglang.egg-info}/PKG-INFO +9 -5
- {sglang-0.1.22 → sglang-0.1.24}/sglang.egg-info/SOURCES.txt +2 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang.egg-info/requires.txt +1 -1
- {sglang-0.1.22 → sglang-0.1.24}/LICENSE +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/setup.cfg +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/api.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/litellm.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/openai.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/bench.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/bench_latency.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/check_env.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/ir.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/launch_server.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/conversation.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/flush_cache.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/quantization/fp8.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/dp_worker.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/manager_multi.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/manager_single.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/controller/radix_cache.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/model_config.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/grok.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/llava.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api/api_adapter.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api/openai_api_adapter.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api/openai_protocol.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_api_adapter.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/openai_protocol.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/test/test_programs.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang/utils.py +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.22 → sglang-0.1.24}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.24
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -228,7 +228,7 @@ Requires-Dist: torch; extra == "srt"
|
|
228
228
|
Requires-Dist: uvicorn; extra == "srt"
|
229
229
|
Requires-Dist: uvloop; extra == "srt"
|
230
230
|
Requires-Dist: zmq; extra == "srt"
|
231
|
-
Requires-Dist: vllm==0.5.
|
231
|
+
Requires-Dist: vllm==0.5.3.post1; extra == "srt"
|
232
232
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
233
|
Provides-Extra: openai
|
234
234
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -282,6 +282,7 @@ The core features include:
|
|
282
282
|
|
283
283
|
### Method 1: With pip
|
284
284
|
```
|
285
|
+
pip install --upgrade pip setuptools wheel
|
285
286
|
pip install "sglang[all]"
|
286
287
|
|
287
288
|
# Install FlashInfer CUDA kernels
|
@@ -293,6 +294,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
293
294
|
git clone https://github.com/sgl-project/sglang.git
|
294
295
|
cd sglang
|
295
296
|
|
297
|
+
pip install --upgrade pip
|
296
298
|
pip install -e "python[all]"
|
297
299
|
|
298
300
|
# Install FlashInfer CUDA kernels
|
@@ -390,15 +392,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
390
392
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
391
393
|
```
|
392
394
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
393
|
-
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-
|
395
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
394
396
|
```
|
395
397
|
# Node 0
|
396
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
398
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
397
399
|
|
398
400
|
# Node 1
|
399
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
401
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
400
402
|
```
|
401
403
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
404
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
402
405
|
|
403
406
|
### Supported Models
|
404
407
|
|
@@ -420,6 +423,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
420
423
|
- Grok
|
421
424
|
- ChatGLM
|
422
425
|
- InternLM 2
|
426
|
+
- Mistral NeMo
|
423
427
|
|
424
428
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
425
429
|
|
@@ -37,6 +37,7 @@ The core features include:
|
|
37
37
|
|
38
38
|
### Method 1: With pip
|
39
39
|
```
|
40
|
+
pip install --upgrade pip setuptools wheel
|
40
41
|
pip install "sglang[all]"
|
41
42
|
|
42
43
|
# Install FlashInfer CUDA kernels
|
@@ -48,6 +49,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
48
49
|
git clone https://github.com/sgl-project/sglang.git
|
49
50
|
cd sglang
|
50
51
|
|
52
|
+
pip install --upgrade pip
|
51
53
|
pip install -e "python[all]"
|
52
54
|
|
53
55
|
# Install FlashInfer CUDA kernels
|
@@ -145,15 +147,16 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
145
147
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
146
148
|
```
|
147
149
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
148
|
-
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-
|
150
|
+
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
149
151
|
```
|
150
152
|
# Node 0
|
151
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
153
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
152
154
|
|
153
155
|
# Node 1
|
154
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-
|
156
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
155
157
|
```
|
156
158
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
159
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
157
160
|
|
158
161
|
### Supported Models
|
159
162
|
|
@@ -175,6 +178,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
175
178
|
- Grok
|
176
179
|
- ChatGLM
|
177
180
|
- InternLM 2
|
181
|
+
- Mistral NeMo
|
178
182
|
|
179
183
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
180
184
|
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.24"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
23
|
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
-
"psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.
|
24
|
+
"psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.3.post1", "outlines>=0.0.44"]
|
25
25
|
openai = ["openai>=1.0", "tiktoken"]
|
26
26
|
anthropic = ["anthropic>=0.20.0"]
|
27
27
|
litellm = ["litellm>=1.0.0"]
|
@@ -5,6 +5,9 @@ Benchmark online serving.
|
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
8
|
+
|
9
|
+
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
|
10
|
+
python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
|
8
11
|
"""
|
9
12
|
|
10
13
|
import argparse
|
@@ -19,6 +22,7 @@ import traceback
|
|
19
22
|
import warnings
|
20
23
|
from argparse import ArgumentParser as FlexibleArgumentParser
|
21
24
|
from dataclasses import dataclass, field
|
25
|
+
from datetime import datetime
|
22
26
|
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
23
27
|
|
24
28
|
import aiohttp
|
@@ -53,12 +57,80 @@ class RequestFuncOutput:
|
|
53
57
|
itl: List[float] = field(default_factory=list) # List of inter-token latencies
|
54
58
|
prompt_len: int = 0
|
55
59
|
error: str = ""
|
60
|
+
output_len: int = 0
|
56
61
|
|
57
62
|
|
58
63
|
def remove_prefix(text: str, prefix: str) -> str:
|
59
64
|
return text[len(prefix) :] if text.startswith(prefix) else text
|
60
65
|
|
61
66
|
|
67
|
+
# trt llm not support ignore_eos
|
68
|
+
# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
|
69
|
+
async def async_request_trt_llm(
|
70
|
+
request_func_input: RequestFuncInput,
|
71
|
+
pbar: Optional[tqdm] = None,
|
72
|
+
) -> RequestFuncOutput:
|
73
|
+
api_url = request_func_input.api_url
|
74
|
+
assert api_url.endswith("generate_stream")
|
75
|
+
|
76
|
+
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
77
|
+
payload = {
|
78
|
+
"accumulate_tokens": True,
|
79
|
+
"text_input": request_func_input.prompt,
|
80
|
+
"temperature": 0.000001,
|
81
|
+
"top_p": 1.0,
|
82
|
+
"max_tokens": request_func_input.output_len,
|
83
|
+
"stream": True,
|
84
|
+
"min_length": request_func_input.output_len,
|
85
|
+
"end_id": 1048576,
|
86
|
+
}
|
87
|
+
output = RequestFuncOutput()
|
88
|
+
output.prompt_len = request_func_input.prompt_len
|
89
|
+
|
90
|
+
ttft = 0.0
|
91
|
+
st = time.perf_counter()
|
92
|
+
most_recent_timestamp = st
|
93
|
+
try:
|
94
|
+
async with session.post(url=api_url, json=payload) as response:
|
95
|
+
if response.status == 200:
|
96
|
+
async for chunk_bytes in response.content:
|
97
|
+
chunk_bytes = chunk_bytes.strip()
|
98
|
+
if not chunk_bytes:
|
99
|
+
continue
|
100
|
+
|
101
|
+
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data:")
|
102
|
+
|
103
|
+
data = json.loads(chunk)
|
104
|
+
output.generated_text += data["text_output"]
|
105
|
+
timestamp = time.perf_counter()
|
106
|
+
# First token
|
107
|
+
if ttft == 0.0:
|
108
|
+
ttft = time.perf_counter() - st
|
109
|
+
output.ttft = ttft
|
110
|
+
|
111
|
+
# Decoding phase
|
112
|
+
else:
|
113
|
+
output.itl.append(timestamp - most_recent_timestamp)
|
114
|
+
|
115
|
+
most_recent_timestamp = timestamp
|
116
|
+
|
117
|
+
output.latency = most_recent_timestamp - st
|
118
|
+
output.success = True
|
119
|
+
output.output_len = request_func_input.output_len
|
120
|
+
|
121
|
+
else:
|
122
|
+
output.error = response.reason or ""
|
123
|
+
output.success = False
|
124
|
+
except Exception:
|
125
|
+
output.success = False
|
126
|
+
exc_info = sys.exc_info()
|
127
|
+
output.error = "".join(traceback.format_exception(*exc_info))
|
128
|
+
|
129
|
+
if pbar:
|
130
|
+
pbar.update(1)
|
131
|
+
return output
|
132
|
+
|
133
|
+
|
62
134
|
# set ignore_eos True by default
|
63
135
|
async def async_request_openai_completions(
|
64
136
|
request_func_input: RequestFuncInput,
|
@@ -76,7 +148,7 @@ async def async_request_openai_completions(
|
|
76
148
|
"temperature": 0.0,
|
77
149
|
"best_of": 1,
|
78
150
|
"max_tokens": request_func_input.output_len,
|
79
|
-
"stream":
|
151
|
+
"stream": not args.disable_stream,
|
80
152
|
"ignore_eos": True,
|
81
153
|
}
|
82
154
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
@@ -99,8 +171,9 @@ async def async_request_openai_completions(
|
|
99
171
|
continue
|
100
172
|
|
101
173
|
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
174
|
+
latency = time.perf_counter() - st
|
102
175
|
if chunk == "[DONE]":
|
103
|
-
|
176
|
+
pass
|
104
177
|
else:
|
105
178
|
data = json.loads(chunk)
|
106
179
|
|
@@ -123,6 +196,7 @@ async def async_request_openai_completions(
|
|
123
196
|
output.generated_text = generated_text
|
124
197
|
output.success = True
|
125
198
|
output.latency = latency
|
199
|
+
output.output_len = request_func_input.output_len
|
126
200
|
else:
|
127
201
|
output.error = response.reason or ""
|
128
202
|
output.success = False
|
@@ -167,6 +241,7 @@ ASYNC_REQUEST_FUNCS = {
|
|
167
241
|
"sglang": async_request_openai_completions,
|
168
242
|
"vllm": async_request_openai_completions,
|
169
243
|
"lmdeploy": async_request_openai_completions,
|
244
|
+
"trt": async_request_trt_llm,
|
170
245
|
}
|
171
246
|
|
172
247
|
|
@@ -175,9 +250,11 @@ class BenchmarkMetrics:
|
|
175
250
|
completed: int
|
176
251
|
total_input: int
|
177
252
|
total_output: int
|
253
|
+
total_output_retokenized: int
|
178
254
|
request_throughput: float
|
179
255
|
input_throughput: float
|
180
256
|
output_throughput: float
|
257
|
+
output_throughput_retokenized: float
|
181
258
|
mean_ttft_ms: float
|
182
259
|
median_ttft_ms: float
|
183
260
|
std_ttft_ms: float
|
@@ -190,6 +267,8 @@ class BenchmarkMetrics:
|
|
190
267
|
median_itl_ms: float
|
191
268
|
std_itl_ms: float
|
192
269
|
p99_itl_ms: float
|
270
|
+
mean_e2e_latency_ms: float
|
271
|
+
median_e2e_latency_ms: float
|
193
272
|
|
194
273
|
|
195
274
|
default_sharegpt_path = "ShareGPT_V3_unfiltered_cleaned_split.json"
|
@@ -384,31 +463,36 @@ def calculate_metrics(
|
|
384
463
|
outputs: List[RequestFuncOutput],
|
385
464
|
dur_s: float,
|
386
465
|
tokenizer: PreTrainedTokenizerBase,
|
466
|
+
backend: str,
|
387
467
|
) -> Tuple[BenchmarkMetrics, List[int]]:
|
388
|
-
|
468
|
+
output_lens: List[int] = []
|
469
|
+
retokenized_output_lens: List[int] = []
|
389
470
|
total_input = 0
|
390
471
|
completed = 0
|
391
472
|
itls: List[float] = []
|
392
473
|
tpots: List[float] = []
|
393
474
|
ttfts: List[float] = []
|
475
|
+
e2e_latencies: List[float] = []
|
394
476
|
for i in range(len(outputs)):
|
395
477
|
if outputs[i].success:
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
# Note : this may inflate the output token count slightly
|
400
|
-
output_len = len(
|
478
|
+
output_len = outputs[i].output_len
|
479
|
+
output_lens.append(output_len)
|
480
|
+
retokenized_output_len = len(
|
401
481
|
tokenizer(outputs[i].generated_text, add_special_tokens=False).input_ids
|
402
482
|
)
|
403
|
-
|
483
|
+
retokenized_output_lens.append(retokenized_output_len)
|
404
484
|
total_input += input_requests[i][1]
|
405
485
|
if output_len > 1:
|
406
486
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
407
487
|
itls += outputs[i].itl
|
408
488
|
ttfts.append(outputs[i].ttft)
|
489
|
+
|
490
|
+
e2e_latencies.append(outputs[i].latency)
|
491
|
+
|
409
492
|
completed += 1
|
410
493
|
else:
|
411
|
-
|
494
|
+
output_lens.append(0)
|
495
|
+
retokenized_output_lens.append(0)
|
412
496
|
|
413
497
|
if completed == 0:
|
414
498
|
warnings.warn(
|
@@ -419,10 +503,12 @@ def calculate_metrics(
|
|
419
503
|
metrics = BenchmarkMetrics(
|
420
504
|
completed=completed,
|
421
505
|
total_input=total_input,
|
422
|
-
total_output=sum(
|
506
|
+
total_output=sum(output_lens),
|
507
|
+
total_output_retokenized=sum(retokenized_output_lens),
|
423
508
|
request_throughput=completed / dur_s,
|
424
509
|
input_throughput=total_input / dur_s,
|
425
|
-
output_throughput=sum(
|
510
|
+
output_throughput=sum(output_lens) / dur_s,
|
511
|
+
output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
|
426
512
|
mean_ttft_ms=np.mean(ttfts or 0)
|
427
513
|
* 1000, # ttfts is empty if streaming is not supported by backend
|
428
514
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
@@ -436,9 +522,11 @@ def calculate_metrics(
|
|
436
522
|
median_itl_ms=np.median(itls or 0) * 1000,
|
437
523
|
std_itl_ms=np.std(itls or 0) * 1000,
|
438
524
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
525
|
+
mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
|
526
|
+
median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
|
439
527
|
)
|
440
528
|
|
441
|
-
return metrics,
|
529
|
+
return metrics, output_lens
|
442
530
|
|
443
531
|
|
444
532
|
async def benchmark(
|
@@ -449,6 +537,7 @@ async def benchmark(
|
|
449
537
|
input_requests: List[Tuple[str, int, int]],
|
450
538
|
request_rate: float,
|
451
539
|
disable_tqdm: bool,
|
540
|
+
enable_multi: bool,
|
452
541
|
):
|
453
542
|
if backend in ASYNC_REQUEST_FUNCS:
|
454
543
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -498,19 +587,26 @@ async def benchmark(
|
|
498
587
|
|
499
588
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
500
589
|
|
501
|
-
metrics,
|
590
|
+
metrics, output_lens = calculate_metrics(
|
502
591
|
input_requests=input_requests,
|
503
592
|
outputs=outputs,
|
504
593
|
dur_s=benchmark_duration,
|
505
594
|
tokenizer=tokenizer,
|
595
|
+
backend=backend,
|
506
596
|
)
|
507
597
|
|
508
598
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
599
|
+
print("{:<40} {:<10}".format("Backend:", backend))
|
509
600
|
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
510
601
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
511
602
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
512
603
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
513
604
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
605
|
+
print(
|
606
|
+
"{:<40} {:<10}".format(
|
607
|
+
"Total generated tokens (retokenized):", metrics.total_output_retokenized
|
608
|
+
)
|
609
|
+
)
|
514
610
|
print(
|
515
611
|
"{:<40} {:<10.2f}".format(
|
516
612
|
"Request throughput (req/s):", metrics.request_throughput
|
@@ -526,6 +622,15 @@ async def benchmark(
|
|
526
622
|
"Output token throughput (tok/s):", metrics.output_throughput
|
527
623
|
)
|
528
624
|
)
|
625
|
+
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
626
|
+
print(
|
627
|
+
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
628
|
+
)
|
629
|
+
print(
|
630
|
+
"{:<40} {:<10.2f}".format(
|
631
|
+
"Median E2E Latency (ms):", metrics.median_e2e_latency_ms
|
632
|
+
)
|
633
|
+
)
|
529
634
|
print("{s:{c}^{n}}".format(s="Time to First Token", n=50, c="-"))
|
530
635
|
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
|
531
636
|
print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
|
@@ -542,11 +647,53 @@ async def benchmark(
|
|
542
647
|
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
543
648
|
print("=" * 50)
|
544
649
|
|
650
|
+
if (
|
651
|
+
metrics.median_ttft_ms is not None
|
652
|
+
and metrics.mean_itl_ms is not None
|
653
|
+
and metrics.output_throughput is not None
|
654
|
+
):
|
655
|
+
result = {
|
656
|
+
"backend": args.backend,
|
657
|
+
"dataset_name": args.dataset_name,
|
658
|
+
"request_rate": request_rate,
|
659
|
+
"total_input": metrics.total_input,
|
660
|
+
"total_output": metrics.total_output,
|
661
|
+
"total_output_retokenized": metrics.total_output_retokenized,
|
662
|
+
"mean_e2e_latency": metrics.mean_e2e_latency_ms,
|
663
|
+
"median_e2e_latency": metrics.median_e2e_latency_ms,
|
664
|
+
"median_ttft": metrics.median_ttft_ms,
|
665
|
+
"median_itl": metrics.median_itl_ms,
|
666
|
+
"output_token_throughput": metrics.output_throughput,
|
667
|
+
"sharegpt_output_len": args.sharegpt_output_len,
|
668
|
+
"random_input_len": args.random_input_len,
|
669
|
+
"random_output_len": args.random_output_len,
|
670
|
+
"random_range_ratio": args.random_range_ratio,
|
671
|
+
"benchmark_duration": benchmark_duration,
|
672
|
+
}
|
673
|
+
else:
|
674
|
+
print(f"Error running benchmark for request rate: {request_rate}")
|
675
|
+
print("-" * 30)
|
676
|
+
|
677
|
+
# Determine output file name
|
678
|
+
if args.output_file:
|
679
|
+
output_file_name = args.output_file
|
680
|
+
else:
|
681
|
+
now = datetime.now().strftime("%m%d")
|
682
|
+
if args.dataset_name == "random":
|
683
|
+
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
684
|
+
else:
|
685
|
+
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
686
|
+
|
687
|
+
# Append results to a JSONL file
|
688
|
+
with open(output_file_name, "a") as file:
|
689
|
+
file.write(json.dumps(result) + "\n")
|
690
|
+
|
545
691
|
result = {
|
546
692
|
"duration": benchmark_duration,
|
547
693
|
"completed": metrics.completed,
|
548
694
|
"total_input_tokens": metrics.total_input,
|
549
695
|
"total_output_tokens": metrics.total_output,
|
696
|
+
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
550
697
|
"request_throughput": metrics.request_throughput,
|
551
698
|
"input_throughput": metrics.input_throughput,
|
552
699
|
"output_throughput": metrics.output_throughput,
|
@@ -563,15 +710,34 @@ async def benchmark(
|
|
563
710
|
"std_itl_ms": metrics.std_itl_ms,
|
564
711
|
"p99_itl_ms": metrics.p99_itl_ms,
|
565
712
|
"input_lens": [output.prompt_len for output in outputs],
|
566
|
-
"output_lens":
|
713
|
+
"output_lens": output_lens,
|
567
714
|
"ttfts": [output.ttft for output in outputs],
|
568
715
|
"itls": [output.itl for output in outputs],
|
569
716
|
"generated_texts": [output.generated_text for output in outputs],
|
570
717
|
"errors": [output.error for output in outputs],
|
718
|
+
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
719
|
+
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
571
720
|
}
|
572
721
|
return result
|
573
722
|
|
574
723
|
|
724
|
+
def parse_request_rate_range(request_rate_range):
|
725
|
+
if len(request_rate_range.split(",")) == 3:
|
726
|
+
start, stop, step = map(int, request_rate_range.split(","))
|
727
|
+
return list(range(start, stop, step))
|
728
|
+
else:
|
729
|
+
return list(map(int, request_rate_range.split(",")))
|
730
|
+
|
731
|
+
|
732
|
+
def check_chat_template(model_path):
|
733
|
+
try:
|
734
|
+
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
735
|
+
return "chat_template" in tokenizer.init_kwargs
|
736
|
+
except Exception as e:
|
737
|
+
print(f"Fail to load tokenizer config with error={e}")
|
738
|
+
return False
|
739
|
+
|
740
|
+
|
575
741
|
def fire(args: argparse.Namespace):
|
576
742
|
random.seed(args.seed)
|
577
743
|
np.random.seed(args.seed)
|
@@ -581,6 +747,7 @@ def fire(args: argparse.Namespace):
|
|
581
747
|
"sglang": 30000,
|
582
748
|
"lmdeploy": 23333,
|
583
749
|
"vllm": 8000,
|
750
|
+
"trt": 8000,
|
584
751
|
}.get(args.backend, 30000)
|
585
752
|
|
586
753
|
api_url = (
|
@@ -594,6 +761,16 @@ def fire(args: argparse.Namespace):
|
|
594
761
|
else f"http://{args.host}:{args.port}/v1/models"
|
595
762
|
)
|
596
763
|
|
764
|
+
if args.backend == "trt":
|
765
|
+
api_url = (
|
766
|
+
f"{args.base_url}/v2/models/ensemble/generate_stream"
|
767
|
+
if args.base_url
|
768
|
+
else f"http://{args.host}:{args.port}/v2/models/ensemble/generate_stream"
|
769
|
+
)
|
770
|
+
if args.model is None:
|
771
|
+
print("Please provide a model using `--model` when using `trt` backend.")
|
772
|
+
sys.exit(1)
|
773
|
+
|
597
774
|
if args.model is None:
|
598
775
|
try:
|
599
776
|
response = requests.get(model_url)
|
@@ -610,6 +787,12 @@ def fire(args: argparse.Namespace):
|
|
610
787
|
print("No model specified or found. Please provide a model using `--model`.")
|
611
788
|
sys.exit(1)
|
612
789
|
|
790
|
+
if not check_chat_template(args.model):
|
791
|
+
print(
|
792
|
+
"\nWARNING It is recommended to use the `Chat` or `Instruct` model for benchmarking.\n"
|
793
|
+
"Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
|
794
|
+
)
|
795
|
+
|
613
796
|
print(f"{args}\n")
|
614
797
|
|
615
798
|
backend = args.backend
|
@@ -637,17 +820,35 @@ def fire(args: argparse.Namespace):
|
|
637
820
|
else:
|
638
821
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
639
822
|
|
640
|
-
|
641
|
-
|
642
|
-
|
643
|
-
|
644
|
-
|
645
|
-
|
646
|
-
|
647
|
-
|
648
|
-
|
823
|
+
if args.multi:
|
824
|
+
request_rates = parse_request_rate_range(args.request_rate_range)
|
825
|
+
|
826
|
+
for rate in request_rates:
|
827
|
+
asyncio.run(
|
828
|
+
benchmark(
|
829
|
+
backend=backend,
|
830
|
+
api_url=api_url,
|
831
|
+
model_id=model_id,
|
832
|
+
tokenizer=tokenizer,
|
833
|
+
input_requests=input_requests,
|
834
|
+
request_rate=rate,
|
835
|
+
disable_tqdm=args.disable_tqdm,
|
836
|
+
enable_multi=args.multi,
|
837
|
+
)
|
838
|
+
)
|
839
|
+
else:
|
840
|
+
asyncio.run(
|
841
|
+
benchmark(
|
842
|
+
backend=backend,
|
843
|
+
api_url=api_url,
|
844
|
+
model_id=model_id,
|
845
|
+
tokenizer=tokenizer,
|
846
|
+
input_requests=input_requests,
|
847
|
+
request_rate=args.request_rate,
|
848
|
+
disable_tqdm=args.disable_tqdm,
|
849
|
+
enable_multi=args.multi,
|
850
|
+
)
|
649
851
|
)
|
650
|
-
)
|
651
852
|
|
652
853
|
|
653
854
|
# to avoid relying on SGLang's components
|
@@ -751,6 +952,23 @@ if __name__ == "__main__":
|
|
751
952
|
action="store_true",
|
752
953
|
help="Specify to disable tqdm progress bar.",
|
753
954
|
)
|
955
|
+
parser.add_argument(
|
956
|
+
"--multi",
|
957
|
+
action="store_true",
|
958
|
+
help="Use request rate range rather than single value.",
|
959
|
+
)
|
960
|
+
parser.add_argument(
|
961
|
+
"--request-rate-range",
|
962
|
+
type=str,
|
963
|
+
default="2,34,2",
|
964
|
+
help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
|
965
|
+
)
|
966
|
+
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
967
|
+
parser.add_argument(
|
968
|
+
"--disable-stream",
|
969
|
+
action="store_true",
|
970
|
+
help="Disable streaming mode.",
|
971
|
+
)
|
754
972
|
|
755
973
|
set_ulimit()
|
756
974
|
|
@@ -16,9 +16,9 @@ class GlobalConfig:
|
|
16
16
|
self.wait_for_new_request_delay = 0.0006
|
17
17
|
|
18
18
|
# Runtime constants: New generation token ratio estimation
|
19
|
-
self.
|
19
|
+
self.init_new_token_ratio = 0.7
|
20
20
|
self.base_min_new_token_ratio = 0.2
|
21
|
-
self.new_token_ratio_decay = 0.
|
21
|
+
self.new_token_ratio_decay = 0.001
|
22
22
|
self.new_token_ratio_recovery = 0.05
|
23
23
|
|
24
24
|
# Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
|
@@ -27,6 +27,7 @@ class GlobalConfig:
|
|
27
27
|
|
28
28
|
# Runtime constants: others
|
29
29
|
self.num_continue_decode_steps = 10
|
30
|
+
self.retract_decode_steps = 20
|
30
31
|
self.flashinfer_workspace_size = 192 * 1024 * 1024
|
31
32
|
|
32
33
|
# Output tokenization configs
|
@@ -288,6 +288,7 @@ class StreamExecutor:
|
|
288
288
|
exes[i].text_ = str(self.text_)
|
289
289
|
exes[i].messages_ = list(self.messages_)
|
290
290
|
exes[i].cur_role = self.cur_role
|
291
|
+
exes[i].cur_role_begin_pos = self.cur_role_begin_pos
|
291
292
|
exes[i].fork_start_text_pos = len(self.text_)
|
292
293
|
exes[i].images_ = list(self.images_)
|
293
294
|
|
@@ -4,19 +4,26 @@ import functools
|
|
4
4
|
import json
|
5
5
|
import os
|
6
6
|
import warnings
|
7
|
-
from typing import AbstractSet, Collection, Literal, Optional, Union
|
7
|
+
from typing import AbstractSet, Collection, Dict, Literal, Optional, Type, Union
|
8
8
|
|
9
9
|
from huggingface_hub import snapshot_download
|
10
10
|
from transformers import (
|
11
11
|
AutoConfig,
|
12
12
|
AutoProcessor,
|
13
13
|
AutoTokenizer,
|
14
|
+
PretrainedConfig,
|
14
15
|
PreTrainedTokenizer,
|
15
16
|
PreTrainedTokenizerFast,
|
16
17
|
)
|
18
|
+
from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
|
17
19
|
|
18
20
|
from sglang.srt.utils import is_multimodal_model
|
19
21
|
|
22
|
+
_CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
|
23
|
+
ChatGLMConfig.model_type: ChatGLMConfig,
|
24
|
+
DbrxConfig.model_type: DbrxConfig,
|
25
|
+
}
|
26
|
+
|
20
27
|
|
21
28
|
def download_from_hf(model_path: str):
|
22
29
|
if os.path.exists(model_path):
|
@@ -40,6 +47,9 @@ def get_config(
|
|
40
47
|
config = AutoConfig.from_pretrained(
|
41
48
|
model, trust_remote_code=trust_remote_code, revision=revision
|
42
49
|
)
|
50
|
+
if config.model_type in _CONFIG_REGISTRY:
|
51
|
+
config_class = _CONFIG_REGISTRY[config.model_type]
|
52
|
+
config = config_class.from_pretrained(model, revision=revision)
|
43
53
|
if model_overide_args:
|
44
54
|
config.update(model_overide_args)
|
45
55
|
return config
|
@@ -63,6 +73,8 @@ def get_context_length(config):
|
|
63
73
|
rope_scaling = getattr(config, "rope_scaling", None)
|
64
74
|
if rope_scaling:
|
65
75
|
rope_scaling_factor = config.rope_scaling["factor"]
|
76
|
+
if config.rope_scaling["rope_type"] == "llama3":
|
77
|
+
rope_scaling_factor = 1
|
66
78
|
else:
|
67
79
|
rope_scaling_factor = 1
|
68
80
|
|