sglang 0.2.10__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.10/sglang.egg-info → sglang-0.2.11}/PKG-INFO +34 -24
- {sglang-0.2.10 → sglang-0.2.11}/README.md +25 -21
- {sglang-0.2.10 → sglang-0.2.11}/pyproject.toml +5 -3
- {sglang-0.2.10 → sglang-0.2.11}/sglang/__init__.py +8 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/api.py +10 -2
- {sglang-0.2.10 → sglang-0.2.11}/sglang/bench_latency.py +145 -36
- {sglang-0.2.10 → sglang-0.2.11}/sglang/check_env.py +24 -2
- {sglang-0.2.10 → sglang-0.2.11}/sglang/global_config.py +0 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/base_backend.py +3 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/openai.py +8 -3
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/runtime_endpoint.py +46 -29
- sglang-0.2.11/sglang/lang/choices.py +164 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/interpreter.py +6 -13
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/ir.py +11 -2
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/logits_processor.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/radix_attention.py +2 -5
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/schedule_batch.py +95 -324
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/tokenizer_manager.py +6 -3
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/tp_worker.py +20 -22
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/memory_pool.py +9 -14
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_executor/cuda_graph_runner.py +3 -3
- sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +256 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_executor/model_runner.py +6 -10
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/chatglm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/commandr.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/deepseek.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/deepseek_v2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gemma.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gemma2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gpt_bigcode.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/grok.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/internlm2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llama2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llama_classification.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llava.py +1 -2
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/minicpm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mixtral.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mixtral_quant.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen2_moe.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/openai_api/adapter.py +34 -12
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/openai_api/protocol.py +6 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/server.py +24 -6
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/server_args.py +4 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/test_utils.py +1 -1
- sglang-0.2.11/sglang/version.py +1 -0
- {sglang-0.2.10 → sglang-0.2.11/sglang.egg-info}/PKG-INFO +34 -24
- {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/SOURCES.txt +2 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/requires.txt +10 -2
- sglang-0.2.10/sglang/version.py +0 -1
- {sglang-0.2.10 → sglang-0.2.11}/LICENSE +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/setup.cfg +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/bench_serving.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/compiler.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/launch_server.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/quantization/fp8.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/io_struct.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/policy_scheduler.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/base_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/run_eval.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/runners.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang/utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.11
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -220,7 +220,6 @@ Requires-Dist: fastapi; extra == "srt"
|
|
220
220
|
Requires-Dist: hf_transfer; extra == "srt"
|
221
221
|
Requires-Dist: huggingface_hub; extra == "srt"
|
222
222
|
Requires-Dist: interegular; extra == "srt"
|
223
|
-
Requires-Dist: jsonlines; extra == "srt"
|
224
223
|
Requires-Dist: packaging; extra == "srt"
|
225
224
|
Requires-Dist: pillow; extra == "srt"
|
226
225
|
Requires-Dist: psutil; extra == "srt"
|
@@ -230,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
|
|
230
229
|
Requires-Dist: uvicorn; extra == "srt"
|
231
230
|
Requires-Dist: uvloop; extra == "srt"
|
232
231
|
Requires-Dist: zmq; extra == "srt"
|
233
|
-
Requires-Dist: vllm==0.5.
|
232
|
+
Requires-Dist: vllm==0.5.4; extra == "srt"
|
234
233
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
235
234
|
Provides-Extra: openai
|
236
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -239,11 +238,18 @@ Provides-Extra: anthropic
|
|
239
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
240
239
|
Provides-Extra: litellm
|
241
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
241
|
+
Provides-Extra: test
|
242
|
+
Requires-Dist: jsonlines; extra == "test"
|
243
|
+
Requires-Dist: matplotlib; extra == "test"
|
244
|
+
Requires-Dist: pandas; extra == "test"
|
242
245
|
Provides-Extra: all
|
243
246
|
Requires-Dist: sglang[srt]; extra == "all"
|
244
247
|
Requires-Dist: sglang[openai]; extra == "all"
|
245
248
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
246
249
|
Requires-Dist: sglang[litellm]; extra == "all"
|
250
|
+
Provides-Extra: dev
|
251
|
+
Requires-Dist: sglang[all]; extra == "dev"
|
252
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
247
253
|
|
248
254
|
<div align="center">
|
249
255
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -296,20 +302,20 @@ pip install --upgrade pip
|
|
296
302
|
pip install "sglang[all]"
|
297
303
|
|
298
304
|
# Install FlashInfer CUDA kernels
|
299
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
305
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
300
306
|
```
|
301
307
|
|
302
308
|
### Method 2: From source
|
303
309
|
```
|
304
310
|
# Use the last release branch
|
305
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
|
306
312
|
cd sglang
|
307
313
|
|
308
314
|
pip install --upgrade pip
|
309
315
|
pip install -e "python[all]"
|
310
316
|
|
311
317
|
# Install FlashInfer CUDA kernels
|
312
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
318
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
313
319
|
```
|
314
320
|
|
315
321
|
### Method 3: Using docker
|
@@ -383,7 +389,7 @@ response = client.chat.completions.create(
|
|
383
389
|
print(response)
|
384
390
|
```
|
385
391
|
|
386
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
392
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
387
393
|
|
388
394
|
### Additional Server Arguments
|
389
395
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
@@ -394,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
394
400
|
```
|
395
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
396
402
|
```
|
397
|
-
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9
|
403
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
398
404
|
```
|
399
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
400
406
|
```
|
407
|
+
- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
|
408
|
+
```
|
409
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
|
410
|
+
```
|
401
411
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
402
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
403
413
|
```
|
@@ -411,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
411
421
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
412
422
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
413
423
|
|
414
|
-
### Run Llama 3.1 405B
|
415
|
-
|
416
|
-
```bash
|
417
|
-
## Run 405B (fp8) on a single node
|
418
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
419
|
-
|
420
|
-
## Run 405B (fp16) on two nodes
|
421
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
422
|
-
|
423
|
-
# on the first node
|
424
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
425
|
-
|
426
|
-
# on the second
|
427
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
428
|
-
```
|
429
|
-
|
430
424
|
### Supported Models
|
431
425
|
|
432
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -452,6 +446,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
452
446
|
|
453
447
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
454
448
|
|
449
|
+
### Run Llama 3.1 405B
|
450
|
+
|
451
|
+
```bash
|
452
|
+
## Run 405B (fp8) on a single node
|
453
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
454
|
+
|
455
|
+
## Run 405B (fp16) on two nodes
|
456
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
457
|
+
|
458
|
+
# on the first node
|
459
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
460
|
+
|
461
|
+
# on the second
|
462
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
463
|
+
```
|
464
|
+
|
455
465
|
### Benchmark Performance
|
456
466
|
|
457
467
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -49,20 +49,20 @@ pip install --upgrade pip
|
|
49
49
|
pip install "sglang[all]"
|
50
50
|
|
51
51
|
# Install FlashInfer CUDA kernels
|
52
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
52
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
53
53
|
```
|
54
54
|
|
55
55
|
### Method 2: From source
|
56
56
|
```
|
57
57
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
58
|
+
git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
|
59
59
|
cd sglang
|
60
60
|
|
61
61
|
pip install --upgrade pip
|
62
62
|
pip install -e "python[all]"
|
63
63
|
|
64
64
|
# Install FlashInfer CUDA kernels
|
65
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
65
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
66
66
|
```
|
67
67
|
|
68
68
|
### Method 3: Using docker
|
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
|
|
136
136
|
print(response)
|
137
137
|
```
|
138
138
|
|
139
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
139
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
140
140
|
|
141
141
|
### Additional Server Arguments
|
142
142
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
@@ -147,10 +147,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
147
147
|
```
|
148
148
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
149
149
|
```
|
150
|
-
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9
|
150
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
151
151
|
```
|
152
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
153
153
|
```
|
154
|
+
- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
|
155
|
+
```
|
156
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
|
157
|
+
```
|
154
158
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
155
159
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
156
160
|
```
|
@@ -164,22 +168,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
164
168
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
165
169
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
166
170
|
|
167
|
-
### Run Llama 3.1 405B
|
168
|
-
|
169
|
-
```bash
|
170
|
-
## Run 405B (fp8) on a single node
|
171
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
|
-
|
173
|
-
## Run 405B (fp16) on two nodes
|
174
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
175
|
-
|
176
|
-
# on the first node
|
177
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
178
|
-
|
179
|
-
# on the second
|
180
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
181
|
-
```
|
182
|
-
|
183
171
|
### Supported Models
|
184
172
|
|
185
173
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -205,6 +193,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
205
193
|
|
206
194
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
207
195
|
|
196
|
+
### Run Llama 3.1 405B
|
197
|
+
|
198
|
+
```bash
|
199
|
+
## Run 405B (fp8) on a single node
|
200
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
201
|
+
|
202
|
+
## Run 405B (fp16) on two nodes
|
203
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
204
|
+
|
205
|
+
# on the first node
|
206
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
207
|
+
|
208
|
+
# on the second
|
209
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
210
|
+
```
|
211
|
+
|
208
212
|
### Benchmark Performance
|
209
213
|
|
210
214
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.11"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,14 +20,16 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.
|
26
|
+
"vllm==0.5.4", "outlines>=0.0.44"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas"]
|
30
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
|
+
dev = ["sglang[all]", "sglang[test]"]
|
31
33
|
|
32
34
|
[project.urls]
|
33
35
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -22,6 +22,11 @@ from sglang.api import (
|
|
22
22
|
user_end,
|
23
23
|
video,
|
24
24
|
)
|
25
|
+
from sglang.lang.choices import (
|
26
|
+
greedy_token_selection,
|
27
|
+
token_length_normalized,
|
28
|
+
unconditional_likelihood_normalized,
|
29
|
+
)
|
25
30
|
|
26
31
|
# SGLang DSL APIs
|
27
32
|
__all__ = [
|
@@ -45,6 +50,9 @@ __all__ = [
|
|
45
50
|
"user_begin",
|
46
51
|
"user_end",
|
47
52
|
"video",
|
53
|
+
"greedy_token_selection",
|
54
|
+
"token_length_normalized",
|
55
|
+
"unconditional_likelihood_normalized",
|
48
56
|
]
|
49
57
|
|
50
58
|
# Global Configurations
|
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
|
|
6
6
|
|
7
7
|
from sglang.global_config import global_config
|
8
8
|
from sglang.lang.backend.base_backend import BaseBackend
|
9
|
+
from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
|
9
10
|
from sglang.lang.ir import (
|
10
11
|
SglExpr,
|
11
12
|
SglExprList,
|
@@ -73,12 +74,18 @@ def gen(
|
|
73
74
|
return_text_in_logprobs: Optional[bool] = None,
|
74
75
|
dtype: Optional[type] = None,
|
75
76
|
choices: Optional[List[str]] = None,
|
77
|
+
choices_method: Optional[ChoicesSamplingMethod] = None,
|
76
78
|
regex: Optional[str] = None,
|
77
79
|
):
|
78
80
|
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
79
81
|
|
80
82
|
if choices:
|
81
|
-
return SglSelect(
|
83
|
+
return SglSelect(
|
84
|
+
name,
|
85
|
+
choices,
|
86
|
+
0.0 if temperature is None else temperature,
|
87
|
+
token_length_normalized if choices_method is None else choices_method,
|
88
|
+
)
|
82
89
|
|
83
90
|
# check regex is valid
|
84
91
|
if regex is not None:
|
@@ -186,9 +193,10 @@ def select(
|
|
186
193
|
name: Optional[str] = None,
|
187
194
|
choices: Optional[List[str]] = None,
|
188
195
|
temperature: float = 0.0,
|
196
|
+
choices_method: ChoicesSamplingMethod = token_length_normalized,
|
189
197
|
):
|
190
198
|
assert choices is not None
|
191
|
-
return SglSelect(name, choices, temperature)
|
199
|
+
return SglSelect(name, choices, temperature, choices_method)
|
192
200
|
|
193
201
|
|
194
202
|
def _role_common(name: str, expr: Optional[SglExpr] = None):
|
@@ -1,13 +1,21 @@
|
|
1
1
|
"""
|
2
2
|
Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
|
3
3
|
|
4
|
-
# Usage (latency test)
|
4
|
+
# Usage (latency test)
|
5
|
+
## with dummy weights:
|
5
6
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
7
|
+
## sweep through multiple data points and store (append) the results in a jsonl file:
|
8
|
+
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
|
9
|
+
## do some changes, and store the results under a different run_name:
|
10
|
+
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
|
11
|
+
## plot the results in series of lines:
|
12
|
+
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
|
13
|
+
|
6
14
|
|
7
15
|
# Usage (correctness test):
|
8
16
|
python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
9
17
|
|
10
|
-
|
18
|
+
## Reference output (of the correctness test above, can be gpu dependent):
|
11
19
|
prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
|
12
20
|
[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
|
13
21
|
[ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
|
@@ -28,19 +36,23 @@ I'm going to the park
|
|
28
36
|
|
29
37
|
import argparse
|
30
38
|
import dataclasses
|
39
|
+
import itertools
|
31
40
|
import logging
|
32
41
|
import multiprocessing
|
42
|
+
import os
|
43
|
+
import sqlite3
|
33
44
|
import time
|
34
45
|
from typing import Tuple
|
35
46
|
|
36
|
-
import jsonlines
|
37
47
|
import numpy as np
|
48
|
+
import pandas as pd
|
38
49
|
import torch
|
39
50
|
import torch.distributed as dist
|
40
51
|
|
41
52
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
42
|
-
from sglang.srt.managers.schedule_batch import
|
53
|
+
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
43
54
|
from sglang.srt.model_config import ModelConfig
|
55
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode
|
44
56
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
45
57
|
from sglang.srt.sampling_params import SamplingParams
|
46
58
|
from sglang.srt.server_args import ServerArgs
|
@@ -49,26 +61,42 @@ from sglang.srt.utils import suppress_other_loggers
|
|
49
61
|
|
50
62
|
@dataclasses.dataclass
|
51
63
|
class BenchArgs:
|
64
|
+
run_name: str = "before"
|
52
65
|
batch_size: Tuple[int] = (1,)
|
53
|
-
input_len: int = 1024
|
54
|
-
output_len: int = 4
|
66
|
+
input_len: Tuple[int] = (1024,)
|
67
|
+
output_len: Tuple[int] = (4,)
|
55
68
|
result_filename: str = ""
|
56
69
|
correctness_test: bool = False
|
57
70
|
# This is only used for correctness test
|
58
71
|
cut_len: int = 4
|
72
|
+
# Plotting args
|
73
|
+
graph_sql: str = (
|
74
|
+
"select run_name, batch_size, prefill_throughput from results where run_name='before'"
|
75
|
+
)
|
76
|
+
graph_filename: str = "out.png"
|
59
77
|
|
60
78
|
@staticmethod
|
61
79
|
def add_cli_args(parser: argparse.ArgumentParser):
|
80
|
+
parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
|
62
81
|
parser.add_argument(
|
63
82
|
"--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
|
64
83
|
)
|
65
|
-
parser.add_argument(
|
66
|
-
|
84
|
+
parser.add_argument(
|
85
|
+
"--input-len", type=int, nargs="+", default=BenchArgs.input_len
|
86
|
+
)
|
87
|
+
parser.add_argument(
|
88
|
+
"--output-len", type=int, nargs="+", default=BenchArgs.output_len
|
89
|
+
)
|
67
90
|
parser.add_argument(
|
68
91
|
"--result-filename", type=str, default=BenchArgs.result_filename
|
69
92
|
)
|
70
93
|
parser.add_argument("--correctness-test", action="store_true")
|
71
94
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
95
|
+
# graphing
|
96
|
+
parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
|
97
|
+
parser.add_argument(
|
98
|
+
"--graph-filename", type=str, default=BenchArgs.graph_filename
|
99
|
+
)
|
72
100
|
|
73
101
|
@classmethod
|
74
102
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -161,7 +189,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
161
189
|
|
162
190
|
|
163
191
|
def extend(reqs, model_runner):
|
164
|
-
batch =
|
192
|
+
batch = ScheduleBatch.init_new(
|
165
193
|
reqs=reqs,
|
166
194
|
req_to_token_pool=model_runner.req_to_token_pool,
|
167
195
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
@@ -222,15 +250,21 @@ def correctness_test(
|
|
222
250
|
|
223
251
|
@torch.inference_mode()
|
224
252
|
def latency_test_run_once(
|
225
|
-
model_runner, rank_print, reqs, batch_size, input_len, output_len
|
253
|
+
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
226
254
|
):
|
255
|
+
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
|
256
|
+
if batch_size > max_batch_size:
|
257
|
+
rank_print(
|
258
|
+
f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
|
259
|
+
)
|
260
|
+
return
|
227
261
|
|
228
262
|
# Clear the pools.
|
229
263
|
model_runner.req_to_token_pool.clear()
|
230
264
|
model_runner.token_to_kv_pool.clear()
|
231
265
|
|
232
266
|
measurement_results = {
|
233
|
-
"run_name":
|
267
|
+
"run_name": run_name,
|
234
268
|
"batch_size": batch_size,
|
235
269
|
"input_len": input_len,
|
236
270
|
"output_len": output_len,
|
@@ -291,49 +325,119 @@ def latency_test(
|
|
291
325
|
|
292
326
|
# Load the model
|
293
327
|
model_runner, tokenizer = load_model(server_args, tp_rank)
|
294
|
-
rank_print(
|
295
|
-
f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
|
296
|
-
)
|
297
328
|
|
298
|
-
#
|
299
|
-
bench_args.batch_size = bench_args.batch_size[0]
|
300
|
-
|
301
|
-
# Prepare inputs
|
329
|
+
# Prepare inputs for warm up
|
302
330
|
reqs = prepare_synthetic_inputs_for_latency_test(
|
303
|
-
bench_args.batch_size, bench_args.input_len
|
331
|
+
bench_args.batch_size[0], bench_args.input_len[0]
|
304
332
|
)
|
305
333
|
|
306
334
|
# Warm up
|
307
335
|
latency_test_run_once(
|
308
|
-
|
336
|
+
bench_args.run_name,
|
337
|
+
model_runner,
|
338
|
+
rank_print,
|
339
|
+
reqs,
|
340
|
+
bench_args.batch_size[0],
|
341
|
+
bench_args.input_len[0],
|
342
|
+
4, # shorter decoding to speed up the warmup
|
309
343
|
)
|
310
344
|
|
311
|
-
# Run
|
345
|
+
# Run the sweep
|
312
346
|
result_list = []
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
bench_args.
|
319
|
-
bench_args.input_len,
|
320
|
-
bench_args.output_len,
|
347
|
+
for bs, il, ol in itertools.product(
|
348
|
+
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
349
|
+
):
|
350
|
+
req = prepare_synthetic_inputs_for_latency_test(bs, il)
|
351
|
+
ret = latency_test_run_once(
|
352
|
+
bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
|
321
353
|
)
|
322
|
-
|
354
|
+
if ret is not None:
|
355
|
+
result_list.append(ret)
|
356
|
+
|
357
|
+
# Write results in jsonlines format on rank 0.
|
358
|
+
if tp_rank == 0 and bench_args.result_filename:
|
359
|
+
import jsonlines
|
323
360
|
|
324
|
-
# Write results in jsonlines format.
|
325
|
-
if bench_args.result_filename:
|
326
361
|
with jsonlines.open(bench_args.result_filename, "a") as f:
|
327
362
|
f.write_all(result_list)
|
328
363
|
|
329
364
|
|
365
|
+
def plot_latency_test(
|
366
|
+
server_args,
|
367
|
+
bench_args,
|
368
|
+
tp_rank,
|
369
|
+
):
|
370
|
+
assert tp_rank == 0
|
371
|
+
|
372
|
+
# read the jsonl file and put in sqlite
|
373
|
+
df = pd.read_json(bench_args.result_filename, lines=True)
|
374
|
+
conn = sqlite3.connect(":memory:")
|
375
|
+
cur = conn.cursor()
|
376
|
+
|
377
|
+
# get the columns and their types
|
378
|
+
column_names = list(df.iloc[0].keys())
|
379
|
+
type_dict = {
|
380
|
+
str: "TEXT",
|
381
|
+
np.int64: "INTEGER",
|
382
|
+
np.float64: "FLOAT",
|
383
|
+
}
|
384
|
+
column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
|
385
|
+
|
386
|
+
# create the table
|
387
|
+
cur.execute(
|
388
|
+
f"""
|
389
|
+
CREATE TABLE IF NOT EXISTS results (
|
390
|
+
{", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
|
391
|
+
)
|
392
|
+
"""
|
393
|
+
)
|
394
|
+
conn.commit()
|
395
|
+
|
396
|
+
# write the results to DB
|
397
|
+
df.to_sql("results", conn, if_exists="replace", index=False)
|
398
|
+
conn.commit()
|
399
|
+
|
400
|
+
# read it back using sql
|
401
|
+
df = pd.read_sql_query(bench_args.graph_sql, conn)
|
402
|
+
conn.close()
|
403
|
+
|
404
|
+
# plot it and save to a file
|
405
|
+
import matplotlib.pyplot as plt
|
406
|
+
|
407
|
+
assert (
|
408
|
+
len(df.columns) == 3
|
409
|
+
), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
|
410
|
+
for label in df[df.columns[0]].unique():
|
411
|
+
q = f"{df.columns[0]}=='{label}'"
|
412
|
+
series = df.query(q)
|
413
|
+
plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
|
414
|
+
plt.xlabel(df.columns[1])
|
415
|
+
plt.ylabel(df.columns[2])
|
416
|
+
plt.legend()
|
417
|
+
plt.savefig(bench_args.graph_filename, dpi=300)
|
418
|
+
|
419
|
+
# if in kitty, just dump it to the terminal
|
420
|
+
if os.environ["TERM"] == "xterm-kitty":
|
421
|
+
os.system(
|
422
|
+
f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
|
423
|
+
)
|
424
|
+
|
425
|
+
|
330
426
|
def main(server_args, bench_args):
|
331
|
-
print(bench_args)
|
332
427
|
|
333
|
-
if
|
334
|
-
|
428
|
+
if server_args.model_path:
|
429
|
+
if bench_args.correctness_test:
|
430
|
+
work_func = correctness_test
|
431
|
+
else:
|
432
|
+
work_func = latency_test
|
433
|
+
elif os.path.isfile(bench_args.result_filename):
|
434
|
+
assert bench_args.graph_filename, "please provide a filename for the graph"
|
435
|
+
work_func = plot_latency_test
|
335
436
|
else:
|
336
|
-
|
437
|
+
raise ValueError(
|
438
|
+
"Provide --model-path for running the tests or "
|
439
|
+
"provide --result-filename for plotting the results"
|
440
|
+
)
|
337
441
|
|
338
442
|
if server_args.tp_size == 1:
|
339
443
|
work_func(server_args, bench_args, 0)
|
@@ -361,6 +465,11 @@ if __name__ == "__main__":
|
|
361
465
|
parser = argparse.ArgumentParser()
|
362
466
|
ServerArgs.add_cli_args(parser)
|
363
467
|
BenchArgs.add_cli_args(parser)
|
468
|
+
# For this script, model-path is not required
|
469
|
+
assert (
|
470
|
+
parser._actions[1].option_strings[0] == "--model-path"
|
471
|
+
), "options changed, this code need to be updated"
|
472
|
+
parser._actions[1].required = False
|
364
473
|
args = parser.parse_args()
|
365
474
|
|
366
475
|
server_args = ServerArgs.from_cli_args(args)
|
@@ -14,6 +14,7 @@ PACKAGE_LIST = [
|
|
14
14
|
"sglang",
|
15
15
|
"flashinfer",
|
16
16
|
"triton",
|
17
|
+
"transformers",
|
17
18
|
"requests",
|
18
19
|
"tqdm",
|
19
20
|
"numpy",
|
@@ -73,10 +74,26 @@ def _get_gpu_info():
|
|
73
74
|
Get information about available GPUs.
|
74
75
|
"""
|
75
76
|
devices = defaultdict(list)
|
77
|
+
capabilities = defaultdict(list)
|
76
78
|
for k in range(torch.cuda.device_count()):
|
77
79
|
devices[torch.cuda.get_device_name(k)].append(str(k))
|
80
|
+
capability = torch.cuda.get_device_capability(k)
|
81
|
+
capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
|
78
82
|
|
79
|
-
|
83
|
+
gpu_info = {}
|
84
|
+
for name, device_ids in devices.items():
|
85
|
+
gpu_info[f"GPU {','.join(device_ids)}"] = name
|
86
|
+
|
87
|
+
if len(capabilities) == 1:
|
88
|
+
# All GPUs have the same compute capability
|
89
|
+
cap, gpu_ids = list(capabilities.items())[0]
|
90
|
+
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
91
|
+
else:
|
92
|
+
# GPUs have different compute capabilities
|
93
|
+
for cap, gpu_ids in capabilities.items():
|
94
|
+
gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
|
95
|
+
|
96
|
+
return gpu_info
|
80
97
|
|
81
98
|
|
82
99
|
def _get_cuda_version_info():
|
@@ -118,6 +135,7 @@ def _get_cuda_driver_version():
|
|
118
135
|
"""
|
119
136
|
Get CUDA driver version.
|
120
137
|
"""
|
138
|
+
versions = set()
|
121
139
|
try:
|
122
140
|
output = subprocess.check_output(
|
123
141
|
[
|
@@ -126,7 +144,11 @@ def _get_cuda_driver_version():
|
|
126
144
|
"--format=csv,noheader,nounits",
|
127
145
|
]
|
128
146
|
)
|
129
|
-
|
147
|
+
versions = set(output.decode().strip().split("\n"))
|
148
|
+
if len(versions) == 1:
|
149
|
+
return {"CUDA Driver Version": versions.pop()}
|
150
|
+
else:
|
151
|
+
return {"CUDA Driver Versions": ", ".join(sorted(versions))}
|
130
152
|
except subprocess.SubprocessError:
|
131
153
|
return {"CUDA Driver Version": "Not Available"}
|
132
154
|
|
@@ -19,7 +19,6 @@ class GlobalConfig:
|
|
19
19
|
self.init_new_token_ratio = 0.7
|
20
20
|
self.base_min_new_token_ratio = 0.1
|
21
21
|
self.new_token_ratio_decay = 0.001
|
22
|
-
self.new_token_ratio_recovery = 0.05
|
23
22
|
|
24
23
|
# Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
|
25
24
|
# This can improve the speed for large batch sizes during prefill.
|