sglang 0.2.9.post1__tar.gz → 0.2.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.9.post1/sglang.egg-info → sglang-0.2.11}/PKG-INFO +37 -26
- {sglang-0.2.9.post1 → sglang-0.2.11}/README.md +27 -23
- {sglang-0.2.9.post1 → sglang-0.2.11}/pyproject.toml +7 -3
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/__init__.py +8 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/api.py +10 -2
- sglang-0.2.11/sglang/bench_latency.py +483 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/check_env.py +25 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/global_config.py +0 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/base_backend.py +3 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/openai.py +8 -3
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/runtime_endpoint.py +46 -40
- sglang-0.2.11/sglang/lang/choices.py +164 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/interpreter.py +6 -13
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/ir.py +11 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/hf_transformers_utils.py +2 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/extend_attention.py +59 -7
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/logits_processor.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/radix_attention.py +24 -14
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/token_attention.py +28 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/io_struct.py +9 -4
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/schedule_batch.py +98 -323
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/tokenizer_manager.py +34 -16
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/tp_worker.py +20 -22
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/memory_pool.py +74 -38
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_config.py +11 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_executor/cuda_graph_runner.py +3 -3
- sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +256 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_executor/model_runner.py +51 -26
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/chatglm.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/commandr.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/deepseek.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/deepseek_v2.py +199 -17
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gemma.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gemma2.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gpt_bigcode.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/grok.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/internlm2.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llama2.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llama_classification.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llava.py +1 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/minicpm.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mixtral.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mixtral_quant.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen2.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen2_moe.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/openai_api/adapter.py +151 -29
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/openai_api/protocol.py +7 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/server.py +111 -84
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/server_args.py +12 -2
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/utils.py +25 -20
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/run_eval.py +21 -10
- sglang-0.2.11/sglang/test/runners.py +237 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_common.py +12 -12
- sglang-0.2.11/sglang/test/simple_eval_gpqa.py +92 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_humaneval.py +5 -5
- sglang-0.2.11/sglang/test/simple_eval_math.py +72 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/test_utils.py +95 -14
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/utils.py +15 -37
- sglang-0.2.11/sglang/version.py +1 -0
- {sglang-0.2.9.post1 → sglang-0.2.11/sglang.egg-info}/PKG-INFO +37 -26
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/SOURCES.txt +5 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/requires.txt +11 -2
- sglang-0.2.9.post1/sglang/bench_latency.py +0 -323
- sglang-0.2.9.post1/sglang/version.py +0 -1
- {sglang-0.2.9.post1 → sglang-0.2.11}/LICENSE +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/setup.cfg +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/bench_serving.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/compiler.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/launch_server.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/quantization/fp8.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/policy_scheduler.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/base_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/test_programs.py +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.11
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -224,13 +224,13 @@ Requires-Dist: packaging; extra == "srt"
|
|
224
224
|
Requires-Dist: pillow; extra == "srt"
|
225
225
|
Requires-Dist: psutil; extra == "srt"
|
226
226
|
Requires-Dist: pydantic; extra == "srt"
|
227
|
+
Requires-Dist: python-multipart; extra == "srt"
|
227
228
|
Requires-Dist: torch; extra == "srt"
|
228
229
|
Requires-Dist: uvicorn; extra == "srt"
|
229
230
|
Requires-Dist: uvloop; extra == "srt"
|
230
231
|
Requires-Dist: zmq; extra == "srt"
|
231
|
-
Requires-Dist: vllm==0.5.
|
232
|
+
Requires-Dist: vllm==0.5.4; extra == "srt"
|
232
233
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
|
-
Requires-Dist: python-multipart; extra == "srt"
|
234
234
|
Provides-Extra: openai
|
235
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
236
236
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -238,11 +238,18 @@ Provides-Extra: anthropic
|
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
239
|
Provides-Extra: litellm
|
240
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
241
|
+
Provides-Extra: test
|
242
|
+
Requires-Dist: jsonlines; extra == "test"
|
243
|
+
Requires-Dist: matplotlib; extra == "test"
|
244
|
+
Requires-Dist: pandas; extra == "test"
|
241
245
|
Provides-Extra: all
|
242
246
|
Requires-Dist: sglang[srt]; extra == "all"
|
243
247
|
Requires-Dist: sglang[openai]; extra == "all"
|
244
248
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
245
249
|
Requires-Dist: sglang[litellm]; extra == "all"
|
250
|
+
Provides-Extra: dev
|
251
|
+
Requires-Dist: sglang[all]; extra == "dev"
|
252
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
246
253
|
|
247
254
|
<div align="center">
|
248
255
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -295,20 +302,20 @@ pip install --upgrade pip
|
|
295
302
|
pip install "sglang[all]"
|
296
303
|
|
297
304
|
# Install FlashInfer CUDA kernels
|
298
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
305
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
299
306
|
```
|
300
307
|
|
301
308
|
### Method 2: From source
|
302
309
|
```
|
303
|
-
# Use the
|
304
|
-
git clone -b v0.2.
|
310
|
+
# Use the last release branch
|
311
|
+
git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
|
305
312
|
cd sglang
|
306
313
|
|
307
314
|
pip install --upgrade pip
|
308
315
|
pip install -e "python[all]"
|
309
316
|
|
310
317
|
# Install FlashInfer CUDA kernels
|
311
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
318
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
312
319
|
```
|
313
320
|
|
314
321
|
### Method 3: Using docker
|
@@ -382,7 +389,7 @@ response = client.chat.completions.create(
|
|
382
389
|
print(response)
|
383
390
|
```
|
384
391
|
|
385
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
392
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
386
393
|
|
387
394
|
### Additional Server Arguments
|
388
395
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
@@ -393,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
393
400
|
```
|
394
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
395
402
|
```
|
396
|
-
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9
|
403
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
397
404
|
```
|
398
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
399
406
|
```
|
407
|
+
- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
|
408
|
+
```
|
409
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
|
410
|
+
```
|
400
411
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
401
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
402
413
|
```
|
@@ -410,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
410
421
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
411
422
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
412
423
|
|
413
|
-
### Run Llama 3.1 405B
|
414
|
-
|
415
|
-
```bash
|
416
|
-
## Run 405B (fp8) on a single node
|
417
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
418
|
-
|
419
|
-
## Run 405B (fp16) on two nodes
|
420
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
421
|
-
|
422
|
-
# on the first node
|
423
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
424
|
-
|
425
|
-
# on the second
|
426
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
427
|
-
```
|
428
|
-
|
429
424
|
### Supported Models
|
430
425
|
|
431
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -451,9 +446,25 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
451
446
|
|
452
447
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
453
448
|
|
449
|
+
### Run Llama 3.1 405B
|
450
|
+
|
451
|
+
```bash
|
452
|
+
## Run 405B (fp8) on a single node
|
453
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
454
|
+
|
455
|
+
## Run 405B (fp16) on two nodes
|
456
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
457
|
+
|
458
|
+
# on the first node
|
459
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
460
|
+
|
461
|
+
# on the second
|
462
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
463
|
+
```
|
464
|
+
|
454
465
|
### Benchmark Performance
|
455
466
|
|
456
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as
|
467
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
457
468
|
```
|
458
469
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
459
470
|
```
|
@@ -49,20 +49,20 @@ pip install --upgrade pip
|
|
49
49
|
pip install "sglang[all]"
|
50
50
|
|
51
51
|
# Install FlashInfer CUDA kernels
|
52
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
52
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
53
53
|
```
|
54
54
|
|
55
55
|
### Method 2: From source
|
56
56
|
```
|
57
|
-
# Use the
|
58
|
-
git clone -b v0.2.
|
57
|
+
# Use the last release branch
|
58
|
+
git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
|
59
59
|
cd sglang
|
60
60
|
|
61
61
|
pip install --upgrade pip
|
62
62
|
pip install -e "python[all]"
|
63
63
|
|
64
64
|
# Install FlashInfer CUDA kernels
|
65
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
65
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
66
66
|
```
|
67
67
|
|
68
68
|
### Method 3: Using docker
|
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
|
|
136
136
|
print(response)
|
137
137
|
```
|
138
138
|
|
139
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
139
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
140
140
|
|
141
141
|
### Additional Server Arguments
|
142
142
|
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
@@ -147,10 +147,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
147
147
|
```
|
148
148
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
149
149
|
```
|
150
|
-
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9
|
150
|
+
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
151
151
|
```
|
152
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
153
153
|
```
|
154
|
+
- If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
|
155
|
+
```
|
156
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
|
157
|
+
```
|
154
158
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
155
159
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
156
160
|
```
|
@@ -164,22 +168,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
164
168
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
165
169
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
166
170
|
|
167
|
-
### Run Llama 3.1 405B
|
168
|
-
|
169
|
-
```bash
|
170
|
-
## Run 405B (fp8) on a single node
|
171
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
|
-
|
173
|
-
## Run 405B (fp16) on two nodes
|
174
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
175
|
-
|
176
|
-
# on the first node
|
177
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
178
|
-
|
179
|
-
# on the second
|
180
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
181
|
-
```
|
182
|
-
|
183
171
|
### Supported Models
|
184
172
|
|
185
173
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
@@ -205,9 +193,25 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
205
193
|
|
206
194
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
207
195
|
|
196
|
+
### Run Llama 3.1 405B
|
197
|
+
|
198
|
+
```bash
|
199
|
+
## Run 405B (fp8) on a single node
|
200
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
201
|
+
|
202
|
+
## Run 405B (fp16) on two nodes
|
203
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
204
|
+
|
205
|
+
# on the first node
|
206
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
207
|
+
|
208
|
+
# on the second
|
209
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
210
|
+
```
|
211
|
+
|
208
212
|
### Benchmark Performance
|
209
213
|
|
210
|
-
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as
|
214
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
211
215
|
```
|
212
216
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
213
217
|
```
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.11"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,12 +20,16 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
|
-
"
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
|
+
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
|
+
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
+
"vllm==0.5.4", "outlines>=0.0.44"]
|
25
27
|
openai = ["openai>=1.0", "tiktoken"]
|
26
28
|
anthropic = ["anthropic>=0.20.0"]
|
27
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas"]
|
28
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
|
+
dev = ["sglang[all]", "sglang[test]"]
|
29
33
|
|
30
34
|
[project.urls]
|
31
35
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -22,6 +22,11 @@ from sglang.api import (
|
|
22
22
|
user_end,
|
23
23
|
video,
|
24
24
|
)
|
25
|
+
from sglang.lang.choices import (
|
26
|
+
greedy_token_selection,
|
27
|
+
token_length_normalized,
|
28
|
+
unconditional_likelihood_normalized,
|
29
|
+
)
|
25
30
|
|
26
31
|
# SGLang DSL APIs
|
27
32
|
__all__ = [
|
@@ -45,6 +50,9 @@ __all__ = [
|
|
45
50
|
"user_begin",
|
46
51
|
"user_end",
|
47
52
|
"video",
|
53
|
+
"greedy_token_selection",
|
54
|
+
"token_length_normalized",
|
55
|
+
"unconditional_likelihood_normalized",
|
48
56
|
]
|
49
57
|
|
50
58
|
# Global Configurations
|
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
|
|
6
6
|
|
7
7
|
from sglang.global_config import global_config
|
8
8
|
from sglang.lang.backend.base_backend import BaseBackend
|
9
|
+
from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
|
9
10
|
from sglang.lang.ir import (
|
10
11
|
SglExpr,
|
11
12
|
SglExprList,
|
@@ -73,12 +74,18 @@ def gen(
|
|
73
74
|
return_text_in_logprobs: Optional[bool] = None,
|
74
75
|
dtype: Optional[type] = None,
|
75
76
|
choices: Optional[List[str]] = None,
|
77
|
+
choices_method: Optional[ChoicesSamplingMethod] = None,
|
76
78
|
regex: Optional[str] = None,
|
77
79
|
):
|
78
80
|
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
79
81
|
|
80
82
|
if choices:
|
81
|
-
return SglSelect(
|
83
|
+
return SglSelect(
|
84
|
+
name,
|
85
|
+
choices,
|
86
|
+
0.0 if temperature is None else temperature,
|
87
|
+
token_length_normalized if choices_method is None else choices_method,
|
88
|
+
)
|
82
89
|
|
83
90
|
# check regex is valid
|
84
91
|
if regex is not None:
|
@@ -186,9 +193,10 @@ def select(
|
|
186
193
|
name: Optional[str] = None,
|
187
194
|
choices: Optional[List[str]] = None,
|
188
195
|
temperature: float = 0.0,
|
196
|
+
choices_method: ChoicesSamplingMethod = token_length_normalized,
|
189
197
|
):
|
190
198
|
assert choices is not None
|
191
|
-
return SglSelect(name, choices, temperature)
|
199
|
+
return SglSelect(name, choices, temperature, choices_method)
|
192
200
|
|
193
201
|
|
194
202
|
def _role_common(name: str, expr: Optional[SglExpr] = None):
|