sglang 0.2.10__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.10/sglang.egg-info → sglang-0.2.12}/PKG-INFO +50 -31
- {sglang-0.2.10 → sglang-0.2.12}/README.md +41 -28
- {sglang-0.2.10 → sglang-0.2.12}/pyproject.toml +5 -3
- {sglang-0.2.10 → sglang-0.2.12}/sglang/__init__.py +8 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/api.py +10 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/bench_latency.py +151 -40
- {sglang-0.2.10 → sglang-0.2.12}/sglang/bench_serving.py +46 -22
- {sglang-0.2.10 → sglang-0.2.12}/sglang/check_env.py +24 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/global_config.py +0 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/base_backend.py +3 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/openai.py +8 -3
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/runtime_endpoint.py +46 -29
- sglang-0.2.12/sglang/lang/choices.py +164 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/compiler.py +2 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/interpreter.py +6 -13
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/ir.py +14 -5
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/base_tool_cache.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/fsm_cache.py +12 -2
- sglang-0.2.12/sglang/srt/layers/activation.py +33 -0
- sglang-0.2.10/sglang/srt/layers/token_attention.py → sglang-0.2.12/sglang/srt/layers/decode_attention.py +9 -5
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/extend_attention.py +6 -1
- sglang-0.2.12/sglang/srt/layers/layernorm.py +65 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/logits_processor.py +6 -1
- sglang-0.2.12/sglang/srt/layers/pooler.py +50 -0
- sglang-0.2.10/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.12/sglang/srt/layers/prefill_attention.py +5 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/radix_attention.py +4 -7
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/detokenizer_manager.py +31 -9
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/io_struct.py +63 -0
- sglang-0.2.12/sglang/srt/managers/policy_scheduler.py +233 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/schedule_batch.py +174 -380
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/tokenizer_manager.py +197 -112
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/tp_worker.py +299 -364
- sglang-0.2.10/sglang/srt/mem_cache/base_cache.py → sglang-0.2.12/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
- sglang-0.2.12/sglang/srt/mem_cache/chunk_cache.py +83 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/memory_pool.py +10 -15
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/radix_cache.py +74 -40
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_executor/cuda_graph_runner.py +27 -12
- sglang-0.2.12/sglang/srt/model_executor/forward_batch_info.py +319 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_executor/model_runner.py +30 -47
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/chatglm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/commandr.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/deepseek.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/deepseek_v2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gemma.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gemma2.py +1 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gpt_bigcode.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/grok.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/internlm2.py +3 -8
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llama2.py +5 -5
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llama_classification.py +1 -1
- sglang-0.2.12/sglang/srt/models/llama_embedding.py +88 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llava.py +1 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/minicpm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mixtral.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mixtral_quant.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen2.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen2_moe.py +1 -12
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/openai_api/adapter.py +189 -39
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/openai_api/protocol.py +43 -1
- sglang-0.2.12/sglang/srt/sampling/penaltylib/__init__.py +13 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/sampling_params.py +31 -4
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/server.py +93 -21
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/server_args.py +30 -19
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/utils.py +31 -13
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/run_eval.py +10 -1
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/runners.py +63 -63
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_humaneval.py +2 -8
- sglang-0.2.12/sglang/test/simple_eval_mgsm.py +203 -0
- sglang-0.2.12/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
- sglang-0.2.12/sglang/test/test_layernorm.py +60 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/test_programs.py +4 -2
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/test_utils.py +21 -3
- {sglang-0.2.10 → sglang-0.2.12}/sglang/utils.py +0 -1
- sglang-0.2.12/sglang/version.py +1 -0
- {sglang-0.2.10 → sglang-0.2.12/sglang.egg-info}/PKG-INFO +50 -31
- {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/SOURCES.txt +19 -7
- {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/requires.txt +10 -2
- sglang-0.2.10/sglang/srt/layers/linear.py +0 -884
- sglang-0.2.10/sglang/srt/layers/quantization/__init__.py +0 -64
- sglang-0.2.10/sglang/srt/layers/quantization/fp8.py +0 -677
- sglang-0.2.10/sglang/srt/managers/policy_scheduler.py +0 -85
- sglang-0.2.10/sglang/srt/mem_cache/chunk_cache.py +0 -60
- sglang-0.2.10/sglang/version.py +0 -1
- {sglang-0.2.10 → sglang-0.2.12}/LICENSE +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/setup.cfg +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/launch_server.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.12
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -220,7 +220,6 @@ Requires-Dist: fastapi; extra == "srt"
|
|
220
220
|
Requires-Dist: hf_transfer; extra == "srt"
|
221
221
|
Requires-Dist: huggingface_hub; extra == "srt"
|
222
222
|
Requires-Dist: interegular; extra == "srt"
|
223
|
-
Requires-Dist: jsonlines; extra == "srt"
|
224
223
|
Requires-Dist: packaging; extra == "srt"
|
225
224
|
Requires-Dist: pillow; extra == "srt"
|
226
225
|
Requires-Dist: psutil; extra == "srt"
|
@@ -230,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
|
|
230
229
|
Requires-Dist: uvicorn; extra == "srt"
|
231
230
|
Requires-Dist: uvloop; extra == "srt"
|
232
231
|
Requires-Dist: zmq; extra == "srt"
|
233
|
-
Requires-Dist: vllm==0.5.
|
232
|
+
Requires-Dist: vllm==0.5.4; extra == "srt"
|
234
233
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
235
234
|
Provides-Extra: openai
|
236
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -239,11 +238,18 @@ Provides-Extra: anthropic
|
|
239
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
240
239
|
Provides-Extra: litellm
|
241
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
241
|
+
Provides-Extra: test
|
242
|
+
Requires-Dist: jsonlines; extra == "test"
|
243
|
+
Requires-Dist: matplotlib; extra == "test"
|
244
|
+
Requires-Dist: pandas; extra == "test"
|
242
245
|
Provides-Extra: all
|
243
246
|
Requires-Dist: sglang[srt]; extra == "all"
|
244
247
|
Requires-Dist: sglang[openai]; extra == "all"
|
245
248
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
246
249
|
Requires-Dist: sglang[litellm]; extra == "all"
|
250
|
+
Provides-Extra: dev
|
251
|
+
Requires-Dist: sglang[all]; extra == "dev"
|
252
|
+
Requires-Dist: sglang[test]; extra == "dev"
|
247
253
|
|
248
254
|
<div align="center">
|
249
255
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
@@ -296,20 +302,20 @@ pip install --upgrade pip
|
|
296
302
|
pip install "sglang[all]"
|
297
303
|
|
298
304
|
# Install FlashInfer CUDA kernels
|
299
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
305
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
300
306
|
```
|
301
307
|
|
302
308
|
### Method 2: From source
|
303
309
|
```
|
304
310
|
# Use the last release branch
|
305
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
|
306
312
|
cd sglang
|
307
313
|
|
308
314
|
pip install --upgrade pip
|
309
315
|
pip install -e "python[all]"
|
310
316
|
|
311
317
|
# Install FlashInfer CUDA kernels
|
312
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
318
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
313
319
|
```
|
314
320
|
|
315
321
|
### Method 3: Using docker
|
@@ -383,22 +389,26 @@ response = client.chat.completions.create(
|
|
383
389
|
print(response)
|
384
390
|
```
|
385
391
|
|
386
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
392
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
387
393
|
|
388
394
|
### Additional Server Arguments
|
389
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
395
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
390
396
|
```
|
391
397
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
392
398
|
```
|
393
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
399
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
394
400
|
```
|
395
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
396
402
|
```
|
397
|
-
- If you see out-of-memory errors during serving,
|
403
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
398
404
|
```
|
399
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
400
406
|
```
|
401
407
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
408
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
409
|
+
```
|
410
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
411
|
+
```
|
402
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
403
413
|
```
|
404
414
|
# Node 0
|
@@ -408,29 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
408
418
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
409
419
|
```
|
410
420
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
411
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
412
421
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
413
|
-
|
414
|
-
|
415
|
-
|
416
|
-
```bash
|
417
|
-
## Run 405B (fp8) on a single node
|
418
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
419
|
-
|
420
|
-
## Run 405B (fp16) on two nodes
|
421
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
422
|
-
|
423
|
-
# on the first node
|
424
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
425
|
-
|
426
|
-
# on the second
|
427
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
428
|
-
```
|
429
|
-
|
422
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
423
|
+
|
430
424
|
### Supported Models
|
431
425
|
|
432
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
433
|
-
- Mistral / Mixtral
|
427
|
+
- Mistral / Mixtral / Mistral NeMo
|
434
428
|
- Gemma / Gemma 2
|
435
429
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
436
430
|
- DeepSeek / DeepSeek 2
|
@@ -448,10 +442,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
448
442
|
- Grok
|
449
443
|
- ChatGLM
|
450
444
|
- InternLM 2
|
451
|
-
- Mistral NeMo
|
452
445
|
|
453
446
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
454
447
|
|
448
|
+
#### Use Models From ModelScope
|
449
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
450
|
+
```
|
451
|
+
export SGLANG_USE_MODELSCOPE=true
|
452
|
+
```
|
453
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
454
|
+
```
|
455
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
456
|
+
```
|
457
|
+
|
458
|
+
#### Run Llama 3.1 405B
|
459
|
+
|
460
|
+
```bash
|
461
|
+
## Run 405B (fp8) on a single node
|
462
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
463
|
+
|
464
|
+
## Run 405B (fp16) on two nodes
|
465
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
466
|
+
|
467
|
+
# on the first node
|
468
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
469
|
+
|
470
|
+
# on the second
|
471
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
472
|
+
```
|
473
|
+
|
455
474
|
### Benchmark Performance
|
456
475
|
|
457
476
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -464,7 +483,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
464
483
|
```
|
465
484
|
|
466
485
|
## Frontend: Structured Generation Language (SGLang)
|
467
|
-
The frontend language can be used with local models or API models.
|
486
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
468
487
|
|
469
488
|
### Quick Start
|
470
489
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -49,20 +49,20 @@ pip install --upgrade pip
|
|
49
49
|
pip install "sglang[all]"
|
50
50
|
|
51
51
|
# Install FlashInfer CUDA kernels
|
52
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
52
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
53
53
|
```
|
54
54
|
|
55
55
|
### Method 2: From source
|
56
56
|
```
|
57
57
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
58
|
+
git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
|
59
59
|
cd sglang
|
60
60
|
|
61
61
|
pip install --upgrade pip
|
62
62
|
pip install -e "python[all]"
|
63
63
|
|
64
64
|
# Install FlashInfer CUDA kernels
|
65
|
-
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.
|
65
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
66
66
|
```
|
67
67
|
|
68
68
|
### Method 3: Using docker
|
@@ -136,22 +136,26 @@ response = client.chat.completions.create(
|
|
136
136
|
print(response)
|
137
137
|
```
|
138
138
|
|
139
|
-
It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
139
|
+
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
140
140
|
|
141
141
|
### Additional Server Arguments
|
142
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
142
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
143
143
|
```
|
144
144
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
145
145
|
```
|
146
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
146
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
147
147
|
```
|
148
148
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
149
149
|
```
|
150
|
-
- If you see out-of-memory errors during serving,
|
150
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
151
151
|
```
|
152
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
153
153
|
```
|
154
154
|
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
155
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
156
|
+
```
|
157
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
158
|
+
```
|
155
159
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
156
160
|
```
|
157
161
|
# Node 0
|
@@ -161,29 +165,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
161
165
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
162
166
|
```
|
163
167
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
164
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
165
168
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
```bash
|
170
|
-
## Run 405B (fp8) on a single node
|
171
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
|
-
|
173
|
-
## Run 405B (fp16) on two nodes
|
174
|
-
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
175
|
-
|
176
|
-
# on the first node
|
177
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
178
|
-
|
179
|
-
# on the second
|
180
|
-
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
181
|
-
```
|
182
|
-
|
169
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
170
|
+
|
183
171
|
### Supported Models
|
184
172
|
|
185
173
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
186
|
-
- Mistral / Mixtral
|
174
|
+
- Mistral / Mixtral / Mistral NeMo
|
187
175
|
- Gemma / Gemma 2
|
188
176
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
189
177
|
- DeepSeek / DeepSeek 2
|
@@ -201,10 +189,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
201
189
|
- Grok
|
202
190
|
- ChatGLM
|
203
191
|
- InternLM 2
|
204
|
-
- Mistral NeMo
|
205
192
|
|
206
193
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
207
194
|
|
195
|
+
#### Use Models From ModelScope
|
196
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
197
|
+
```
|
198
|
+
export SGLANG_USE_MODELSCOPE=true
|
199
|
+
```
|
200
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
201
|
+
```
|
202
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
203
|
+
```
|
204
|
+
|
205
|
+
#### Run Llama 3.1 405B
|
206
|
+
|
207
|
+
```bash
|
208
|
+
## Run 405B (fp8) on a single node
|
209
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
210
|
+
|
211
|
+
## Run 405B (fp16) on two nodes
|
212
|
+
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
213
|
+
|
214
|
+
# on the first node
|
215
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
216
|
+
|
217
|
+
# on the second
|
218
|
+
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
219
|
+
```
|
220
|
+
|
208
221
|
### Benchmark Performance
|
209
222
|
|
210
223
|
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
|
@@ -217,7 +230,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
217
230
|
```
|
218
231
|
|
219
232
|
## Frontend: Structured Generation Language (SGLang)
|
220
|
-
The frontend language can be used with local models or API models.
|
233
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
221
234
|
|
222
235
|
### Quick Start
|
223
236
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.12"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,14 +20,16 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.
|
26
|
+
"vllm==0.5.4", "outlines>=0.0.44"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
30
|
+
test = ["jsonlines", "matplotlib", "pandas"]
|
30
31
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
|
+
dev = ["sglang[all]", "sglang[test]"]
|
31
33
|
|
32
34
|
[project.urls]
|
33
35
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -22,6 +22,11 @@ from sglang.api import (
|
|
22
22
|
user_end,
|
23
23
|
video,
|
24
24
|
)
|
25
|
+
from sglang.lang.choices import (
|
26
|
+
greedy_token_selection,
|
27
|
+
token_length_normalized,
|
28
|
+
unconditional_likelihood_normalized,
|
29
|
+
)
|
25
30
|
|
26
31
|
# SGLang DSL APIs
|
27
32
|
__all__ = [
|
@@ -45,6 +50,9 @@ __all__ = [
|
|
45
50
|
"user_begin",
|
46
51
|
"user_end",
|
47
52
|
"video",
|
53
|
+
"greedy_token_selection",
|
54
|
+
"token_length_normalized",
|
55
|
+
"unconditional_likelihood_normalized",
|
48
56
|
]
|
49
57
|
|
50
58
|
# Global Configurations
|
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
|
|
6
6
|
|
7
7
|
from sglang.global_config import global_config
|
8
8
|
from sglang.lang.backend.base_backend import BaseBackend
|
9
|
+
from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
|
9
10
|
from sglang.lang.ir import (
|
10
11
|
SglExpr,
|
11
12
|
SglExprList,
|
@@ -73,12 +74,18 @@ def gen(
|
|
73
74
|
return_text_in_logprobs: Optional[bool] = None,
|
74
75
|
dtype: Optional[type] = None,
|
75
76
|
choices: Optional[List[str]] = None,
|
77
|
+
choices_method: Optional[ChoicesSamplingMethod] = None,
|
76
78
|
regex: Optional[str] = None,
|
77
79
|
):
|
78
80
|
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
79
81
|
|
80
82
|
if choices:
|
81
|
-
return SglSelect(
|
83
|
+
return SglSelect(
|
84
|
+
name,
|
85
|
+
choices,
|
86
|
+
0.0 if temperature is None else temperature,
|
87
|
+
token_length_normalized if choices_method is None else choices_method,
|
88
|
+
)
|
82
89
|
|
83
90
|
# check regex is valid
|
84
91
|
if regex is not None:
|
@@ -186,9 +193,10 @@ def select(
|
|
186
193
|
name: Optional[str] = None,
|
187
194
|
choices: Optional[List[str]] = None,
|
188
195
|
temperature: float = 0.0,
|
196
|
+
choices_method: ChoicesSamplingMethod = token_length_normalized,
|
189
197
|
):
|
190
198
|
assert choices is not None
|
191
|
-
return SglSelect(name, choices, temperature)
|
199
|
+
return SglSelect(name, choices, temperature, choices_method)
|
192
200
|
|
193
201
|
|
194
202
|
def _role_common(name: str, expr: Optional[SglExpr] = None):
|