sglang 0.2.11__tar.gz → 0.2.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.11/sglang.egg-info → sglang-0.2.12}/PKG-INFO +23 -14
- {sglang-0.2.11 → sglang-0.2.12}/README.md +22 -13
- {sglang-0.2.11 → sglang-0.2.12}/pyproject.toml +1 -1
- {sglang-0.2.11 → sglang-0.2.12}/sglang/bench_latency.py +6 -4
- {sglang-0.2.11 → sglang-0.2.12}/sglang/bench_serving.py +46 -22
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/compiler.py +2 -2
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/ir.py +3 -3
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/base_tool_cache.py +1 -1
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/fsm_cache.py +12 -2
- sglang-0.2.12/sglang/srt/layers/activation.py +33 -0
- sglang-0.2.11/sglang/srt/layers/token_attention.py → sglang-0.2.12/sglang/srt/layers/decode_attention.py +9 -5
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/extend_attention.py +6 -1
- sglang-0.2.12/sglang/srt/layers/layernorm.py +65 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/logits_processor.py +5 -0
- sglang-0.2.12/sglang/srt/layers/pooler.py +50 -0
- sglang-0.2.11/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.12/sglang/srt/layers/prefill_attention.py +5 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/radix_attention.py +2 -2
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/detokenizer_manager.py +31 -9
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/io_struct.py +63 -0
- sglang-0.2.12/sglang/srt/managers/policy_scheduler.py +233 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/schedule_batch.py +110 -87
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/tokenizer_manager.py +193 -111
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/tp_worker.py +289 -352
- sglang-0.2.11/sglang/srt/mem_cache/base_cache.py → sglang-0.2.12/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
- sglang-0.2.12/sglang/srt/mem_cache/chunk_cache.py +83 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/memory_pool.py +2 -2
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/radix_cache.py +74 -40
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_executor/cuda_graph_runner.py +24 -9
- sglang-0.2.12/sglang/srt/model_executor/forward_batch_info.py +319 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_executor/model_runner.py +24 -37
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gemma2.py +0 -1
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/internlm2.py +2 -7
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llama2.py +4 -4
- sglang-0.2.12/sglang/srt/models/llama_embedding.py +88 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen2_moe.py +0 -11
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/openai_api/adapter.py +155 -27
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/openai_api/protocol.py +37 -1
- sglang-0.2.12/sglang/srt/sampling/penaltylib/__init__.py +13 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
- sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/sampling_params.py +31 -4
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/server.py +69 -15
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/server_args.py +26 -19
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/utils.py +31 -13
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/run_eval.py +10 -1
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/runners.py +63 -63
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_humaneval.py +2 -8
- sglang-0.2.12/sglang/test/simple_eval_mgsm.py +203 -0
- sglang-0.2.12/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
- sglang-0.2.12/sglang/test/test_layernorm.py +60 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/test_programs.py +4 -2
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/test_utils.py +20 -2
- {sglang-0.2.11 → sglang-0.2.12}/sglang/utils.py +0 -1
- sglang-0.2.12/sglang/version.py +1 -0
- {sglang-0.2.11 → sglang-0.2.12/sglang.egg-info}/PKG-INFO +23 -14
- {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/SOURCES.txt +17 -7
- sglang-0.2.11/sglang/srt/layers/linear.py +0 -884
- sglang-0.2.11/sglang/srt/layers/quantization/__init__.py +0 -64
- sglang-0.2.11/sglang/srt/layers/quantization/fp8.py +0 -677
- sglang-0.2.11/sglang/srt/managers/policy_scheduler.py +0 -85
- sglang-0.2.11/sglang/srt/mem_cache/chunk_cache.py +0 -60
- sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +0 -256
- sglang-0.2.11/sglang/version.py +0 -1
- {sglang-0.2.11 → sglang-0.2.12}/LICENSE +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/setup.cfg +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/api.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/check_env.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/global_config.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/choices.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/interpreter.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/launch_server.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/fused_moe.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_loader/model_loader.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_loader/utils.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/grok.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llava.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.12
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
308
|
### Method 2: From source
|
309
309
|
```
|
310
310
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
|
312
312
|
cd sglang
|
313
313
|
|
314
314
|
pip install --upgrade pip
|
@@ -392,23 +392,23 @@ print(response)
|
|
392
392
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
393
393
|
|
394
394
|
### Additional Server Arguments
|
395
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
395
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
396
396
|
```
|
397
397
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
398
398
|
```
|
399
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
399
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
400
400
|
```
|
401
401
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
402
402
|
```
|
403
|
-
- If you see out-of-memory errors during serving,
|
403
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
404
404
|
```
|
405
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
406
406
|
```
|
407
|
-
-
|
407
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
408
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
408
409
|
```
|
409
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3
|
410
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
410
411
|
```
|
411
|
-
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
412
412
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
413
413
|
```
|
414
414
|
# Node 0
|
@@ -418,13 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
418
418
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
419
419
|
```
|
420
420
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
421
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
422
421
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
423
|
-
|
422
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
423
|
+
|
424
424
|
### Supported Models
|
425
425
|
|
426
426
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
427
|
-
- Mistral / Mixtral
|
427
|
+
- Mistral / Mixtral / Mistral NeMo
|
428
428
|
- Gemma / Gemma 2
|
429
429
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
430
430
|
- DeepSeek / DeepSeek 2
|
@@ -442,11 +442,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
442
442
|
- Grok
|
443
443
|
- ChatGLM
|
444
444
|
- InternLM 2
|
445
|
-
- Mistral NeMo
|
446
445
|
|
447
446
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
448
447
|
|
449
|
-
|
448
|
+
#### Use Models From ModelScope
|
449
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
450
|
+
```
|
451
|
+
export SGLANG_USE_MODELSCOPE=true
|
452
|
+
```
|
453
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
454
|
+
```
|
455
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
456
|
+
```
|
457
|
+
|
458
|
+
#### Run Llama 3.1 405B
|
450
459
|
|
451
460
|
```bash
|
452
461
|
## Run 405B (fp8) on a single node
|
@@ -474,7 +483,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
474
483
|
```
|
475
484
|
|
476
485
|
## Frontend: Structured Generation Language (SGLang)
|
477
|
-
The frontend language can be used with local models or API models.
|
486
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
478
487
|
|
479
488
|
### Quick Start
|
480
489
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
55
55
|
### Method 2: From source
|
56
56
|
```
|
57
57
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
58
|
+
git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
|
59
59
|
cd sglang
|
60
60
|
|
61
61
|
pip install --upgrade pip
|
@@ -139,23 +139,23 @@ print(response)
|
|
139
139
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
140
140
|
|
141
141
|
### Additional Server Arguments
|
142
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
142
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
143
143
|
```
|
144
144
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
145
145
|
```
|
146
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
146
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
147
147
|
```
|
148
148
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
149
149
|
```
|
150
|
-
- If you see out-of-memory errors during serving,
|
150
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
151
151
|
```
|
152
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
153
153
|
```
|
154
|
-
-
|
154
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
155
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
155
156
|
```
|
156
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3
|
157
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
157
158
|
```
|
158
|
-
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
159
159
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
160
160
|
```
|
161
161
|
# Node 0
|
@@ -165,13 +165,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
165
165
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
166
166
|
```
|
167
167
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
168
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
169
168
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
170
|
-
|
169
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
170
|
+
|
171
171
|
### Supported Models
|
172
172
|
|
173
173
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
174
|
-
- Mistral / Mixtral
|
174
|
+
- Mistral / Mixtral / Mistral NeMo
|
175
175
|
- Gemma / Gemma 2
|
176
176
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
177
177
|
- DeepSeek / DeepSeek 2
|
@@ -189,11 +189,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
189
189
|
- Grok
|
190
190
|
- ChatGLM
|
191
191
|
- InternLM 2
|
192
|
-
- Mistral NeMo
|
193
192
|
|
194
193
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
195
194
|
|
196
|
-
|
195
|
+
#### Use Models From ModelScope
|
196
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
197
|
+
```
|
198
|
+
export SGLANG_USE_MODELSCOPE=true
|
199
|
+
```
|
200
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
201
|
+
```
|
202
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
203
|
+
```
|
204
|
+
|
205
|
+
#### Run Llama 3.1 405B
|
197
206
|
|
198
207
|
```bash
|
199
208
|
## Run 405B (fp8) on a single node
|
@@ -221,7 +230,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
221
230
|
```
|
222
231
|
|
223
232
|
## Frontend: Structured Generation Language (SGLang)
|
224
|
-
The frontend language can be used with local models or API models.
|
233
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
225
234
|
|
226
235
|
### Quick Start
|
227
236
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.12"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
|
|
152
152
|
req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
|
153
153
|
req.prefix_indices = []
|
154
154
|
req.sampling_params = sampling_params
|
155
|
-
req.
|
155
|
+
req.fill_ids = req.origin_input_ids
|
156
156
|
reqs.append(req)
|
157
157
|
|
158
158
|
return input_ids, reqs
|
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
|
|
163
163
|
):
|
164
164
|
for i in range(len(reqs)):
|
165
165
|
req = reqs[i]
|
166
|
-
req.
|
166
|
+
req.fill_ids += input_ids[i][bench_args.cut_len :]
|
167
167
|
req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
|
168
168
|
i, : bench_args.cut_len
|
169
169
|
]
|
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
182
182
|
req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
|
183
183
|
req.prefix_indices = []
|
184
184
|
req.sampling_params = sampling_params
|
185
|
-
req.
|
185
|
+
req.fill_ids = req.origin_input_ids
|
186
186
|
reqs.append(req)
|
187
187
|
|
188
188
|
return reqs
|
@@ -238,7 +238,7 @@ def correctness_test(
|
|
238
238
|
|
239
239
|
# Decode
|
240
240
|
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
241
|
-
for _ in range(bench_args.output_len):
|
241
|
+
for _ in range(bench_args.output_len[0]):
|
242
242
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
243
243
|
for i in range(len(reqs)):
|
244
244
|
output_ids[i].append(next_token_ids[i])
|
@@ -332,6 +332,7 @@ def latency_test(
|
|
332
332
|
)
|
333
333
|
|
334
334
|
# Warm up
|
335
|
+
rank_print("Warmup ...")
|
335
336
|
latency_test_run_once(
|
336
337
|
bench_args.run_name,
|
337
338
|
model_runner,
|
@@ -341,6 +342,7 @@ def latency_test(
|
|
341
342
|
bench_args.input_len[0],
|
342
343
|
4, # shorter decoding to speed up the warmup
|
343
344
|
)
|
345
|
+
rank_print("Benchmark ...")
|
344
346
|
|
345
347
|
# Run the sweep
|
346
348
|
result_list = []
|
@@ -24,7 +24,7 @@ import warnings
|
|
24
24
|
from argparse import ArgumentParser
|
25
25
|
from dataclasses import dataclass, field
|
26
26
|
from datetime import datetime
|
27
|
-
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
27
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
28
28
|
|
29
29
|
import aiohttp
|
30
30
|
import numpy as np
|
@@ -39,6 +39,8 @@ from transformers import (
|
|
39
39
|
|
40
40
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
41
41
|
|
42
|
+
global args
|
43
|
+
|
42
44
|
|
43
45
|
@dataclass
|
44
46
|
class RequestFuncInput:
|
@@ -47,6 +49,7 @@ class RequestFuncInput:
|
|
47
49
|
prompt_len: int
|
48
50
|
output_len: int
|
49
51
|
model: str
|
52
|
+
extra_request_body: Dict[str, Any]
|
50
53
|
|
51
54
|
|
52
55
|
@dataclass
|
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
|
|
84
87
|
"stream": True,
|
85
88
|
"min_length": request_func_input.output_len,
|
86
89
|
"end_id": 1048576,
|
90
|
+
**request_func_input.extra_request_body,
|
87
91
|
}
|
88
92
|
if args.disable_ignore_eos:
|
89
93
|
del payload["min_length"]
|
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
|
|
154
158
|
"max_tokens": request_func_input.output_len,
|
155
159
|
"stream": not args.disable_stream,
|
156
160
|
"ignore_eos": not args.disable_ignore_eos,
|
161
|
+
**request_func_input.extra_request_body,
|
157
162
|
}
|
158
163
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
159
164
|
|
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
|
|
192
197
|
output.ttft = ttft
|
193
198
|
|
194
199
|
# Decoding phase
|
195
|
-
|
200
|
+
else:
|
201
|
+
output.itl.append(timestamp - most_recent_timestamp)
|
196
202
|
|
197
203
|
most_recent_timestamp = timestamp
|
198
204
|
generated_text += data["choices"][0]["text"]
|
@@ -542,6 +548,7 @@ async def benchmark(
|
|
542
548
|
request_rate: float,
|
543
549
|
disable_tqdm: bool,
|
544
550
|
enable_multi: bool,
|
551
|
+
extra_request_body: Dict[str, Any],
|
545
552
|
):
|
546
553
|
if backend in ASYNC_REQUEST_FUNCS:
|
547
554
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -556,6 +563,7 @@ async def benchmark(
|
|
556
563
|
api_url=api_url,
|
557
564
|
prompt_len=test_prompt_len,
|
558
565
|
output_len=test_output_len,
|
566
|
+
extra_request_body=extra_request_body,
|
559
567
|
)
|
560
568
|
test_output = await request_func(request_func_input=test_input)
|
561
569
|
if not test_output.success:
|
@@ -578,6 +586,7 @@ async def benchmark(
|
|
578
586
|
api_url=api_url,
|
579
587
|
prompt_len=prompt_len,
|
580
588
|
output_len=output_len,
|
589
|
+
extra_request_body=extra_request_body,
|
581
590
|
)
|
582
591
|
tasks.append(
|
583
592
|
asyncio.create_task(
|
@@ -660,19 +669,20 @@ async def benchmark(
|
|
660
669
|
"backend": args.backend,
|
661
670
|
"dataset_name": args.dataset_name,
|
662
671
|
"request_rate": request_rate,
|
663
|
-
"
|
664
|
-
"
|
665
|
-
"
|
666
|
-
"
|
667
|
-
"
|
668
|
-
"
|
669
|
-
"
|
670
|
-
"
|
672
|
+
"total_input_tokens": metrics.total_input,
|
673
|
+
"total_output_tokens": metrics.total_output,
|
674
|
+
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
675
|
+
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
676
|
+
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
677
|
+
"median_ttft_ms": metrics.median_ttft_ms,
|
678
|
+
"median_itl_ms": metrics.median_itl_ms,
|
679
|
+
"output_throughput": metrics.output_throughput,
|
671
680
|
"sharegpt_output_len": args.sharegpt_output_len,
|
672
681
|
"random_input_len": args.random_input_len,
|
673
682
|
"random_output_len": args.random_output_len,
|
674
683
|
"random_range_ratio": args.random_range_ratio,
|
675
|
-
"
|
684
|
+
"duration": benchmark_duration,
|
685
|
+
"completed": metrics.completed,
|
676
686
|
}
|
677
687
|
else:
|
678
688
|
print(f"Error running benchmark for request rate: {request_rate}")
|
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
|
|
742
752
|
return False
|
743
753
|
|
744
754
|
|
745
|
-
def
|
755
|
+
def run_benchmark(args_: argparse.Namespace):
|
756
|
+
global args
|
757
|
+
args = args_
|
758
|
+
|
759
|
+
set_ulimit()
|
746
760
|
random.seed(args.seed)
|
747
761
|
np.random.seed(args.seed)
|
748
762
|
|
763
|
+
extra_request_body = {}
|
764
|
+
if args.extra_request_body:
|
765
|
+
extra_request_body = json.loads(args.extra_request_body)
|
766
|
+
|
749
767
|
if args.port is None:
|
750
768
|
args.port = {
|
751
769
|
"sglang": 30000,
|
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
|
|
838
856
|
request_rate=rate,
|
839
857
|
disable_tqdm=args.disable_tqdm,
|
840
858
|
enable_multi=args.multi,
|
859
|
+
extra_request_body=extra_request_body,
|
841
860
|
)
|
842
861
|
)
|
843
862
|
else:
|
844
|
-
asyncio.run(
|
863
|
+
return asyncio.run(
|
845
864
|
benchmark(
|
846
865
|
backend=backend,
|
847
866
|
api_url=api_url,
|
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
|
|
851
870
|
request_rate=args.request_rate,
|
852
871
|
disable_tqdm=args.disable_tqdm,
|
853
872
|
enable_multi=args.multi,
|
873
|
+
extra_request_body=extra_request_body,
|
854
874
|
)
|
855
875
|
)
|
856
876
|
|
@@ -949,11 +969,6 @@ if __name__ == "__main__":
|
|
949
969
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
|
950
970
|
)
|
951
971
|
parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
|
952
|
-
parser.add_argument(
|
953
|
-
"--disable-tqdm",
|
954
|
-
action="store_true",
|
955
|
-
help="Specify to disable tqdm progress bar.",
|
956
|
-
)
|
957
972
|
parser.add_argument(
|
958
973
|
"--multi",
|
959
974
|
action="store_true",
|
@@ -966,6 +981,11 @@ if __name__ == "__main__":
|
|
966
981
|
help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
|
967
982
|
)
|
968
983
|
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
984
|
+
parser.add_argument(
|
985
|
+
"--disable-tqdm",
|
986
|
+
action="store_true",
|
987
|
+
help="Specify to disable tqdm progress bar.",
|
988
|
+
)
|
969
989
|
parser.add_argument(
|
970
990
|
"--disable-stream",
|
971
991
|
action="store_true",
|
@@ -976,8 +996,12 @@ if __name__ == "__main__":
|
|
976
996
|
action="store_true",
|
977
997
|
help="Disable ignoring EOS.",
|
978
998
|
)
|
979
|
-
|
980
|
-
|
981
|
-
|
999
|
+
parser.add_argument(
|
1000
|
+
"--extra-request-body",
|
1001
|
+
metavar='{"key1": "value1", "key2": "value2"}',
|
1002
|
+
type=str,
|
1003
|
+
help="Append given JSON object to the request payload. You can use this to specify"
|
1004
|
+
"additional generate params like sampling params.",
|
1005
|
+
)
|
982
1006
|
args = parser.parse_args()
|
983
|
-
|
1007
|
+
run_benchmark(args)
|
@@ -125,7 +125,7 @@ class CompiledFunction:
|
|
125
125
|
def run(
|
126
126
|
self,
|
127
127
|
*,
|
128
|
-
max_new_tokens: int =
|
128
|
+
max_new_tokens: int = 128,
|
129
129
|
stop: Union[str, List[str]] = (),
|
130
130
|
temperature: float = 1.0,
|
131
131
|
top_p: float = 1.0,
|
@@ -155,7 +155,7 @@ class CompiledFunction:
|
|
155
155
|
self,
|
156
156
|
batch_kwargs,
|
157
157
|
*,
|
158
|
-
max_new_tokens: int =
|
158
|
+
max_new_tokens: int = 128,
|
159
159
|
stop: Union[str, List[str]] = (),
|
160
160
|
temperature: float = 1.0,
|
161
161
|
top_p: float = 1.0,
|
@@ -16,7 +16,7 @@ REGEX_STRING = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg
|
|
16
16
|
|
17
17
|
@dataclasses.dataclass
|
18
18
|
class SglSamplingParams:
|
19
|
-
max_new_tokens: int =
|
19
|
+
max_new_tokens: int = 128
|
20
20
|
stop: Union[str, List[str]] = ()
|
21
21
|
temperature: float = 1.0
|
22
22
|
top_p: float = 1.0
|
@@ -140,7 +140,7 @@ class SglFunction:
|
|
140
140
|
def run(
|
141
141
|
self,
|
142
142
|
*args,
|
143
|
-
max_new_tokens: int =
|
143
|
+
max_new_tokens: int = 128,
|
144
144
|
stop: Union[str, List[str]] = (),
|
145
145
|
temperature: float = 1.0,
|
146
146
|
top_p: float = 1.0,
|
@@ -179,7 +179,7 @@ class SglFunction:
|
|
179
179
|
self,
|
180
180
|
batch_kwargs,
|
181
181
|
*,
|
182
|
-
max_new_tokens: int =
|
182
|
+
max_new_tokens: int = 128,
|
183
183
|
stop: Union[str, List[str]] = (),
|
184
184
|
temperature: float = 1.0,
|
185
185
|
top_p: float = 1.0,
|
@@ -20,10 +20,20 @@ from sglang.srt.constrained.base_tool_cache import BaseToolCache
|
|
20
20
|
|
21
21
|
|
22
22
|
class FSMCache(BaseToolCache):
|
23
|
-
def __init__(
|
23
|
+
def __init__(
|
24
|
+
self,
|
25
|
+
tokenizer_path,
|
26
|
+
tokenizer_args_dict,
|
27
|
+
enable=True,
|
28
|
+
skip_tokenizer_init=False,
|
29
|
+
):
|
24
30
|
super().__init__(enable=enable)
|
25
31
|
|
26
|
-
if
|
32
|
+
if (
|
33
|
+
skip_tokenizer_init
|
34
|
+
or tokenizer_path.endswith(".json")
|
35
|
+
or tokenizer_path.endswith(".model")
|
36
|
+
):
|
27
37
|
# Do not support TiktokenTokenizer or SentencePieceTokenizer
|
28
38
|
return
|
29
39
|
|
@@ -0,0 +1,33 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
Unless required by applicable law or agreed to in writing, software
|
8
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
9
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
10
|
+
See the License for the specific language governing permissions and
|
11
|
+
limitations under the License.
|
12
|
+
"""
|
13
|
+
|
14
|
+
"""Fused operators for activation layers."""
|
15
|
+
|
16
|
+
import torch
|
17
|
+
import torch.nn as nn
|
18
|
+
import torch.nn.functional as F
|
19
|
+
from flashinfer.activation import silu_and_mul
|
20
|
+
from vllm.model_executor.custom_op import CustomOp
|
21
|
+
|
22
|
+
|
23
|
+
class SiluAndMul(CustomOp):
|
24
|
+
def forward_native(self, x: torch.Tensor) -> torch.Tensor:
|
25
|
+
d = x.shape[-1] // 2
|
26
|
+
return F.silu(x[..., :d]) * x[..., d:]
|
27
|
+
|
28
|
+
def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
|
29
|
+
d = x.shape[-1] // 2
|
30
|
+
output_shape = x.shape[:-1] + (d,)
|
31
|
+
out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
|
32
|
+
silu_and_mul(x, out)
|
33
|
+
return out
|
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
+
"""
|
17
|
+
Memory-efficient attention for decoding.
|
18
|
+
"""
|
19
|
+
|
16
20
|
# Adapted from
|
17
21
|
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
|
18
22
|
# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
|
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
|
|
194
198
|
tl.store(out_ptrs, acc)
|
195
199
|
|
196
200
|
|
197
|
-
def
|
201
|
+
def _decode_att_m_fwd(
|
198
202
|
q,
|
199
203
|
k_buffer,
|
200
204
|
att_out,
|
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
|
|
254
258
|
)
|
255
259
|
|
256
260
|
|
257
|
-
def
|
261
|
+
def _decode_softmax_reducev_fwd(
|
258
262
|
logics,
|
259
263
|
v_buffer,
|
260
264
|
o,
|
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
|
|
292
296
|
)
|
293
297
|
|
294
298
|
|
295
|
-
def
|
299
|
+
def decode_attention_fwd(
|
296
300
|
q,
|
297
301
|
k_buffer,
|
298
302
|
v_buffer,
|
@@ -312,7 +316,7 @@ def token_attention_fwd(
|
|
312
316
|
(q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
|
313
317
|
)
|
314
318
|
|
315
|
-
|
319
|
+
_decode_att_m_fwd(
|
316
320
|
q,
|
317
321
|
k_buffer,
|
318
322
|
att_m,
|
@@ -324,7 +328,7 @@ def token_attention_fwd(
|
|
324
328
|
sm_scale,
|
325
329
|
logit_cap,
|
326
330
|
)
|
327
|
-
|
331
|
+
_decode_softmax_reducev_fwd(
|
328
332
|
att_m,
|
329
333
|
v_buffer,
|
330
334
|
o,
|
@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
|
|
13
13
|
limitations under the License.
|
14
14
|
"""
|
15
15
|
|
16
|
+
"""
|
17
|
+
Memory-efficient attention for prefill.
|
18
|
+
It supporst page size = 1 and prefill with KV cache (i.e. extend).
|
19
|
+
"""
|
20
|
+
|
16
21
|
import torch
|
17
22
|
import triton
|
18
23
|
import triton.language as tl
|
19
24
|
|
20
|
-
from sglang.srt.layers.
|
25
|
+
from sglang.srt.layers.prefill_attention import context_attention_fwd
|
21
26
|
|
22
27
|
CUDA_CAPABILITY = torch.cuda.get_device_capability()
|
23
28
|
|