sglang 0.2.11__tar.gz → 0.2.13__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.11/sglang.egg-info → sglang-0.2.13}/PKG-INFO +33 -16
- {sglang-0.2.11 → sglang-0.2.13}/README.md +32 -15
- {sglang-0.2.11 → sglang-0.2.13}/pyproject.toml +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/api.py +7 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/bench_latency.py +9 -6
- {sglang-0.2.11 → sglang-0.2.13}/sglang/bench_serving.py +46 -22
- {sglang-0.2.11 → sglang-0.2.13}/sglang/global_config.py +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/runtime_endpoint.py +60 -49
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/compiler.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/interpreter.py +4 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/ir.py +16 -7
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/base_tool_cache.py +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/fsm_cache.py +12 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/jump_forward.py +13 -2
- sglang-0.2.13/sglang/srt/layers/activation.py +32 -0
- sglang-0.2.11/sglang/srt/layers/token_attention.py → sglang-0.2.13/sglang/srt/layers/decode_attention.py +9 -5
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/extend_attention.py +9 -2
- sglang-0.2.13/sglang/srt/layers/fused_moe/__init__.py +1 -0
- {sglang-0.2.11/sglang/srt/layers → sglang-0.2.13/sglang/srt/layers/fused_moe}/fused_moe.py +165 -108
- sglang-0.2.13/sglang/srt/layers/fused_moe/layer.py +587 -0
- sglang-0.2.13/sglang/srt/layers/layernorm.py +65 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/logits_processor.py +7 -2
- sglang-0.2.13/sglang/srt/layers/pooler.py +50 -0
- sglang-0.2.11/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.13/sglang/srt/layers/prefill_attention.py +5 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/layers/radix_attention.py +40 -16
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/detokenizer_manager.py +31 -9
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/io_struct.py +63 -0
- sglang-0.2.13/sglang/srt/managers/policy_scheduler.py +233 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/schedule_batch.py +115 -97
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/tokenizer_manager.py +194 -112
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/tp_worker.py +290 -359
- sglang-0.2.11/sglang/srt/mem_cache/base_cache.py → sglang-0.2.13/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
- sglang-0.2.13/sglang/srt/mem_cache/chunk_cache.py +83 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/memory_pool.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/radix_cache.py +74 -40
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_executor/cuda_graph_runner.py +71 -25
- sglang-0.2.13/sglang/srt/model_executor/forward_batch_info.py +393 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_executor/model_runner.py +77 -57
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/chatglm.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/commandr.py +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/deepseek.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/deepseek_v2.py +7 -6
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gemma.py +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gemma2.py +11 -6
- sglang-0.2.13/sglang/srt/models/grok.py +408 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/internlm2.py +2 -7
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llama2.py +4 -4
- sglang-0.2.13/sglang/srt/models/llama_embedding.py +88 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/minicpm.py +2 -2
- sglang-0.2.13/sglang/srt/models/mixtral.py +380 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/mixtral_quant.py +1 -4
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen2.py +2 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/qwen2_moe.py +2 -13
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/openai_api/adapter.py +187 -48
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/openai_api/protocol.py +37 -1
- sglang-0.2.13/sglang/srt/sampling/penaltylib/__init__.py +13 -0
- sglang-0.2.13/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
- sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
- sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
- sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
- sglang-0.2.13/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/sampling_params.py +31 -8
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/server.py +91 -29
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/server_args.py +32 -19
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/utils.py +32 -15
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/run_eval.py +10 -1
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/runners.py +81 -73
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_humaneval.py +2 -8
- sglang-0.2.13/sglang/test/simple_eval_mgsm.py +203 -0
- sglang-0.2.13/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
- sglang-0.2.13/sglang/test/test_layernorm.py +60 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/test_programs.py +36 -7
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/test_utils.py +24 -2
- {sglang-0.2.11 → sglang-0.2.13}/sglang/utils.py +0 -1
- sglang-0.2.13/sglang/version.py +1 -0
- {sglang-0.2.11 → sglang-0.2.13/sglang.egg-info}/PKG-INFO +33 -16
- {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/SOURCES.txt +20 -10
- sglang-0.2.11/sglang/srt/layers/linear.py +0 -884
- sglang-0.2.11/sglang/srt/layers/quantization/__init__.py +0 -64
- sglang-0.2.11/sglang/srt/layers/quantization/fp8.py +0 -677
- sglang-0.2.11/sglang/srt/managers/policy_scheduler.py +0 -85
- sglang-0.2.11/sglang/srt/mem_cache/chunk_cache.py +0 -60
- sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +0 -256
- sglang-0.2.11/sglang/srt/model_loader/model_loader.py +0 -292
- sglang-0.2.11/sglang/srt/model_loader/utils.py +0 -275
- sglang-0.2.11/sglang/srt/models/grok.py +0 -754
- sglang-0.2.11/sglang/srt/models/mixtral.py +0 -578
- sglang-0.2.11/sglang/version.py +0 -1
- {sglang-0.2.11 → sglang-0.2.13}/LICENSE +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/setup.cfg +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/check_env.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/choices.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/launch_server.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/conversation.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/model_config.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llava.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.2.11 → sglang-0.2.13}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.13
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
308
308
|
### Method 2: From source
|
309
309
|
```
|
310
310
|
# Use the last release branch
|
311
|
-
git clone -b v0.2.
|
311
|
+
git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
|
312
312
|
cd sglang
|
313
313
|
|
314
314
|
pip install --upgrade pip
|
@@ -329,11 +329,19 @@ docker run --gpus all \
|
|
329
329
|
--env "HF_TOKEN=<secret>" \
|
330
330
|
--ipc=host \
|
331
331
|
lmsysorg/sglang:latest \
|
332
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
332
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
333
333
|
```
|
334
334
|
|
335
|
+
### Method 4: Using docker compose
|
336
|
+
|
337
|
+
> This method is recommended if you plan to serve it as a service.
|
338
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
339
|
+
|
340
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
341
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
342
|
+
|
335
343
|
### Common Notes
|
336
|
-
-
|
344
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
337
345
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
338
346
|
|
339
347
|
## Backend: SGLang Runtime (SRT)
|
@@ -392,23 +400,23 @@ print(response)
|
|
392
400
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
393
401
|
|
394
402
|
### Additional Server Arguments
|
395
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
403
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
396
404
|
```
|
397
405
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
398
406
|
```
|
399
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
407
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
400
408
|
```
|
401
409
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
402
410
|
```
|
403
|
-
- If you see out-of-memory errors during serving,
|
411
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
404
412
|
```
|
405
413
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
406
414
|
```
|
407
|
-
-
|
415
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
416
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
408
417
|
```
|
409
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3
|
418
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
410
419
|
```
|
411
|
-
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
412
420
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
413
421
|
```
|
414
422
|
# Node 0
|
@@ -418,13 +426,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
418
426
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
419
427
|
```
|
420
428
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
421
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
422
429
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
423
|
-
|
430
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
431
|
+
|
424
432
|
### Supported Models
|
425
433
|
|
426
434
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
427
|
-
- Mistral / Mixtral
|
435
|
+
- Mistral / Mixtral / Mistral NeMo
|
428
436
|
- Gemma / Gemma 2
|
429
437
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
430
438
|
- DeepSeek / DeepSeek 2
|
@@ -442,11 +450,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
442
450
|
- Grok
|
443
451
|
- ChatGLM
|
444
452
|
- InternLM 2
|
445
|
-
- Mistral NeMo
|
446
453
|
|
447
454
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
448
455
|
|
449
|
-
|
456
|
+
#### Use Models From ModelScope
|
457
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
458
|
+
```
|
459
|
+
export SGLANG_USE_MODELSCOPE=true
|
460
|
+
```
|
461
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
462
|
+
```
|
463
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
464
|
+
```
|
465
|
+
|
466
|
+
#### Run Llama 3.1 405B
|
450
467
|
|
451
468
|
```bash
|
452
469
|
## Run 405B (fp8) on a single node
|
@@ -474,7 +491,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
474
491
|
```
|
475
492
|
|
476
493
|
## Frontend: Structured Generation Language (SGLang)
|
477
|
-
The frontend language can be used with local models or API models.
|
494
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
478
495
|
|
479
496
|
### Quick Start
|
480
497
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
55
55
|
### Method 2: From source
|
56
56
|
```
|
57
57
|
# Use the last release branch
|
58
|
-
git clone -b v0.2.
|
58
|
+
git clone -b v0.2.13 https://github.com/sgl-project/sglang.git
|
59
59
|
cd sglang
|
60
60
|
|
61
61
|
pip install --upgrade pip
|
@@ -76,11 +76,19 @@ docker run --gpus all \
|
|
76
76
|
--env "HF_TOKEN=<secret>" \
|
77
77
|
--ipc=host \
|
78
78
|
lmsysorg/sglang:latest \
|
79
|
-
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
|
79
|
+
python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
80
80
|
```
|
81
81
|
|
82
|
+
### Method 4: Using docker compose
|
83
|
+
|
84
|
+
> This method is recommended if you plan to serve it as a service.
|
85
|
+
> A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
|
86
|
+
|
87
|
+
1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
|
88
|
+
2. Execute the command `docker compose up -d` in your terminal.
|
89
|
+
|
82
90
|
### Common Notes
|
83
|
-
-
|
91
|
+
- [FlashInfer](https://github.com/flashinfer-ai/flashinfer) is currently one of the dependencies that must be installed for SGLang. If you are using NVIDIA GPU devices below sm80, such as T4, you can't use SGLang for the time being. We expect to resolve this issue soon, so please stay tuned. If you encounter any FlashInfer-related issues on sm80+ devices (e.g., A100, L40S, H100), consider using Triton's kernel by `--disable-flashinfer --disable-flashinfer-sampling` and raise a issue.
|
84
92
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
85
93
|
|
86
94
|
## Backend: SGLang Runtime (SRT)
|
@@ -139,23 +147,23 @@ print(response)
|
|
139
147
|
It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
140
148
|
|
141
149
|
### Additional Server Arguments
|
142
|
-
- Add `--tp 2` to enable tensor parallelism. If it
|
150
|
+
- Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
143
151
|
```
|
144
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
|
145
153
|
```
|
146
|
-
- Add `--dp 2` to enable data parallelism. It can also be used together with
|
154
|
+
- Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
|
147
155
|
```
|
148
156
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
|
149
157
|
```
|
150
|
-
- If you see out-of-memory errors during serving,
|
158
|
+
- If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
|
151
159
|
```
|
152
160
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
153
161
|
```
|
154
|
-
-
|
162
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
163
|
+
- If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
|
155
164
|
```
|
156
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3
|
165
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
|
157
166
|
```
|
158
|
-
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
159
167
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
160
168
|
```
|
161
169
|
# Node 0
|
@@ -165,13 +173,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
165
173
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
166
174
|
```
|
167
175
|
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
168
|
-
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
169
176
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
170
|
-
|
177
|
+
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
178
|
+
|
171
179
|
### Supported Models
|
172
180
|
|
173
181
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
174
|
-
- Mistral / Mixtral
|
182
|
+
- Mistral / Mixtral / Mistral NeMo
|
175
183
|
- Gemma / Gemma 2
|
176
184
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
177
185
|
- DeepSeek / DeepSeek 2
|
@@ -189,11 +197,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
189
197
|
- Grok
|
190
198
|
- ChatGLM
|
191
199
|
- InternLM 2
|
192
|
-
- Mistral NeMo
|
193
200
|
|
194
201
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
195
202
|
|
196
|
-
|
203
|
+
#### Use Models From ModelScope
|
204
|
+
To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
|
205
|
+
```
|
206
|
+
export SGLANG_USE_MODELSCOPE=true
|
207
|
+
```
|
208
|
+
Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
|
209
|
+
```
|
210
|
+
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
211
|
+
```
|
212
|
+
|
213
|
+
#### Run Llama 3.1 405B
|
197
214
|
|
198
215
|
```bash
|
199
216
|
## Run 405B (fp8) on a single node
|
@@ -221,7 +238,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
221
238
|
```
|
222
239
|
|
223
240
|
## Frontend: Structured Generation Language (SGLang)
|
224
|
-
The frontend language can be used with local models or API models.
|
241
|
+
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
225
242
|
|
226
243
|
### Quick Start
|
227
244
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.13"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -62,6 +62,7 @@ def gen(
|
|
62
62
|
name: Optional[str] = None,
|
63
63
|
max_tokens: Optional[int] = None,
|
64
64
|
stop: Optional[Union[str, List[str]]] = None,
|
65
|
+
stop_token_ids: Optional[List[int]] = None,
|
65
66
|
temperature: Optional[float] = None,
|
66
67
|
top_p: Optional[float] = None,
|
67
68
|
top_k: Optional[int] = None,
|
@@ -72,7 +73,7 @@ def gen(
|
|
72
73
|
logprob_start_len: Optional[int] = None,
|
73
74
|
top_logprobs_num: Optional[int] = None,
|
74
75
|
return_text_in_logprobs: Optional[bool] = None,
|
75
|
-
dtype: Optional[type] = None,
|
76
|
+
dtype: Optional[Union[type, str]] = None,
|
76
77
|
choices: Optional[List[str]] = None,
|
77
78
|
choices_method: Optional[ChoicesSamplingMethod] = None,
|
78
79
|
regex: Optional[str] = None,
|
@@ -98,6 +99,7 @@ def gen(
|
|
98
99
|
name,
|
99
100
|
max_tokens,
|
100
101
|
stop,
|
102
|
+
stop_token_ids,
|
101
103
|
temperature,
|
102
104
|
top_p,
|
103
105
|
top_k,
|
@@ -117,6 +119,7 @@ def gen_int(
|
|
117
119
|
name: Optional[str] = None,
|
118
120
|
max_tokens: Optional[int] = None,
|
119
121
|
stop: Optional[Union[str, List[str]]] = None,
|
122
|
+
stop_token_ids: Optional[List[int]] = None,
|
120
123
|
temperature: Optional[float] = None,
|
121
124
|
top_p: Optional[float] = None,
|
122
125
|
top_k: Optional[int] = None,
|
@@ -132,6 +135,7 @@ def gen_int(
|
|
132
135
|
name,
|
133
136
|
max_tokens,
|
134
137
|
stop,
|
138
|
+
stop_token_ids,
|
135
139
|
temperature,
|
136
140
|
top_p,
|
137
141
|
top_k,
|
@@ -151,6 +155,7 @@ def gen_string(
|
|
151
155
|
name: Optional[str] = None,
|
152
156
|
max_tokens: Optional[int] = None,
|
153
157
|
stop: Optional[Union[str, List[str]]] = None,
|
158
|
+
stop_token_ids: Optional[List[int]] = None,
|
154
159
|
temperature: Optional[float] = None,
|
155
160
|
top_p: Optional[float] = None,
|
156
161
|
top_k: Optional[int] = None,
|
@@ -166,6 +171,7 @@ def gen_string(
|
|
166
171
|
name,
|
167
172
|
max_tokens,
|
168
173
|
stop,
|
174
|
+
stop_token_ids,
|
169
175
|
temperature,
|
170
176
|
top_p,
|
171
177
|
top_k,
|
@@ -64,7 +64,7 @@ class BenchArgs:
|
|
64
64
|
run_name: str = "before"
|
65
65
|
batch_size: Tuple[int] = (1,)
|
66
66
|
input_len: Tuple[int] = (1024,)
|
67
|
-
output_len: Tuple[int] = (
|
67
|
+
output_len: Tuple[int] = (16,)
|
68
68
|
result_filename: str = ""
|
69
69
|
correctness_test: bool = False
|
70
70
|
# This is only used for correctness test
|
@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
|
|
152
152
|
req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
|
153
153
|
req.prefix_indices = []
|
154
154
|
req.sampling_params = sampling_params
|
155
|
-
req.
|
155
|
+
req.fill_ids = req.origin_input_ids
|
156
156
|
reqs.append(req)
|
157
157
|
|
158
158
|
return input_ids, reqs
|
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
|
|
163
163
|
):
|
164
164
|
for i in range(len(reqs)):
|
165
165
|
req = reqs[i]
|
166
|
-
req.
|
166
|
+
req.fill_ids += input_ids[i][bench_args.cut_len :]
|
167
167
|
req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
|
168
168
|
i, : bench_args.cut_len
|
169
169
|
]
|
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
182
182
|
req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
|
183
183
|
req.prefix_indices = []
|
184
184
|
req.sampling_params = sampling_params
|
185
|
-
req.
|
185
|
+
req.fill_ids = req.origin_input_ids
|
186
186
|
reqs.append(req)
|
187
187
|
|
188
188
|
return reqs
|
@@ -195,7 +195,7 @@ def extend(reqs, model_runner):
|
|
195
195
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
196
196
|
tree_cache=None,
|
197
197
|
)
|
198
|
-
batch.prepare_for_extend(model_runner.model_config.vocab_size
|
198
|
+
batch.prepare_for_extend(model_runner.model_config.vocab_size)
|
199
199
|
output = model_runner.forward(batch, ForwardMode.EXTEND)
|
200
200
|
next_token_ids = batch.sample(output.next_token_logits)
|
201
201
|
return next_token_ids, output.next_token_logits, batch
|
@@ -221,6 +221,7 @@ def correctness_test(
|
|
221
221
|
|
222
222
|
# Prepare inputs
|
223
223
|
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
|
224
|
+
rank_print(f"{input_ids=}")
|
224
225
|
|
225
226
|
if bench_args.cut_len > 0:
|
226
227
|
# Prefill
|
@@ -238,7 +239,7 @@ def correctness_test(
|
|
238
239
|
|
239
240
|
# Decode
|
240
241
|
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
241
|
-
for _ in range(bench_args.output_len):
|
242
|
+
for _ in range(bench_args.output_len[0]):
|
242
243
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
243
244
|
for i in range(len(reqs)):
|
244
245
|
output_ids[i].append(next_token_ids[i])
|
@@ -332,6 +333,7 @@ def latency_test(
|
|
332
333
|
)
|
333
334
|
|
334
335
|
# Warm up
|
336
|
+
rank_print("Warmup ...")
|
335
337
|
latency_test_run_once(
|
336
338
|
bench_args.run_name,
|
337
339
|
model_runner,
|
@@ -341,6 +343,7 @@ def latency_test(
|
|
341
343
|
bench_args.input_len[0],
|
342
344
|
4, # shorter decoding to speed up the warmup
|
343
345
|
)
|
346
|
+
rank_print("Benchmark ...")
|
344
347
|
|
345
348
|
# Run the sweep
|
346
349
|
result_list = []
|
@@ -24,7 +24,7 @@ import warnings
|
|
24
24
|
from argparse import ArgumentParser
|
25
25
|
from dataclasses import dataclass, field
|
26
26
|
from datetime import datetime
|
27
|
-
from typing import AsyncGenerator, List, Optional, Tuple, Union
|
27
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
28
28
|
|
29
29
|
import aiohttp
|
30
30
|
import numpy as np
|
@@ -39,6 +39,8 @@ from transformers import (
|
|
39
39
|
|
40
40
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
41
41
|
|
42
|
+
global args
|
43
|
+
|
42
44
|
|
43
45
|
@dataclass
|
44
46
|
class RequestFuncInput:
|
@@ -47,6 +49,7 @@ class RequestFuncInput:
|
|
47
49
|
prompt_len: int
|
48
50
|
output_len: int
|
49
51
|
model: str
|
52
|
+
extra_request_body: Dict[str, Any]
|
50
53
|
|
51
54
|
|
52
55
|
@dataclass
|
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
|
|
84
87
|
"stream": True,
|
85
88
|
"min_length": request_func_input.output_len,
|
86
89
|
"end_id": 1048576,
|
90
|
+
**request_func_input.extra_request_body,
|
87
91
|
}
|
88
92
|
if args.disable_ignore_eos:
|
89
93
|
del payload["min_length"]
|
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
|
|
154
158
|
"max_tokens": request_func_input.output_len,
|
155
159
|
"stream": not args.disable_stream,
|
156
160
|
"ignore_eos": not args.disable_ignore_eos,
|
161
|
+
**request_func_input.extra_request_body,
|
157
162
|
}
|
158
163
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
159
164
|
|
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
|
|
192
197
|
output.ttft = ttft
|
193
198
|
|
194
199
|
# Decoding phase
|
195
|
-
|
200
|
+
else:
|
201
|
+
output.itl.append(timestamp - most_recent_timestamp)
|
196
202
|
|
197
203
|
most_recent_timestamp = timestamp
|
198
204
|
generated_text += data["choices"][0]["text"]
|
@@ -542,6 +548,7 @@ async def benchmark(
|
|
542
548
|
request_rate: float,
|
543
549
|
disable_tqdm: bool,
|
544
550
|
enable_multi: bool,
|
551
|
+
extra_request_body: Dict[str, Any],
|
545
552
|
):
|
546
553
|
if backend in ASYNC_REQUEST_FUNCS:
|
547
554
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -556,6 +563,7 @@ async def benchmark(
|
|
556
563
|
api_url=api_url,
|
557
564
|
prompt_len=test_prompt_len,
|
558
565
|
output_len=test_output_len,
|
566
|
+
extra_request_body=extra_request_body,
|
559
567
|
)
|
560
568
|
test_output = await request_func(request_func_input=test_input)
|
561
569
|
if not test_output.success:
|
@@ -578,6 +586,7 @@ async def benchmark(
|
|
578
586
|
api_url=api_url,
|
579
587
|
prompt_len=prompt_len,
|
580
588
|
output_len=output_len,
|
589
|
+
extra_request_body=extra_request_body,
|
581
590
|
)
|
582
591
|
tasks.append(
|
583
592
|
asyncio.create_task(
|
@@ -660,19 +669,20 @@ async def benchmark(
|
|
660
669
|
"backend": args.backend,
|
661
670
|
"dataset_name": args.dataset_name,
|
662
671
|
"request_rate": request_rate,
|
663
|
-
"
|
664
|
-
"
|
665
|
-
"
|
666
|
-
"
|
667
|
-
"
|
668
|
-
"
|
669
|
-
"
|
670
|
-
"
|
672
|
+
"total_input_tokens": metrics.total_input,
|
673
|
+
"total_output_tokens": metrics.total_output,
|
674
|
+
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
675
|
+
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
676
|
+
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
677
|
+
"median_ttft_ms": metrics.median_ttft_ms,
|
678
|
+
"median_itl_ms": metrics.median_itl_ms,
|
679
|
+
"output_throughput": metrics.output_throughput,
|
671
680
|
"sharegpt_output_len": args.sharegpt_output_len,
|
672
681
|
"random_input_len": args.random_input_len,
|
673
682
|
"random_output_len": args.random_output_len,
|
674
683
|
"random_range_ratio": args.random_range_ratio,
|
675
|
-
"
|
684
|
+
"duration": benchmark_duration,
|
685
|
+
"completed": metrics.completed,
|
676
686
|
}
|
677
687
|
else:
|
678
688
|
print(f"Error running benchmark for request rate: {request_rate}")
|
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
|
|
742
752
|
return False
|
743
753
|
|
744
754
|
|
745
|
-
def
|
755
|
+
def run_benchmark(args_: argparse.Namespace):
|
756
|
+
global args
|
757
|
+
args = args_
|
758
|
+
|
759
|
+
set_ulimit()
|
746
760
|
random.seed(args.seed)
|
747
761
|
np.random.seed(args.seed)
|
748
762
|
|
763
|
+
extra_request_body = {}
|
764
|
+
if args.extra_request_body:
|
765
|
+
extra_request_body = json.loads(args.extra_request_body)
|
766
|
+
|
749
767
|
if args.port is None:
|
750
768
|
args.port = {
|
751
769
|
"sglang": 30000,
|
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
|
|
838
856
|
request_rate=rate,
|
839
857
|
disable_tqdm=args.disable_tqdm,
|
840
858
|
enable_multi=args.multi,
|
859
|
+
extra_request_body=extra_request_body,
|
841
860
|
)
|
842
861
|
)
|
843
862
|
else:
|
844
|
-
asyncio.run(
|
863
|
+
return asyncio.run(
|
845
864
|
benchmark(
|
846
865
|
backend=backend,
|
847
866
|
api_url=api_url,
|
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
|
|
851
870
|
request_rate=args.request_rate,
|
852
871
|
disable_tqdm=args.disable_tqdm,
|
853
872
|
enable_multi=args.multi,
|
873
|
+
extra_request_body=extra_request_body,
|
854
874
|
)
|
855
875
|
)
|
856
876
|
|
@@ -949,11 +969,6 @@ if __name__ == "__main__":
|
|
949
969
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
|
950
970
|
)
|
951
971
|
parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
|
952
|
-
parser.add_argument(
|
953
|
-
"--disable-tqdm",
|
954
|
-
action="store_true",
|
955
|
-
help="Specify to disable tqdm progress bar.",
|
956
|
-
)
|
957
972
|
parser.add_argument(
|
958
973
|
"--multi",
|
959
974
|
action="store_true",
|
@@ -966,6 +981,11 @@ if __name__ == "__main__":
|
|
966
981
|
help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
|
967
982
|
)
|
968
983
|
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
984
|
+
parser.add_argument(
|
985
|
+
"--disable-tqdm",
|
986
|
+
action="store_true",
|
987
|
+
help="Specify to disable tqdm progress bar.",
|
988
|
+
)
|
969
989
|
parser.add_argument(
|
970
990
|
"--disable-stream",
|
971
991
|
action="store_true",
|
@@ -976,8 +996,12 @@ if __name__ == "__main__":
|
|
976
996
|
action="store_true",
|
977
997
|
help="Disable ignoring EOS.",
|
978
998
|
)
|
979
|
-
|
980
|
-
|
981
|
-
|
999
|
+
parser.add_argument(
|
1000
|
+
"--extra-request-body",
|
1001
|
+
metavar='{"key1": "value1", "key2": "value2"}',
|
1002
|
+
type=str,
|
1003
|
+
help="Append given JSON object to the request payload. You can use this to specify"
|
1004
|
+
"additional generate params like sampling params.",
|
1005
|
+
)
|
982
1006
|
args = parser.parse_args()
|
983
|
-
|
1007
|
+
run_benchmark(args)
|
@@ -27,7 +27,7 @@ class GlobalConfig:
|
|
27
27
|
# Runtime constants: others
|
28
28
|
self.num_continue_decode_steps = 10
|
29
29
|
self.retract_decode_steps = 20
|
30
|
-
self.flashinfer_workspace_size =
|
30
|
+
self.flashinfer_workspace_size = 384 * 1024 * 1024
|
31
31
|
|
32
32
|
# Output tokenization configs
|
33
33
|
self.skip_special_tokens_in_output = True
|