sglang 0.2.5__tar.gz → 0.2.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.2.5/sglang.egg-info → sglang-0.2.7}/PKG-INFO +40 -12
- {sglang-0.2.5 → sglang-0.2.7}/README.md +39 -11
- {sglang-0.2.5 → sglang-0.2.7}/pyproject.toml +1 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/__init__.py +33 -26
- {sglang-0.2.5 → sglang-0.2.7}/sglang/api.py +9 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/bench_latency.py +2 -2
- {sglang-0.2.5 → sglang-0.2.7}/sglang/bench_serving.py +10 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/check_env.py +1 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/litellm.py +1 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/openai.py +1 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/runtime_endpoint.py +4 -4
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/interpreter.py +24 -9
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/ir.py +1 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/__init__.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/base_cache.py +15 -0
- sglang-0.2.7/sglang/srt/constrained/fsm_cache.py +66 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/constrained/jump_forward.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/conversation.py +26 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/hf_transformers_utils.py +18 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/context_flashattention_nopad.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/extend_attention.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/fused_moe.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/linear.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/logits_processor.py +109 -72
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/quantization/__init__.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/quantization/fp8.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/radix_attention.py +21 -3
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/layers/token_attention.py +16 -1
- sglang-0.2.5/sglang/srt/managers/controller/manager_multi.py → sglang-0.2.7/sglang/srt/managers/controller_multi.py +17 -2
- sglang-0.2.5/sglang/srt/managers/controller/manager_single.py → sglang-0.2.7/sglang/srt/managers/controller_single.py +17 -2
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/detokenizer_manager.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/io_struct.py +38 -5
- sglang-0.2.5/sglang/srt/managers/controller/schedule_heuristic.py → sglang-0.2.7/sglang/srt/managers/policy_scheduler.py +37 -22
- sglang-0.2.5/sglang/srt/managers/controller/infer_batch.py → sglang-0.2.7/sglang/srt/managers/schedule_batch.py +85 -25
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/managers/tokenizer_manager.py +99 -57
- {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/managers}/tp_worker.py +177 -81
- sglang-0.2.7/sglang/srt/mem_cache/flush_cache.py +33 -0
- {sglang-0.2.5/sglang/srt → sglang-0.2.7/sglang/srt/mem_cache}/memory_pool.py +16 -1
- {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/mem_cache}/radix_cache.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/mm_utils.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_config.py +20 -0
- {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/model_executor}/cuda_graph_runner.py +42 -18
- {sglang-0.2.5/sglang/srt/managers/controller → sglang-0.2.7/sglang/srt/model_executor}/model_runner.py +51 -16
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_loader/model_loader.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/model_loader/utils.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/chatglm.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/commandr.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/dbrx.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/deepseek.py +16 -1
- sglang-0.2.7/sglang/srt/models/deepseek_v2.py +532 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gemma.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gemma2.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/gpt_bigcode.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/grok.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/internlm2.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llama2.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llama_classification.py +19 -4
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llava.py +17 -2
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/llavavid.py +17 -2
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/minicpm.py +16 -1
- sglang-0.2.7/sglang/srt/models/mistral.py +26 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/mixtral.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/mixtral_quant.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen2.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/qwen2_moe.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/stablelm.py +16 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/models/yivl.py +15 -0
- sglang-0.2.7/sglang/srt/openai_api/adapter.py +822 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/openai_api/protocol.py +65 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/sampling_params.py +20 -4
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/server.py +90 -37
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/server_args.py +76 -17
- {sglang-0.2.5 → sglang-0.2.7}/sglang/srt/utils.py +15 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_programs.py +5 -1
- {sglang-0.2.5 → sglang-0.2.7}/sglang/utils.py +22 -0
- sglang-0.2.7/sglang/version.py +1 -0
- {sglang-0.2.5 → sglang-0.2.7/sglang.egg-info}/PKG-INFO +40 -12
- {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/SOURCES.txt +11 -10
- sglang-0.2.5/sglang/srt/constrained/fsm_cache.py +0 -31
- sglang-0.2.5/sglang/srt/flush_cache.py +0 -18
- sglang-0.2.5/sglang/srt/models/mistral.py +0 -11
- sglang-0.2.5/sglang/srt/openai_api/adapter.py +0 -437
- sglang-0.2.5/sglang/version.py +0 -1
- {sglang-0.2.5 → sglang-0.2.7}/LICENSE +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/setup.cfg +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/global_config.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/__init__.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/chat_template.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/compiler.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/lang/tracer.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/launch_server.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_conversation.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang/test/test_utils.py +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.2.5 → sglang-0.2.7}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.7
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -245,11 +245,18 @@ Requires-Dist: sglang[litellm]; extra == "all"
|
|
245
245
|
|
246
246
|
<div align="center">
|
247
247
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
248
|
+
|
249
|
+
[](https://pypi.org/project/sglang)
|
250
|
+

|
251
|
+
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
252
|
+
[](https://github.com/sgl-project/sglang/issues)
|
253
|
+
[](https://github.com/sgl-project/sglang/issues)
|
254
|
+
|
248
255
|
</div>
|
249
256
|
|
250
257
|
--------------------------------------------------------------------------------
|
251
258
|
|
252
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
259
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
253
260
|
|
254
261
|
SGLang is a fast serving framework for large language models and vision language models.
|
255
262
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -292,7 +299,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
292
299
|
|
293
300
|
### Method 2: From source
|
294
301
|
```
|
295
|
-
|
302
|
+
# Use the stable release branch
|
303
|
+
git clone -b release https://github.com/sgl-project/sglang.git
|
296
304
|
cd sglang
|
297
305
|
|
298
306
|
pip install --upgrade pip
|
@@ -341,7 +349,7 @@ curl http://localhost:30000/generate \
|
|
341
349
|
}
|
342
350
|
}'
|
343
351
|
```
|
344
|
-
Learn more about the argument format [here](docs/sampling_params.md).
|
352
|
+
Learn more about the argument format [here](docs/en/sampling_params.md).
|
345
353
|
|
346
354
|
### OpenAI Compatible API
|
347
355
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -388,7 +396,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
388
396
|
```
|
389
397
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
390
398
|
```
|
391
|
-
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
399
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
392
400
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
393
401
|
```
|
394
402
|
# Node 0
|
@@ -397,23 +405,24 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
397
405
|
# Node 1
|
398
406
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
399
407
|
```
|
400
|
-
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
408
|
+
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
401
409
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
402
410
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
403
411
|
|
404
412
|
### Run Llama 3.1 405B
|
405
413
|
|
406
414
|
```bash
|
407
|
-
|
415
|
+
## Run 405B (fp8) on a single node
|
416
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
417
|
+
|
418
|
+
## Run 405B (fp16) on two nodes
|
408
419
|
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
420
|
+
|
409
421
|
# on the first node
|
410
422
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
411
423
|
|
412
424
|
# on the second
|
413
425
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
414
|
-
|
415
|
-
# single node run 405B fp8
|
416
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
417
426
|
```
|
418
427
|
|
419
428
|
### Supported Models
|
@@ -422,6 +431,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
|
|
422
431
|
- Mistral / Mixtral
|
423
432
|
- Gemma / Gemma 2
|
424
433
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
434
|
+
- DeepSeek / DeepSeek 2
|
425
435
|
- LLaVA 1.5 / 1.6
|
426
436
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
427
437
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -438,11 +448,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
|
|
438
448
|
- InternLM 2
|
439
449
|
- Mistral NeMo
|
440
450
|
|
441
|
-
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
451
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
442
452
|
|
443
453
|
### Benchmark Performance
|
444
454
|
|
445
|
-
- Benchmark a single static batch
|
455
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
|
446
456
|
```
|
447
457
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
448
458
|
```
|
@@ -669,6 +679,24 @@ for out in state.text_iter():
|
|
669
679
|
print(out, end="", flush=True)
|
670
680
|
```
|
671
681
|
|
682
|
+
#### Roles
|
683
|
+
|
684
|
+
Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
|
685
|
+
|
686
|
+
```python
|
687
|
+
@sgl.function
|
688
|
+
def chat_example(s):
|
689
|
+
s += sgl.system("You are a helpful assistant.")
|
690
|
+
# Same as: s += s.system("You are a helpful assistant.")
|
691
|
+
|
692
|
+
with s.user():
|
693
|
+
s += "Question: What is the capital of France?"
|
694
|
+
|
695
|
+
s += sgl.assistant_begin()
|
696
|
+
s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
|
697
|
+
s += sgl.assistant_end()
|
698
|
+
```
|
699
|
+
|
672
700
|
#### Tips and Implementation Details
|
673
701
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
674
702
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
@@ -1,10 +1,17 @@
|
|
1
1
|
<div align="center">
|
2
2
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
3
|
+
|
4
|
+
[](https://pypi.org/project/sglang)
|
5
|
+

|
6
|
+
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
7
|
+
[](https://github.com/sgl-project/sglang/issues)
|
8
|
+
[](https://github.com/sgl-project/sglang/issues)
|
9
|
+
|
3
10
|
</div>
|
4
11
|
|
5
12
|
--------------------------------------------------------------------------------
|
6
13
|
|
7
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
8
15
|
|
9
16
|
SGLang is a fast serving framework for large language models and vision language models.
|
10
17
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -47,7 +54,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
47
54
|
|
48
55
|
### Method 2: From source
|
49
56
|
```
|
50
|
-
|
57
|
+
# Use the stable release branch
|
58
|
+
git clone -b release https://github.com/sgl-project/sglang.git
|
51
59
|
cd sglang
|
52
60
|
|
53
61
|
pip install --upgrade pip
|
@@ -96,7 +104,7 @@ curl http://localhost:30000/generate \
|
|
96
104
|
}
|
97
105
|
}'
|
98
106
|
```
|
99
|
-
Learn more about the argument format [here](docs/sampling_params.md).
|
107
|
+
Learn more about the argument format [here](docs/en/sampling_params.md).
|
100
108
|
|
101
109
|
### OpenAI Compatible API
|
102
110
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -143,7 +151,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
143
151
|
```
|
144
152
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
|
145
153
|
```
|
146
|
-
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
154
|
+
- See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
147
155
|
- Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
148
156
|
```
|
149
157
|
# Node 0
|
@@ -152,23 +160,24 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
152
160
|
# Node 1
|
153
161
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
154
162
|
```
|
155
|
-
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
|
163
|
+
- If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
156
164
|
- To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
157
165
|
- To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
158
166
|
|
159
167
|
### Run Llama 3.1 405B
|
160
168
|
|
161
169
|
```bash
|
162
|
-
|
170
|
+
## Run 405B (fp8) on a single node
|
171
|
+
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
|
+
|
173
|
+
## Run 405B (fp16) on two nodes
|
163
174
|
# replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
|
175
|
+
|
164
176
|
# on the first node
|
165
177
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
|
166
178
|
|
167
179
|
# on the second
|
168
180
|
GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
|
169
|
-
|
170
|
-
# single node run 405B fp8
|
171
|
-
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
|
172
181
|
```
|
173
182
|
|
174
183
|
### Supported Models
|
@@ -177,6 +186,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
|
|
177
186
|
- Mistral / Mixtral
|
178
187
|
- Gemma / Gemma 2
|
179
188
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
189
|
+
- DeepSeek / DeepSeek 2
|
180
190
|
- LLaVA 1.5 / 1.6
|
181
191
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
182
192
|
- `python -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -193,11 +203,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instr
|
|
193
203
|
- InternLM 2
|
194
204
|
- Mistral NeMo
|
195
205
|
|
196
|
-
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
206
|
+
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
|
197
207
|
|
198
208
|
### Benchmark Performance
|
199
209
|
|
200
|
-
- Benchmark a single static batch
|
210
|
+
- Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
|
201
211
|
```
|
202
212
|
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
|
203
213
|
```
|
@@ -424,6 +434,24 @@ for out in state.text_iter():
|
|
424
434
|
print(out, end="", flush=True)
|
425
435
|
```
|
426
436
|
|
437
|
+
#### Roles
|
438
|
+
|
439
|
+
Use `sgl.system`, `sgl.user` and `sgl.assistant` to set roles when using Chat models. You can also define more complex role prompts using begin and end tokens.
|
440
|
+
|
441
|
+
```python
|
442
|
+
@sgl.function
|
443
|
+
def chat_example(s):
|
444
|
+
s += sgl.system("You are a helpful assistant.")
|
445
|
+
# Same as: s += s.system("You are a helpful assistant.")
|
446
|
+
|
447
|
+
with s.user():
|
448
|
+
s += "Question: What is the capital of France?"
|
449
|
+
|
450
|
+
s += sgl.assistant_begin()
|
451
|
+
s += "Answer: " + sgl.gen(max_tokens=100, stop="\n")
|
452
|
+
s += sgl.assistant_end()
|
453
|
+
```
|
454
|
+
|
427
455
|
#### Tips and Implementation Details
|
428
456
|
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
429
457
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.2.
|
7
|
+
version = "0.2.7"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -1,4 +1,5 @@
|
|
1
1
|
# SGL API Components
|
2
|
+
|
2
3
|
from sglang.api import (
|
3
4
|
Runtime,
|
4
5
|
assistant,
|
@@ -14,48 +15,54 @@ from sglang.api import (
|
|
14
15
|
select,
|
15
16
|
set_default_backend,
|
16
17
|
system,
|
18
|
+
system_begin,
|
19
|
+
system_end,
|
17
20
|
user,
|
18
21
|
user_begin,
|
19
22
|
user_end,
|
20
23
|
video,
|
21
24
|
)
|
22
25
|
|
23
|
-
#
|
24
|
-
from sglang.global_config import global_config
|
25
|
-
|
26
|
-
# SGL Backends
|
27
|
-
from sglang.lang.backend.anthropic import Anthropic
|
28
|
-
from sglang.lang.backend.litellm import LiteLLM
|
29
|
-
from sglang.lang.backend.openai import OpenAI
|
30
|
-
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
31
|
-
from sglang.lang.backend.vertexai import VertexAI
|
32
|
-
|
33
|
-
from .version import __version__
|
34
|
-
|
35
|
-
# public APIs management
|
26
|
+
# SGLang DSL APIs
|
36
27
|
__all__ = [
|
37
|
-
"global_config",
|
38
|
-
"Anthropic",
|
39
|
-
"LiteLLM",
|
40
|
-
"OpenAI",
|
41
|
-
"RuntimeEndpoint",
|
42
|
-
"VertexAI",
|
43
|
-
"function",
|
44
28
|
"Runtime",
|
45
|
-
"
|
29
|
+
"assistant",
|
30
|
+
"assistant_begin",
|
31
|
+
"assistant_end",
|
46
32
|
"flush_cache",
|
47
|
-
"
|
33
|
+
"function",
|
48
34
|
"gen",
|
49
35
|
"gen_int",
|
50
36
|
"gen_string",
|
37
|
+
"get_server_args",
|
51
38
|
"image",
|
52
|
-
"video",
|
53
39
|
"select",
|
40
|
+
"set_default_backend",
|
54
41
|
"system",
|
42
|
+
"system_begin",
|
43
|
+
"system_end",
|
55
44
|
"user",
|
56
|
-
"assistant",
|
57
45
|
"user_begin",
|
58
46
|
"user_end",
|
59
|
-
"
|
60
|
-
"assistant_end",
|
47
|
+
"video",
|
61
48
|
]
|
49
|
+
|
50
|
+
# Global Configurations
|
51
|
+
from sglang.global_config import global_config
|
52
|
+
|
53
|
+
__all__ += ["global_config"]
|
54
|
+
|
55
|
+
from sglang.version import __version__
|
56
|
+
|
57
|
+
__all__ += ["__version__"]
|
58
|
+
|
59
|
+
# SGL Backends
|
60
|
+
from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
|
61
|
+
from sglang.utils import LazyImport
|
62
|
+
|
63
|
+
Anthropic = LazyImport("sglang.lang.backend.anthropic", "Anthropic")
|
64
|
+
LiteLLM = LazyImport("sglang.lang.backend.litellm", "LiteLLM")
|
65
|
+
OpenAI = LazyImport("sglang.lang.backend.openai", "OpenAI")
|
66
|
+
VertexAI = LazyImport("sglang.lang.backend.vertexai", "VertexAI")
|
67
|
+
|
68
|
+
__all__ += ["Anthropic", "LiteLLM", "OpenAI", "VertexAI", "RuntimeEndpoint"]
|
@@ -75,7 +75,7 @@ def gen(
|
|
75
75
|
choices: Optional[List[str]] = None,
|
76
76
|
regex: Optional[str] = None,
|
77
77
|
):
|
78
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
78
|
+
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
79
79
|
|
80
80
|
if choices:
|
81
81
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
@@ -210,6 +210,14 @@ def assistant(expr: Optional[SglExpr] = None):
|
|
210
210
|
return _role_common("assistant", expr)
|
211
211
|
|
212
212
|
|
213
|
+
def system_begin():
|
214
|
+
return SglRoleBegin("system")
|
215
|
+
|
216
|
+
|
217
|
+
def system_end():
|
218
|
+
return SglRoleEnd("system")
|
219
|
+
|
220
|
+
|
213
221
|
def user_begin():
|
214
222
|
return SglRoleBegin("user")
|
215
223
|
|
@@ -37,9 +37,9 @@ import torch
|
|
37
37
|
import torch.distributed as dist
|
38
38
|
|
39
39
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
40
|
-
from sglang.srt.managers.
|
41
|
-
from sglang.srt.managers.controller.model_runner import ModelRunner
|
40
|
+
from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
|
42
41
|
from sglang.srt.model_config import ModelConfig
|
42
|
+
from sglang.srt.model_executor.model_runner import ModelRunner
|
43
43
|
from sglang.srt.sampling_params import SamplingParams
|
44
44
|
from sglang.srt.server_args import ServerArgs
|
45
45
|
from sglang.srt.utils import suppress_other_loggers
|
@@ -1,5 +1,6 @@
|
|
1
1
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/backend_request_func.py
|
2
2
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
|
3
|
+
|
3
4
|
"""
|
4
5
|
Benchmark online serving.
|
5
6
|
|
@@ -84,6 +85,9 @@ async def async_request_trt_llm(
|
|
84
85
|
"min_length": request_func_input.output_len,
|
85
86
|
"end_id": 1048576,
|
86
87
|
}
|
88
|
+
if args.disable_ignore_eos:
|
89
|
+
del payload["min_length"]
|
90
|
+
del payload["end_id"]
|
87
91
|
output = RequestFuncOutput()
|
88
92
|
output.prompt_len = request_func_input.prompt_len
|
89
93
|
|
@@ -149,7 +153,7 @@ async def async_request_openai_completions(
|
|
149
153
|
"best_of": 1,
|
150
154
|
"max_tokens": request_func_input.output_len,
|
151
155
|
"stream": not args.disable_stream,
|
152
|
-
"ignore_eos":
|
156
|
+
"ignore_eos": not args.disable_ignore_eos,
|
153
157
|
}
|
154
158
|
headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
|
155
159
|
|
@@ -969,6 +973,11 @@ if __name__ == "__main__":
|
|
969
973
|
action="store_true",
|
970
974
|
help="Disable streaming mode.",
|
971
975
|
)
|
976
|
+
parser.add_argument(
|
977
|
+
"--disable-ignore-eos",
|
978
|
+
action="store_true",
|
979
|
+
help="Disable ignoring EOS.",
|
980
|
+
)
|
972
981
|
|
973
982
|
set_ulimit()
|
974
983
|
|
@@ -253,14 +253,14 @@ class RuntimeEndpoint(BaseBackend):
|
|
253
253
|
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
254
254
|
]
|
255
255
|
decision = choices[np.argmax(normalized_prompt_logprobs)]
|
256
|
-
|
257
|
-
|
256
|
+
input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
|
257
|
+
output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
|
258
258
|
|
259
259
|
return (
|
260
260
|
decision,
|
261
261
|
normalized_prompt_logprobs,
|
262
|
-
|
263
|
-
|
262
|
+
input_token_logprobs,
|
263
|
+
output_token_logprobs,
|
264
264
|
)
|
265
265
|
|
266
266
|
def concatenate_and_append(self, src_rids: List[str], dst_rid: str):
|
@@ -541,18 +541,19 @@ class StreamExecutor:
|
|
541
541
|
(
|
542
542
|
decision,
|
543
543
|
normalized_prompt_logprobs,
|
544
|
-
|
545
|
-
|
544
|
+
input_token_logprobs,
|
545
|
+
output_token_logprobs,
|
546
546
|
) = self.backend.select(self, expr.choices, expr.temperature)
|
547
547
|
if expr.name is not None:
|
548
548
|
name = expr.name
|
549
549
|
self.variables[name] = decision
|
550
550
|
self.meta_info[name] = {
|
551
551
|
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
552
|
-
"
|
553
|
-
"
|
552
|
+
"input_token_logprobs": input_token_logprobs,
|
553
|
+
"output_token_logprobs": output_token_logprobs,
|
554
554
|
}
|
555
555
|
self.variable_event[name].set()
|
556
|
+
self.stream_var_event[name].set()
|
556
557
|
self.text_ += decision
|
557
558
|
|
558
559
|
def _execute_variable(self, expr: SglVariable):
|
@@ -705,9 +706,9 @@ class ProgramState:
|
|
705
706
|
|
706
707
|
def _role_common(self, name: str, expr: Optional[SglExpr] = None):
|
707
708
|
if expr is not None:
|
708
|
-
|
709
|
-
|
710
|
-
|
709
|
+
role_expr = SglExprList([SglRoleBegin(name), expr, SglRoleEnd(name)])
|
710
|
+
self.stream_executor.submit(role_expr)
|
711
|
+
return role_expr
|
711
712
|
else:
|
712
713
|
|
713
714
|
@contextmanager
|
@@ -778,7 +779,14 @@ class ProgramState:
|
|
778
779
|
if self.stream_executor.is_finished:
|
779
780
|
break
|
780
781
|
else:
|
781
|
-
event =
|
782
|
+
event = None
|
783
|
+
while not event:
|
784
|
+
if var_name in self.stream_executor.stream_var_event:
|
785
|
+
event = self.stream_executor.stream_var_event[var_name]
|
786
|
+
if self.stream_executor.is_finished:
|
787
|
+
yield ""
|
788
|
+
return
|
789
|
+
|
782
790
|
while True:
|
783
791
|
event.wait()
|
784
792
|
event.clear()
|
@@ -813,7 +821,14 @@ class ProgramState:
|
|
813
821
|
if self.stream_executor.is_finished:
|
814
822
|
break
|
815
823
|
else:
|
816
|
-
event =
|
824
|
+
event = None
|
825
|
+
while not event:
|
826
|
+
if var_name in self.stream_executor.stream_var_event:
|
827
|
+
event = self.stream_executor.stream_var_event[var_name]
|
828
|
+
if self.stream_executor.is_finished:
|
829
|
+
yield ""
|
830
|
+
return
|
831
|
+
|
817
832
|
while True:
|
818
833
|
await loop.run_in_executor(None, event.wait)
|
819
834
|
event.clear()
|
@@ -410,7 +410,7 @@ class SglGen(SglExpr):
|
|
410
410
|
dtype: Optional[type] = None,
|
411
411
|
regex: Optional[str] = None,
|
412
412
|
):
|
413
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
413
|
+
"""Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
|
414
414
|
super().__init__()
|
415
415
|
self.name = name
|
416
416
|
self.sampling_params = SglSamplingParams(
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
import json
|
2
17
|
from typing import Dict, Optional, Union
|
3
18
|
|
@@ -1,3 +1,18 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
1
16
|
"""Base cache class."""
|
2
17
|
|
3
18
|
import time
|
@@ -0,0 +1,66 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""Cache for the compressed finite state machine."""
|
17
|
+
|
18
|
+
from sglang.srt.constrained import RegexGuide, TransformerTokenizer
|
19
|
+
from sglang.srt.constrained.base_cache import BaseCache
|
20
|
+
|
21
|
+
|
22
|
+
class FSMCache(BaseCache):
|
23
|
+
def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
|
24
|
+
super().__init__(enable=enable)
|
25
|
+
|
26
|
+
if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
|
27
|
+
# Do not support TiktokenTokenizer or SentencePieceTokenizer
|
28
|
+
return
|
29
|
+
|
30
|
+
from importlib.metadata import version
|
31
|
+
|
32
|
+
if version("outlines") >= "0.0.35":
|
33
|
+
from transformers import AutoTokenizer
|
34
|
+
|
35
|
+
tokenizer_args_dict.setdefault("padding_side", "left")
|
36
|
+
tokenizer = AutoTokenizer.from_pretrained(
|
37
|
+
tokenizer_path, **tokenizer_args_dict
|
38
|
+
)
|
39
|
+
try:
|
40
|
+
self.outlines_tokenizer = TransformerTokenizer(tokenizer)
|
41
|
+
except AttributeError:
|
42
|
+
# FIXME: tmp fix for chatglm2 & chatglm3 (pad_token_id=0)
|
43
|
+
origin_pad_token_id = tokenizer.pad_token_id
|
44
|
+
|
45
|
+
def fset(self, value):
|
46
|
+
self._value = value
|
47
|
+
|
48
|
+
type(tokenizer).pad_token_id = property(
|
49
|
+
fget=type(tokenizer).pad_token_id.fget, fset=fset
|
50
|
+
)
|
51
|
+
self.outlines_tokenizer = TransformerTokenizer(tokenizer)
|
52
|
+
self.outlines_tokenizer.tokenizer.pad_token_id = origin_pad_token_id
|
53
|
+
self.outlines_tokenizer.pad_token_id = origin_pad_token_id
|
54
|
+
self.outlines_tokenizer.pad_token = (
|
55
|
+
self.outlines_tokenizer.tokenizer.pad_token
|
56
|
+
)
|
57
|
+
self.outlines_tokenizer.vocabulary = (
|
58
|
+
self.outlines_tokenizer.tokenizer.get_vocab()
|
59
|
+
)
|
60
|
+
else:
|
61
|
+
self.outlines_tokenizer = TransformerTokenizer(
|
62
|
+
tokenizer_path, **tokenizer_args_dict
|
63
|
+
)
|
64
|
+
|
65
|
+
def init_value(self, regex):
|
66
|
+
return RegexGuide(regex, self.outlines_tokenizer)
|