sglang 0.1.18__tar.gz → 0.1.20__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.18/sglang.egg-info → sglang-0.1.20}/PKG-INFO +19 -13
- {sglang-0.1.18 → sglang-0.1.20}/README.md +17 -11
- {sglang-0.1.18 → sglang-0.1.20}/pyproject.toml +2 -2
- {sglang-0.1.18 → sglang-0.1.20}/sglang/__init__.py +1 -1
- {sglang-0.1.18 → sglang-0.1.20}/sglang/api.py +26 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/runtime_endpoint.py +18 -14
- {sglang-0.1.18 → sglang-0.1.20}/sglang/bench_latency.py +40 -18
- {sglang-0.1.18 → sglang-0.1.20}/sglang/global_config.py +21 -16
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/chat_template.py +41 -6
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/interpreter.py +5 -1
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/ir.py +61 -25
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/__init__.py +3 -2
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/hf_transformers_utils.py +7 -3
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/extend_attention.py +2 -1
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/fused_moe.py +181 -167
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/logits_processor.py +55 -19
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/radix_attention.py +33 -59
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/token_attention.py +4 -8
- sglang-0.1.20/sglang/srt/managers/controller/cuda_graph_runner.py +172 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/infer_batch.py +244 -36
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/manager_single.py +1 -1
- sglang-0.1.20/sglang/srt/managers/controller/model_runner.py +347 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/tp_worker.py +39 -20
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/detokenizer_manager.py +4 -2
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/io_struct.py +1 -1
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/tokenizer_manager.py +14 -13
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/memory_pool.py +33 -6
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/model_config.py +6 -0
- sglang-0.1.20/sglang/srt/models/gemma2.py +436 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llama2.py +3 -3
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llama_classification.py +10 -7
- sglang-0.1.20/sglang/srt/models/minicpm.py +373 -0
- sglang-0.1.20/sglang/srt/models/qwen2_moe.py +454 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/openai_api_adapter.py +2 -2
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/openai_protocol.py +1 -1
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/server.py +18 -8
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/server_args.py +24 -20
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/utils.py +68 -35
- {sglang-0.1.18 → sglang-0.1.20/sglang.egg-info}/PKG-INFO +19 -13
- {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/SOURCES.txt +4 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/requires.txt +1 -1
- sglang-0.1.18/sglang/srt/managers/controller/model_runner.py +0 -562
- {sglang-0.1.18 → sglang-0.1.20}/LICENSE +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/setup.cfg +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/litellm.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/openai.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/launch_server.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/conversation.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/flush_cache.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/dp_worker.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/manager_multi.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/radix_cache.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/managers/controller/schedule_heuristic.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/grok.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llava.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_programs.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang/utils.py +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.18 → sglang-0.1.20}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.20
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -229,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
|
|
229
229
|
Requires-Dist: uvicorn; extra == "srt"
|
230
230
|
Requires-Dist: uvloop; extra == "srt"
|
231
231
|
Requires-Dist: zmq; extra == "srt"
|
232
|
-
Requires-Dist: vllm==0.5.
|
232
|
+
Requires-Dist: vllm==0.5.1; extra == "srt"
|
233
233
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
234
234
|
Provides-Extra: openai
|
235
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
@@ -257,7 +257,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
|
|
257
257
|
|
258
258
|
The core features include:
|
259
259
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
-
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
261
261
|
|
262
262
|
## News
|
263
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -288,15 +288,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
288
288
|
git clone https://github.com/sgl-project/sglang.git
|
289
289
|
cd sglang
|
290
290
|
|
291
|
-
pip install --upgrade pip
|
292
291
|
pip install -e "python[all]"
|
293
292
|
|
294
293
|
# Install FlashInfer CUDA kernels
|
295
294
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
296
295
|
```
|
297
296
|
|
298
|
-
###
|
299
|
-
|
297
|
+
### Method 3: Using docker
|
298
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
299
|
+
|
300
|
+
### Common Notes
|
301
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
302
|
+
```
|
303
|
+
pip uninstall -y triton triton-nightly
|
304
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
305
|
+
```
|
300
306
|
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
307
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
302
308
|
|
@@ -518,8 +524,8 @@ for out in state.text_iter():
|
|
518
524
|
```
|
519
525
|
|
520
526
|
### Tips and Implementation Details
|
521
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
522
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
527
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
528
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
523
529
|
|
524
530
|
## Backend: SGLang Runtime (SRT)
|
525
531
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -576,7 +582,6 @@ response = client.chat.completions.create(
|
|
576
582
|
print(response)
|
577
583
|
```
|
578
584
|
|
579
|
-
|
580
585
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
581
586
|
|
582
587
|
If needed, you can also override the chat template when launching the server:
|
@@ -605,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
605
610
|
```
|
606
611
|
|
607
612
|
### Additional Arguments
|
608
|
-
- Add `--tp 2` to enable tensor parallelism.
|
613
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
609
614
|
```
|
610
615
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
611
616
|
```
|
@@ -623,9 +628,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
623
628
|
- Llama
|
624
629
|
- Mistral
|
625
630
|
- Mixtral
|
626
|
-
- Qwen / Qwen 2
|
627
|
-
- Gemma
|
628
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
631
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
632
|
+
- Gemma / Gemma 2
|
629
633
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
630
634
|
- LLaVA
|
631
635
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -638,6 +642,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
638
642
|
- StableLM
|
639
643
|
- Command-R
|
640
644
|
- DBRX
|
645
|
+
- Grok
|
646
|
+
- ChatGLM
|
641
647
|
- AWQ/GPTQ/Marlin quantization
|
642
648
|
|
643
649
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -11,7 +11,7 @@ It makes your interaction with LLMs faster and more controllable by co-designing
|
|
11
11
|
|
12
12
|
The core features include:
|
13
13
|
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
14
|
-
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
14
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -42,15 +42,21 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
|
42
42
|
git clone https://github.com/sgl-project/sglang.git
|
43
43
|
cd sglang
|
44
44
|
|
45
|
-
pip install --upgrade pip
|
46
45
|
pip install -e "python[all]"
|
47
46
|
|
48
47
|
# Install FlashInfer CUDA kernels
|
49
48
|
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
50
49
|
```
|
51
50
|
|
52
|
-
###
|
53
|
-
|
51
|
+
### Method 3: Using docker
|
52
|
+
The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
|
53
|
+
|
54
|
+
### Common Notes
|
55
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
|
56
|
+
```
|
57
|
+
pip uninstall -y triton triton-nightly
|
58
|
+
pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
|
59
|
+
```
|
54
60
|
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
55
61
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
56
62
|
|
@@ -272,8 +278,8 @@ for out in state.text_iter():
|
|
272
278
|
```
|
273
279
|
|
274
280
|
### Tips and Implementation Details
|
275
|
-
- The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
|
276
|
-
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
|
281
|
+
- The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
|
282
|
+
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
277
283
|
|
278
284
|
## Backend: SGLang Runtime (SRT)
|
279
285
|
The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
|
@@ -330,7 +336,6 @@ response = client.chat.completions.create(
|
|
330
336
|
print(response)
|
331
337
|
```
|
332
338
|
|
333
|
-
|
334
339
|
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
335
340
|
|
336
341
|
If needed, you can also override the chat template when launching the server:
|
@@ -359,7 +364,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
359
364
|
```
|
360
365
|
|
361
366
|
### Additional Arguments
|
362
|
-
- Add `--tp 2` to enable tensor parallelism.
|
367
|
+
- Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
|
363
368
|
```
|
364
369
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
365
370
|
```
|
@@ -377,9 +382,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
377
382
|
- Llama
|
378
383
|
- Mistral
|
379
384
|
- Mixtral
|
380
|
-
- Qwen / Qwen 2
|
381
|
-
- Gemma
|
382
|
-
- Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
|
385
|
+
- Qwen / Qwen 2 / Qwen 2 MoE
|
386
|
+
- Gemma / Gemma 2
|
383
387
|
- `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
|
384
388
|
- LLaVA
|
385
389
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
@@ -392,6 +396,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
392
396
|
- StableLM
|
393
397
|
- Command-R
|
394
398
|
- DBRX
|
399
|
+
- Grok
|
400
|
+
- ChatGLM
|
395
401
|
- AWQ/GPTQ/Marlin quantization
|
396
402
|
|
397
403
|
Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.20"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -21,7 +21,7 @@ dependencies = [
|
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
23
|
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
-
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.
|
24
|
+
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
|
25
25
|
openai = ["openai>=1.0", "tiktoken"]
|
26
26
|
anthropic = ["anthropic>=0.20.0"]
|
27
27
|
litellm = ["litellm>=1.0.0"]
|
@@ -67,10 +67,16 @@ def gen(
|
|
67
67
|
frequency_penalty: Optional[float] = None,
|
68
68
|
presence_penalty: Optional[float] = None,
|
69
69
|
ignore_eos: Optional[bool] = None,
|
70
|
+
return_logprob: Optional[bool] = None,
|
71
|
+
logprob_start_len: Optional[int] = None,
|
72
|
+
top_logprobs_num: Optional[int] = None,
|
73
|
+
return_text_in_logprobs: Optional[bool] = None,
|
70
74
|
dtype: Optional[type] = None,
|
71
75
|
choices: Optional[List[str]] = None,
|
72
76
|
regex: Optional[str] = None,
|
73
77
|
):
|
78
|
+
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
79
|
+
|
74
80
|
if choices:
|
75
81
|
return SglSelect(name, choices, 0.0 if temperature is None else temperature)
|
76
82
|
|
@@ -91,6 +97,10 @@ def gen(
|
|
91
97
|
frequency_penalty,
|
92
98
|
presence_penalty,
|
93
99
|
ignore_eos,
|
100
|
+
return_logprob,
|
101
|
+
logprob_start_len,
|
102
|
+
top_logprobs_num,
|
103
|
+
return_text_in_logprobs,
|
94
104
|
dtype,
|
95
105
|
regex,
|
96
106
|
)
|
@@ -106,6 +116,10 @@ def gen_int(
|
|
106
116
|
frequency_penalty: Optional[float] = None,
|
107
117
|
presence_penalty: Optional[float] = None,
|
108
118
|
ignore_eos: Optional[bool] = None,
|
119
|
+
return_logprob: Optional[bool] = None,
|
120
|
+
logprob_start_len: Optional[int] = None,
|
121
|
+
top_logprobs_num: Optional[int] = None,
|
122
|
+
return_text_in_logprobs: Optional[bool] = None,
|
109
123
|
):
|
110
124
|
return SglGen(
|
111
125
|
name,
|
@@ -117,6 +131,10 @@ def gen_int(
|
|
117
131
|
frequency_penalty,
|
118
132
|
presence_penalty,
|
119
133
|
ignore_eos,
|
134
|
+
return_logprob,
|
135
|
+
logprob_start_len,
|
136
|
+
top_logprobs_num,
|
137
|
+
return_text_in_logprobs,
|
120
138
|
int,
|
121
139
|
None,
|
122
140
|
)
|
@@ -132,6 +150,10 @@ def gen_string(
|
|
132
150
|
frequency_penalty: Optional[float] = None,
|
133
151
|
presence_penalty: Optional[float] = None,
|
134
152
|
ignore_eos: Optional[bool] = None,
|
153
|
+
return_logprob: Optional[bool] = None,
|
154
|
+
logprob_start_len: Optional[int] = None,
|
155
|
+
top_logprobs_num: Optional[int] = None,
|
156
|
+
return_text_in_logprobs: Optional[bool] = None,
|
135
157
|
):
|
136
158
|
return SglGen(
|
137
159
|
name,
|
@@ -143,6 +165,10 @@ def gen_string(
|
|
143
165
|
frequency_penalty,
|
144
166
|
presence_penalty,
|
145
167
|
ignore_eos,
|
168
|
+
return_logprob,
|
169
|
+
logprob_start_len,
|
170
|
+
top_logprobs_num,
|
171
|
+
return_text_in_logprobs,
|
146
172
|
str,
|
147
173
|
None,
|
148
174
|
)
|
@@ -1,18 +1,18 @@
|
|
1
1
|
import json
|
2
|
-
from typing import
|
2
|
+
from typing import List, Optional
|
3
3
|
|
4
4
|
import numpy as np
|
5
|
-
import requests
|
6
5
|
|
7
6
|
from sglang.backend.base_backend import BaseBackend
|
8
7
|
from sglang.global_config import global_config
|
9
8
|
from sglang.lang.chat_template import get_chat_template_by_model_path
|
10
9
|
from sglang.lang.interpreter import StreamExecutor
|
11
|
-
from sglang.lang.ir import
|
12
|
-
from sglang.utils import
|
10
|
+
from sglang.lang.ir import SglSamplingParams
|
11
|
+
from sglang.utils import http_request
|
13
12
|
|
14
13
|
|
15
14
|
class RuntimeEndpoint(BaseBackend):
|
15
|
+
|
16
16
|
def __init__(
|
17
17
|
self,
|
18
18
|
base_url: str,
|
@@ -38,8 +38,7 @@ class RuntimeEndpoint(BaseBackend):
|
|
38
38
|
self.model_info = res.json()
|
39
39
|
|
40
40
|
self.chat_template = get_chat_template_by_model_path(
|
41
|
-
self.model_info["model_path"]
|
42
|
-
)
|
41
|
+
self.model_info["model_path"])
|
43
42
|
|
44
43
|
def get_model_name(self):
|
45
44
|
return self.model_info["model_path"]
|
@@ -125,6 +124,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
125
124
|
else:
|
126
125
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
127
126
|
|
127
|
+
for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
|
128
|
+
value = getattr(sampling_params, item, None)
|
129
|
+
if value is not None:
|
130
|
+
data[item] = value
|
131
|
+
|
128
132
|
self._add_images(s, data)
|
129
133
|
|
130
134
|
res = http_request(
|
@@ -167,6 +171,11 @@ class RuntimeEndpoint(BaseBackend):
|
|
167
171
|
else:
|
168
172
|
raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
|
169
173
|
|
174
|
+
for item in ["return_logprob", "logprob_start_len", "top_logprobs_num", "return_text_in_logprobs"]:
|
175
|
+
value = getattr(sampling_params, item, None)
|
176
|
+
if value is not None:
|
177
|
+
data[item] = value
|
178
|
+
|
170
179
|
data["stream"] = True
|
171
180
|
self._add_images(s, data)
|
172
181
|
|
@@ -181,21 +190,16 @@ class RuntimeEndpoint(BaseBackend):
|
|
181
190
|
self._assert_success(res)
|
182
191
|
pos = 0
|
183
192
|
|
184
|
-
incomplete_text = ""
|
185
193
|
for chunk in res.iter_lines(decode_unicode=False):
|
186
194
|
chunk = chunk.decode("utf-8")
|
187
195
|
if chunk and chunk.startswith("data:"):
|
188
196
|
if chunk == "data: [DONE]":
|
189
197
|
break
|
190
198
|
data = json.loads(chunk[5:].strip("\n"))
|
191
|
-
|
199
|
+
chunk_text = data["text"][pos:]
|
192
200
|
meta_info = data["meta_info"]
|
193
|
-
pos += len(
|
194
|
-
|
195
|
-
yield text, meta_info
|
196
|
-
|
197
|
-
if len(incomplete_text) > 0:
|
198
|
-
yield incomplete_text, meta_info
|
201
|
+
pos += len(chunk_text)
|
202
|
+
yield chunk_text, meta_info
|
199
203
|
|
200
204
|
def select(
|
201
205
|
self,
|
@@ -32,6 +32,7 @@ import logging
|
|
32
32
|
import multiprocessing
|
33
33
|
import time
|
34
34
|
|
35
|
+
|
35
36
|
import numpy as np
|
36
37
|
import torch
|
37
38
|
import torch.distributed as dist
|
@@ -70,6 +71,7 @@ class BenchArgs:
|
|
70
71
|
|
71
72
|
def load_model(server_args, tp_rank):
|
72
73
|
suppress_other_loggers()
|
74
|
+
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
73
75
|
|
74
76
|
model_config = ModelConfig(path=server_args.model_path)
|
75
77
|
model_runner = ModelRunner(
|
@@ -81,7 +83,7 @@ def load_model(server_args, tp_rank):
|
|
81
83
|
nccl_port=28888,
|
82
84
|
server_args=server_args,
|
83
85
|
)
|
84
|
-
|
86
|
+
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
|
85
87
|
tokenizer = get_tokenizer(
|
86
88
|
server_args.tokenizer_path,
|
87
89
|
tokenizer_mode=server_args.tokenizer_mode,
|
@@ -108,7 +110,7 @@ def prepare_inputs(bench_args, tokenizer):
|
|
108
110
|
for i in range(len(prompts)):
|
109
111
|
assert len(input_ids[i]) > bench_args.cut_len
|
110
112
|
|
111
|
-
tmp_input_ids = input_ids[i][:bench_args.cut_len]
|
113
|
+
tmp_input_ids = input_ids[i][: bench_args.cut_len]
|
112
114
|
req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
|
113
115
|
req.prefix_indices = []
|
114
116
|
req.sampling_params = sampling_params
|
@@ -121,9 +123,9 @@ def prepare_inputs(bench_args, tokenizer):
|
|
121
123
|
def prepare_extend_inputs(bench_args, input_ids, reqs, model_runner):
|
122
124
|
for i in range(len(reqs)):
|
123
125
|
req = reqs[i]
|
124
|
-
req.input_ids += input_ids[i][bench_args.cut_len:]
|
126
|
+
req.input_ids += input_ids[i][bench_args.cut_len :]
|
125
127
|
req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
|
126
|
-
i, :bench_args.cut_len
|
128
|
+
i, : bench_args.cut_len
|
127
129
|
]
|
128
130
|
return reqs
|
129
131
|
|
@@ -151,7 +153,8 @@ def extend(reqs, model_runner):
|
|
151
153
|
reqs=reqs,
|
152
154
|
req_to_token_pool=model_runner.req_to_token_pool,
|
153
155
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
154
|
-
tree_cache=None
|
156
|
+
tree_cache=None,
|
157
|
+
)
|
155
158
|
batch.prepare_for_extend(model_runner.model_config.vocab_size, None)
|
156
159
|
output = model_runner.forward(batch, ForwardMode.EXTEND)
|
157
160
|
next_token_ids, _ = batch.sample(output.next_token_logits)
|
@@ -165,6 +168,7 @@ def decode(input_token_ids, batch, model_runner):
|
|
165
168
|
return next_token_ids, output.next_token_logits
|
166
169
|
|
167
170
|
|
171
|
+
@torch.inference_mode()
|
168
172
|
def correctness_test(
|
169
173
|
server_args,
|
170
174
|
bench_args,
|
@@ -178,9 +182,10 @@ def correctness_test(
|
|
178
182
|
# Prepare inputs
|
179
183
|
input_ids, reqs = prepare_inputs(bench_args, tokenizer)
|
180
184
|
|
181
|
-
|
182
|
-
|
183
|
-
|
185
|
+
if bench_args.cut_len > 0:
|
186
|
+
# Prefill
|
187
|
+
next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
|
188
|
+
rank_print("prefill logits (first half)", next_token_logits)
|
184
189
|
|
185
190
|
# Prepare extend inputs
|
186
191
|
reqs = prepare_extend_inputs(bench_args, input_ids, reqs, model_runner)
|
@@ -190,7 +195,7 @@ def correctness_test(
|
|
190
195
|
rank_print("prefill logits (final)", next_token_logits)
|
191
196
|
|
192
197
|
# Decode
|
193
|
-
output_ids = [
|
198
|
+
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
194
199
|
for _ in range(bench_args.output_len):
|
195
200
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
196
201
|
for i in range(len(reqs)):
|
@@ -198,7 +203,7 @@ def correctness_test(
|
|
198
203
|
|
199
204
|
# Print
|
200
205
|
for i in range(len(reqs)):
|
201
|
-
|
206
|
+
rank_print(tokenizer.decode(output_ids[i]))
|
202
207
|
|
203
208
|
|
204
209
|
def latency_test(
|
@@ -210,7 +215,9 @@ def latency_test(
|
|
210
215
|
|
211
216
|
# Load the model
|
212
217
|
model_runner, tokenizer = load_model(server_args, tp_rank)
|
213
|
-
|
218
|
+
rank_print(
|
219
|
+
f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
|
220
|
+
)
|
214
221
|
|
215
222
|
# Prepare inputs
|
216
223
|
reqs = prepare_synthetic_inputs(bench_args, tokenizer)
|
@@ -230,7 +237,9 @@ def latency_test(
|
|
230
237
|
prefill_latency = time.time() - tic
|
231
238
|
tot_latency += prefill_latency
|
232
239
|
throughput = bench_args.input_len * bench_args.batch_size / prefill_latency
|
233
|
-
rank_print(
|
240
|
+
rank_print(
|
241
|
+
f"Prefill. latency: {prefill_latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
242
|
+
)
|
234
243
|
|
235
244
|
# Decode
|
236
245
|
for i in range(output_len):
|
@@ -241,13 +250,24 @@ def latency_test(
|
|
241
250
|
latency = time.time() - tic
|
242
251
|
tot_latency += latency
|
243
252
|
throughput = bench_args.batch_size / latency
|
244
|
-
if i < 5:
|
253
|
+
if i < 5:
|
254
|
+
rank_print(
|
255
|
+
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
256
|
+
)
|
245
257
|
avg_decode_latency = (tot_latency - prefill_latency) / output_len
|
246
258
|
avg_decode_throughput = bench_args.batch_size / avg_decode_latency
|
247
|
-
rank_print(
|
248
|
-
|
249
|
-
|
250
|
-
|
259
|
+
rank_print(
|
260
|
+
f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
|
261
|
+
)
|
262
|
+
|
263
|
+
throughput = (
|
264
|
+
(bench_args.input_len + bench_args.output_len)
|
265
|
+
* bench_args.batch_size
|
266
|
+
/ tot_latency
|
267
|
+
)
|
268
|
+
rank_print(
|
269
|
+
f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
|
270
|
+
)
|
251
271
|
|
252
272
|
# Warm up
|
253
273
|
run_once(4)
|
@@ -281,6 +301,8 @@ def main(server_args, bench_args):
|
|
281
301
|
for proc in workers:
|
282
302
|
proc.join()
|
283
303
|
|
304
|
+
proc.terminate()
|
305
|
+
|
284
306
|
|
285
307
|
if __name__ == "__main__":
|
286
308
|
parser = argparse.ArgumentParser()
|
@@ -296,4 +318,4 @@ if __name__ == "__main__":
|
|
296
318
|
format="%(message)s",
|
297
319
|
)
|
298
320
|
|
299
|
-
main(server_args, bench_args)
|
321
|
+
main(server_args, bench_args)
|
@@ -8,35 +8,40 @@ class GlobalConfig:
|
|
8
8
|
# 2: output final text after every run
|
9
9
|
self.verbosity = 0
|
10
10
|
|
11
|
+
# Default backend of the language
|
11
12
|
self.default_backend = None
|
12
13
|
|
13
|
-
#
|
14
|
+
# Runtime constants: Request dependency time due to network delay
|
15
|
+
self.request_dependency_delay = 0.02
|
16
|
+
self.wait_for_new_request_delay = 0.0006
|
17
|
+
|
18
|
+
# Runtime constants: New generation token ratio estimation
|
19
|
+
self.base_new_token_ratio = 0.4
|
20
|
+
self.base_min_new_token_ratio = 0.2
|
21
|
+
self.new_token_ratio_decay = 0.0001
|
22
|
+
self.new_token_ratio_recovery = 0.05
|
23
|
+
|
24
|
+
# Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
|
25
|
+
# This can improve the speed for large batch sizes during prefill.
|
26
|
+
self.layer_sync_threshold = 8192
|
27
|
+
|
28
|
+
# Runtime constants: Flashinfer
|
29
|
+
self.flashinfer_workspace_size = 192 * 1024 * 1024
|
30
|
+
|
31
|
+
# Output tokenization configs
|
14
32
|
self.skip_special_tokens_in_output = True
|
15
33
|
self.spaces_between_special_tokens_in_out = True
|
16
34
|
|
17
|
-
#
|
35
|
+
# Interpreter optimization configs
|
18
36
|
self.eager_fill_image = False
|
19
37
|
self.enable_precache_with_tracing = True
|
20
38
|
self.enable_parallel_encoding = True
|
21
39
|
self.enable_parallel_decoding = True
|
22
40
|
|
41
|
+
# Deprecated
|
23
42
|
# Choices: ["no_adjust", "adjust_cache"]
|
24
43
|
# no_adjust: Do not adjust the position embedding of KV cache.
|
25
44
|
# adjust_cache: Adjust the position embedding of KV cache.
|
26
45
|
self.concate_and_append_mode = "no_adjust"
|
27
46
|
|
28
|
-
# Request dependency time due to network delay
|
29
|
-
self.request_dependency_delay = 0.02
|
30
|
-
self.wait_for_new_request_delay = 0.0006
|
31
|
-
|
32
|
-
# New generation token ratio estimation
|
33
|
-
self.base_new_token_ratio = 0.4
|
34
|
-
self.base_min_new_token_ratio = 0.2
|
35
|
-
self.new_token_ratio_decay = 0.0001
|
36
|
-
self.new_token_ratio_recovery = 0.05
|
37
|
-
|
38
|
-
# The threshold (number of tokens) to trigger layer-wise cuda sync.
|
39
|
-
# This can improve the speed for large batch sizes during prefill.
|
40
|
-
self.layer_sync_threshold = 8192
|
41
|
-
|
42
47
|
global_config = GlobalConfig()
|
@@ -84,7 +84,7 @@ register_chat_template(
|
|
84
84
|
"system": ("SYSTEM:", "\n"),
|
85
85
|
"user": ("USER:", "\n"),
|
86
86
|
"assistant": ("ASSISTANT:", "\n"),
|
87
|
-
}
|
87
|
+
}
|
88
88
|
)
|
89
89
|
)
|
90
90
|
|
@@ -116,6 +116,23 @@ register_chat_template(
|
|
116
116
|
)
|
117
117
|
)
|
118
118
|
|
119
|
+
# There is default system prompt for qwen
|
120
|
+
# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
|
121
|
+
# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
122
|
+
register_chat_template(
|
123
|
+
ChatTemplate(
|
124
|
+
name="qwen",
|
125
|
+
default_system_prompt="You are a helpful assistant.",
|
126
|
+
role_prefix_and_suffix={
|
127
|
+
"system": ("<|im_start|>system\n", "<|im_end|>\n"),
|
128
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n"),
|
129
|
+
"assistant": ("<|im_start|>assistant\n", "<|im_end|>\n"),
|
130
|
+
},
|
131
|
+
style=ChatTemplateStyle.PLAIN,
|
132
|
+
stop_str=("<|im_end|>",),
|
133
|
+
)
|
134
|
+
)
|
135
|
+
|
119
136
|
|
120
137
|
register_chat_template(
|
121
138
|
ChatTemplate(
|
@@ -132,6 +149,7 @@ register_chat_template(
|
|
132
149
|
)
|
133
150
|
)
|
134
151
|
|
152
|
+
# Reference: https://github.com/lm-sys/FastChat/blob/main/docs/vicuna_weights_version.md#prompt-template
|
135
153
|
register_chat_template(
|
136
154
|
ChatTemplate(
|
137
155
|
name="vicuna_v1.1",
|
@@ -148,6 +166,20 @@ register_chat_template(
|
|
148
166
|
)
|
149
167
|
)
|
150
168
|
|
169
|
+
# Reference: https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/file/view/master?fileName=tokenizer_config.json&status=1
|
170
|
+
register_chat_template(
|
171
|
+
ChatTemplate(
|
172
|
+
name="yi-1.5",
|
173
|
+
default_system_prompt=None,
|
174
|
+
role_prefix_and_suffix={
|
175
|
+
"system": ("", ""),
|
176
|
+
"user": ("<|im_start|>user\n", "<|im_end|>\n<|im_start|>assistant\n"),
|
177
|
+
"assistant": ("", "<|im_end|>\n"),
|
178
|
+
},
|
179
|
+
style=ChatTemplateStyle.PLAIN,
|
180
|
+
stop_str=("<|im_end|>",)
|
181
|
+
)
|
182
|
+
)
|
151
183
|
|
152
184
|
register_chat_template(
|
153
185
|
ChatTemplate(
|
@@ -187,7 +219,7 @@ register_chat_template(
|
|
187
219
|
# Reference: https://github.com/01-ai/Yi/tree/main/VL#major-difference-with-llava
|
188
220
|
register_chat_template(
|
189
221
|
ChatTemplate(
|
190
|
-
name="yi",
|
222
|
+
name="yi-vl",
|
191
223
|
default_system_prompt=(
|
192
224
|
"This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. Read all the images carefully, and respond to the human's questions with informative, helpful, detailed and polite answers."
|
193
225
|
"这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。仔细阅读所有的图像,并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。"
|
@@ -289,8 +321,9 @@ def match_chat_ml(model_path: str):
|
|
289
321
|
model_path = model_path.lower()
|
290
322
|
if "tinyllama" in model_path:
|
291
323
|
return get_chat_template("chatml")
|
292
|
-
|
293
|
-
|
324
|
+
# Now the suffix for qwen2 chat model is "instruct"
|
325
|
+
if "qwen" in model_path and ("chat" in model_path or "instruct" in model_path):
|
326
|
+
return get_chat_template("qwen")
|
294
327
|
if (
|
295
328
|
"llava-v1.6-34b" in model_path
|
296
329
|
or "llava-v1.6-yi-34b" in model_path
|
@@ -302,8 +335,10 @@ def match_chat_ml(model_path: str):
|
|
302
335
|
@register_chat_template_matching_function
|
303
336
|
def match_chat_yi(model_path: str):
|
304
337
|
model_path = model_path.lower()
|
305
|
-
if "yi" in model_path and "llava" not in model_path:
|
306
|
-
return get_chat_template("yi")
|
338
|
+
if "yi-vl" in model_path and "llava" not in model_path:
|
339
|
+
return get_chat_template("yi-vl")
|
340
|
+
elif "yi-1.5" in model_path and "chat" in model_path:
|
341
|
+
return get_chat_template("yi-1.5")
|
307
342
|
|
308
343
|
|
309
344
|
@register_chat_template_matching_function
|