sglang 0.1.15__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.15/sglang.egg-info → sglang-0.1.17}/PKG-INFO +23 -13
- {sglang-0.1.15 → sglang-0.1.17}/README.md +15 -10
- {sglang-0.1.15 → sglang-0.1.17}/pyproject.toml +5 -4
- {sglang-0.1.15 → sglang-0.1.17}/sglang/__init__.py +5 -1
- {sglang-0.1.15 → sglang-0.1.17}/sglang/api.py +8 -3
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/anthropic.py +1 -1
- sglang-0.1.17/sglang/backend/litellm.py +90 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/openai.py +148 -12
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/runtime_endpoint.py +18 -10
- {sglang-0.1.15 → sglang-0.1.17}/sglang/global_config.py +11 -1
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/chat_template.py +9 -2
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/interpreter.py +161 -81
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/ir.py +29 -11
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/tracer.py +1 -1
- {sglang-0.1.15 → sglang-0.1.17}/sglang/launch_server.py +1 -2
- sglang-0.1.17/sglang/launch_server_llavavid.py +31 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/fsm_cache.py +3 -0
- sglang-0.1.17/sglang/srt/flush_cache.py +16 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/hf_transformers_utils.py +83 -2
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/extend_attention.py +17 -0
- sglang-0.1.17/sglang/srt/layers/fused_moe.py +485 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/logits_processor.py +12 -7
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/radix_attention.py +10 -3
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/token_attention.py +16 -1
- sglang-0.1.17/sglang/srt/managers/controller/dp_worker.py +110 -0
- sglang-0.1.17/sglang/srt/managers/controller/infer_batch.py +619 -0
- sglang-0.1.17/sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang-0.1.17/sglang/srt/managers/controller/manager_single.py +97 -0
- sglang-0.1.17/sglang/srt/managers/controller/model_runner.py +462 -0
- {sglang-0.1.15/sglang/srt/managers/router → sglang-0.1.17/sglang/srt/managers/controller}/radix_cache.py +54 -18
- sglang-0.1.17/sglang/srt/managers/controller/schedule_heuristic.py +59 -0
- sglang-0.1.17/sglang/srt/managers/controller/tp_worker.py +791 -0
- sglang-0.1.17/sglang/srt/managers/detokenizer_manager.py +95 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/io_struct.py +26 -10
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/infer_batch.py +130 -74
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/manager.py +7 -9
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/model_rpc.py +224 -135
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/model_runner.py +94 -107
- sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +267 -0
- sglang-0.1.17/sglang/srt/managers/router/scheduler.py +59 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/tokenizer_manager.py +183 -88
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/model_config.py +5 -2
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/commandr.py +15 -22
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/dbrx.py +22 -29
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/gemma.py +14 -24
- sglang-0.1.17/sglang/srt/models/grok.py +671 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/llama2.py +24 -23
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/llava.py +85 -25
- sglang-0.1.17/sglang/srt/models/llavavid.py +298 -0
- sglang-0.1.17/sglang/srt/models/mixtral.py +513 -0
- sglang-0.1.15/sglang/srt/models/mixtral.py → sglang-0.1.17/sglang/srt/models/mixtral_quant.py +18 -34
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/qwen.py +28 -25
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/qwen2.py +17 -22
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/stablelm.py +21 -26
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/yivl.py +17 -25
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/openai_api_adapter.py +140 -95
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/openai_protocol.py +10 -1
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/server.py +101 -52
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/server_args.py +59 -11
- sglang-0.1.17/sglang/srt/utils.py +484 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_programs.py +44 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_utils.py +32 -1
- {sglang-0.1.15 → sglang-0.1.17}/sglang/utils.py +95 -26
- {sglang-0.1.15 → sglang-0.1.17/sglang.egg-info}/PKG-INFO +23 -13
- {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/SOURCES.txt +15 -3
- {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/requires.txt +8 -2
- sglang-0.1.15/sglang/srt/backend_config.py +0 -13
- sglang-0.1.15/sglang/srt/managers/detokenizer_manager.py +0 -95
- sglang-0.1.15/sglang/srt/managers/router/scheduler.py +0 -70
- sglang-0.1.15/sglang/srt/models/dbrx_config.py +0 -281
- sglang-0.1.15/sglang/srt/utils.py +0 -317
- sglang-0.1.15/sglang/srt/weight_utils.py +0 -402
- {sglang-0.1.15 → sglang-0.1.17}/LICENSE +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/setup.cfg +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/conversation.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.17
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -222,12 +222,14 @@ Requires-Dist: torch; extra == "srt"
|
|
222
222
|
Requires-Dist: uvloop; extra == "srt"
|
223
223
|
Requires-Dist: uvicorn; extra == "srt"
|
224
224
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm
|
225
|
+
Requires-Dist: vllm==0.4.3; extra == "srt"
|
226
226
|
Requires-Dist: interegular; extra == "srt"
|
227
227
|
Requires-Dist: pydantic; extra == "srt"
|
228
228
|
Requires-Dist: pillow; extra == "srt"
|
229
|
-
Requires-Dist: outlines>=0.0.27; extra == "srt"
|
230
229
|
Requires-Dist: packaging; extra == "srt"
|
230
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
+
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
231
233
|
Provides-Extra: openai
|
232
234
|
Requires-Dist: openai>=1.0; extra == "openai"
|
233
235
|
Requires-Dist: numpy; extra == "openai"
|
@@ -235,10 +237,13 @@ Requires-Dist: tiktoken; extra == "openai"
|
|
235
237
|
Provides-Extra: anthropic
|
236
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
237
239
|
Requires-Dist: numpy; extra == "anthropic"
|
240
|
+
Provides-Extra: litellm
|
241
|
+
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
238
242
|
Provides-Extra: all
|
239
243
|
Requires-Dist: sglang[srt]; extra == "all"
|
240
244
|
Requires-Dist: sglang[openai]; extra == "all"
|
241
245
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
246
|
+
Requires-Dist: sglang[litellm]; extra == "all"
|
242
247
|
|
243
248
|
<div align="center">
|
244
249
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -251,9 +256,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
|
|
251
256
|
SGLang is a structured generation language designed for large language models (LLMs).
|
252
257
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
253
258
|
|
254
|
-
The core features
|
259
|
+
The core features include:
|
255
260
|
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
|
256
|
-
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by
|
261
|
+
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
|
257
262
|
|
258
263
|
## News
|
259
264
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -286,12 +291,8 @@ pip install -e "python[all]"
|
|
286
291
|
```
|
287
292
|
|
288
293
|
### Notes
|
289
|
-
- If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
|
290
|
-
- For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
|
291
|
-
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
|
292
294
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
293
295
|
|
294
|
-
|
295
296
|
## Quick Start
|
296
297
|
The example below shows how to use sglang to answer a mulit-turn question.
|
297
298
|
|
@@ -568,15 +569,17 @@ response = client.chat.completions.create(
|
|
568
569
|
print(response)
|
569
570
|
```
|
570
571
|
|
571
|
-
|
572
|
-
|
572
|
+
|
573
|
+
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
574
|
+
|
575
|
+
If needed, you can also override the chat template when launching the server:
|
573
576
|
|
574
577
|
```
|
575
578
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
576
579
|
```
|
577
580
|
|
578
581
|
If the chat template you are looking for is missing, you are welcome to contribute it.
|
579
|
-
Meanwhile, you can also
|
582
|
+
Meanwhile, you can also temporarily register your chat template as follows:
|
580
583
|
|
581
584
|
```json
|
582
585
|
{
|
@@ -599,11 +602,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
599
602
|
```
|
600
603
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
601
604
|
```
|
605
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
606
|
+
```
|
607
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
608
|
+
```
|
602
609
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
603
610
|
```
|
604
611
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
605
612
|
```
|
606
|
-
-
|
613
|
+
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
607
615
|
|
608
616
|
### Supported Models
|
609
617
|
- Llama
|
@@ -617,6 +625,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
617
625
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
618
626
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
619
627
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
628
|
+
- LLaVA-NeXT-Video
|
629
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
620
630
|
- Yi-VL
|
621
631
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
622
632
|
- StableLM
|
@@ -9,9 +9,9 @@
|
|
9
9
|
SGLang is a structured generation language designed for large language models (LLMs).
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
|
-
The core features
|
12
|
+
The core features include:
|
13
13
|
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
|
14
|
-
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by
|
14
|
+
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -44,12 +44,8 @@ pip install -e "python[all]"
|
|
44
44
|
```
|
45
45
|
|
46
46
|
### Notes
|
47
|
-
- If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
|
48
|
-
- For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
|
49
|
-
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
|
50
47
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
51
48
|
|
52
|
-
|
53
49
|
## Quick Start
|
54
50
|
The example below shows how to use sglang to answer a mulit-turn question.
|
55
51
|
|
@@ -326,15 +322,17 @@ response = client.chat.completions.create(
|
|
326
322
|
print(response)
|
327
323
|
```
|
328
324
|
|
329
|
-
|
330
|
-
|
325
|
+
|
326
|
+
By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
|
327
|
+
|
328
|
+
If needed, you can also override the chat template when launching the server:
|
331
329
|
|
332
330
|
```
|
333
331
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
|
334
332
|
```
|
335
333
|
|
336
334
|
If the chat template you are looking for is missing, you are welcome to contribute it.
|
337
|
-
Meanwhile, you can also
|
335
|
+
Meanwhile, you can also temporarily register your chat template as follows:
|
338
336
|
|
339
337
|
```json
|
340
338
|
{
|
@@ -357,11 +355,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
357
355
|
```
|
358
356
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
359
357
|
```
|
358
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
359
|
+
```
|
360
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
361
|
+
```
|
360
362
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
361
363
|
```
|
362
364
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
363
365
|
```
|
364
|
-
-
|
366
|
+
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
367
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
365
368
|
|
366
369
|
### Supported Models
|
367
370
|
- Llama
|
@@ -375,6 +378,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
375
378
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
376
379
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
377
380
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
381
|
+
- LLaVA-NeXT-Video
|
382
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
378
383
|
- Yi-VL
|
379
384
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
380
385
|
- StableLM
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
8
|
-
description = "A structured generation langauge for LLMs."
|
7
|
+
version = "0.1.17"
|
8
|
+
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
11
11
|
license = {file = "LICENSE"}
|
@@ -20,10 +20,11 @@ dependencies = [
|
|
20
20
|
|
21
21
|
[project.optional-dependencies]
|
22
22
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
23
|
-
"zmq", "vllm
|
23
|
+
"zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
|
24
24
|
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
25
25
|
anthropic = ["anthropic>=0.20.0", "numpy"]
|
26
|
-
|
26
|
+
litellm = ["litellm>=1.0.0"]
|
27
|
+
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
27
28
|
|
28
29
|
[project.urls]
|
29
30
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.17"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -19,6 +19,7 @@ from sglang.api import (
|
|
19
19
|
user,
|
20
20
|
user_begin,
|
21
21
|
user_end,
|
22
|
+
video,
|
22
23
|
)
|
23
24
|
|
24
25
|
# SGL Backends
|
@@ -26,6 +27,7 @@ from sglang.backend.anthropic import Anthropic
|
|
26
27
|
from sglang.backend.openai import OpenAI
|
27
28
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
28
29
|
from sglang.backend.vertexai import VertexAI
|
30
|
+
from sglang.backend.litellm import LiteLLM
|
29
31
|
|
30
32
|
# Global Configurations
|
31
33
|
from sglang.global_config import global_config
|
@@ -34,6 +36,7 @@ from sglang.global_config import global_config
|
|
34
36
|
__all__ = [
|
35
37
|
"global_config",
|
36
38
|
"Anthropic",
|
39
|
+
"LiteLLM",
|
37
40
|
"OpenAI",
|
38
41
|
"RuntimeEndpoint",
|
39
42
|
"VertexAI",
|
@@ -46,6 +49,7 @@ __all__ = [
|
|
46
49
|
"gen_int",
|
47
50
|
"gen_string",
|
48
51
|
"image",
|
52
|
+
"video",
|
49
53
|
"select",
|
50
54
|
"system",
|
51
55
|
"user",
|
@@ -15,17 +15,18 @@ from sglang.lang.ir import (
|
|
15
15
|
SglRoleBegin,
|
16
16
|
SglRoleEnd,
|
17
17
|
SglSelect,
|
18
|
+
SglVideo,
|
18
19
|
)
|
19
20
|
|
20
21
|
|
21
22
|
def function(
|
22
|
-
func: Optional[Callable] = None,
|
23
|
+
func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
|
23
24
|
):
|
24
25
|
if func:
|
25
|
-
return SglFunction(func,
|
26
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
26
27
|
|
27
28
|
def decorator(func):
|
28
|
-
return SglFunction(func,
|
29
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
29
30
|
|
30
31
|
return decorator
|
31
32
|
|
@@ -151,6 +152,10 @@ def image(expr: SglExpr):
|
|
151
152
|
return SglImage(expr)
|
152
153
|
|
153
154
|
|
155
|
+
def video(path: str, num_frames: int):
|
156
|
+
return SglVideo(path, num_frames)
|
157
|
+
|
158
|
+
|
154
159
|
def select(
|
155
160
|
name: Optional[str] = None,
|
156
161
|
choices: List[str] = None,
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from typing import Mapping, Optional
|
2
|
+
|
3
|
+
from sglang.backend.base_backend import BaseBackend
|
4
|
+
from sglang.lang.chat_template import get_chat_template_by_model_path
|
5
|
+
from sglang.lang.interpreter import StreamExecutor
|
6
|
+
from sglang.lang.ir import SglSamplingParams
|
7
|
+
|
8
|
+
try:
|
9
|
+
import litellm
|
10
|
+
except ImportError as e:
|
11
|
+
litellm = e
|
12
|
+
litellm.num_retries = 1
|
13
|
+
|
14
|
+
|
15
|
+
class LiteLLM(BaseBackend):
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
model_name,
|
20
|
+
chat_template=None,
|
21
|
+
api_key=None,
|
22
|
+
organization: Optional[str] = None,
|
23
|
+
base_url: Optional[str] = None,
|
24
|
+
timeout: Optional[float] = 600,
|
25
|
+
max_retries: Optional[int] = litellm.num_retries,
|
26
|
+
default_headers: Optional[Mapping[str, str]] = None,
|
27
|
+
):
|
28
|
+
super().__init__()
|
29
|
+
|
30
|
+
if isinstance(litellm, Exception):
|
31
|
+
raise litellm
|
32
|
+
|
33
|
+
self.model_name = model_name
|
34
|
+
|
35
|
+
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
+
model_name)
|
37
|
+
|
38
|
+
self.client_params = {
|
39
|
+
"api_key": api_key,
|
40
|
+
"organization": organization,
|
41
|
+
"base_url": base_url,
|
42
|
+
"timeout": timeout,
|
43
|
+
"max_retries": max_retries,
|
44
|
+
"default_headers": default_headers,
|
45
|
+
}
|
46
|
+
|
47
|
+
def get_chat_template(self):
|
48
|
+
return self.chat_template
|
49
|
+
|
50
|
+
def generate(
|
51
|
+
self,
|
52
|
+
s: StreamExecutor,
|
53
|
+
sampling_params: SglSamplingParams,
|
54
|
+
):
|
55
|
+
if s.messages_:
|
56
|
+
messages = s.messages_
|
57
|
+
else:
|
58
|
+
messages = [{"role": "user", "content": s.text_}]
|
59
|
+
|
60
|
+
ret = litellm.completion(
|
61
|
+
model=self.model_name,
|
62
|
+
messages=messages,
|
63
|
+
**self.client_params,
|
64
|
+
**sampling_params.to_anthropic_kwargs(),
|
65
|
+
)
|
66
|
+
comp = ret.choices[0].message.content
|
67
|
+
|
68
|
+
return comp, {}
|
69
|
+
|
70
|
+
def generate_stream(
|
71
|
+
self,
|
72
|
+
s: StreamExecutor,
|
73
|
+
sampling_params: SglSamplingParams,
|
74
|
+
):
|
75
|
+
if s.messages_:
|
76
|
+
messages = s.messages_
|
77
|
+
else:
|
78
|
+
messages = [{"role": "user", "content": s.text_}]
|
79
|
+
|
80
|
+
ret = litellm.completion(
|
81
|
+
model=self.model_name,
|
82
|
+
messages=messages,
|
83
|
+
stream=True,
|
84
|
+
**self.client_params,
|
85
|
+
**sampling_params.to_litellm_kwargs(),
|
86
|
+
)
|
87
|
+
for chunk in ret:
|
88
|
+
text = chunk.choices[0].delta.content
|
89
|
+
if text is not None:
|
90
|
+
yield text, {}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
import warnings
|
4
|
+
import dataclasses
|
3
5
|
from typing import Callable, List, Optional, Union
|
4
6
|
|
5
7
|
import numpy as np
|
@@ -41,6 +43,15 @@ INSTRUCT_MODEL_NAMES = [
|
|
41
43
|
]
|
42
44
|
|
43
45
|
|
46
|
+
@dataclasses.dataclass
|
47
|
+
class TokenUsage:
|
48
|
+
prompt_tokens: int
|
49
|
+
completion_tokens: int
|
50
|
+
|
51
|
+
def reset(self):
|
52
|
+
self.prompt_tokens = self.completion_tokens = 0
|
53
|
+
|
54
|
+
|
44
55
|
class OpenAI(BaseBackend):
|
45
56
|
def __init__(
|
46
57
|
self,
|
@@ -80,40 +91,89 @@ class OpenAI(BaseBackend):
|
|
80
91
|
else:
|
81
92
|
self.is_chat_model = True
|
82
93
|
|
83
|
-
self.
|
94
|
+
self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
|
95
|
+
|
96
|
+
# Usage
|
97
|
+
self.token_usage = TokenUsage(0, 0)
|
98
|
+
|
99
|
+
# API speculative execution
|
100
|
+
# TODO(ying): This does not support multi-threading (run_batch)
|
101
|
+
self.spec_kwargs = {}
|
102
|
+
self.spec_format = []
|
103
|
+
self.spec_max_num_tries = 3
|
84
104
|
|
85
105
|
def get_chat_template(self):
|
86
106
|
return self.chat_template
|
87
107
|
|
108
|
+
def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
|
109
|
+
num_api_spec_tokens: int, spec_var_name: str):
|
110
|
+
if "max_tokens" not in self.spec_kwargs:
|
111
|
+
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
|
+
else:
|
113
|
+
assert (
|
114
|
+
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
+
)
|
116
|
+
|
117
|
+
params = sampling_params.to_openai_kwargs()
|
118
|
+
for key, value in params.items():
|
119
|
+
if key in ["stop"]:
|
120
|
+
continue
|
121
|
+
if key in ["max_tokens"]:
|
122
|
+
warnings.warn(
|
123
|
+
"The parameter max_tokens will be overwritten by speculated number of tokens."
|
124
|
+
)
|
125
|
+
continue
|
126
|
+
if key not in self.spec_kwargs:
|
127
|
+
self.spec_kwargs[key] = value
|
128
|
+
else:
|
129
|
+
assert (
|
130
|
+
value == self.spec_kwargs[key]
|
131
|
+
), "sampling parameters should be consistent if turn on api speculative execution."
|
132
|
+
self.spec_format.append(
|
133
|
+
{"text": "", "stop": params["stop"], "name": spec_var_name}
|
134
|
+
)
|
135
|
+
return "", {}
|
136
|
+
|
88
137
|
def generate(
|
89
138
|
self,
|
90
139
|
s: StreamExecutor,
|
91
140
|
sampling_params: SglSamplingParams,
|
141
|
+
spec_var_name: str = None,
|
92
142
|
):
|
93
143
|
if sampling_params.dtype is None:
|
94
144
|
if self.is_chat_model:
|
95
|
-
if
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
145
|
+
if s.num_api_spec_tokens is None:
|
146
|
+
if not s.text_.endswith(self.chat_prefix):
|
147
|
+
raise RuntimeError(
|
148
|
+
"This use case is not supported if api speculative execution is off. "
|
149
|
+
"For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
|
150
|
+
"Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
|
151
|
+
)
|
152
|
+
prompt = s.messages_
|
153
|
+
else:
|
154
|
+
return self._prepare_spec_execution(sampling_params,
|
155
|
+
s.num_api_spec_tokens, spec_var_name)
|
101
156
|
else:
|
102
157
|
prompt = s.text_
|
103
158
|
|
104
159
|
kwargs = sampling_params.to_openai_kwargs()
|
105
160
|
comp = openai_completion(
|
106
161
|
client=self.client,
|
162
|
+
token_usage=self.token_usage,
|
107
163
|
is_chat=self.is_chat_model,
|
108
164
|
model=self.model_name,
|
109
165
|
prompt=prompt,
|
110
166
|
**kwargs,
|
111
167
|
)
|
112
168
|
elif sampling_params.dtype in [str, "str", "string"]:
|
169
|
+
assert (
|
170
|
+
not self.is_chat_model
|
171
|
+
), "constrained type not supported on chat model"
|
113
172
|
kwargs = sampling_params.to_openai_kwargs()
|
114
173
|
kwargs.pop("stop")
|
115
174
|
comp = openai_completion(
|
116
175
|
client=self.client,
|
176
|
+
token_usage=self.token_usage,
|
117
177
|
is_chat=self.is_chat_model,
|
118
178
|
model=self.model_name,
|
119
179
|
prompt=s.text_ + '"',
|
@@ -122,10 +182,14 @@ class OpenAI(BaseBackend):
|
|
122
182
|
)
|
123
183
|
comp = '"' + comp + '"'
|
124
184
|
elif sampling_params.dtype in [int, "int"]:
|
185
|
+
assert (
|
186
|
+
not self.is_chat_model
|
187
|
+
), "constrained type not supported on chat model"
|
125
188
|
kwargs = sampling_params.to_openai_kwargs()
|
126
189
|
kwargs.pop("stop")
|
127
190
|
comp = openai_completion(
|
128
191
|
client=self.client,
|
192
|
+
token_usage=self.token_usage,
|
129
193
|
is_chat=self.is_chat_model,
|
130
194
|
model=self.model_name,
|
131
195
|
prompt=s.text_,
|
@@ -138,6 +202,63 @@ class OpenAI(BaseBackend):
|
|
138
202
|
|
139
203
|
return comp, {}
|
140
204
|
|
205
|
+
def spec_fill(self, value: str):
|
206
|
+
assert self.is_chat_model
|
207
|
+
self.spec_format.append({"text": value, "stop": None, "name": None})
|
208
|
+
|
209
|
+
def spec_pattern_match(self, comp):
|
210
|
+
for i, term in enumerate(self.spec_format):
|
211
|
+
text = term["text"]
|
212
|
+
if text != "":
|
213
|
+
if comp.startswith(text):
|
214
|
+
comp = comp[len(text) :]
|
215
|
+
else:
|
216
|
+
return False
|
217
|
+
else:
|
218
|
+
pos = comp.find(term["stop"])
|
219
|
+
if pos != -1:
|
220
|
+
term["text"] = comp[:pos]
|
221
|
+
comp = comp[pos:]
|
222
|
+
else:
|
223
|
+
if i == len(self.spec_format) - 1:
|
224
|
+
term["text"] = comp
|
225
|
+
else:
|
226
|
+
return False
|
227
|
+
return True
|
228
|
+
|
229
|
+
def role_end_generate(
|
230
|
+
self,
|
231
|
+
s: StreamExecutor,
|
232
|
+
):
|
233
|
+
if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
|
234
|
+
return
|
235
|
+
|
236
|
+
comp = ""
|
237
|
+
if not all(x["name"] is None for x in self.spec_format):
|
238
|
+
# TODO(ying): throw errors or warnings
|
239
|
+
for i in range(self.spec_max_num_tries):
|
240
|
+
comp = openai_completion(
|
241
|
+
client=self.client,
|
242
|
+
token_usage=self.token_usage,
|
243
|
+
is_chat=self.is_chat_model,
|
244
|
+
model=self.model_name,
|
245
|
+
prompt=s.messages_,
|
246
|
+
**self.spec_kwargs,
|
247
|
+
)
|
248
|
+
if self.spec_pattern_match(comp):
|
249
|
+
break
|
250
|
+
|
251
|
+
for term in self.spec_format:
|
252
|
+
s.text_ += term["text"]
|
253
|
+
name = term["name"]
|
254
|
+
if name is not None:
|
255
|
+
s.variables[name] = term["text"]
|
256
|
+
s.meta_info[name] = {}
|
257
|
+
s.variable_event[name].set()
|
258
|
+
|
259
|
+
self.spec_kwargs = {}
|
260
|
+
self.spec_format = []
|
261
|
+
|
141
262
|
def generate_stream(
|
142
263
|
self,
|
143
264
|
s: StreamExecutor,
|
@@ -145,7 +266,7 @@ class OpenAI(BaseBackend):
|
|
145
266
|
):
|
146
267
|
if sampling_params.dtype is None:
|
147
268
|
if self.is_chat_model:
|
148
|
-
if not s.text_.endswith(self.
|
269
|
+
if not s.text_.endswith(self.chat_prefix):
|
149
270
|
raise RuntimeError(
|
150
271
|
"This use case is not supported. "
|
151
272
|
"For OpenAI chat models, sgl.gen must be right after sgl.assistant"
|
@@ -157,6 +278,7 @@ class OpenAI(BaseBackend):
|
|
157
278
|
kwargs = sampling_params.to_openai_kwargs()
|
158
279
|
generator = openai_completion_stream(
|
159
280
|
client=self.client,
|
281
|
+
token_usage=self.token_usage,
|
160
282
|
is_chat=self.is_chat_model,
|
161
283
|
model=self.model_name,
|
162
284
|
prompt=prompt,
|
@@ -202,6 +324,8 @@ class OpenAI(BaseBackend):
|
|
202
324
|
)
|
203
325
|
ret_str = ret.choices[0].text
|
204
326
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
|
+
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
+
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
205
329
|
|
206
330
|
# TODO:
|
207
331
|
# 1. return logits as the scores
|
@@ -231,7 +355,7 @@ class OpenAI(BaseBackend):
|
|
231
355
|
return decision, scores, None, None
|
232
356
|
|
233
357
|
|
234
|
-
def openai_completion(client,
|
358
|
+
def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
235
359
|
for attempt in range(retries):
|
236
360
|
try:
|
237
361
|
if is_chat:
|
@@ -245,6 +369,9 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
245
369
|
comp = [c.text for c in ret.choices]
|
246
370
|
else:
|
247
371
|
comp = ret.choices[0].text
|
372
|
+
|
373
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
374
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
248
375
|
break
|
249
376
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
250
377
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|
@@ -258,16 +385,19 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
258
385
|
return comp
|
259
386
|
|
260
387
|
|
261
|
-
def openai_completion_stream(client,
|
388
|
+
def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
262
389
|
for attempt in range(retries):
|
263
390
|
try:
|
264
391
|
if is_chat:
|
265
392
|
if "stop" in kwargs and kwargs["stop"] is None:
|
266
393
|
kwargs.pop("stop")
|
267
394
|
generator = client.chat.completions.create(
|
268
|
-
messages=prompt, stream=True,
|
395
|
+
messages=prompt, stream=True, stream_options={"include_usage": True},
|
396
|
+
**kwargs
|
269
397
|
)
|
270
398
|
for ret in generator:
|
399
|
+
if len(ret.choices) == 0:
|
400
|
+
continue
|
271
401
|
try:
|
272
402
|
content = ret.choices[0].delta.content
|
273
403
|
except IndexError:
|
@@ -275,11 +405,17 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
|
|
275
405
|
yield content or "", {}
|
276
406
|
else:
|
277
407
|
generator = client.completions.create(
|
278
|
-
prompt=prompt, stream=True,
|
408
|
+
prompt=prompt, stream=True, stream_options={"include_usage": True},
|
409
|
+
**kwargs
|
279
410
|
)
|
280
411
|
for ret in generator:
|
412
|
+
if len(ret.choices) == 0:
|
413
|
+
continue
|
281
414
|
content = ret.choices[0].text
|
282
415
|
yield content or "", {}
|
416
|
+
|
417
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
418
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
283
419
|
break
|
284
420
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
285
421
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|