sglang 0.1.16__tar.gz → 0.1.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.16/sglang.egg-info → sglang-0.1.17}/PKG-INFO +15 -9
- {sglang-0.1.16 → sglang-0.1.17}/README.md +11 -8
- {sglang-0.1.16 → sglang-0.1.17}/pyproject.toml +5 -4
- {sglang-0.1.16 → sglang-0.1.17}/sglang/__init__.py +3 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/api.py +3 -3
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/anthropic.py +1 -1
- sglang-0.1.17/sglang/backend/litellm.py +90 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/openai.py +148 -12
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/runtime_endpoint.py +18 -10
- {sglang-0.1.16 → sglang-0.1.17}/sglang/global_config.py +8 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/interpreter.py +114 -67
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/ir.py +17 -2
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/constrained/fsm_cache.py +3 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/flush_cache.py +1 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/hf_transformers_utils.py +75 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/layers/extend_attention.py +17 -0
- sglang-0.1.17/sglang/srt/layers/fused_moe.py +485 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/layers/logits_processor.py +12 -7
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/layers/radix_attention.py +10 -3
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/layers/token_attention.py +16 -1
- sglang-0.1.17/sglang/srt/managers/controller/dp_worker.py +110 -0
- sglang-0.1.17/sglang/srt/managers/controller/infer_batch.py +619 -0
- sglang-0.1.17/sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang-0.1.17/sglang/srt/managers/controller/manager_single.py +97 -0
- sglang-0.1.17/sglang/srt/managers/controller/model_runner.py +462 -0
- {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.17/sglang/srt/managers/controller}/radix_cache.py +7 -1
- sglang-0.1.17/sglang/srt/managers/controller/schedule_heuristic.py +59 -0
- sglang-0.1.17/sglang/srt/managers/controller/tp_worker.py +791 -0
- sglang-0.1.17/sglang/srt/managers/detokenizer_manager.py +95 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/io_struct.py +15 -11
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/router/infer_batch.py +103 -59
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/router/manager.py +1 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/router/model_rpc.py +175 -122
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/router/model_runner.py +91 -104
- sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +267 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/router/scheduler.py +6 -6
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/managers/tokenizer_manager.py +152 -89
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/model_config.py +4 -5
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/commandr.py +10 -13
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/dbrx.py +9 -15
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/gemma.py +8 -15
- sglang-0.1.17/sglang/srt/models/grok.py +671 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/llama2.py +19 -15
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/llava.py +84 -20
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/llavavid.py +11 -20
- sglang-0.1.17/sglang/srt/models/mixtral.py +513 -0
- sglang-0.1.16/sglang/srt/models/mixtral.py → sglang-0.1.17/sglang/srt/models/mixtral_quant.py +12 -22
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/qwen.py +9 -13
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/qwen2.py +11 -13
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/stablelm.py +9 -15
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/yivl.py +17 -22
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/openai_api_adapter.py +140 -95
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/openai_protocol.py +10 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/server.py +77 -42
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/server_args.py +51 -6
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/utils.py +124 -66
- {sglang-0.1.16 → sglang-0.1.17}/sglang/test/test_programs.py +44 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/test/test_utils.py +32 -1
- {sglang-0.1.16 → sglang-0.1.17}/sglang/utils.py +22 -4
- {sglang-0.1.16 → sglang-0.1.17/sglang.egg-info}/PKG-INFO +15 -9
- {sglang-0.1.16 → sglang-0.1.17}/sglang.egg-info/SOURCES.txt +12 -3
- {sglang-0.1.16 → sglang-0.1.17}/sglang.egg-info/requires.txt +5 -1
- sglang-0.1.16/sglang/srt/backend_config.py +0 -13
- sglang-0.1.16/sglang/srt/managers/detokenizer_manager.py +0 -95
- sglang-0.1.16/sglang/srt/models/dbrx_config.py +0 -281
- sglang-0.1.16/sglang/srt/weight_utils.py +0 -417
- {sglang-0.1.16 → sglang-0.1.17}/LICENSE +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/setup.cfg +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/compiler.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/launch_server.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/conversation.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.16 → sglang-0.1.17}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.17
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -222,7 +222,7 @@ Requires-Dist: torch; extra == "srt"
|
|
222
222
|
Requires-Dist: uvloop; extra == "srt"
|
223
223
|
Requires-Dist: uvicorn; extra == "srt"
|
224
224
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm
|
225
|
+
Requires-Dist: vllm==0.4.3; extra == "srt"
|
226
226
|
Requires-Dist: interegular; extra == "srt"
|
227
227
|
Requires-Dist: pydantic; extra == "srt"
|
228
228
|
Requires-Dist: pillow; extra == "srt"
|
@@ -237,10 +237,13 @@ Requires-Dist: tiktoken; extra == "openai"
|
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
239
|
Requires-Dist: numpy; extra == "anthropic"
|
240
|
+
Provides-Extra: litellm
|
241
|
+
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
240
242
|
Provides-Extra: all
|
241
243
|
Requires-Dist: sglang[srt]; extra == "all"
|
242
244
|
Requires-Dist: sglang[openai]; extra == "all"
|
243
245
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
246
|
+
Requires-Dist: sglang[litellm]; extra == "all"
|
244
247
|
|
245
248
|
<div align="center">
|
246
249
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -253,9 +256,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
|
|
253
256
|
SGLang is a structured generation language designed for large language models (LLMs).
|
254
257
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
255
258
|
|
256
|
-
The core features
|
259
|
+
The core features include:
|
257
260
|
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
|
258
|
-
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by
|
261
|
+
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
|
259
262
|
|
260
263
|
## News
|
261
264
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -288,12 +291,8 @@ pip install -e "python[all]"
|
|
288
291
|
```
|
289
292
|
|
290
293
|
### Notes
|
291
|
-
- If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
|
292
|
-
- For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
|
293
|
-
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
|
294
294
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
295
295
|
|
296
|
-
|
297
296
|
## Quick Start
|
298
297
|
The example below shows how to use sglang to answer a mulit-turn question.
|
299
298
|
|
@@ -603,11 +602,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
603
602
|
```
|
604
603
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
605
604
|
```
|
605
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
606
|
+
```
|
607
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
608
|
+
```
|
606
609
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
607
610
|
```
|
608
611
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
609
612
|
```
|
610
|
-
-
|
613
|
+
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
611
615
|
|
612
616
|
### Supported Models
|
613
617
|
- Llama
|
@@ -621,6 +625,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
621
625
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
622
626
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
623
627
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
628
|
+
- LLaVA-NeXT-Video
|
629
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
624
630
|
- Yi-VL
|
625
631
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
626
632
|
- StableLM
|
@@ -9,9 +9,9 @@
|
|
9
9
|
SGLang is a structured generation language designed for large language models (LLMs).
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
|
-
The core features
|
12
|
+
The core features include:
|
13
13
|
- **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
|
14
|
-
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by
|
14
|
+
- **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -44,12 +44,8 @@ pip install -e "python[all]"
|
|
44
44
|
```
|
45
45
|
|
46
46
|
### Notes
|
47
|
-
- If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
|
48
|
-
- For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
|
49
|
-
- For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
|
50
47
|
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
51
48
|
|
52
|
-
|
53
49
|
## Quick Start
|
54
50
|
The example below shows how to use sglang to answer a mulit-turn question.
|
55
51
|
|
@@ -359,11 +355,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
359
355
|
```
|
360
356
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
361
357
|
```
|
358
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
359
|
+
```
|
360
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
361
|
+
```
|
362
362
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
363
363
|
```
|
364
364
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
365
365
|
```
|
366
|
-
-
|
366
|
+
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
367
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
367
368
|
|
368
369
|
### Supported Models
|
369
370
|
- Llama
|
@@ -377,6 +378,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
377
378
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
378
379
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
379
380
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
381
|
+
- LLaVA-NeXT-Video
|
382
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
380
383
|
- Yi-VL
|
381
384
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
382
385
|
- StableLM
|
@@ -410,4 +413,4 @@ https://github.com/sgl-project/sglang/issues/157
|
|
410
413
|
}
|
411
414
|
```
|
412
415
|
|
413
|
-
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
416
|
+
We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
8
|
-
description = "A structured generation langauge for LLMs."
|
7
|
+
version = "0.1.17"
|
8
|
+
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
11
11
|
license = {file = "LICENSE"}
|
@@ -20,10 +20,11 @@ dependencies = [
|
|
20
20
|
|
21
21
|
[project.optional-dependencies]
|
22
22
|
srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
|
23
|
-
"zmq", "vllm
|
23
|
+
"zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
|
24
24
|
openai = ["openai>=1.0", "numpy", "tiktoken"]
|
25
25
|
anthropic = ["anthropic>=0.20.0", "numpy"]
|
26
|
-
|
26
|
+
litellm = ["litellm>=1.0.0"]
|
27
|
+
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
27
28
|
|
28
29
|
[project.urls]
|
29
30
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.17"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -27,6 +27,7 @@ from sglang.backend.anthropic import Anthropic
|
|
27
27
|
from sglang.backend.openai import OpenAI
|
28
28
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
29
|
from sglang.backend.vertexai import VertexAI
|
30
|
+
from sglang.backend.litellm import LiteLLM
|
30
31
|
|
31
32
|
# Global Configurations
|
32
33
|
from sglang.global_config import global_config
|
@@ -35,6 +36,7 @@ from sglang.global_config import global_config
|
|
35
36
|
__all__ = [
|
36
37
|
"global_config",
|
37
38
|
"Anthropic",
|
39
|
+
"LiteLLM",
|
38
40
|
"OpenAI",
|
39
41
|
"RuntimeEndpoint",
|
40
42
|
"VertexAI",
|
@@ -20,13 +20,13 @@ from sglang.lang.ir import (
|
|
20
20
|
|
21
21
|
|
22
22
|
def function(
|
23
|
-
func: Optional[Callable] = None,
|
23
|
+
func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
|
24
24
|
):
|
25
25
|
if func:
|
26
|
-
return SglFunction(func,
|
26
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
27
27
|
|
28
28
|
def decorator(func):
|
29
|
-
return SglFunction(func,
|
29
|
+
return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
|
30
30
|
|
31
31
|
return decorator
|
32
32
|
|
@@ -0,0 +1,90 @@
|
|
1
|
+
from typing import Mapping, Optional
|
2
|
+
|
3
|
+
from sglang.backend.base_backend import BaseBackend
|
4
|
+
from sglang.lang.chat_template import get_chat_template_by_model_path
|
5
|
+
from sglang.lang.interpreter import StreamExecutor
|
6
|
+
from sglang.lang.ir import SglSamplingParams
|
7
|
+
|
8
|
+
try:
|
9
|
+
import litellm
|
10
|
+
except ImportError as e:
|
11
|
+
litellm = e
|
12
|
+
litellm.num_retries = 1
|
13
|
+
|
14
|
+
|
15
|
+
class LiteLLM(BaseBackend):
|
16
|
+
|
17
|
+
def __init__(
|
18
|
+
self,
|
19
|
+
model_name,
|
20
|
+
chat_template=None,
|
21
|
+
api_key=None,
|
22
|
+
organization: Optional[str] = None,
|
23
|
+
base_url: Optional[str] = None,
|
24
|
+
timeout: Optional[float] = 600,
|
25
|
+
max_retries: Optional[int] = litellm.num_retries,
|
26
|
+
default_headers: Optional[Mapping[str, str]] = None,
|
27
|
+
):
|
28
|
+
super().__init__()
|
29
|
+
|
30
|
+
if isinstance(litellm, Exception):
|
31
|
+
raise litellm
|
32
|
+
|
33
|
+
self.model_name = model_name
|
34
|
+
|
35
|
+
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
+
model_name)
|
37
|
+
|
38
|
+
self.client_params = {
|
39
|
+
"api_key": api_key,
|
40
|
+
"organization": organization,
|
41
|
+
"base_url": base_url,
|
42
|
+
"timeout": timeout,
|
43
|
+
"max_retries": max_retries,
|
44
|
+
"default_headers": default_headers,
|
45
|
+
}
|
46
|
+
|
47
|
+
def get_chat_template(self):
|
48
|
+
return self.chat_template
|
49
|
+
|
50
|
+
def generate(
|
51
|
+
self,
|
52
|
+
s: StreamExecutor,
|
53
|
+
sampling_params: SglSamplingParams,
|
54
|
+
):
|
55
|
+
if s.messages_:
|
56
|
+
messages = s.messages_
|
57
|
+
else:
|
58
|
+
messages = [{"role": "user", "content": s.text_}]
|
59
|
+
|
60
|
+
ret = litellm.completion(
|
61
|
+
model=self.model_name,
|
62
|
+
messages=messages,
|
63
|
+
**self.client_params,
|
64
|
+
**sampling_params.to_anthropic_kwargs(),
|
65
|
+
)
|
66
|
+
comp = ret.choices[0].message.content
|
67
|
+
|
68
|
+
return comp, {}
|
69
|
+
|
70
|
+
def generate_stream(
|
71
|
+
self,
|
72
|
+
s: StreamExecutor,
|
73
|
+
sampling_params: SglSamplingParams,
|
74
|
+
):
|
75
|
+
if s.messages_:
|
76
|
+
messages = s.messages_
|
77
|
+
else:
|
78
|
+
messages = [{"role": "user", "content": s.text_}]
|
79
|
+
|
80
|
+
ret = litellm.completion(
|
81
|
+
model=self.model_name,
|
82
|
+
messages=messages,
|
83
|
+
stream=True,
|
84
|
+
**self.client_params,
|
85
|
+
**sampling_params.to_litellm_kwargs(),
|
86
|
+
)
|
87
|
+
for chunk in ret:
|
88
|
+
text = chunk.choices[0].delta.content
|
89
|
+
if text is not None:
|
90
|
+
yield text, {}
|
@@ -1,5 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
import time
|
3
|
+
import warnings
|
4
|
+
import dataclasses
|
3
5
|
from typing import Callable, List, Optional, Union
|
4
6
|
|
5
7
|
import numpy as np
|
@@ -41,6 +43,15 @@ INSTRUCT_MODEL_NAMES = [
|
|
41
43
|
]
|
42
44
|
|
43
45
|
|
46
|
+
@dataclasses.dataclass
|
47
|
+
class TokenUsage:
|
48
|
+
prompt_tokens: int
|
49
|
+
completion_tokens: int
|
50
|
+
|
51
|
+
def reset(self):
|
52
|
+
self.prompt_tokens = self.completion_tokens = 0
|
53
|
+
|
54
|
+
|
44
55
|
class OpenAI(BaseBackend):
|
45
56
|
def __init__(
|
46
57
|
self,
|
@@ -80,40 +91,89 @@ class OpenAI(BaseBackend):
|
|
80
91
|
else:
|
81
92
|
self.is_chat_model = True
|
82
93
|
|
83
|
-
self.
|
94
|
+
self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
|
95
|
+
|
96
|
+
# Usage
|
97
|
+
self.token_usage = TokenUsage(0, 0)
|
98
|
+
|
99
|
+
# API speculative execution
|
100
|
+
# TODO(ying): This does not support multi-threading (run_batch)
|
101
|
+
self.spec_kwargs = {}
|
102
|
+
self.spec_format = []
|
103
|
+
self.spec_max_num_tries = 3
|
84
104
|
|
85
105
|
def get_chat_template(self):
|
86
106
|
return self.chat_template
|
87
107
|
|
108
|
+
def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
|
109
|
+
num_api_spec_tokens: int, spec_var_name: str):
|
110
|
+
if "max_tokens" not in self.spec_kwargs:
|
111
|
+
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
|
+
else:
|
113
|
+
assert (
|
114
|
+
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
+
)
|
116
|
+
|
117
|
+
params = sampling_params.to_openai_kwargs()
|
118
|
+
for key, value in params.items():
|
119
|
+
if key in ["stop"]:
|
120
|
+
continue
|
121
|
+
if key in ["max_tokens"]:
|
122
|
+
warnings.warn(
|
123
|
+
"The parameter max_tokens will be overwritten by speculated number of tokens."
|
124
|
+
)
|
125
|
+
continue
|
126
|
+
if key not in self.spec_kwargs:
|
127
|
+
self.spec_kwargs[key] = value
|
128
|
+
else:
|
129
|
+
assert (
|
130
|
+
value == self.spec_kwargs[key]
|
131
|
+
), "sampling parameters should be consistent if turn on api speculative execution."
|
132
|
+
self.spec_format.append(
|
133
|
+
{"text": "", "stop": params["stop"], "name": spec_var_name}
|
134
|
+
)
|
135
|
+
return "", {}
|
136
|
+
|
88
137
|
def generate(
|
89
138
|
self,
|
90
139
|
s: StreamExecutor,
|
91
140
|
sampling_params: SglSamplingParams,
|
141
|
+
spec_var_name: str = None,
|
92
142
|
):
|
93
143
|
if sampling_params.dtype is None:
|
94
144
|
if self.is_chat_model:
|
95
|
-
if
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
145
|
+
if s.num_api_spec_tokens is None:
|
146
|
+
if not s.text_.endswith(self.chat_prefix):
|
147
|
+
raise RuntimeError(
|
148
|
+
"This use case is not supported if api speculative execution is off. "
|
149
|
+
"For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
|
150
|
+
"Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
|
151
|
+
)
|
152
|
+
prompt = s.messages_
|
153
|
+
else:
|
154
|
+
return self._prepare_spec_execution(sampling_params,
|
155
|
+
s.num_api_spec_tokens, spec_var_name)
|
101
156
|
else:
|
102
157
|
prompt = s.text_
|
103
158
|
|
104
159
|
kwargs = sampling_params.to_openai_kwargs()
|
105
160
|
comp = openai_completion(
|
106
161
|
client=self.client,
|
162
|
+
token_usage=self.token_usage,
|
107
163
|
is_chat=self.is_chat_model,
|
108
164
|
model=self.model_name,
|
109
165
|
prompt=prompt,
|
110
166
|
**kwargs,
|
111
167
|
)
|
112
168
|
elif sampling_params.dtype in [str, "str", "string"]:
|
169
|
+
assert (
|
170
|
+
not self.is_chat_model
|
171
|
+
), "constrained type not supported on chat model"
|
113
172
|
kwargs = sampling_params.to_openai_kwargs()
|
114
173
|
kwargs.pop("stop")
|
115
174
|
comp = openai_completion(
|
116
175
|
client=self.client,
|
176
|
+
token_usage=self.token_usage,
|
117
177
|
is_chat=self.is_chat_model,
|
118
178
|
model=self.model_name,
|
119
179
|
prompt=s.text_ + '"',
|
@@ -122,10 +182,14 @@ class OpenAI(BaseBackend):
|
|
122
182
|
)
|
123
183
|
comp = '"' + comp + '"'
|
124
184
|
elif sampling_params.dtype in [int, "int"]:
|
185
|
+
assert (
|
186
|
+
not self.is_chat_model
|
187
|
+
), "constrained type not supported on chat model"
|
125
188
|
kwargs = sampling_params.to_openai_kwargs()
|
126
189
|
kwargs.pop("stop")
|
127
190
|
comp = openai_completion(
|
128
191
|
client=self.client,
|
192
|
+
token_usage=self.token_usage,
|
129
193
|
is_chat=self.is_chat_model,
|
130
194
|
model=self.model_name,
|
131
195
|
prompt=s.text_,
|
@@ -138,6 +202,63 @@ class OpenAI(BaseBackend):
|
|
138
202
|
|
139
203
|
return comp, {}
|
140
204
|
|
205
|
+
def spec_fill(self, value: str):
|
206
|
+
assert self.is_chat_model
|
207
|
+
self.spec_format.append({"text": value, "stop": None, "name": None})
|
208
|
+
|
209
|
+
def spec_pattern_match(self, comp):
|
210
|
+
for i, term in enumerate(self.spec_format):
|
211
|
+
text = term["text"]
|
212
|
+
if text != "":
|
213
|
+
if comp.startswith(text):
|
214
|
+
comp = comp[len(text) :]
|
215
|
+
else:
|
216
|
+
return False
|
217
|
+
else:
|
218
|
+
pos = comp.find(term["stop"])
|
219
|
+
if pos != -1:
|
220
|
+
term["text"] = comp[:pos]
|
221
|
+
comp = comp[pos:]
|
222
|
+
else:
|
223
|
+
if i == len(self.spec_format) - 1:
|
224
|
+
term["text"] = comp
|
225
|
+
else:
|
226
|
+
return False
|
227
|
+
return True
|
228
|
+
|
229
|
+
def role_end_generate(
|
230
|
+
self,
|
231
|
+
s: StreamExecutor,
|
232
|
+
):
|
233
|
+
if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
|
234
|
+
return
|
235
|
+
|
236
|
+
comp = ""
|
237
|
+
if not all(x["name"] is None for x in self.spec_format):
|
238
|
+
# TODO(ying): throw errors or warnings
|
239
|
+
for i in range(self.spec_max_num_tries):
|
240
|
+
comp = openai_completion(
|
241
|
+
client=self.client,
|
242
|
+
token_usage=self.token_usage,
|
243
|
+
is_chat=self.is_chat_model,
|
244
|
+
model=self.model_name,
|
245
|
+
prompt=s.messages_,
|
246
|
+
**self.spec_kwargs,
|
247
|
+
)
|
248
|
+
if self.spec_pattern_match(comp):
|
249
|
+
break
|
250
|
+
|
251
|
+
for term in self.spec_format:
|
252
|
+
s.text_ += term["text"]
|
253
|
+
name = term["name"]
|
254
|
+
if name is not None:
|
255
|
+
s.variables[name] = term["text"]
|
256
|
+
s.meta_info[name] = {}
|
257
|
+
s.variable_event[name].set()
|
258
|
+
|
259
|
+
self.spec_kwargs = {}
|
260
|
+
self.spec_format = []
|
261
|
+
|
141
262
|
def generate_stream(
|
142
263
|
self,
|
143
264
|
s: StreamExecutor,
|
@@ -145,7 +266,7 @@ class OpenAI(BaseBackend):
|
|
145
266
|
):
|
146
267
|
if sampling_params.dtype is None:
|
147
268
|
if self.is_chat_model:
|
148
|
-
if not s.text_.endswith(self.
|
269
|
+
if not s.text_.endswith(self.chat_prefix):
|
149
270
|
raise RuntimeError(
|
150
271
|
"This use case is not supported. "
|
151
272
|
"For OpenAI chat models, sgl.gen must be right after sgl.assistant"
|
@@ -157,6 +278,7 @@ class OpenAI(BaseBackend):
|
|
157
278
|
kwargs = sampling_params.to_openai_kwargs()
|
158
279
|
generator = openai_completion_stream(
|
159
280
|
client=self.client,
|
281
|
+
token_usage=self.token_usage,
|
160
282
|
is_chat=self.is_chat_model,
|
161
283
|
model=self.model_name,
|
162
284
|
prompt=prompt,
|
@@ -202,6 +324,8 @@ class OpenAI(BaseBackend):
|
|
202
324
|
)
|
203
325
|
ret_str = ret.choices[0].text
|
204
326
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
|
+
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
+
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
205
329
|
|
206
330
|
# TODO:
|
207
331
|
# 1. return logits as the scores
|
@@ -231,7 +355,7 @@ class OpenAI(BaseBackend):
|
|
231
355
|
return decision, scores, None, None
|
232
356
|
|
233
357
|
|
234
|
-
def openai_completion(client,
|
358
|
+
def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
235
359
|
for attempt in range(retries):
|
236
360
|
try:
|
237
361
|
if is_chat:
|
@@ -245,6 +369,9 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
245
369
|
comp = [c.text for c in ret.choices]
|
246
370
|
else:
|
247
371
|
comp = ret.choices[0].text
|
372
|
+
|
373
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
374
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
248
375
|
break
|
249
376
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
250
377
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|
@@ -258,16 +385,19 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
|
|
258
385
|
return comp
|
259
386
|
|
260
387
|
|
261
|
-
def openai_completion_stream(client,
|
388
|
+
def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
|
262
389
|
for attempt in range(retries):
|
263
390
|
try:
|
264
391
|
if is_chat:
|
265
392
|
if "stop" in kwargs and kwargs["stop"] is None:
|
266
393
|
kwargs.pop("stop")
|
267
394
|
generator = client.chat.completions.create(
|
268
|
-
messages=prompt, stream=True,
|
395
|
+
messages=prompt, stream=True, stream_options={"include_usage": True},
|
396
|
+
**kwargs
|
269
397
|
)
|
270
398
|
for ret in generator:
|
399
|
+
if len(ret.choices) == 0:
|
400
|
+
continue
|
271
401
|
try:
|
272
402
|
content = ret.choices[0].delta.content
|
273
403
|
except IndexError:
|
@@ -275,11 +405,17 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
|
|
275
405
|
yield content or "", {}
|
276
406
|
else:
|
277
407
|
generator = client.completions.create(
|
278
|
-
prompt=prompt, stream=True,
|
408
|
+
prompt=prompt, stream=True, stream_options={"include_usage": True},
|
409
|
+
**kwargs
|
279
410
|
)
|
280
411
|
for ret in generator:
|
412
|
+
if len(ret.choices) == 0:
|
413
|
+
continue
|
281
414
|
content = ret.choices[0].text
|
282
415
|
yield content or "", {}
|
416
|
+
|
417
|
+
token_usage.prompt_tokens += ret.usage.prompt_tokens
|
418
|
+
token_usage.completion_tokens += ret.usage.completion_tokens
|
283
419
|
break
|
284
420
|
except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
|
285
421
|
logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
|