sglang 0.1.17__tar.gz → 0.1.18__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.1.17/sglang.egg-info → sglang-0.1.18}/PKG-INFO +29 -22
- {sglang-0.1.17 → sglang-0.1.18}/README.md +18 -10
- {sglang-0.1.17 → sglang-0.1.18}/pyproject.toml +6 -5
- {sglang-0.1.17 → sglang-0.1.18}/sglang/__init__.py +2 -2
- {sglang-0.1.17 → sglang-0.1.18}/sglang/api.py +4 -4
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/litellm.py +2 -2
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/openai.py +26 -15
- sglang-0.1.18/sglang/bench_latency.py +299 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/global_config.py +4 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/compiler.py +2 -2
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/interpreter.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/ir.py +15 -5
- {sglang-0.1.17 → sglang-0.1.18}/sglang/launch_server.py +4 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/launch_server_llavavid.py +2 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/__init__.py +13 -6
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/fsm_cache.py +6 -3
- sglang-0.1.18/sglang/srt/constrained/jump_forward.py +164 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/conversation.py +2 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/flush_cache.py +2 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/hf_transformers_utils.py +64 -9
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/fused_moe.py +186 -89
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/logits_processor.py +53 -25
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/radix_attention.py +34 -7
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/dp_worker.py +6 -3
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/infer_batch.py +142 -67
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/manager_multi.py +5 -5
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/manager_single.py +8 -3
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/model_runner.py +154 -54
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/radix_cache.py +4 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/schedule_heuristic.py +2 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/tp_worker.py +140 -135
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/detokenizer_manager.py +15 -19
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/io_struct.py +10 -4
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/tokenizer_manager.py +14 -13
- sglang-0.1.18/sglang/srt/model_config.py +125 -0
- sglang-0.1.18/sglang/srt/models/chatglm.py +399 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/commandr.py +2 -2
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/gemma.py +5 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/grok.py +204 -137
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llama2.py +11 -4
- sglang-0.1.18/sglang/srt/models/llama_classification.py +104 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llava.py +11 -8
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llavavid.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mixtral.py +164 -115
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mixtral_quant.py +0 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/qwen.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/qwen2.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/stablelm.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/yivl.py +2 -2
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/openai_api_adapter.py +33 -23
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/openai_protocol.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/server.py +60 -19
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/server_args.py +79 -44
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/utils.py +146 -37
- {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_programs.py +28 -10
- {sglang-0.1.17 → sglang-0.1.18}/sglang/utils.py +4 -3
- {sglang-0.1.17 → sglang-0.1.18/sglang.egg-info}/PKG-INFO +29 -22
- {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/SOURCES.txt +3 -6
- {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/requires.txt +10 -11
- sglang-0.1.17/sglang/srt/constrained/jump_forward.py +0 -76
- sglang-0.1.17/sglang/srt/managers/router/infer_batch.py +0 -596
- sglang-0.1.17/sglang/srt/managers/router/manager.py +0 -82
- sglang-0.1.17/sglang/srt/managers/router/model_rpc.py +0 -818
- sglang-0.1.17/sglang/srt/managers/router/model_runner.py +0 -445
- sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +0 -267
- sglang-0.1.17/sglang/srt/managers/router/scheduler.py +0 -59
- sglang-0.1.17/sglang/srt/model_config.py +0 -46
- {sglang-0.1.17 → sglang-0.1.18}/LICENSE +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/setup.cfg +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/anthropic.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/base_backend.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/runtime_endpoint.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/vertexai.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/__init__.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/chat_template.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/tracer.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/base_cache.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/extend_attention.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/token_attention.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/memory_pool.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/dbrx.py +1 -1
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/sampling_params.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_conversation.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_openai_protocol.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_utils.py +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,30 +213,29 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: srt
|
217
218
|
Requires-Dist: aiohttp; extra == "srt"
|
218
219
|
Requires-Dist: fastapi; extra == "srt"
|
220
|
+
Requires-Dist: hf_transfer; extra == "srt"
|
221
|
+
Requires-Dist: huggingface_hub; extra == "srt"
|
222
|
+
Requires-Dist: interegular; extra == "srt"
|
223
|
+
Requires-Dist: packaging; extra == "srt"
|
224
|
+
Requires-Dist: pillow; extra == "srt"
|
219
225
|
Requires-Dist: psutil; extra == "srt"
|
226
|
+
Requires-Dist: pydantic; extra == "srt"
|
220
227
|
Requires-Dist: rpyc; extra == "srt"
|
221
228
|
Requires-Dist: torch; extra == "srt"
|
222
|
-
Requires-Dist: uvloop; extra == "srt"
|
223
229
|
Requires-Dist: uvicorn; extra == "srt"
|
230
|
+
Requires-Dist: uvloop; extra == "srt"
|
224
231
|
Requires-Dist: zmq; extra == "srt"
|
225
|
-
Requires-Dist: vllm==0.
|
226
|
-
Requires-Dist:
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: pillow; extra == "srt"
|
229
|
-
Requires-Dist: packaging; extra == "srt"
|
230
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
231
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
232
|
-
Requires-Dist: outlines>=0.0.34; extra == "srt"
|
232
|
+
Requires-Dist: vllm==0.5.0; extra == "srt"
|
233
|
+
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
233
234
|
Provides-Extra: openai
|
234
235
|
Requires-Dist: openai>=1.0; extra == "openai"
|
235
|
-
Requires-Dist: numpy; extra == "openai"
|
236
236
|
Requires-Dist: tiktoken; extra == "openai"
|
237
237
|
Provides-Extra: anthropic
|
238
238
|
Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
|
239
|
-
Requires-Dist: numpy; extra == "anthropic"
|
240
239
|
Provides-Extra: litellm
|
241
240
|
Requires-Dist: litellm>=1.0.0; extra == "litellm"
|
242
241
|
Provides-Extra: all
|
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
257
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
258
257
|
|
259
258
|
The core features include:
|
260
|
-
- **
|
261
|
-
- **
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
262
261
|
|
263
262
|
## News
|
264
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -279,19 +278,27 @@ The core features include:
|
|
279
278
|
### Method 1: With pip
|
280
279
|
```
|
281
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
282
284
|
```
|
283
285
|
|
284
286
|
### Method 2: From source
|
285
287
|
```
|
286
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
287
289
|
cd sglang
|
288
290
|
|
289
291
|
pip install --upgrade pip
|
290
292
|
pip install -e "python[all]"
|
293
|
+
|
294
|
+
# Install FlashInfer CUDA kernels
|
295
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
291
296
|
```
|
292
297
|
|
293
298
|
### Notes
|
294
|
-
- If you
|
299
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
300
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
295
302
|
|
296
303
|
## Quick Start
|
297
304
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -610,7 +617,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
610
617
|
```
|
611
618
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
612
619
|
```
|
613
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
614
620
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
615
621
|
|
616
622
|
### Supported Models
|
@@ -643,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
643
649
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
644
650
|

|
645
651
|
|
646
|
-
Learn more [
|
652
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
653
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
647
654
|
|
648
655
|
## Roadmap
|
649
656
|
https://github.com/sgl-project/sglang/issues/157
|
650
657
|
|
651
658
|
## Citation And Acknowledgment
|
652
659
|
```
|
653
|
-
@misc{
|
654
|
-
title={
|
655
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
656
|
-
year={
|
660
|
+
@misc{zheng2024sglang,
|
661
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
662
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
663
|
+
year={2024},
|
657
664
|
eprint={2312.07104},
|
658
665
|
archivePrefix={arXiv},
|
659
666
|
primaryClass={cs.AI}
|
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
|
|
10
10
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
11
11
|
|
12
12
|
The core features include:
|
13
|
-
- **
|
14
|
-
- **
|
13
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
14
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
15
15
|
|
16
16
|
## News
|
17
17
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -32,19 +32,27 @@ The core features include:
|
|
32
32
|
### Method 1: With pip
|
33
33
|
```
|
34
34
|
pip install "sglang[all]"
|
35
|
+
|
36
|
+
# Install FlashInfer CUDA kernels
|
37
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
35
38
|
```
|
36
39
|
|
37
40
|
### Method 2: From source
|
38
41
|
```
|
39
|
-
git clone
|
42
|
+
git clone https://github.com/sgl-project/sglang.git
|
40
43
|
cd sglang
|
41
44
|
|
42
45
|
pip install --upgrade pip
|
43
46
|
pip install -e "python[all]"
|
47
|
+
|
48
|
+
# Install FlashInfer CUDA kernels
|
49
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
44
50
|
```
|
45
51
|
|
46
52
|
### Notes
|
47
|
-
- If you
|
53
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
54
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
55
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
48
56
|
|
49
57
|
## Quick Start
|
50
58
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -363,7 +371,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
363
371
|
```
|
364
372
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
365
373
|
```
|
366
|
-
- See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
|
367
374
|
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
368
375
|
|
369
376
|
### Supported Models
|
@@ -396,17 +403,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
396
403
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
397
404
|

|
398
405
|
|
399
|
-
Learn more [
|
406
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
407
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
400
408
|
|
401
409
|
## Roadmap
|
402
410
|
https://github.com/sgl-project/sglang/issues/157
|
403
411
|
|
404
412
|
## Citation And Acknowledgment
|
405
413
|
```
|
406
|
-
@misc{
|
407
|
-
title={
|
408
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
409
|
-
year={
|
414
|
+
@misc{zheng2024sglang,
|
415
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
416
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
417
|
+
year={2024},
|
410
418
|
eprint={2312.07104},
|
411
419
|
archivePrefix={arXiv},
|
412
420
|
primaryClass={cs.AI}
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.1.
|
7
|
+
version = "0.1.18"
|
8
8
|
description = "A structured generation langauge for LLMs."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -16,13 +16,14 @@ classifiers = [
|
|
16
16
|
dependencies = [
|
17
17
|
"requests",
|
18
18
|
"tqdm",
|
19
|
+
"numpy",
|
19
20
|
]
|
20
21
|
|
21
22
|
[project.optional-dependencies]
|
22
|
-
srt = ["aiohttp", "fastapi", "
|
23
|
-
"
|
24
|
-
openai = ["openai>=1.0", "
|
25
|
-
anthropic = ["anthropic>=0.20.0"
|
23
|
+
srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
|
24
|
+
"psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
|
25
|
+
openai = ["openai>=1.0", "tiktoken"]
|
26
|
+
anthropic = ["anthropic>=0.20.0"]
|
26
27
|
litellm = ["litellm>=1.0.0"]
|
27
28
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
28
29
|
|
@@ -1,4 +1,4 @@
|
|
1
|
-
__version__ = "0.1.
|
1
|
+
__version__ = "0.1.18"
|
2
2
|
|
3
3
|
# SGL API Components
|
4
4
|
from sglang.api import (
|
@@ -24,10 +24,10 @@ from sglang.api import (
|
|
24
24
|
|
25
25
|
# SGL Backends
|
26
26
|
from sglang.backend.anthropic import Anthropic
|
27
|
+
from sglang.backend.litellm import LiteLLM
|
27
28
|
from sglang.backend.openai import OpenAI
|
28
29
|
from sglang.backend.runtime_endpoint import RuntimeEndpoint
|
29
30
|
from sglang.backend.vertexai import VertexAI
|
30
|
-
from sglang.backend.litellm import LiteLLM
|
31
31
|
|
32
32
|
# Global Configurations
|
33
33
|
from sglang.global_config import global_config
|
@@ -1,4 +1,4 @@
|
|
1
|
-
"""
|
1
|
+
"""Public APIs of the language."""
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
|
|
43
43
|
global_config.default_backend = backend
|
44
44
|
|
45
45
|
|
46
|
-
def flush_cache(backend: BaseBackend = None):
|
46
|
+
def flush_cache(backend: Optional[BaseBackend] = None):
|
47
47
|
backend = backend or global_config.default_backend
|
48
48
|
if backend is None:
|
49
49
|
return False
|
50
50
|
return backend.flush_cache()
|
51
51
|
|
52
52
|
|
53
|
-
def get_server_args(backend: BaseBackend = None):
|
53
|
+
def get_server_args(backend: Optional[BaseBackend] = None):
|
54
54
|
backend = backend or global_config.default_backend
|
55
55
|
if backend is None:
|
56
56
|
return None
|
@@ -158,7 +158,7 @@ def video(path: str, num_frames: int):
|
|
158
158
|
|
159
159
|
def select(
|
160
160
|
name: Optional[str] = None,
|
161
|
-
choices: List[str] = None,
|
161
|
+
choices: Optional[List[str]] = None,
|
162
162
|
temperature: float = 0.0,
|
163
163
|
):
|
164
164
|
assert choices is not None
|
@@ -13,7 +13,6 @@ except ImportError as e:
|
|
13
13
|
|
14
14
|
|
15
15
|
class LiteLLM(BaseBackend):
|
16
|
-
|
17
16
|
def __init__(
|
18
17
|
self,
|
19
18
|
model_name,
|
@@ -33,7 +32,8 @@ class LiteLLM(BaseBackend):
|
|
33
32
|
self.model_name = model_name
|
34
33
|
|
35
34
|
self.chat_template = chat_template or get_chat_template_by_model_path(
|
36
|
-
model_name
|
35
|
+
model_name
|
36
|
+
)
|
37
37
|
|
38
38
|
self.client_params = {
|
39
39
|
"api_key": api_key,
|
@@ -1,7 +1,7 @@
|
|
1
|
+
import dataclasses
|
1
2
|
import logging
|
2
3
|
import time
|
3
4
|
import warnings
|
4
|
-
import dataclasses
|
5
5
|
from typing import Callable, List, Optional, Union
|
6
6
|
|
7
7
|
import numpy as np
|
@@ -105,14 +105,16 @@ class OpenAI(BaseBackend):
|
|
105
105
|
def get_chat_template(self):
|
106
106
|
return self.chat_template
|
107
107
|
|
108
|
-
def _prepare_spec_execution(
|
109
|
-
|
108
|
+
def _prepare_spec_execution(
|
109
|
+
self,
|
110
|
+
sampling_params: SglSamplingParams,
|
111
|
+
num_api_spec_tokens: int,
|
112
|
+
spec_var_name: str,
|
113
|
+
):
|
110
114
|
if "max_tokens" not in self.spec_kwargs:
|
111
115
|
self.spec_kwargs["max_tokens"] = num_api_spec_tokens
|
112
116
|
else:
|
113
|
-
assert
|
114
|
-
self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
115
|
-
)
|
117
|
+
assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
|
116
118
|
|
117
119
|
params = sampling_params.to_openai_kwargs()
|
118
120
|
for key, value in params.items():
|
@@ -151,8 +153,9 @@ class OpenAI(BaseBackend):
|
|
151
153
|
)
|
152
154
|
prompt = s.messages_
|
153
155
|
else:
|
154
|
-
return self._prepare_spec_execution(
|
155
|
-
s.num_api_spec_tokens, spec_var_name
|
156
|
+
return self._prepare_spec_execution(
|
157
|
+
sampling_params, s.num_api_spec_tokens, spec_var_name
|
158
|
+
)
|
156
159
|
else:
|
157
160
|
prompt = s.text_
|
158
161
|
|
@@ -325,7 +328,7 @@ class OpenAI(BaseBackend):
|
|
325
328
|
ret_str = ret.choices[0].text
|
326
329
|
ret_token = self.tokenizer.encode(ret_str)[0]
|
327
330
|
self.token_usage.prompt_tokens += ret.usage.prompt_tokens
|
328
|
-
self.token_usage.completion_tokens= ret.usage.completion_tokens
|
331
|
+
self.token_usage.completion_tokens = ret.usage.completion_tokens
|
329
332
|
|
330
333
|
# TODO:
|
331
334
|
# 1. return logits as the scores
|
@@ -355,7 +358,9 @@ class OpenAI(BaseBackend):
|
|
355
358
|
return decision, scores, None, None
|
356
359
|
|
357
360
|
|
358
|
-
def openai_completion(
|
361
|
+
def openai_completion(
|
362
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
363
|
+
):
|
359
364
|
for attempt in range(retries):
|
360
365
|
try:
|
361
366
|
if is_chat:
|
@@ -385,15 +390,19 @@ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None,
|
|
385
390
|
return comp
|
386
391
|
|
387
392
|
|
388
|
-
def openai_completion_stream(
|
393
|
+
def openai_completion_stream(
|
394
|
+
client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
|
395
|
+
):
|
389
396
|
for attempt in range(retries):
|
390
397
|
try:
|
391
398
|
if is_chat:
|
392
399
|
if "stop" in kwargs and kwargs["stop"] is None:
|
393
400
|
kwargs.pop("stop")
|
394
401
|
generator = client.chat.completions.create(
|
395
|
-
messages=prompt,
|
396
|
-
|
402
|
+
messages=prompt,
|
403
|
+
stream=True,
|
404
|
+
stream_options={"include_usage": True},
|
405
|
+
**kwargs,
|
397
406
|
)
|
398
407
|
for ret in generator:
|
399
408
|
if len(ret.choices) == 0:
|
@@ -405,8 +414,10 @@ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, promp
|
|
405
414
|
yield content or "", {}
|
406
415
|
else:
|
407
416
|
generator = client.completions.create(
|
408
|
-
prompt=prompt,
|
409
|
-
|
417
|
+
prompt=prompt,
|
418
|
+
stream=True,
|
419
|
+
stream_options={"include_usage": True},
|
420
|
+
**kwargs,
|
410
421
|
)
|
411
422
|
for ret in generator:
|
412
423
|
if len(ret.choices) == 0:
|