sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +3 -1
- sglang/api.py +7 -7
- sglang/backend/anthropic.py +1 -1
- sglang/backend/litellm.py +90 -0
- sglang/backend/openai.py +158 -11
- sglang/backend/runtime_endpoint.py +18 -10
- sglang/bench_latency.py +299 -0
- sglang/global_config.py +12 -2
- sglang/lang/compiler.py +2 -2
- sglang/lang/interpreter.py +114 -67
- sglang/lang/ir.py +28 -3
- sglang/launch_server.py +4 -1
- sglang/launch_server_llavavid.py +2 -1
- sglang/srt/constrained/__init__.py +13 -6
- sglang/srt/constrained/fsm_cache.py +8 -2
- sglang/srt/constrained/jump_forward.py +113 -25
- sglang/srt/conversation.py +2 -0
- sglang/srt/flush_cache.py +3 -1
- sglang/srt/hf_transformers_utils.py +130 -1
- sglang/srt/layers/extend_attention.py +17 -0
- sglang/srt/layers/fused_moe.py +582 -0
- sglang/srt/layers/logits_processor.py +65 -32
- sglang/srt/layers/radix_attention.py +41 -7
- sglang/srt/layers/token_attention.py +16 -1
- sglang/srt/managers/controller/dp_worker.py +113 -0
- sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
- sglang/srt/managers/controller/manager_multi.py +191 -0
- sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
- sglang/srt/managers/{router → controller}/model_runner.py +262 -158
- sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
- sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
- sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
- sglang/srt/managers/detokenizer_manager.py +42 -46
- sglang/srt/managers/io_struct.py +22 -12
- sglang/srt/managers/tokenizer_manager.py +151 -87
- sglang/srt/model_config.py +83 -5
- sglang/srt/models/chatglm.py +399 -0
- sglang/srt/models/commandr.py +10 -13
- sglang/srt/models/dbrx.py +9 -15
- sglang/srt/models/gemma.py +12 -15
- sglang/srt/models/grok.py +738 -0
- sglang/srt/models/llama2.py +26 -15
- sglang/srt/models/llama_classification.py +104 -0
- sglang/srt/models/llava.py +86 -19
- sglang/srt/models/llavavid.py +11 -20
- sglang/srt/models/mixtral.py +282 -103
- sglang/srt/models/mixtral_quant.py +372 -0
- sglang/srt/models/qwen.py +9 -13
- sglang/srt/models/qwen2.py +11 -13
- sglang/srt/models/stablelm.py +9 -15
- sglang/srt/models/yivl.py +17 -22
- sglang/srt/openai_api_adapter.py +150 -95
- sglang/srt/openai_protocol.py +11 -2
- sglang/srt/server.py +124 -48
- sglang/srt/server_args.py +128 -48
- sglang/srt/utils.py +234 -67
- sglang/test/test_programs.py +65 -3
- sglang/test/test_utils.py +32 -1
- sglang/utils.py +23 -4
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
- sglang-0.1.18.dist-info/RECORD +78 -0
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
- sglang/srt/backend_config.py +0 -13
- sglang/srt/models/dbrx_config.py +0 -281
- sglang/srt/weight_utils.py +0 -417
- sglang-0.1.16.dist-info/RECORD +0 -72
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
- {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.18
|
4
4
|
Summary: A structured generation langauge for LLMs.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -213,34 +213,36 @@ Description-Content-Type: text/markdown
|
|
213
213
|
License-File: LICENSE
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
|
+
Requires-Dist: numpy
|
216
217
|
Provides-Extra: all
|
217
218
|
Requires-Dist: sglang[srt] ; extra == 'all'
|
218
219
|
Requires-Dist: sglang[openai] ; extra == 'all'
|
219
220
|
Requires-Dist: sglang[anthropic] ; extra == 'all'
|
221
|
+
Requires-Dist: sglang[litellm] ; extra == 'all'
|
220
222
|
Provides-Extra: anthropic
|
221
223
|
Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
|
222
|
-
|
224
|
+
Provides-Extra: litellm
|
225
|
+
Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
|
223
226
|
Provides-Extra: openai
|
224
227
|
Requires-Dist: openai >=1.0 ; extra == 'openai'
|
225
|
-
Requires-Dist: numpy ; extra == 'openai'
|
226
228
|
Requires-Dist: tiktoken ; extra == 'openai'
|
227
229
|
Provides-Extra: srt
|
228
230
|
Requires-Dist: aiohttp ; extra == 'srt'
|
229
231
|
Requires-Dist: fastapi ; extra == 'srt'
|
232
|
+
Requires-Dist: hf-transfer ; extra == 'srt'
|
233
|
+
Requires-Dist: huggingface-hub ; extra == 'srt'
|
234
|
+
Requires-Dist: interegular ; extra == 'srt'
|
235
|
+
Requires-Dist: packaging ; extra == 'srt'
|
236
|
+
Requires-Dist: pillow ; extra == 'srt'
|
230
237
|
Requires-Dist: psutil ; extra == 'srt'
|
238
|
+
Requires-Dist: pydantic ; extra == 'srt'
|
231
239
|
Requires-Dist: rpyc ; extra == 'srt'
|
232
240
|
Requires-Dist: torch ; extra == 'srt'
|
233
|
-
Requires-Dist: uvloop ; extra == 'srt'
|
234
241
|
Requires-Dist: uvicorn ; extra == 'srt'
|
242
|
+
Requires-Dist: uvloop ; extra == 'srt'
|
235
243
|
Requires-Dist: zmq ; extra == 'srt'
|
236
|
-
Requires-Dist: vllm
|
237
|
-
Requires-Dist:
|
238
|
-
Requires-Dist: pydantic ; extra == 'srt'
|
239
|
-
Requires-Dist: pillow ; extra == 'srt'
|
240
|
-
Requires-Dist: packaging ; extra == 'srt'
|
241
|
-
Requires-Dist: huggingface-hub ; extra == 'srt'
|
242
|
-
Requires-Dist: hf-transfer ; extra == 'srt'
|
243
|
-
Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
|
244
|
+
Requires-Dist: vllm ==0.5.0 ; extra == 'srt'
|
245
|
+
Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
|
244
246
|
|
245
247
|
<div align="center">
|
246
248
|
<img src="assets/logo.png" alt="logo" width="400"></img>
|
@@ -253,9 +255,9 @@ Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
|
|
253
255
|
SGLang is a structured generation language designed for large language models (LLMs).
|
254
256
|
It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
|
255
257
|
|
256
|
-
The core features
|
257
|
-
- **
|
258
|
-
- **
|
258
|
+
The core features include:
|
259
|
+
- **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
|
260
|
+
- **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
|
259
261
|
|
260
262
|
## News
|
261
263
|
- [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
@@ -276,23 +278,27 @@ The core features of SGLang include:
|
|
276
278
|
### Method 1: With pip
|
277
279
|
```
|
278
280
|
pip install "sglang[all]"
|
281
|
+
|
282
|
+
# Install FlashInfer CUDA kernels
|
283
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
279
284
|
```
|
280
285
|
|
281
286
|
### Method 2: From source
|
282
287
|
```
|
283
|
-
git clone
|
288
|
+
git clone https://github.com/sgl-project/sglang.git
|
284
289
|
cd sglang
|
285
290
|
|
286
291
|
pip install --upgrade pip
|
287
292
|
pip install -e "python[all]"
|
293
|
+
|
294
|
+
# Install FlashInfer CUDA kernels
|
295
|
+
pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
|
288
296
|
```
|
289
297
|
|
290
298
|
### Notes
|
291
|
-
- If you
|
292
|
-
|
293
|
-
|
294
|
-
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
|
295
|
-
|
299
|
+
- If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
|
300
|
+
- If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
|
301
|
+
- If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
|
296
302
|
|
297
303
|
## Quick Start
|
298
304
|
The example below shows how to use sglang to answer a mulit-turn question.
|
@@ -603,11 +609,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
603
609
|
```
|
604
610
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
|
605
611
|
```
|
612
|
+
- Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
|
613
|
+
```
|
614
|
+
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
|
615
|
+
```
|
606
616
|
- If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
|
607
617
|
```
|
608
618
|
python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
|
609
619
|
```
|
610
|
-
-
|
620
|
+
- See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
|
611
621
|
|
612
622
|
### Supported Models
|
613
623
|
- Llama
|
@@ -621,6 +631,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
|
|
621
631
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
622
632
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
|
623
633
|
- `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
|
634
|
+
- LLaVA-NeXT-Video
|
635
|
+
- see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
|
624
636
|
- Yi-VL
|
625
637
|
- see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
|
626
638
|
- StableLM
|
@@ -637,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
|
|
637
649
|
- Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
|
638
650
|

|
639
651
|
|
640
|
-
Learn more [
|
652
|
+
- Learn more about the above [results](docs/benchmark_results.md).
|
653
|
+
- Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
|
641
654
|
|
642
655
|
## Roadmap
|
643
656
|
https://github.com/sgl-project/sglang/issues/157
|
644
657
|
|
645
658
|
## Citation And Acknowledgment
|
646
659
|
```
|
647
|
-
@misc{
|
648
|
-
title={
|
649
|
-
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and
|
650
|
-
year={
|
660
|
+
@misc{zheng2024sglang,
|
661
|
+
title={SGLang: Efficient Execution of Structured Language Model Programs},
|
662
|
+
author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
|
663
|
+
year={2024},
|
651
664
|
eprint={2312.07104},
|
652
665
|
archivePrefix={arXiv},
|
653
666
|
primaryClass={cs.AI}
|
@@ -0,0 +1,78 @@
|
|
1
|
+
sglang/__init__.py,sha256=PhkN9MopSdHLXHG9_7l5JB-awRDI9CdR6Qht1vWA9C8,1116
|
2
|
+
sglang/api.py,sha256=92oqUgVeKq9B9If2A8LHzEhPicZK5Rq3rKUShwPAq0E,4579
|
3
|
+
sglang/bench_latency.py,sha256=MNxmVCwBM7ZWFYSFy2m-y8MmEWNWvZO2gUBbuMyWSBI,10018
|
4
|
+
sglang/global_config.py,sha256=xMX7JqPgDRwtvcbULkwHJ-bfysNefEN42V3BGss9mlo,1425
|
5
|
+
sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
|
6
|
+
sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
|
7
|
+
sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
|
8
|
+
sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
|
10
|
+
sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
|
11
|
+
sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
|
12
|
+
sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
|
13
|
+
sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
|
14
|
+
sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
|
15
|
+
sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
|
+
sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
|
17
|
+
sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
|
18
|
+
sglang/lang/interpreter.py,sha256=MMdvli-75ySiKiULlsnoVmb8oEu5bvSkYz8GRdtZoVk,29494
|
19
|
+
sglang/lang/ir.py,sha256=KZxXVva2r1UihYOVWRKcU_zILMx05oWV2yLy3SeZfnA,14603
|
20
|
+
sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
|
21
|
+
sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
|
22
|
+
sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
|
23
|
+
sglang/srt/hf_transformers_utils.py,sha256=P6eXfGwH-OeU6hDrlGYL5GACcTNPdYOimpKZ0ZBZUao,10683
|
24
|
+
sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
|
25
|
+
sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
|
26
|
+
sglang/srt/model_config.py,sha256=eGt8hTtipSTqp-AsB-Cl4wfZDb14CTcOtIz-iXgaVk8,4997
|
27
|
+
sglang/srt/openai_api_adapter.py,sha256=pqGP0bON-wEZOnZyo85gzrO9MSzeIkHh5xqhpN5RkyY,15120
|
28
|
+
sglang/srt/openai_protocol.py,sha256=CNJOMr3PJvoRGI2TIh9t8f_4wYTtT0EF8kzsrYsASYY,5350
|
29
|
+
sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
|
30
|
+
sglang/srt/server.py,sha256=742w8gn0GgE7w3EwgPhq7MYabaVxcdPpMAovEE6-DaU,13112
|
31
|
+
sglang/srt/server_args.py,sha256=j0-Aj8sHQ-zgumd4w0IaezRqDdjDC6MMMG5M8zzITVw,12166
|
32
|
+
sglang/srt/utils.py,sha256=V2C4fb93oKS4D3lezlRgHkD7MQDNBZlIy_4ZTNzAC9E,19423
|
33
|
+
sglang/srt/constrained/__init__.py,sha256=Q-XnKFChC9q6WDCnJKAKAuXzKHHg4QoFlYODge8ZKCs,1504
|
34
|
+
sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
|
35
|
+
sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
|
36
|
+
sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
|
37
|
+
sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
|
38
|
+
sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
|
39
|
+
sglang/srt/layers/fused_moe.py,sha256=M_cTHMNSoD-wdh6XjzHseuq3zsdqOmECWxNeEVJklu4,22257
|
40
|
+
sglang/srt/layers/logits_processor.py,sha256=t-bZIcGj70KKf2Jcor9K7Va1NsBlDVNrQ4Ktlq0lUlU,8506
|
41
|
+
sglang/srt/layers/radix_attention.py,sha256=XsHFf7myNKZwyt3qB5LEXAttTKMY9OP3M3t5CZnyu3g,6911
|
42
|
+
sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
|
43
|
+
sglang/srt/managers/detokenizer_manager.py,sha256=tOjURt-XQofPblnGECoJfoRSMPoWFVAH99R05hXeYNw,3353
|
44
|
+
sglang/srt/managers/io_struct.py,sha256=O1cz6hDV6BjXbZ0-tk6VaDNjYFuMBUOGswbG3H_GliY,4532
|
45
|
+
sglang/srt/managers/tokenizer_manager.py,sha256=TswupFKrlXAvUM5-1eT2cR6uNJoQVivp2MQkEFu4axQ,14848
|
46
|
+
sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
|
47
|
+
sglang/srt/managers/controller/infer_batch.py,sha256=-Q17Pk_Mmccobxly7UM8wCC6dYKJ4zmjplMboN1q8b0,25700
|
48
|
+
sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
|
49
|
+
sglang/srt/managers/controller/manager_single.py,sha256=OIm_BjbDaEcYmpb_E_7wv0xfOlb2le0zXjPMqf1pU9U,3468
|
50
|
+
sglang/srt/managers/controller/model_runner.py,sha256=HjOHp_Rtdm7OnMmhtnSwPWPmEYHDpnt5LjeKbiYb6mo,21718
|
51
|
+
sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
|
52
|
+
sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
|
53
|
+
sglang/srt/managers/controller/tp_worker.py,sha256=VNVQ0oqPGllC00cZCxHB-0LqudxgS74jf-it2zDHzTA,31411
|
54
|
+
sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
|
55
|
+
sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
|
56
|
+
sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
|
57
|
+
sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
|
58
|
+
sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
|
59
|
+
sglang/srt/models/llama2.py,sha256=7aPPSLABRIy7_iy4YvFHV7Beqc2I1-Vc1obSbsgzNzY,12190
|
60
|
+
sglang/srt/models/llama_classification.py,sha256=LrPRFB-Yd2haZADNY3uIusbajQwcZNQrOCTd92L2vS0,4304
|
61
|
+
sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
|
62
|
+
sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
|
63
|
+
sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
|
64
|
+
sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
|
65
|
+
sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
|
66
|
+
sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
|
67
|
+
sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
|
68
|
+
sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
|
69
|
+
sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
|
70
|
+
sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
|
71
|
+
sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
|
72
|
+
sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
|
73
|
+
sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
|
74
|
+
sglang-0.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
75
|
+
sglang-0.1.18.dist-info/METADATA,sha256=tDdBZo2qvH8wWC4faXxfryjh7-6frEsBnH0vJ_ia1w4,29752
|
76
|
+
sglang-0.1.18.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
|
77
|
+
sglang-0.1.18.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
|
78
|
+
sglang-0.1.18.dist-info/RECORD,,
|
sglang/srt/backend_config.py
DELETED
sglang/srt/models/dbrx_config.py
DELETED
@@ -1,281 +0,0 @@
|
|
1
|
-
# Adapted from:
|
2
|
-
# https://github.com/vllm-project/vllm/blob/14ccd94c89d0ffd9da283545d93ab1dfea5da340/vllm/transformers_utils/configs/dbrx.py
|
3
|
-
# yapf: disable
|
4
|
-
# ruff: noqa: E501
|
5
|
-
# coding=utf-8
|
6
|
-
# Copied from
|
7
|
-
# https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
|
8
|
-
"""Dbrx configuration."""
|
9
|
-
|
10
|
-
# FIXME: remove this once vllm releases a new version
|
11
|
-
|
12
|
-
from typing import Any, Optional
|
13
|
-
|
14
|
-
from transformers.configuration_utils import PretrainedConfig
|
15
|
-
from transformers.utils import logging
|
16
|
-
|
17
|
-
logger = logging.get_logger(__name__)
|
18
|
-
|
19
|
-
DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
20
|
-
|
21
|
-
|
22
|
-
class DbrxAttentionConfig(PretrainedConfig):
|
23
|
-
"""Configuration class for Dbrx Attention.
|
24
|
-
|
25
|
-
[`DbrxAttention`] class. It is used to instantiate attention layers
|
26
|
-
according to the specified arguments, defining the layers architecture.
|
27
|
-
|
28
|
-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
29
|
-
documentation from [`PretrainedConfig`] for more information.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
attn_pdrop (`float`, *optional*, defaults to 0.0):
|
33
|
-
The dropout probability for the attention layers.
|
34
|
-
clip_qkv (`float`, *optional*, defaults to None):
|
35
|
-
If not `None`, clip the queries, keys, and values in the attention layer to this value.
|
36
|
-
kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
|
37
|
-
rope_theta (float): The base frequency for rope.
|
38
|
-
"""
|
39
|
-
|
40
|
-
def __init__(
|
41
|
-
self,
|
42
|
-
attn_pdrop: float = 0,
|
43
|
-
clip_qkv: Optional[float] = None,
|
44
|
-
kv_n_heads: int = 1,
|
45
|
-
rope_theta: float = 10000.0,
|
46
|
-
**kwargs: Any,
|
47
|
-
):
|
48
|
-
super().__init__(**kwargs)
|
49
|
-
self.attn_pdrop = attn_pdrop
|
50
|
-
self.clip_qkv = clip_qkv
|
51
|
-
self.kv_n_heads = kv_n_heads
|
52
|
-
self.rope_theta = rope_theta
|
53
|
-
|
54
|
-
for k in ["model_type"]:
|
55
|
-
if k in kwargs:
|
56
|
-
kwargs.pop(k)
|
57
|
-
if len(kwargs) != 0:
|
58
|
-
raise ValueError(f"Found unknown {kwargs=}")
|
59
|
-
|
60
|
-
@classmethod
|
61
|
-
def from_pretrained(
|
62
|
-
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
63
|
-
) -> "PretrainedConfig":
|
64
|
-
cls._set_token_in_kwargs(kwargs)
|
65
|
-
|
66
|
-
config_dict, kwargs = cls.get_config_dict(
|
67
|
-
pretrained_model_name_or_path, **kwargs
|
68
|
-
)
|
69
|
-
|
70
|
-
if config_dict.get("model_type") == "dbrx":
|
71
|
-
config_dict = config_dict["attn_config"]
|
72
|
-
|
73
|
-
if (
|
74
|
-
"model_type" in config_dict
|
75
|
-
and hasattr(cls, "model_type")
|
76
|
-
and config_dict["model_type"] != cls.model_type
|
77
|
-
):
|
78
|
-
logger.warning(
|
79
|
-
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
80
|
-
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
81
|
-
)
|
82
|
-
|
83
|
-
return cls.from_dict(config_dict, **kwargs)
|
84
|
-
|
85
|
-
|
86
|
-
class DbrxFFNConfig(PretrainedConfig):
|
87
|
-
"""Configuration class for Dbrx FFN.
|
88
|
-
|
89
|
-
[`DbrxFFN`] class. It is used to instantiate feedforward layers according to
|
90
|
-
the specified arguments, defining the layers architecture.
|
91
|
-
|
92
|
-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
93
|
-
documentation from [`PretrainedConfig`] for more information.
|
94
|
-
|
95
|
-
Args:
|
96
|
-
ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
|
97
|
-
The dict should have a key 'name' with the value being the name of
|
98
|
-
the activation function along with any additional keyword arguments.
|
99
|
-
ffn_hidden_size (int, optional): The hidden size of the feedforward network.
|
100
|
-
moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
|
101
|
-
moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
|
102
|
-
moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
|
103
|
-
moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
|
104
|
-
moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
|
105
|
-
uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
|
106
|
-
This should only be used for benchmarking purposes.
|
107
|
-
"""
|
108
|
-
|
109
|
-
def __init__(
|
110
|
-
self,
|
111
|
-
ffn_act_fn: Optional[dict] = None,
|
112
|
-
ffn_hidden_size: int = 3584,
|
113
|
-
moe_num_experts: int = 4,
|
114
|
-
moe_top_k: int = 1,
|
115
|
-
moe_jitter_eps: Optional[float] = None,
|
116
|
-
moe_loss_weight: float = 0.01,
|
117
|
-
moe_normalize_expert_weights: Optional[float] = 1,
|
118
|
-
uniform_expert_assignment: bool = False,
|
119
|
-
**kwargs: Any,
|
120
|
-
):
|
121
|
-
super().__init__()
|
122
|
-
if ffn_act_fn is None:
|
123
|
-
ffn_act_fn = {"name": "silu"}
|
124
|
-
self.ffn_act_fn = ffn_act_fn
|
125
|
-
self.ffn_hidden_size = ffn_hidden_size
|
126
|
-
self.moe_num_experts = moe_num_experts
|
127
|
-
self.moe_top_k = moe_top_k
|
128
|
-
self.moe_jitter_eps = moe_jitter_eps
|
129
|
-
self.moe_loss_weight = moe_loss_weight
|
130
|
-
self.moe_normalize_expert_weights = moe_normalize_expert_weights
|
131
|
-
self.uniform_expert_assignment = uniform_expert_assignment
|
132
|
-
|
133
|
-
for k in ["model_type"]:
|
134
|
-
if k in kwargs:
|
135
|
-
kwargs.pop(k)
|
136
|
-
if len(kwargs) != 0:
|
137
|
-
raise ValueError(f"Found unknown {kwargs=}")
|
138
|
-
|
139
|
-
@classmethod
|
140
|
-
def from_pretrained(
|
141
|
-
cls, pretrained_model_name_or_path: str, **kwargs: Any
|
142
|
-
) -> "PretrainedConfig":
|
143
|
-
cls._set_token_in_kwargs(kwargs)
|
144
|
-
|
145
|
-
config_dict, kwargs = cls.get_config_dict(
|
146
|
-
pretrained_model_name_or_path, **kwargs
|
147
|
-
)
|
148
|
-
|
149
|
-
if config_dict.get("model_type") == "dbrx":
|
150
|
-
config_dict = config_dict["ffn_config"]
|
151
|
-
|
152
|
-
if (
|
153
|
-
"model_type" in config_dict
|
154
|
-
and hasattr(cls, "model_type")
|
155
|
-
and config_dict["model_type"] != cls.model_type
|
156
|
-
):
|
157
|
-
logger.warning(
|
158
|
-
f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
|
159
|
-
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
|
160
|
-
)
|
161
|
-
|
162
|
-
return cls.from_dict(config_dict, **kwargs)
|
163
|
-
|
164
|
-
|
165
|
-
class DbrxConfig(PretrainedConfig):
|
166
|
-
"""Configuration class for Dbrx.
|
167
|
-
|
168
|
-
[`DbrxModel`]. It is used to instantiate a Dbrx model according to the
|
169
|
-
specified arguments, defining the model architecture.
|
170
|
-
|
171
|
-
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
172
|
-
documentation from [`PretrainedConfig`] for more information.
|
173
|
-
|
174
|
-
|
175
|
-
Args:
|
176
|
-
d_model (`int`, *optional*, defaults to 6144):
|
177
|
-
Dimensionality of the embeddings and hidden states.
|
178
|
-
n_heads (`int`, *optional*, defaults to 48):
|
179
|
-
Number of attention heads for each attention layer in the Transformer encoder.
|
180
|
-
n_layers (`int`, *optional*, defaults to 40):
|
181
|
-
Number of hidden layers in the Transformer encoder.
|
182
|
-
max_seq_len (`int`, *optional*, defaults to 32768):
|
183
|
-
The maximum sequence length of the model.
|
184
|
-
vocab_size (`int`, *optional*, defaults to 100352):
|
185
|
-
Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
|
186
|
-
the `inputs_ids` passed when calling [`DbrxModel`].
|
187
|
-
resid_pdrop (`float`, *optional*, defaults to 0.0):
|
188
|
-
The dropout probability applied to the attention output before combining with residual.
|
189
|
-
emb_pdrop (`float`, *optional*, defaults to 0.0):
|
190
|
-
The dropout probability for the embedding layer.
|
191
|
-
attn_config (`dict`, *optional*):
|
192
|
-
A dictionary used to configure the model's attention module.
|
193
|
-
ffn_config (`dict`, *optional*):
|
194
|
-
A dictionary used to configure the model's FFN module.
|
195
|
-
use_cache (`bool`, *optional*, defaults to `False`):
|
196
|
-
Whether or not the model should return the last key/values attentions (not used by all models).
|
197
|
-
initializer_range (`float`, *optional*, defaults to 0.02):
|
198
|
-
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
199
|
-
output_router_logits (`bool`, *optional*, defaults to `False`):
|
200
|
-
Whether or not the router logits should be returned by the model. Enabling this will also
|
201
|
-
allow the model to output the auxiliary loss. See [here]() for more details
|
202
|
-
router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
|
203
|
-
The aux loss factor for the total loss.
|
204
|
-
|
205
|
-
|
206
|
-
Example:
|
207
|
-
```python
|
208
|
-
>>> from transformers import DbrxConfig, DbrxModel
|
209
|
-
|
210
|
-
>>> # Initializing a Dbrx configuration
|
211
|
-
>>> configuration = DbrxConfig()
|
212
|
-
|
213
|
-
>>> # Initializing a model (with random weights) from the configuration
|
214
|
-
>>> model = DbrxModel(configuration)
|
215
|
-
|
216
|
-
>>> # Accessing the model configuration
|
217
|
-
>>> configuration = model.config
|
218
|
-
```
|
219
|
-
"""
|
220
|
-
|
221
|
-
model_type = "dbrx"
|
222
|
-
attribute_map = {
|
223
|
-
"num_attention_heads": "n_heads",
|
224
|
-
"hidden_size": "d_model",
|
225
|
-
"num_hidden_layers": "n_layers",
|
226
|
-
"max_position_embeddings": "max_seq_len",
|
227
|
-
}
|
228
|
-
|
229
|
-
def __init__(
|
230
|
-
self,
|
231
|
-
d_model: int = 2048,
|
232
|
-
n_heads: int = 16,
|
233
|
-
n_layers: int = 24,
|
234
|
-
max_seq_len: int = 2048,
|
235
|
-
vocab_size: int = 32000,
|
236
|
-
resid_pdrop: float = 0.0,
|
237
|
-
emb_pdrop: float = 0.0,
|
238
|
-
attn_config: Optional[DbrxAttentionConfig] = None,
|
239
|
-
ffn_config: Optional[DbrxFFNConfig] = None,
|
240
|
-
use_cache: bool = True,
|
241
|
-
initializer_range: float = 0.02,
|
242
|
-
output_router_logits: bool = False,
|
243
|
-
router_aux_loss_coef: float = 0.05,
|
244
|
-
**kwargs: Any,
|
245
|
-
):
|
246
|
-
if attn_config is None:
|
247
|
-
self.attn_config = DbrxAttentionConfig()
|
248
|
-
elif isinstance(attn_config, dict):
|
249
|
-
self.attn_config = DbrxAttentionConfig(**attn_config)
|
250
|
-
else:
|
251
|
-
self.attn_config = attn_config
|
252
|
-
|
253
|
-
if ffn_config is None:
|
254
|
-
self.ffn_config = DbrxFFNConfig()
|
255
|
-
elif isinstance(ffn_config, dict):
|
256
|
-
self.ffn_config = DbrxFFNConfig(**ffn_config)
|
257
|
-
else:
|
258
|
-
self.ffn_config = ffn_config
|
259
|
-
|
260
|
-
self.d_model = d_model
|
261
|
-
self.n_heads = n_heads
|
262
|
-
self.n_layers = n_layers
|
263
|
-
self.max_seq_len = max_seq_len
|
264
|
-
self.vocab_size = vocab_size
|
265
|
-
self.resid_pdrop = resid_pdrop
|
266
|
-
self.emb_pdrop = emb_pdrop
|
267
|
-
self.use_cache = use_cache
|
268
|
-
self.initializer_range = initializer_range
|
269
|
-
self.output_router_logits = output_router_logits
|
270
|
-
self.router_aux_loss_coef = router_aux_loss_coef
|
271
|
-
|
272
|
-
tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
|
273
|
-
if tie_word_embeddings:
|
274
|
-
raise ValueError(
|
275
|
-
"tie_word_embeddings is not supported for Dbrx models."
|
276
|
-
)
|
277
|
-
|
278
|
-
super().__init__(
|
279
|
-
tie_word_embeddings=tie_word_embeddings,
|
280
|
-
**kwargs,
|
281
|
-
)
|