sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. sglang/__init__.py +3 -1
  2. sglang/api.py +7 -7
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +158 -11
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/bench_latency.py +299 -0
  8. sglang/global_config.py +12 -2
  9. sglang/lang/compiler.py +2 -2
  10. sglang/lang/interpreter.py +114 -67
  11. sglang/lang/ir.py +28 -3
  12. sglang/launch_server.py +4 -1
  13. sglang/launch_server_llavavid.py +2 -1
  14. sglang/srt/constrained/__init__.py +13 -6
  15. sglang/srt/constrained/fsm_cache.py +8 -2
  16. sglang/srt/constrained/jump_forward.py +113 -25
  17. sglang/srt/conversation.py +2 -0
  18. sglang/srt/flush_cache.py +3 -1
  19. sglang/srt/hf_transformers_utils.py +130 -1
  20. sglang/srt/layers/extend_attention.py +17 -0
  21. sglang/srt/layers/fused_moe.py +582 -0
  22. sglang/srt/layers/logits_processor.py +65 -32
  23. sglang/srt/layers/radix_attention.py +41 -7
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/controller/dp_worker.py +113 -0
  26. sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
  27. sglang/srt/managers/controller/manager_multi.py +191 -0
  28. sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
  29. sglang/srt/managers/{router → controller}/model_runner.py +262 -158
  30. sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
  31. sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
  32. sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
  33. sglang/srt/managers/detokenizer_manager.py +42 -46
  34. sglang/srt/managers/io_struct.py +22 -12
  35. sglang/srt/managers/tokenizer_manager.py +151 -87
  36. sglang/srt/model_config.py +83 -5
  37. sglang/srt/models/chatglm.py +399 -0
  38. sglang/srt/models/commandr.py +10 -13
  39. sglang/srt/models/dbrx.py +9 -15
  40. sglang/srt/models/gemma.py +12 -15
  41. sglang/srt/models/grok.py +738 -0
  42. sglang/srt/models/llama2.py +26 -15
  43. sglang/srt/models/llama_classification.py +104 -0
  44. sglang/srt/models/llava.py +86 -19
  45. sglang/srt/models/llavavid.py +11 -20
  46. sglang/srt/models/mixtral.py +282 -103
  47. sglang/srt/models/mixtral_quant.py +372 -0
  48. sglang/srt/models/qwen.py +9 -13
  49. sglang/srt/models/qwen2.py +11 -13
  50. sglang/srt/models/stablelm.py +9 -15
  51. sglang/srt/models/yivl.py +17 -22
  52. sglang/srt/openai_api_adapter.py +150 -95
  53. sglang/srt/openai_protocol.py +11 -2
  54. sglang/srt/server.py +124 -48
  55. sglang/srt/server_args.py +128 -48
  56. sglang/srt/utils.py +234 -67
  57. sglang/test/test_programs.py +65 -3
  58. sglang/test/test_utils.py +32 -1
  59. sglang/utils.py +23 -4
  60. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
  61. sglang-0.1.18.dist-info/RECORD +78 -0
  62. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -417
  66. sglang-0.1.16.dist-info/RECORD +0 -72
  67. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -213,34 +213,36 @@ Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
+ Requires-Dist: numpy
216
217
  Provides-Extra: all
217
218
  Requires-Dist: sglang[srt] ; extra == 'all'
218
219
  Requires-Dist: sglang[openai] ; extra == 'all'
219
220
  Requires-Dist: sglang[anthropic] ; extra == 'all'
221
+ Requires-Dist: sglang[litellm] ; extra == 'all'
220
222
  Provides-Extra: anthropic
221
223
  Requires-Dist: anthropic >=0.20.0 ; extra == 'anthropic'
222
- Requires-Dist: numpy ; extra == 'anthropic'
224
+ Provides-Extra: litellm
225
+ Requires-Dist: litellm >=1.0.0 ; extra == 'litellm'
223
226
  Provides-Extra: openai
224
227
  Requires-Dist: openai >=1.0 ; extra == 'openai'
225
- Requires-Dist: numpy ; extra == 'openai'
226
228
  Requires-Dist: tiktoken ; extra == 'openai'
227
229
  Provides-Extra: srt
228
230
  Requires-Dist: aiohttp ; extra == 'srt'
229
231
  Requires-Dist: fastapi ; extra == 'srt'
232
+ Requires-Dist: hf-transfer ; extra == 'srt'
233
+ Requires-Dist: huggingface-hub ; extra == 'srt'
234
+ Requires-Dist: interegular ; extra == 'srt'
235
+ Requires-Dist: packaging ; extra == 'srt'
236
+ Requires-Dist: pillow ; extra == 'srt'
230
237
  Requires-Dist: psutil ; extra == 'srt'
238
+ Requires-Dist: pydantic ; extra == 'srt'
231
239
  Requires-Dist: rpyc ; extra == 'srt'
232
240
  Requires-Dist: torch ; extra == 'srt'
233
- Requires-Dist: uvloop ; extra == 'srt'
234
241
  Requires-Dist: uvicorn ; extra == 'srt'
242
+ Requires-Dist: uvloop ; extra == 'srt'
235
243
  Requires-Dist: zmq ; extra == 'srt'
236
- Requires-Dist: vllm >=0.4.2 ; extra == 'srt'
237
- Requires-Dist: interegular ; extra == 'srt'
238
- Requires-Dist: pydantic ; extra == 'srt'
239
- Requires-Dist: pillow ; extra == 'srt'
240
- Requires-Dist: packaging ; extra == 'srt'
241
- Requires-Dist: huggingface-hub ; extra == 'srt'
242
- Requires-Dist: hf-transfer ; extra == 'srt'
243
- Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
244
+ Requires-Dist: vllm ==0.5.0 ; extra == 'srt'
245
+ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
244
246
 
245
247
  <div align="center">
246
248
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -253,9 +255,9 @@ Requires-Dist: outlines >=0.0.34 ; extra == 'srt'
253
255
  SGLang is a structured generation language designed for large language models (LLMs).
254
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
255
257
 
256
- The core features of SGLang include:
257
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
258
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
258
+ The core features include:
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
259
261
 
260
262
  ## News
261
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -276,23 +278,27 @@ The core features of SGLang include:
276
278
  ### Method 1: With pip
277
279
  ```
278
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
279
284
  ```
280
285
 
281
286
  ### Method 2: From source
282
287
  ```
283
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
284
289
  cd sglang
285
290
 
286
291
  pip install --upgrade pip
287
292
  pip install -e "python[all]"
293
+
294
+ # Install FlashInfer CUDA kernels
295
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
296
  ```
289
297
 
290
298
  ### Notes
291
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
292
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
293
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
295
-
299
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
300
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
301
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
296
302
 
297
303
  ## Quick Start
298
304
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -603,11 +609,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
603
609
  ```
604
610
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
605
611
  ```
612
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
613
+ ```
614
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
615
+ ```
606
616
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
607
617
  ```
608
618
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
609
619
  ```
610
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
620
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
611
621
 
612
622
  ### Supported Models
613
623
  - Llama
@@ -621,6 +631,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
621
631
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
622
632
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
623
633
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
634
+ - LLaVA-NeXT-Video
635
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
624
636
  - Yi-VL
625
637
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
626
638
  - StableLM
@@ -637,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
637
649
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
638
650
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
639
651
 
640
- Learn more [here](docs/benchmark_results.md).
652
+ - Learn more about the above [results](docs/benchmark_results.md).
653
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
641
654
 
642
655
  ## Roadmap
643
656
  https://github.com/sgl-project/sglang/issues/157
644
657
 
645
658
  ## Citation And Acknowledgment
646
659
  ```
647
- @misc{zheng2023efficiently,
648
- title={Efficiently Programming Large Language Models using SGLang},
649
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
650
- year={2023},
660
+ @misc{zheng2024sglang,
661
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
662
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
663
+ year={2024},
651
664
  eprint={2312.07104},
652
665
  archivePrefix={arXiv},
653
666
  primaryClass={cs.AI}
@@ -0,0 +1,78 @@
1
+ sglang/__init__.py,sha256=PhkN9MopSdHLXHG9_7l5JB-awRDI9CdR6Qht1vWA9C8,1116
2
+ sglang/api.py,sha256=92oqUgVeKq9B9If2A8LHzEhPicZK5Rq3rKUShwPAq0E,4579
3
+ sglang/bench_latency.py,sha256=MNxmVCwBM7ZWFYSFy2m-y8MmEWNWvZO2gUBbuMyWSBI,10018
4
+ sglang/global_config.py,sha256=xMX7JqPgDRwtvcbULkwHJ-bfysNefEN42V3BGss9mlo,1425
5
+ sglang/launch_server.py,sha256=X8TX6M-tv9JWHJkWnJskYNc0IZBooecI_yzpBHVf5KU,364
6
+ sglang/launch_server_llavavid.py,sha256=cxGJICBTYVgHVNy7NWwitY7VXt11kEnh7npkcB-iRf8,1115
7
+ sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
8
+ sglang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ sglang/backend/anthropic.py,sha256=iJjXiDMZbtvX2XNG78MG9kM7SpZq9hmXVuzT_T18elw,2076
10
+ sglang/backend/base_backend.py,sha256=APiMht4WYECLCOGRPCEUF6lX-an1vjVe2dWoMSgymWY,1831
11
+ sglang/backend/litellm.py,sha256=ZqsEZXgxLge-Fh3SMr1XkVPU7z3FKntpRppNwd1a12s,2447
12
+ sglang/backend/openai.py,sha256=Id4vDzfefG9R7AqJBMXqYmKHv2FMu0PBSYEGbK7Q510,14803
13
+ sglang/backend/runtime_endpoint.py,sha256=8NyWgMvhzUcA5VEsPLo1AacZ_UPVSnpxpzt6vYdVQSU,8871
14
+ sglang/backend/vertexai.py,sha256=XNkbUzOdLIz-1qP_BBieYIfUXZf6gsfdghlaulNpBM8,4714
15
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
+ sglang/lang/chat_template.py,sha256=ogIT8iMlDcSEgcNBTh5pRLoCkdQI_ec5Hc27wFUFDIg,11532
17
+ sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
18
+ sglang/lang/interpreter.py,sha256=MMdvli-75ySiKiULlsnoVmb8oEu5bvSkYz8GRdtZoVk,29494
19
+ sglang/lang/ir.py,sha256=KZxXVva2r1UihYOVWRKcU_zILMx05oWV2yLy3SeZfnA,14603
20
+ sglang/lang/tracer.py,sha256=QcslAObEjepk8XmiqCobwzWaDpihofEQXjeRs_3B8NQ,8282
21
+ sglang/srt/conversation.py,sha256=kuMrdYtcpy2F7qACMEYdD1CniP6HHNRSvhqVZe8jj_w,15522
22
+ sglang/srt/flush_cache.py,sha256=SJsbZnmDhH-gb9ch3hIwnI_nuwaOLlKvlXADyLBGENk,403
23
+ sglang/srt/hf_transformers_utils.py,sha256=P6eXfGwH-OeU6hDrlGYL5GACcTNPdYOimpKZ0ZBZUao,10683
24
+ sglang/srt/memory_pool.py,sha256=5bqI8d5_JURbKwIhv1BwlcIO2IDHewHvIqezPG-b_5M,3284
25
+ sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
26
+ sglang/srt/model_config.py,sha256=eGt8hTtipSTqp-AsB-Cl4wfZDb14CTcOtIz-iXgaVk8,4997
27
+ sglang/srt/openai_api_adapter.py,sha256=pqGP0bON-wEZOnZyo85gzrO9MSzeIkHh5xqhpN5RkyY,15120
28
+ sglang/srt/openai_protocol.py,sha256=CNJOMr3PJvoRGI2TIh9t8f_4wYTtT0EF8kzsrYsASYY,5350
29
+ sglang/srt/sampling_params.py,sha256=dQbVr7JmTJ9JEn_sy3clB56yT9kyr9ldWFZ-GaNXOy0,3023
30
+ sglang/srt/server.py,sha256=742w8gn0GgE7w3EwgPhq7MYabaVxcdPpMAovEE6-DaU,13112
31
+ sglang/srt/server_args.py,sha256=j0-Aj8sHQ-zgumd4w0IaezRqDdjDC6MMMG5M8zzITVw,12166
32
+ sglang/srt/utils.py,sha256=V2C4fb93oKS4D3lezlRgHkD7MQDNBZlIy_4ZTNzAC9E,19423
33
+ sglang/srt/constrained/__init__.py,sha256=Q-XnKFChC9q6WDCnJKAKAuXzKHHg4QoFlYODge8ZKCs,1504
34
+ sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
35
+ sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
36
+ sglang/srt/constrained/jump_forward.py,sha256=s60jZ7Ue8zaodgQm7gDpN6pSedpvpUck_waJALUMj60,5615
37
+ sglang/srt/layers/context_flashattention_nopad.py,sha256=bENdVltDozccR5mLY_CcYDjqLob28tHA9f2s03D8UFQ,5210
38
+ sglang/srt/layers/extend_attention.py,sha256=JUYuYSAhfbgOXrwIK5YHJCXPq54a6IZ7vQrze-3VvMQ,12955
39
+ sglang/srt/layers/fused_moe.py,sha256=M_cTHMNSoD-wdh6XjzHseuq3zsdqOmECWxNeEVJklu4,22257
40
+ sglang/srt/layers/logits_processor.py,sha256=t-bZIcGj70KKf2Jcor9K7Va1NsBlDVNrQ4Ktlq0lUlU,8506
41
+ sglang/srt/layers/radix_attention.py,sha256=XsHFf7myNKZwyt3qB5LEXAttTKMY9OP3M3t5CZnyu3g,6911
42
+ sglang/srt/layers/token_attention.py,sha256=rVbPlFpmLoU3nx3qtK2YZdynDxfvMKtQNTPeKi0KNP0,8823
43
+ sglang/srt/managers/detokenizer_manager.py,sha256=tOjURt-XQofPblnGECoJfoRSMPoWFVAH99R05hXeYNw,3353
44
+ sglang/srt/managers/io_struct.py,sha256=O1cz6hDV6BjXbZ0-tk6VaDNjYFuMBUOGswbG3H_GliY,4532
45
+ sglang/srt/managers/tokenizer_manager.py,sha256=TswupFKrlXAvUM5-1eT2cR6uNJoQVivp2MQkEFu4axQ,14848
46
+ sglang/srt/managers/controller/dp_worker.py,sha256=ES3-jyxGfHzpgVoXub_3qjVygwfWYWpfN4vuVWU23Gs,3675
47
+ sglang/srt/managers/controller/infer_batch.py,sha256=-Q17Pk_Mmccobxly7UM8wCC6dYKJ4zmjplMboN1q8b0,25700
48
+ sglang/srt/managers/controller/manager_multi.py,sha256=Z0a-iZzqk8T2Xl7ak2DgE9j00GA6Eb0XoNVx7UlxKa4,6630
49
+ sglang/srt/managers/controller/manager_single.py,sha256=OIm_BjbDaEcYmpb_E_7wv0xfOlb2le0zXjPMqf1pU9U,3468
50
+ sglang/srt/managers/controller/model_runner.py,sha256=HjOHp_Rtdm7OnMmhtnSwPWPmEYHDpnt5LjeKbiYb6mo,21718
51
+ sglang/srt/managers/controller/radix_cache.py,sha256=fMqIm1fTvufI9I_QMoFLfQMkSUWp8VN4wh3-63KJUL0,8193
52
+ sglang/srt/managers/controller/schedule_heuristic.py,sha256=_ne7W2mrpuO794uh5tYLR3q6XBbgTMdNmE6VpzY1sJE,2312
53
+ sglang/srt/managers/controller/tp_worker.py,sha256=VNVQ0oqPGllC00cZCxHB-0LqudxgS74jf-it2zDHzTA,31411
54
+ sglang/srt/models/chatglm.py,sha256=BU0rdp-GCUZcmctBYFFo6i5s5XOUJCQbr-v4EQjwJKo,13275
55
+ sglang/srt/models/commandr.py,sha256=hHsNQWi0X8rNL7_gpcoUxQxdhxtvx5_RVx8u6cLzqYQ,13606
56
+ sglang/srt/models/dbrx.py,sha256=lv0nXFGJnmv6toUBRv7q7M1ZTrI3VACrvLBKHA6xdjE,14074
57
+ sglang/srt/models/gemma.py,sha256=DweoalfWYhLL-ZWLAO5gl4SCZflWmejVeDG3Vky_WNo,11719
58
+ sglang/srt/models/grok.py,sha256=oy-QoCvUKKQO2sR6a_qwHm10Fc0t-ka4I-1uEGGW3j8,27274
59
+ sglang/srt/models/llama2.py,sha256=7aPPSLABRIy7_iy4YvFHV7Beqc2I1-Vc1obSbsgzNzY,12190
60
+ sglang/srt/models/llama_classification.py,sha256=LrPRFB-Yd2haZADNY3uIusbajQwcZNQrOCTd92L2vS0,4304
61
+ sglang/srt/models/llava.py,sha256=M0zQwOvnqYkTQgH2aJqsjLLIXQNkadO61UCPpx8A1zQ,17903
62
+ sglang/srt/models/llavavid.py,sha256=7NQ5IzC8G1yrsNbFYS_8CAUpuh0LxM9vEPKD2IZT99g,13029
63
+ sglang/srt/models/mistral.py,sha256=XSn7fiZqspyWVTYrpVAacAnWdwAybBtyn9-Sh9AvMTM,254
64
+ sglang/srt/models/mixtral.py,sha256=lpasWpwvWPHqSQ1Vskr2kL3e_oBxRxlYK6bk6sf61AQ,20810
65
+ sglang/srt/models/mixtral_quant.py,sha256=SMqOnuToJ8pz_7wb10pn7Uib15cXBcqSrtGsh5sVhw8,13635
66
+ sglang/srt/models/qwen.py,sha256=fTRtEXdYPWIOtmwKb4kVFrq65w7AYxjsYqV8ar5mmac,9419
67
+ sglang/srt/models/qwen2.py,sha256=F3k21F_CCqFJMIkzLC-1mIFQOgtEHbuZfIaautNC8-s,11465
68
+ sglang/srt/models/stablelm.py,sha256=LbO8rruVkvvLng6pVHG4wjbewrGfMLm9vKxK41V2W_s,10781
69
+ sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
70
+ sglang/test/test_conversation.py,sha256=1zIrXcXiwEliPHgDAsqsQUA7JKzZ5fnQEU-U6L887FU,1592
71
+ sglang/test/test_openai_protocol.py,sha256=eePzoskYR3PqfWczSVZvg8ja63qbT8TFUNEMyzDZpa8,1657
72
+ sglang/test/test_programs.py,sha256=g80P0QWO8Jv_87onTCsvJ-2MgSh7I6_lzcfdm43JlNY,13616
73
+ sglang/test/test_utils.py,sha256=Mjn2btfmEQQ7rpsLfNo6VugXCPzUmRpNhssWvxevN4s,11038
74
+ sglang-0.1.18.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
75
+ sglang-0.1.18.dist-info/METADATA,sha256=tDdBZo2qvH8wWC4faXxfryjh7-6frEsBnH0vJ_ia1w4,29752
76
+ sglang-0.1.18.dist-info/WHEEL,sha256=y4mX-SOX4fYIkonsAGA5N0Oy-8_gI4FXw5HNI1xqvWg,91
77
+ sglang-0.1.18.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
78
+ sglang-0.1.18.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: bdist_wheel (0.43.0)
2
+ Generator: setuptools (70.2.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,13 +0,0 @@
1
- """
2
- Backend configurations, may vary with different serving platforms.
3
- """
4
-
5
- from dataclasses import dataclass
6
-
7
-
8
- @dataclass
9
- class BackendConfig:
10
- extend_dependency_time: float = 0.03
11
-
12
-
13
- GLOBAL_BACKEND_CONFIG = BackendConfig()
@@ -1,281 +0,0 @@
1
- # Adapted from:
2
- # https://github.com/vllm-project/vllm/blob/14ccd94c89d0ffd9da283545d93ab1dfea5da340/vllm/transformers_utils/configs/dbrx.py
3
- # yapf: disable
4
- # ruff: noqa: E501
5
- # coding=utf-8
6
- # Copied from
7
- # https://huggingface.co/databricks/dbrx-base/blob/main/configuration_dbrx.py
8
- """Dbrx configuration."""
9
-
10
- # FIXME: remove this once vllm releases a new version
11
-
12
- from typing import Any, Optional
13
-
14
- from transformers.configuration_utils import PretrainedConfig
15
- from transformers.utils import logging
16
-
17
- logger = logging.get_logger(__name__)
18
-
19
- DBRX_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
20
-
21
-
22
- class DbrxAttentionConfig(PretrainedConfig):
23
- """Configuration class for Dbrx Attention.
24
-
25
- [`DbrxAttention`] class. It is used to instantiate attention layers
26
- according to the specified arguments, defining the layers architecture.
27
-
28
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
29
- documentation from [`PretrainedConfig`] for more information.
30
-
31
- Args:
32
- attn_pdrop (`float`, *optional*, defaults to 0.0):
33
- The dropout probability for the attention layers.
34
- clip_qkv (`float`, *optional*, defaults to None):
35
- If not `None`, clip the queries, keys, and values in the attention layer to this value.
36
- kv_n_heads (Optional[int]): For grouped_query_attention only, allow user to specify number of kv heads.
37
- rope_theta (float): The base frequency for rope.
38
- """
39
-
40
- def __init__(
41
- self,
42
- attn_pdrop: float = 0,
43
- clip_qkv: Optional[float] = None,
44
- kv_n_heads: int = 1,
45
- rope_theta: float = 10000.0,
46
- **kwargs: Any,
47
- ):
48
- super().__init__(**kwargs)
49
- self.attn_pdrop = attn_pdrop
50
- self.clip_qkv = clip_qkv
51
- self.kv_n_heads = kv_n_heads
52
- self.rope_theta = rope_theta
53
-
54
- for k in ["model_type"]:
55
- if k in kwargs:
56
- kwargs.pop(k)
57
- if len(kwargs) != 0:
58
- raise ValueError(f"Found unknown {kwargs=}")
59
-
60
- @classmethod
61
- def from_pretrained(
62
- cls, pretrained_model_name_or_path: str, **kwargs: Any
63
- ) -> "PretrainedConfig":
64
- cls._set_token_in_kwargs(kwargs)
65
-
66
- config_dict, kwargs = cls.get_config_dict(
67
- pretrained_model_name_or_path, **kwargs
68
- )
69
-
70
- if config_dict.get("model_type") == "dbrx":
71
- config_dict = config_dict["attn_config"]
72
-
73
- if (
74
- "model_type" in config_dict
75
- and hasattr(cls, "model_type")
76
- and config_dict["model_type"] != cls.model_type
77
- ):
78
- logger.warning(
79
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
80
- + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
81
- )
82
-
83
- return cls.from_dict(config_dict, **kwargs)
84
-
85
-
86
- class DbrxFFNConfig(PretrainedConfig):
87
- """Configuration class for Dbrx FFN.
88
-
89
- [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
90
- the specified arguments, defining the layers architecture.
91
-
92
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
93
- documentation from [`PretrainedConfig`] for more information.
94
-
95
- Args:
96
- ffn_act_fn (dict, optional): A dict specifying activation function for the FFN.
97
- The dict should have a key 'name' with the value being the name of
98
- the activation function along with any additional keyword arguments.
99
- ffn_hidden_size (int, optional): The hidden size of the feedforward network.
100
- moe_num_experts (int, optional): The number of experts in the mixture of experts layer.
101
- moe_top_k (int, optional): The number of experts to use in the mixture of experts layer.
102
- moe_jitter_eps (float, optional): The jitter epsilon for the mixture of experts layer.
103
- moe_loss_weight (float, optional): The loss weight for the mixture of experts layer.
104
- moe_normalize_expert_weights (float, optional): The normalization factor for the expert weights.
105
- uniform_expert_assignment (bool, optional): Whether to use uniform expert assignment.
106
- This should only be used for benchmarking purposes.
107
- """
108
-
109
- def __init__(
110
- self,
111
- ffn_act_fn: Optional[dict] = None,
112
- ffn_hidden_size: int = 3584,
113
- moe_num_experts: int = 4,
114
- moe_top_k: int = 1,
115
- moe_jitter_eps: Optional[float] = None,
116
- moe_loss_weight: float = 0.01,
117
- moe_normalize_expert_weights: Optional[float] = 1,
118
- uniform_expert_assignment: bool = False,
119
- **kwargs: Any,
120
- ):
121
- super().__init__()
122
- if ffn_act_fn is None:
123
- ffn_act_fn = {"name": "silu"}
124
- self.ffn_act_fn = ffn_act_fn
125
- self.ffn_hidden_size = ffn_hidden_size
126
- self.moe_num_experts = moe_num_experts
127
- self.moe_top_k = moe_top_k
128
- self.moe_jitter_eps = moe_jitter_eps
129
- self.moe_loss_weight = moe_loss_weight
130
- self.moe_normalize_expert_weights = moe_normalize_expert_weights
131
- self.uniform_expert_assignment = uniform_expert_assignment
132
-
133
- for k in ["model_type"]:
134
- if k in kwargs:
135
- kwargs.pop(k)
136
- if len(kwargs) != 0:
137
- raise ValueError(f"Found unknown {kwargs=}")
138
-
139
- @classmethod
140
- def from_pretrained(
141
- cls, pretrained_model_name_or_path: str, **kwargs: Any
142
- ) -> "PretrainedConfig":
143
- cls._set_token_in_kwargs(kwargs)
144
-
145
- config_dict, kwargs = cls.get_config_dict(
146
- pretrained_model_name_or_path, **kwargs
147
- )
148
-
149
- if config_dict.get("model_type") == "dbrx":
150
- config_dict = config_dict["ffn_config"]
151
-
152
- if (
153
- "model_type" in config_dict
154
- and hasattr(cls, "model_type")
155
- and config_dict["model_type"] != cls.model_type
156
- ):
157
- logger.warning(
158
- f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
159
- + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
160
- )
161
-
162
- return cls.from_dict(config_dict, **kwargs)
163
-
164
-
165
- class DbrxConfig(PretrainedConfig):
166
- """Configuration class for Dbrx.
167
-
168
- [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
169
- specified arguments, defining the model architecture.
170
-
171
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
172
- documentation from [`PretrainedConfig`] for more information.
173
-
174
-
175
- Args:
176
- d_model (`int`, *optional*, defaults to 6144):
177
- Dimensionality of the embeddings and hidden states.
178
- n_heads (`int`, *optional*, defaults to 48):
179
- Number of attention heads for each attention layer in the Transformer encoder.
180
- n_layers (`int`, *optional*, defaults to 40):
181
- Number of hidden layers in the Transformer encoder.
182
- max_seq_len (`int`, *optional*, defaults to 32768):
183
- The maximum sequence length of the model.
184
- vocab_size (`int`, *optional*, defaults to 100352):
185
- Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
186
- the `inputs_ids` passed when calling [`DbrxModel`].
187
- resid_pdrop (`float`, *optional*, defaults to 0.0):
188
- The dropout probability applied to the attention output before combining with residual.
189
- emb_pdrop (`float`, *optional*, defaults to 0.0):
190
- The dropout probability for the embedding layer.
191
- attn_config (`dict`, *optional*):
192
- A dictionary used to configure the model's attention module.
193
- ffn_config (`dict`, *optional*):
194
- A dictionary used to configure the model's FFN module.
195
- use_cache (`bool`, *optional*, defaults to `False`):
196
- Whether or not the model should return the last key/values attentions (not used by all models).
197
- initializer_range (`float`, *optional*, defaults to 0.02):
198
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
199
- output_router_logits (`bool`, *optional*, defaults to `False`):
200
- Whether or not the router logits should be returned by the model. Enabling this will also
201
- allow the model to output the auxiliary loss. See [here]() for more details
202
- router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
203
- The aux loss factor for the total loss.
204
-
205
-
206
- Example:
207
- ```python
208
- >>> from transformers import DbrxConfig, DbrxModel
209
-
210
- >>> # Initializing a Dbrx configuration
211
- >>> configuration = DbrxConfig()
212
-
213
- >>> # Initializing a model (with random weights) from the configuration
214
- >>> model = DbrxModel(configuration)
215
-
216
- >>> # Accessing the model configuration
217
- >>> configuration = model.config
218
- ```
219
- """
220
-
221
- model_type = "dbrx"
222
- attribute_map = {
223
- "num_attention_heads": "n_heads",
224
- "hidden_size": "d_model",
225
- "num_hidden_layers": "n_layers",
226
- "max_position_embeddings": "max_seq_len",
227
- }
228
-
229
- def __init__(
230
- self,
231
- d_model: int = 2048,
232
- n_heads: int = 16,
233
- n_layers: int = 24,
234
- max_seq_len: int = 2048,
235
- vocab_size: int = 32000,
236
- resid_pdrop: float = 0.0,
237
- emb_pdrop: float = 0.0,
238
- attn_config: Optional[DbrxAttentionConfig] = None,
239
- ffn_config: Optional[DbrxFFNConfig] = None,
240
- use_cache: bool = True,
241
- initializer_range: float = 0.02,
242
- output_router_logits: bool = False,
243
- router_aux_loss_coef: float = 0.05,
244
- **kwargs: Any,
245
- ):
246
- if attn_config is None:
247
- self.attn_config = DbrxAttentionConfig()
248
- elif isinstance(attn_config, dict):
249
- self.attn_config = DbrxAttentionConfig(**attn_config)
250
- else:
251
- self.attn_config = attn_config
252
-
253
- if ffn_config is None:
254
- self.ffn_config = DbrxFFNConfig()
255
- elif isinstance(ffn_config, dict):
256
- self.ffn_config = DbrxFFNConfig(**ffn_config)
257
- else:
258
- self.ffn_config = ffn_config
259
-
260
- self.d_model = d_model
261
- self.n_heads = n_heads
262
- self.n_layers = n_layers
263
- self.max_seq_len = max_seq_len
264
- self.vocab_size = vocab_size
265
- self.resid_pdrop = resid_pdrop
266
- self.emb_pdrop = emb_pdrop
267
- self.use_cache = use_cache
268
- self.initializer_range = initializer_range
269
- self.output_router_logits = output_router_logits
270
- self.router_aux_loss_coef = router_aux_loss_coef
271
-
272
- tie_word_embeddings = kwargs.pop("tie_word_embeddings", False)
273
- if tie_word_embeddings:
274
- raise ValueError(
275
- "tie_word_embeddings is not supported for Dbrx models."
276
- )
277
-
278
- super().__init__(
279
- tie_word_embeddings=tie_word_embeddings,
280
- **kwargs,
281
- )