sglang 0.1.16__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. {sglang-0.1.16/sglang.egg-info → sglang-0.1.18}/PKG-INFO +40 -27
  2. {sglang-0.1.16 → sglang-0.1.18}/README.md +27 -16
  3. {sglang-0.1.16 → sglang-0.1.18}/pyproject.toml +9 -7
  4. {sglang-0.1.16 → sglang-0.1.18}/sglang/__init__.py +3 -1
  5. {sglang-0.1.16 → sglang-0.1.18}/sglang/api.py +7 -7
  6. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/anthropic.py +1 -1
  7. sglang-0.1.18/sglang/backend/litellm.py +90 -0
  8. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/openai.py +158 -11
  9. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/runtime_endpoint.py +18 -10
  10. sglang-0.1.18/sglang/bench_latency.py +299 -0
  11. {sglang-0.1.16 → sglang-0.1.18}/sglang/global_config.py +12 -2
  12. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/compiler.py +2 -2
  13. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/interpreter.py +114 -67
  14. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/ir.py +28 -3
  15. {sglang-0.1.16 → sglang-0.1.18}/sglang/launch_server.py +4 -1
  16. {sglang-0.1.16 → sglang-0.1.18}/sglang/launch_server_llavavid.py +2 -1
  17. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/__init__.py +13 -6
  18. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/fsm_cache.py +8 -2
  19. sglang-0.1.18/sglang/srt/constrained/jump_forward.py +164 -0
  20. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/conversation.py +2 -0
  21. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/flush_cache.py +3 -1
  22. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/hf_transformers_utils.py +130 -1
  23. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/extend_attention.py +17 -0
  24. sglang-0.1.18/sglang/srt/layers/fused_moe.py +582 -0
  25. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/logits_processor.py +65 -32
  26. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/radix_attention.py +41 -7
  27. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/token_attention.py +16 -1
  28. sglang-0.1.18/sglang/srt/managers/controller/dp_worker.py +113 -0
  29. {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/infer_batch.py +242 -100
  30. sglang-0.1.18/sglang/srt/managers/controller/manager_multi.py +191 -0
  31. sglang-0.1.16/sglang/srt/managers/router/manager.py → sglang-0.1.18/sglang/srt/managers/controller/manager_single.py +34 -14
  32. {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/model_runner.py +262 -158
  33. {sglang-0.1.16/sglang/srt/managers/router → sglang-0.1.18/sglang/srt/managers/controller}/radix_cache.py +11 -1
  34. sglang-0.1.16/sglang/srt/managers/router/scheduler.py → sglang-0.1.18/sglang/srt/managers/controller/schedule_heuristic.py +9 -7
  35. sglang-0.1.16/sglang/srt/managers/router/model_rpc.py → sglang-0.1.18/sglang/srt/managers/controller/tp_worker.py +298 -267
  36. sglang-0.1.18/sglang/srt/managers/detokenizer_manager.py +91 -0
  37. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/managers/io_struct.py +22 -12
  38. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/managers/tokenizer_manager.py +151 -87
  39. sglang-0.1.18/sglang/srt/model_config.py +125 -0
  40. sglang-0.1.18/sglang/srt/models/chatglm.py +399 -0
  41. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/commandr.py +10 -13
  42. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/dbrx.py +9 -15
  43. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/gemma.py +12 -15
  44. sglang-0.1.18/sglang/srt/models/grok.py +738 -0
  45. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llama2.py +26 -15
  46. sglang-0.1.18/sglang/srt/models/llama_classification.py +104 -0
  47. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llava.py +86 -19
  48. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/llavavid.py +11 -20
  49. sglang-0.1.18/sglang/srt/models/mixtral.py +562 -0
  50. sglang-0.1.16/sglang/srt/models/mixtral.py → sglang-0.1.18/sglang/srt/models/mixtral_quant.py +11 -22
  51. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/qwen.py +9 -13
  52. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/qwen2.py +11 -13
  53. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/stablelm.py +9 -15
  54. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/yivl.py +17 -22
  55. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/openai_api_adapter.py +150 -95
  56. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/openai_protocol.py +11 -2
  57. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/server.py +124 -48
  58. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/server_args.py +128 -48
  59. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/utils.py +234 -67
  60. {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_programs.py +65 -3
  61. {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_utils.py +32 -1
  62. {sglang-0.1.16 → sglang-0.1.18}/sglang/utils.py +23 -4
  63. {sglang-0.1.16 → sglang-0.1.18/sglang.egg-info}/PKG-INFO +40 -27
  64. {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/SOURCES.txt +15 -9
  65. {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/requires.txt +14 -11
  66. sglang-0.1.16/sglang/srt/backend_config.py +0 -13
  67. sglang-0.1.16/sglang/srt/constrained/jump_forward.py +0 -76
  68. sglang-0.1.16/sglang/srt/managers/detokenizer_manager.py +0 -95
  69. sglang-0.1.16/sglang/srt/model_config.py +0 -47
  70. sglang-0.1.16/sglang/srt/models/dbrx_config.py +0 -281
  71. sglang-0.1.16/sglang/srt/weight_utils.py +0 -417
  72. {sglang-0.1.16 → sglang-0.1.18}/LICENSE +0 -0
  73. {sglang-0.1.16 → sglang-0.1.18}/setup.cfg +0 -0
  74. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/__init__.py +0 -0
  75. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/base_backend.py +0 -0
  76. {sglang-0.1.16 → sglang-0.1.18}/sglang/backend/vertexai.py +0 -0
  77. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/__init__.py +0 -0
  78. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/chat_template.py +0 -0
  79. {sglang-0.1.16 → sglang-0.1.18}/sglang/lang/tracer.py +0 -0
  80. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/constrained/base_cache.py +0 -0
  81. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  82. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/memory_pool.py +0 -0
  83. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/mm_utils.py +0 -0
  84. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/models/mistral.py +0 -0
  85. {sglang-0.1.16 → sglang-0.1.18}/sglang/srt/sampling_params.py +0 -0
  86. {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_conversation.py +0 -0
  87. {sglang-0.1.16 → sglang-0.1.18}/sglang/test/test_openai_protocol.py +0 -0
  88. {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/dependency_links.txt +0 -0
  89. {sglang-0.1.16 → sglang-0.1.18}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.16
3
+ Version: 0.1.18
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -213,34 +213,36 @@ Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
+ Requires-Dist: numpy
216
217
  Provides-Extra: srt
217
218
  Requires-Dist: aiohttp; extra == "srt"
218
219
  Requires-Dist: fastapi; extra == "srt"
220
+ Requires-Dist: hf_transfer; extra == "srt"
221
+ Requires-Dist: huggingface_hub; extra == "srt"
222
+ Requires-Dist: interegular; extra == "srt"
223
+ Requires-Dist: packaging; extra == "srt"
224
+ Requires-Dist: pillow; extra == "srt"
219
225
  Requires-Dist: psutil; extra == "srt"
226
+ Requires-Dist: pydantic; extra == "srt"
220
227
  Requires-Dist: rpyc; extra == "srt"
221
228
  Requires-Dist: torch; extra == "srt"
222
- Requires-Dist: uvloop; extra == "srt"
223
229
  Requires-Dist: uvicorn; extra == "srt"
230
+ Requires-Dist: uvloop; extra == "srt"
224
231
  Requires-Dist: zmq; extra == "srt"
225
- Requires-Dist: vllm>=0.4.2; extra == "srt"
226
- Requires-Dist: interegular; extra == "srt"
227
- Requires-Dist: pydantic; extra == "srt"
228
- Requires-Dist: pillow; extra == "srt"
229
- Requires-Dist: packaging; extra == "srt"
230
- Requires-Dist: huggingface_hub; extra == "srt"
231
- Requires-Dist: hf_transfer; extra == "srt"
232
- Requires-Dist: outlines>=0.0.34; extra == "srt"
232
+ Requires-Dist: vllm==0.5.0; extra == "srt"
233
+ Requires-Dist: outlines>=0.0.44; extra == "srt"
233
234
  Provides-Extra: openai
234
235
  Requires-Dist: openai>=1.0; extra == "openai"
235
- Requires-Dist: numpy; extra == "openai"
236
236
  Requires-Dist: tiktoken; extra == "openai"
237
237
  Provides-Extra: anthropic
238
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
- Requires-Dist: numpy; extra == "anthropic"
239
+ Provides-Extra: litellm
240
+ Requires-Dist: litellm>=1.0.0; extra == "litellm"
240
241
  Provides-Extra: all
241
242
  Requires-Dist: sglang[srt]; extra == "all"
242
243
  Requires-Dist: sglang[openai]; extra == "all"
243
244
  Requires-Dist: sglang[anthropic]; extra == "all"
245
+ Requires-Dist: sglang[litellm]; extra == "all"
244
246
 
245
247
  <div align="center">
246
248
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -253,9 +255,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
253
255
  SGLang is a structured generation language designed for large language models (LLMs).
254
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
255
257
 
256
- The core features of SGLang include:
257
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
258
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
258
+ The core features include:
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
259
261
 
260
262
  ## News
261
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -276,23 +278,27 @@ The core features of SGLang include:
276
278
  ### Method 1: With pip
277
279
  ```
278
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
279
284
  ```
280
285
 
281
286
  ### Method 2: From source
282
287
  ```
283
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
284
289
  cd sglang
285
290
 
286
291
  pip install --upgrade pip
287
292
  pip install -e "python[all]"
293
+
294
+ # Install FlashInfer CUDA kernels
295
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
288
296
  ```
289
297
 
290
298
  ### Notes
291
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
292
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
293
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
295
-
299
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
300
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
301
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
296
302
 
297
303
  ## Quick Start
298
304
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -603,11 +609,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
603
609
  ```
604
610
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
605
611
  ```
612
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
613
+ ```
614
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
615
+ ```
606
616
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
607
617
  ```
608
618
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
609
619
  ```
610
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
620
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
611
621
 
612
622
  ### Supported Models
613
623
  - Llama
@@ -621,6 +631,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
621
631
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
622
632
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
623
633
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
634
+ - LLaVA-NeXT-Video
635
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
624
636
  - Yi-VL
625
637
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
626
638
  - StableLM
@@ -637,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
637
649
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
638
650
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
639
651
 
640
- Learn more [here](docs/benchmark_results.md).
652
+ - Learn more about the above [results](docs/benchmark_results.md).
653
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
641
654
 
642
655
  ## Roadmap
643
656
  https://github.com/sgl-project/sglang/issues/157
644
657
 
645
658
  ## Citation And Acknowledgment
646
659
  ```
647
- @misc{zheng2023efficiently,
648
- title={Efficiently Programming Large Language Models using SGLang},
649
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
650
- year={2023},
660
+ @misc{zheng2024sglang,
661
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
662
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
663
+ year={2024},
651
664
  eprint={2312.07104},
652
665
  archivePrefix={arXiv},
653
666
  primaryClass={cs.AI}
@@ -9,9 +9,9 @@
9
9
  SGLang is a structured generation language designed for large language models (LLMs).
10
10
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
11
11
 
12
- The core features of SGLang include:
13
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
14
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
12
+ The core features include:
13
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
14
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
15
15
 
16
16
  ## News
17
17
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -32,23 +32,27 @@ The core features of SGLang include:
32
32
  ### Method 1: With pip
33
33
  ```
34
34
  pip install "sglang[all]"
35
+
36
+ # Install FlashInfer CUDA kernels
37
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
35
38
  ```
36
39
 
37
40
  ### Method 2: From source
38
41
  ```
39
- git clone git@github.com:sgl-project/sglang.git
42
+ git clone https://github.com/sgl-project/sglang.git
40
43
  cd sglang
41
44
 
42
45
  pip install --upgrade pip
43
46
  pip install -e "python[all]"
47
+
48
+ # Install FlashInfer CUDA kernels
49
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
44
50
  ```
45
51
 
46
52
  ### Notes
47
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
48
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
49
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
50
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
51
-
53
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
54
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
55
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
52
56
 
53
57
  ## Quick Start
54
58
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -359,11 +363,15 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
359
363
  ```
360
364
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
361
365
  ```
366
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
367
+ ```
368
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
369
+ ```
362
370
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
363
371
  ```
364
372
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
365
373
  ```
366
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
374
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
367
375
 
368
376
  ### Supported Models
369
377
  - Llama
@@ -377,6 +385,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
377
385
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
378
386
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
379
387
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
388
+ - LLaVA-NeXT-Video
389
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
380
390
  - Yi-VL
381
391
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
382
392
  - StableLM
@@ -393,21 +403,22 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
393
403
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
394
404
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
395
405
 
396
- Learn more [here](docs/benchmark_results.md).
406
+ - Learn more about the above [results](docs/benchmark_results.md).
407
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
397
408
 
398
409
  ## Roadmap
399
410
  https://github.com/sgl-project/sglang/issues/157
400
411
 
401
412
  ## Citation And Acknowledgment
402
413
  ```
403
- @misc{zheng2023efficiently,
404
- title={Efficiently Programming Large Language Models using SGLang},
405
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
406
- year={2023},
414
+ @misc{zheng2024sglang,
415
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
416
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
417
+ year={2024},
407
418
  eprint={2312.07104},
408
419
  archivePrefix={arXiv},
409
420
  primaryClass={cs.AI}
410
421
  }
411
422
  ```
412
423
 
413
- We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
424
+ We learned from the design and reused some code of the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), [LMQL](https://github.com/eth-sri/lmql).
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.16"
8
- description = "A structured generation langauge for LLMs."
7
+ version = "0.1.18"
8
+ description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  license = {file = "LICENSE"}
@@ -16,14 +16,16 @@ classifiers = [
16
16
  dependencies = [
17
17
  "requests",
18
18
  "tqdm",
19
+ "numpy",
19
20
  ]
20
21
 
21
22
  [project.optional-dependencies]
22
- srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
23
- "zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
24
- openai = ["openai>=1.0", "numpy", "tiktoken"]
25
- anthropic = ["anthropic>=0.20.0", "numpy"]
26
- all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
24
+ "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
25
+ openai = ["openai>=1.0", "tiktoken"]
26
+ anthropic = ["anthropic>=0.20.0"]
27
+ litellm = ["litellm>=1.0.0"]
28
+ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
27
29
 
28
30
  [project.urls]
29
31
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.16"
1
+ __version__ = "0.1.18"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -24,6 +24,7 @@ from sglang.api import (
24
24
 
25
25
  # SGL Backends
26
26
  from sglang.backend.anthropic import Anthropic
27
+ from sglang.backend.litellm import LiteLLM
27
28
  from sglang.backend.openai import OpenAI
28
29
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
29
30
  from sglang.backend.vertexai import VertexAI
@@ -35,6 +36,7 @@ from sglang.global_config import global_config
35
36
  __all__ = [
36
37
  "global_config",
37
38
  "Anthropic",
39
+ "LiteLLM",
38
40
  "OpenAI",
39
41
  "RuntimeEndpoint",
40
42
  "VertexAI",
@@ -1,4 +1,4 @@
1
- """Some Public API Definitions"""
1
+ """Public APIs of the language."""
2
2
 
3
3
  import os
4
4
  import re
@@ -20,13 +20,13 @@ from sglang.lang.ir import (
20
20
 
21
21
 
22
22
  def function(
23
- func: Optional[Callable] = None, api_num_spec_tokens: Optional[int] = None
23
+ func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
24
24
  ):
25
25
  if func:
26
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
26
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
27
27
 
28
28
  def decorator(func):
29
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
29
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
30
30
 
31
31
  return decorator
32
32
 
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
43
43
  global_config.default_backend = backend
44
44
 
45
45
 
46
- def flush_cache(backend: BaseBackend = None):
46
+ def flush_cache(backend: Optional[BaseBackend] = None):
47
47
  backend = backend or global_config.default_backend
48
48
  if backend is None:
49
49
  return False
50
50
  return backend.flush_cache()
51
51
 
52
52
 
53
- def get_server_args(backend: BaseBackend = None):
53
+ def get_server_args(backend: Optional[BaseBackend] = None):
54
54
  backend = backend or global_config.default_backend
55
55
  if backend is None:
56
56
  return None
@@ -158,7 +158,7 @@ def video(path: str, num_frames: int):
158
158
 
159
159
  def select(
160
160
  name: Optional[str] = None,
161
- choices: List[str] = None,
161
+ choices: Optional[List[str]] = None,
162
162
  temperature: float = 0.0,
163
163
  ):
164
164
  assert choices is not None
@@ -74,4 +74,4 @@ class Anthropic(BaseBackend):
74
74
  **sampling_params.to_anthropic_kwargs(),
75
75
  ) as stream:
76
76
  for text in stream.text_stream:
77
- yield text, {}
77
+ yield text, {}
@@ -0,0 +1,90 @@
1
+ from typing import Mapping, Optional
2
+
3
+ from sglang.backend.base_backend import BaseBackend
4
+ from sglang.lang.chat_template import get_chat_template_by_model_path
5
+ from sglang.lang.interpreter import StreamExecutor
6
+ from sglang.lang.ir import SglSamplingParams
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError as e:
11
+ litellm = e
12
+ litellm.num_retries = 1
13
+
14
+
15
+ class LiteLLM(BaseBackend):
16
+ def __init__(
17
+ self,
18
+ model_name,
19
+ chat_template=None,
20
+ api_key=None,
21
+ organization: Optional[str] = None,
22
+ base_url: Optional[str] = None,
23
+ timeout: Optional[float] = 600,
24
+ max_retries: Optional[int] = litellm.num_retries,
25
+ default_headers: Optional[Mapping[str, str]] = None,
26
+ ):
27
+ super().__init__()
28
+
29
+ if isinstance(litellm, Exception):
30
+ raise litellm
31
+
32
+ self.model_name = model_name
33
+
34
+ self.chat_template = chat_template or get_chat_template_by_model_path(
35
+ model_name
36
+ )
37
+
38
+ self.client_params = {
39
+ "api_key": api_key,
40
+ "organization": organization,
41
+ "base_url": base_url,
42
+ "timeout": timeout,
43
+ "max_retries": max_retries,
44
+ "default_headers": default_headers,
45
+ }
46
+
47
+ def get_chat_template(self):
48
+ return self.chat_template
49
+
50
+ def generate(
51
+ self,
52
+ s: StreamExecutor,
53
+ sampling_params: SglSamplingParams,
54
+ ):
55
+ if s.messages_:
56
+ messages = s.messages_
57
+ else:
58
+ messages = [{"role": "user", "content": s.text_}]
59
+
60
+ ret = litellm.completion(
61
+ model=self.model_name,
62
+ messages=messages,
63
+ **self.client_params,
64
+ **sampling_params.to_anthropic_kwargs(),
65
+ )
66
+ comp = ret.choices[0].message.content
67
+
68
+ return comp, {}
69
+
70
+ def generate_stream(
71
+ self,
72
+ s: StreamExecutor,
73
+ sampling_params: SglSamplingParams,
74
+ ):
75
+ if s.messages_:
76
+ messages = s.messages_
77
+ else:
78
+ messages = [{"role": "user", "content": s.text_}]
79
+
80
+ ret = litellm.completion(
81
+ model=self.model_name,
82
+ messages=messages,
83
+ stream=True,
84
+ **self.client_params,
85
+ **sampling_params.to_litellm_kwargs(),
86
+ )
87
+ for chunk in ret:
88
+ text = chunk.choices[0].delta.content
89
+ if text is not None:
90
+ yield text, {}