sglang 0.1.17__tar.gz → 0.1.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. {sglang-0.1.17/sglang.egg-info → sglang-0.1.18}/PKG-INFO +29 -22
  2. {sglang-0.1.17 → sglang-0.1.18}/README.md +18 -10
  3. {sglang-0.1.17 → sglang-0.1.18}/pyproject.toml +6 -5
  4. {sglang-0.1.17 → sglang-0.1.18}/sglang/__init__.py +2 -2
  5. {sglang-0.1.17 → sglang-0.1.18}/sglang/api.py +4 -4
  6. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/litellm.py +2 -2
  7. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/openai.py +26 -15
  8. sglang-0.1.18/sglang/bench_latency.py +299 -0
  9. {sglang-0.1.17 → sglang-0.1.18}/sglang/global_config.py +4 -1
  10. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/compiler.py +2 -2
  11. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/interpreter.py +1 -1
  12. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/ir.py +15 -5
  13. {sglang-0.1.17 → sglang-0.1.18}/sglang/launch_server.py +4 -1
  14. {sglang-0.1.17 → sglang-0.1.18}/sglang/launch_server_llavavid.py +2 -1
  15. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/__init__.py +13 -6
  16. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/fsm_cache.py +6 -3
  17. sglang-0.1.18/sglang/srt/constrained/jump_forward.py +164 -0
  18. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/conversation.py +2 -0
  19. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/flush_cache.py +2 -0
  20. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/hf_transformers_utils.py +64 -9
  21. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/fused_moe.py +186 -89
  22. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/logits_processor.py +53 -25
  23. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/radix_attention.py +34 -7
  24. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/dp_worker.py +6 -3
  25. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/infer_batch.py +142 -67
  26. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/manager_multi.py +5 -5
  27. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/manager_single.py +8 -3
  28. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/model_runner.py +154 -54
  29. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/radix_cache.py +4 -0
  30. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/schedule_heuristic.py +2 -0
  31. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/controller/tp_worker.py +140 -135
  32. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/detokenizer_manager.py +15 -19
  33. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/io_struct.py +10 -4
  34. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/managers/tokenizer_manager.py +14 -13
  35. sglang-0.1.18/sglang/srt/model_config.py +125 -0
  36. sglang-0.1.18/sglang/srt/models/chatglm.py +399 -0
  37. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/commandr.py +2 -2
  38. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/gemma.py +5 -1
  39. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/grok.py +204 -137
  40. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llama2.py +11 -4
  41. sglang-0.1.18/sglang/srt/models/llama_classification.py +104 -0
  42. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llava.py +11 -8
  43. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/llavavid.py +1 -1
  44. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mixtral.py +164 -115
  45. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mixtral_quant.py +0 -1
  46. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/qwen.py +1 -1
  47. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/qwen2.py +1 -1
  48. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/stablelm.py +1 -1
  49. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/yivl.py +2 -2
  50. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/openai_api_adapter.py +33 -23
  51. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/openai_protocol.py +1 -1
  52. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/server.py +60 -19
  53. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/server_args.py +79 -44
  54. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/utils.py +146 -37
  55. {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_programs.py +28 -10
  56. {sglang-0.1.17 → sglang-0.1.18}/sglang/utils.py +4 -3
  57. {sglang-0.1.17 → sglang-0.1.18/sglang.egg-info}/PKG-INFO +29 -22
  58. {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/SOURCES.txt +3 -6
  59. {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/requires.txt +10 -11
  60. sglang-0.1.17/sglang/srt/constrained/jump_forward.py +0 -76
  61. sglang-0.1.17/sglang/srt/managers/router/infer_batch.py +0 -596
  62. sglang-0.1.17/sglang/srt/managers/router/manager.py +0 -82
  63. sglang-0.1.17/sglang/srt/managers/router/model_rpc.py +0 -818
  64. sglang-0.1.17/sglang/srt/managers/router/model_runner.py +0 -445
  65. sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +0 -267
  66. sglang-0.1.17/sglang/srt/managers/router/scheduler.py +0 -59
  67. sglang-0.1.17/sglang/srt/model_config.py +0 -46
  68. {sglang-0.1.17 → sglang-0.1.18}/LICENSE +0 -0
  69. {sglang-0.1.17 → sglang-0.1.18}/setup.cfg +0 -0
  70. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/__init__.py +0 -0
  71. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/anthropic.py +0 -0
  72. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/base_backend.py +0 -0
  73. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/runtime_endpoint.py +0 -0
  74. {sglang-0.1.17 → sglang-0.1.18}/sglang/backend/vertexai.py +0 -0
  75. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/__init__.py +0 -0
  76. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/chat_template.py +0 -0
  77. {sglang-0.1.17 → sglang-0.1.18}/sglang/lang/tracer.py +0 -0
  78. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/constrained/base_cache.py +0 -0
  79. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  80. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/extend_attention.py +0 -0
  81. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/layers/token_attention.py +0 -0
  82. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/memory_pool.py +0 -0
  83. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/mm_utils.py +0 -0
  84. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/dbrx.py +1 -1
  85. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/models/mistral.py +0 -0
  86. {sglang-0.1.17 → sglang-0.1.18}/sglang/srt/sampling_params.py +0 -0
  87. {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_conversation.py +0 -0
  88. {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_openai_protocol.py +0 -0
  89. {sglang-0.1.17 → sglang-0.1.18}/sglang/test/test_utils.py +0 -0
  90. {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/dependency_links.txt +0 -0
  91. {sglang-0.1.17 → sglang-0.1.18}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.17
3
+ Version: 0.1.18
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -213,30 +213,29 @@ Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
+ Requires-Dist: numpy
216
217
  Provides-Extra: srt
217
218
  Requires-Dist: aiohttp; extra == "srt"
218
219
  Requires-Dist: fastapi; extra == "srt"
220
+ Requires-Dist: hf_transfer; extra == "srt"
221
+ Requires-Dist: huggingface_hub; extra == "srt"
222
+ Requires-Dist: interegular; extra == "srt"
223
+ Requires-Dist: packaging; extra == "srt"
224
+ Requires-Dist: pillow; extra == "srt"
219
225
  Requires-Dist: psutil; extra == "srt"
226
+ Requires-Dist: pydantic; extra == "srt"
220
227
  Requires-Dist: rpyc; extra == "srt"
221
228
  Requires-Dist: torch; extra == "srt"
222
- Requires-Dist: uvloop; extra == "srt"
223
229
  Requires-Dist: uvicorn; extra == "srt"
230
+ Requires-Dist: uvloop; extra == "srt"
224
231
  Requires-Dist: zmq; extra == "srt"
225
- Requires-Dist: vllm==0.4.3; extra == "srt"
226
- Requires-Dist: interegular; extra == "srt"
227
- Requires-Dist: pydantic; extra == "srt"
228
- Requires-Dist: pillow; extra == "srt"
229
- Requires-Dist: packaging; extra == "srt"
230
- Requires-Dist: huggingface_hub; extra == "srt"
231
- Requires-Dist: hf_transfer; extra == "srt"
232
- Requires-Dist: outlines>=0.0.34; extra == "srt"
232
+ Requires-Dist: vllm==0.5.0; extra == "srt"
233
+ Requires-Dist: outlines>=0.0.44; extra == "srt"
233
234
  Provides-Extra: openai
234
235
  Requires-Dist: openai>=1.0; extra == "openai"
235
- Requires-Dist: numpy; extra == "openai"
236
236
  Requires-Dist: tiktoken; extra == "openai"
237
237
  Provides-Extra: anthropic
238
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
- Requires-Dist: numpy; extra == "anthropic"
240
239
  Provides-Extra: litellm
241
240
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
242
241
  Provides-Extra: all
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
257
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
258
257
 
259
258
  The core features include:
260
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
261
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
262
261
 
263
262
  ## News
264
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -279,19 +278,27 @@ The core features include:
279
278
  ### Method 1: With pip
280
279
  ```
281
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
282
284
  ```
283
285
 
284
286
  ### Method 2: From source
285
287
  ```
286
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
287
289
  cd sglang
288
290
 
289
291
  pip install --upgrade pip
290
292
  pip install -e "python[all]"
293
+
294
+ # Install FlashInfer CUDA kernels
295
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
291
296
  ```
292
297
 
293
298
  ### Notes
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
299
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
300
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
301
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
295
302
 
296
303
  ## Quick Start
297
304
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -610,7 +617,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
610
617
  ```
611
618
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
612
619
  ```
613
- - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
614
620
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
615
621
 
616
622
  ### Supported Models
@@ -643,17 +649,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
643
649
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
644
650
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
645
651
 
646
- Learn more [here](docs/benchmark_results.md).
652
+ - Learn more about the above [results](docs/benchmark_results.md).
653
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
647
654
 
648
655
  ## Roadmap
649
656
  https://github.com/sgl-project/sglang/issues/157
650
657
 
651
658
  ## Citation And Acknowledgment
652
659
  ```
653
- @misc{zheng2023efficiently,
654
- title={Efficiently Programming Large Language Models using SGLang},
655
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
656
- year={2023},
660
+ @misc{zheng2024sglang,
661
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
662
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
663
+ year={2024},
657
664
  eprint={2312.07104},
658
665
  archivePrefix={arXiv},
659
666
  primaryClass={cs.AI}
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
10
10
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
11
11
 
12
12
  The core features include:
13
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
14
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
13
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
14
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
15
15
 
16
16
  ## News
17
17
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -32,19 +32,27 @@ The core features include:
32
32
  ### Method 1: With pip
33
33
  ```
34
34
  pip install "sglang[all]"
35
+
36
+ # Install FlashInfer CUDA kernels
37
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
35
38
  ```
36
39
 
37
40
  ### Method 2: From source
38
41
  ```
39
- git clone git@github.com:sgl-project/sglang.git
42
+ git clone https://github.com/sgl-project/sglang.git
40
43
  cd sglang
41
44
 
42
45
  pip install --upgrade pip
43
46
  pip install -e "python[all]"
47
+
48
+ # Install FlashInfer CUDA kernels
49
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
44
50
  ```
45
51
 
46
52
  ### Notes
47
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
53
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html).
54
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
55
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
48
56
 
49
57
  ## Quick Start
50
58
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -363,7 +371,6 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
363
371
  ```
364
372
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
365
373
  ```
366
- - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
367
374
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
368
375
 
369
376
  ### Supported Models
@@ -396,17 +403,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
396
403
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
397
404
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
398
405
 
399
- Learn more [here](docs/benchmark_results.md).
406
+ - Learn more about the above [results](docs/benchmark_results.md).
407
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
400
408
 
401
409
  ## Roadmap
402
410
  https://github.com/sgl-project/sglang/issues/157
403
411
 
404
412
  ## Citation And Acknowledgment
405
413
  ```
406
- @misc{zheng2023efficiently,
407
- title={Efficiently Programming Large Language Models using SGLang},
408
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
409
- year={2023},
414
+ @misc{zheng2024sglang,
415
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
416
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
417
+ year={2024},
410
418
  eprint={2312.07104},
411
419
  archivePrefix={arXiv},
412
420
  primaryClass={cs.AI}
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.17"
7
+ version = "0.1.18"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,13 +16,14 @@ classifiers = [
16
16
  dependencies = [
17
17
  "requests",
18
18
  "tqdm",
19
+ "numpy",
19
20
  ]
20
21
 
21
22
  [project.optional-dependencies]
22
- srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
23
- "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
24
- openai = ["openai>=1.0", "numpy", "tiktoken"]
25
- anthropic = ["anthropic>=0.20.0", "numpy"]
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
24
+ "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.0", "outlines>=0.0.44"]
25
+ openai = ["openai>=1.0", "tiktoken"]
26
+ anthropic = ["anthropic>=0.20.0"]
26
27
  litellm = ["litellm>=1.0.0"]
27
28
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
28
29
 
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.17"
1
+ __version__ = "0.1.18"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -24,10 +24,10 @@ from sglang.api import (
24
24
 
25
25
  # SGL Backends
26
26
  from sglang.backend.anthropic import Anthropic
27
+ from sglang.backend.litellm import LiteLLM
27
28
  from sglang.backend.openai import OpenAI
28
29
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
29
30
  from sglang.backend.vertexai import VertexAI
30
- from sglang.backend.litellm import LiteLLM
31
31
 
32
32
  # Global Configurations
33
33
  from sglang.global_config import global_config
@@ -1,4 +1,4 @@
1
- """Some Public API Definitions"""
1
+ """Public APIs of the language."""
2
2
 
3
3
  import os
4
4
  import re
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
43
43
  global_config.default_backend = backend
44
44
 
45
45
 
46
- def flush_cache(backend: BaseBackend = None):
46
+ def flush_cache(backend: Optional[BaseBackend] = None):
47
47
  backend = backend or global_config.default_backend
48
48
  if backend is None:
49
49
  return False
50
50
  return backend.flush_cache()
51
51
 
52
52
 
53
- def get_server_args(backend: BaseBackend = None):
53
+ def get_server_args(backend: Optional[BaseBackend] = None):
54
54
  backend = backend or global_config.default_backend
55
55
  if backend is None:
56
56
  return None
@@ -158,7 +158,7 @@ def video(path: str, num_frames: int):
158
158
 
159
159
  def select(
160
160
  name: Optional[str] = None,
161
- choices: List[str] = None,
161
+ choices: Optional[List[str]] = None,
162
162
  temperature: float = 0.0,
163
163
  ):
164
164
  assert choices is not None
@@ -13,7 +13,6 @@ except ImportError as e:
13
13
 
14
14
 
15
15
  class LiteLLM(BaseBackend):
16
-
17
16
  def __init__(
18
17
  self,
19
18
  model_name,
@@ -33,7 +32,8 @@ class LiteLLM(BaseBackend):
33
32
  self.model_name = model_name
34
33
 
35
34
  self.chat_template = chat_template or get_chat_template_by_model_path(
36
- model_name)
35
+ model_name
36
+ )
37
37
 
38
38
  self.client_params = {
39
39
  "api_key": api_key,
@@ -1,7 +1,7 @@
1
+ import dataclasses
1
2
  import logging
2
3
  import time
3
4
  import warnings
4
- import dataclasses
5
5
  from typing import Callable, List, Optional, Union
6
6
 
7
7
  import numpy as np
@@ -105,14 +105,16 @@ class OpenAI(BaseBackend):
105
105
  def get_chat_template(self):
106
106
  return self.chat_template
107
107
 
108
- def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
109
- num_api_spec_tokens: int, spec_var_name: str):
108
+ def _prepare_spec_execution(
109
+ self,
110
+ sampling_params: SglSamplingParams,
111
+ num_api_spec_tokens: int,
112
+ spec_var_name: str,
113
+ ):
110
114
  if "max_tokens" not in self.spec_kwargs:
111
115
  self.spec_kwargs["max_tokens"] = num_api_spec_tokens
112
116
  else:
113
- assert (
114
- self.spec_kwargs["max_tokens"] == num_api_spec_tokens
115
- )
117
+ assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
116
118
 
117
119
  params = sampling_params.to_openai_kwargs()
118
120
  for key, value in params.items():
@@ -151,8 +153,9 @@ class OpenAI(BaseBackend):
151
153
  )
152
154
  prompt = s.messages_
153
155
  else:
154
- return self._prepare_spec_execution(sampling_params,
155
- s.num_api_spec_tokens, spec_var_name)
156
+ return self._prepare_spec_execution(
157
+ sampling_params, s.num_api_spec_tokens, spec_var_name
158
+ )
156
159
  else:
157
160
  prompt = s.text_
158
161
 
@@ -325,7 +328,7 @@ class OpenAI(BaseBackend):
325
328
  ret_str = ret.choices[0].text
326
329
  ret_token = self.tokenizer.encode(ret_str)[0]
327
330
  self.token_usage.prompt_tokens += ret.usage.prompt_tokens
328
- self.token_usage.completion_tokens= ret.usage.completion_tokens
331
+ self.token_usage.completion_tokens = ret.usage.completion_tokens
329
332
 
330
333
  # TODO:
331
334
  # 1. return logits as the scores
@@ -355,7 +358,9 @@ class OpenAI(BaseBackend):
355
358
  return decision, scores, None, None
356
359
 
357
360
 
358
- def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
361
+ def openai_completion(
362
+ client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
363
+ ):
359
364
  for attempt in range(retries):
360
365
  try:
361
366
  if is_chat:
@@ -385,15 +390,19 @@ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None,
385
390
  return comp
386
391
 
387
392
 
388
- def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
393
+ def openai_completion_stream(
394
+ client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
395
+ ):
389
396
  for attempt in range(retries):
390
397
  try:
391
398
  if is_chat:
392
399
  if "stop" in kwargs and kwargs["stop"] is None:
393
400
  kwargs.pop("stop")
394
401
  generator = client.chat.completions.create(
395
- messages=prompt, stream=True, stream_options={"include_usage": True},
396
- **kwargs
402
+ messages=prompt,
403
+ stream=True,
404
+ stream_options={"include_usage": True},
405
+ **kwargs,
397
406
  )
398
407
  for ret in generator:
399
408
  if len(ret.choices) == 0:
@@ -405,8 +414,10 @@ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, promp
405
414
  yield content or "", {}
406
415
  else:
407
416
  generator = client.completions.create(
408
- prompt=prompt, stream=True, stream_options={"include_usage": True},
409
- **kwargs
417
+ prompt=prompt,
418
+ stream=True,
419
+ stream_options={"include_usage": True},
420
+ **kwargs,
410
421
  )
411
422
  for ret in generator:
412
423
  if len(ret.choices) == 0: