sglang 0.1.15__tar.gz → 0.1.17__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. {sglang-0.1.15/sglang.egg-info → sglang-0.1.17}/PKG-INFO +23 -13
  2. {sglang-0.1.15 → sglang-0.1.17}/README.md +15 -10
  3. {sglang-0.1.15 → sglang-0.1.17}/pyproject.toml +5 -4
  4. {sglang-0.1.15 → sglang-0.1.17}/sglang/__init__.py +5 -1
  5. {sglang-0.1.15 → sglang-0.1.17}/sglang/api.py +8 -3
  6. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/anthropic.py +1 -1
  7. sglang-0.1.17/sglang/backend/litellm.py +90 -0
  8. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/openai.py +148 -12
  9. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/runtime_endpoint.py +18 -10
  10. {sglang-0.1.15 → sglang-0.1.17}/sglang/global_config.py +11 -1
  11. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/chat_template.py +9 -2
  12. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/interpreter.py +161 -81
  13. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/ir.py +29 -11
  14. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/tracer.py +1 -1
  15. {sglang-0.1.15 → sglang-0.1.17}/sglang/launch_server.py +1 -2
  16. sglang-0.1.17/sglang/launch_server_llavavid.py +31 -0
  17. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/fsm_cache.py +3 -0
  18. sglang-0.1.17/sglang/srt/flush_cache.py +16 -0
  19. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/hf_transformers_utils.py +83 -2
  20. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/extend_attention.py +17 -0
  21. sglang-0.1.17/sglang/srt/layers/fused_moe.py +485 -0
  22. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/logits_processor.py +12 -7
  23. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/radix_attention.py +10 -3
  24. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/token_attention.py +16 -1
  25. sglang-0.1.17/sglang/srt/managers/controller/dp_worker.py +110 -0
  26. sglang-0.1.17/sglang/srt/managers/controller/infer_batch.py +619 -0
  27. sglang-0.1.17/sglang/srt/managers/controller/manager_multi.py +191 -0
  28. sglang-0.1.17/sglang/srt/managers/controller/manager_single.py +97 -0
  29. sglang-0.1.17/sglang/srt/managers/controller/model_runner.py +462 -0
  30. {sglang-0.1.15/sglang/srt/managers/router → sglang-0.1.17/sglang/srt/managers/controller}/radix_cache.py +54 -18
  31. sglang-0.1.17/sglang/srt/managers/controller/schedule_heuristic.py +59 -0
  32. sglang-0.1.17/sglang/srt/managers/controller/tp_worker.py +791 -0
  33. sglang-0.1.17/sglang/srt/managers/detokenizer_manager.py +95 -0
  34. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/io_struct.py +26 -10
  35. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/infer_batch.py +130 -74
  36. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/manager.py +7 -9
  37. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/model_rpc.py +224 -135
  38. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/router/model_runner.py +94 -107
  39. sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +267 -0
  40. sglang-0.1.17/sglang/srt/managers/router/scheduler.py +59 -0
  41. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/managers/tokenizer_manager.py +183 -88
  42. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/model_config.py +5 -2
  43. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/commandr.py +15 -22
  44. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/dbrx.py +22 -29
  45. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/gemma.py +14 -24
  46. sglang-0.1.17/sglang/srt/models/grok.py +671 -0
  47. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/llama2.py +24 -23
  48. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/llava.py +85 -25
  49. sglang-0.1.17/sglang/srt/models/llavavid.py +298 -0
  50. sglang-0.1.17/sglang/srt/models/mixtral.py +513 -0
  51. sglang-0.1.15/sglang/srt/models/mixtral.py → sglang-0.1.17/sglang/srt/models/mixtral_quant.py +18 -34
  52. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/qwen.py +28 -25
  53. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/qwen2.py +17 -22
  54. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/stablelm.py +21 -26
  55. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/yivl.py +17 -25
  56. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/openai_api_adapter.py +140 -95
  57. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/openai_protocol.py +10 -1
  58. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/server.py +101 -52
  59. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/server_args.py +59 -11
  60. sglang-0.1.17/sglang/srt/utils.py +484 -0
  61. {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_programs.py +44 -0
  62. {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_utils.py +32 -1
  63. {sglang-0.1.15 → sglang-0.1.17}/sglang/utils.py +95 -26
  64. {sglang-0.1.15 → sglang-0.1.17/sglang.egg-info}/PKG-INFO +23 -13
  65. {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/SOURCES.txt +15 -3
  66. {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/requires.txt +8 -2
  67. sglang-0.1.15/sglang/srt/backend_config.py +0 -13
  68. sglang-0.1.15/sglang/srt/managers/detokenizer_manager.py +0 -95
  69. sglang-0.1.15/sglang/srt/managers/router/scheduler.py +0 -70
  70. sglang-0.1.15/sglang/srt/models/dbrx_config.py +0 -281
  71. sglang-0.1.15/sglang/srt/utils.py +0 -317
  72. sglang-0.1.15/sglang/srt/weight_utils.py +0 -402
  73. {sglang-0.1.15 → sglang-0.1.17}/LICENSE +0 -0
  74. {sglang-0.1.15 → sglang-0.1.17}/setup.cfg +0 -0
  75. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/__init__.py +0 -0
  76. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/base_backend.py +0 -0
  77. {sglang-0.1.15 → sglang-0.1.17}/sglang/backend/vertexai.py +0 -0
  78. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/__init__.py +0 -0
  79. {sglang-0.1.15 → sglang-0.1.17}/sglang/lang/compiler.py +0 -0
  80. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/__init__.py +0 -0
  81. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/base_cache.py +0 -0
  82. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/constrained/jump_forward.py +0 -0
  83. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/conversation.py +0 -0
  84. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  85. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/memory_pool.py +0 -0
  86. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/mm_utils.py +0 -0
  87. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/models/mistral.py +0 -0
  88. {sglang-0.1.15 → sglang-0.1.17}/sglang/srt/sampling_params.py +0 -0
  89. {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_conversation.py +0 -0
  90. {sglang-0.1.15 → sglang-0.1.17}/sglang/test/test_openai_protocol.py +0 -0
  91. {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/dependency_links.txt +0 -0
  92. {sglang-0.1.15 → sglang-0.1.17}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.15
3
+ Version: 0.1.17
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -222,12 +222,14 @@ Requires-Dist: torch; extra == "srt"
222
222
  Requires-Dist: uvloop; extra == "srt"
223
223
  Requires-Dist: uvicorn; extra == "srt"
224
224
  Requires-Dist: zmq; extra == "srt"
225
- Requires-Dist: vllm>=0.4.2; extra == "srt"
225
+ Requires-Dist: vllm==0.4.3; extra == "srt"
226
226
  Requires-Dist: interegular; extra == "srt"
227
227
  Requires-Dist: pydantic; extra == "srt"
228
228
  Requires-Dist: pillow; extra == "srt"
229
- Requires-Dist: outlines>=0.0.27; extra == "srt"
230
229
  Requires-Dist: packaging; extra == "srt"
230
+ Requires-Dist: huggingface_hub; extra == "srt"
231
+ Requires-Dist: hf_transfer; extra == "srt"
232
+ Requires-Dist: outlines>=0.0.34; extra == "srt"
231
233
  Provides-Extra: openai
232
234
  Requires-Dist: openai>=1.0; extra == "openai"
233
235
  Requires-Dist: numpy; extra == "openai"
@@ -235,10 +237,13 @@ Requires-Dist: tiktoken; extra == "openai"
235
237
  Provides-Extra: anthropic
236
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
237
239
  Requires-Dist: numpy; extra == "anthropic"
240
+ Provides-Extra: litellm
241
+ Requires-Dist: litellm>=1.0.0; extra == "litellm"
238
242
  Provides-Extra: all
239
243
  Requires-Dist: sglang[srt]; extra == "all"
240
244
  Requires-Dist: sglang[openai]; extra == "all"
241
245
  Requires-Dist: sglang[anthropic]; extra == "all"
246
+ Requires-Dist: sglang[litellm]; extra == "all"
242
247
 
243
248
  <div align="center">
244
249
  <img src="assets/logo.png" alt="logo" width="400"></img>
@@ -251,9 +256,9 @@ Requires-Dist: sglang[anthropic]; extra == "all"
251
256
  SGLang is a structured generation language designed for large language models (LLMs).
252
257
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
253
258
 
254
- The core features of SGLang include:
259
+ The core features include:
255
260
  - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
256
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
261
+ - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
257
262
 
258
263
  ## News
259
264
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -286,12 +291,8 @@ pip install -e "python[all]"
286
291
  ```
287
292
 
288
293
  ### Notes
289
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
290
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
291
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
292
294
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
293
295
 
294
-
295
296
  ## Quick Start
296
297
  The example below shows how to use sglang to answer a mulit-turn question.
297
298
 
@@ -568,15 +569,17 @@ response = client.chat.completions.create(
568
569
  print(response)
569
570
  ```
570
571
 
571
- In above example, the server uses the chat template specified in the model tokenizer.
572
- You can override the chat template if needed when launching the server:
572
+
573
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
574
+
575
+ If needed, you can also override the chat template when launching the server:
573
576
 
574
577
  ```
575
578
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
576
579
  ```
577
580
 
578
581
  If the chat template you are looking for is missing, you are welcome to contribute it.
579
- Meanwhile, you can also temporary register your chat template as follows:
582
+ Meanwhile, you can also temporarily register your chat template as follows:
580
583
 
581
584
  ```json
582
585
  {
@@ -599,11 +602,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
599
602
  ```
600
603
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
601
604
  ```
605
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
606
+ ```
607
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
608
+ ```
602
609
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
603
610
  ```
604
611
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
605
612
  ```
606
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
613
+ - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
614
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
607
615
 
608
616
  ### Supported Models
609
617
  - Llama
@@ -617,6 +625,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
617
625
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
618
626
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
619
627
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
628
+ - LLaVA-NeXT-Video
629
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
620
630
  - Yi-VL
621
631
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
622
632
  - StableLM
@@ -9,9 +9,9 @@
9
9
  SGLang is a structured generation language designed for large language models (LLMs).
10
10
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
11
11
 
12
- The core features of SGLang include:
12
+ The core features include:
13
13
  - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
14
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatic KV cache reuse across multiple calls. It also supports other common techniques like continuous batching and tensor parallelism.
14
+ - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
15
15
 
16
16
  ## News
17
17
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -44,12 +44,8 @@ pip install -e "python[all]"
44
44
  ```
45
45
 
46
46
  ### Notes
47
- - If you are using older GPUs (NVIDIA V100, T4), please pick the correct triton compiler version to avoid some known bugs.
48
- - For NVIDIA T4, please use `pip install "triton>=2.2.0"`.
49
- - For NVIDIA V100, please install the [nightly](https://triton-lang.org/main/getting-started/installation.html) version.
50
47
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
51
48
 
52
-
53
49
  ## Quick Start
54
50
  The example below shows how to use sglang to answer a mulit-turn question.
55
51
 
@@ -326,15 +322,17 @@ response = client.chat.completions.create(
326
322
  print(response)
327
323
  ```
328
324
 
329
- In above example, the server uses the chat template specified in the model tokenizer.
330
- You can override the chat template if needed when launching the server:
325
+
326
+ By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
327
+
328
+ If needed, you can also override the chat template when launching the server:
331
329
 
332
330
  ```
333
331
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --chat-template llama-2
334
332
  ```
335
333
 
336
334
  If the chat template you are looking for is missing, you are welcome to contribute it.
337
- Meanwhile, you can also temporary register your chat template as follows:
335
+ Meanwhile, you can also temporarily register your chat template as follows:
338
336
 
339
337
  ```json
340
338
  {
@@ -357,11 +355,16 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
357
355
  ```
358
356
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
359
357
  ```
358
+ - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
359
+ ```
360
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --dp 2 --tp 2
361
+ ```
360
362
  - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
361
363
  ```
362
364
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
363
365
  ```
364
- - You can turn on [flashinfer](docs/flashinfer.md) to accelerate the inference by using highly optimized CUDA kernels.
366
+ - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
367
+ - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
365
368
 
366
369
  ### Supported Models
367
370
  - Llama
@@ -375,6 +378,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
375
378
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
376
379
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-vicuna-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
377
380
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.6-34b --tokenizer-path liuhaotian/llava-v1.6-34b-tokenizer --port 3000`
381
+ - LLaVA-NeXT-Video
382
+ - see [srt_example_llava_v.sh](examples/usage/llava_video/srt_example_llava_v.sh)
378
383
  - Yi-VL
379
384
  - see [srt_example_yi_vl.py](examples/quick_start/srt_example_yi_vl.py).
380
385
  - StableLM
@@ -4,8 +4,8 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.15"
8
- description = "A structured generation langauge for LLMs."
7
+ version = "0.1.17"
8
+ description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
11
11
  license = {file = "LICENSE"}
@@ -20,10 +20,11 @@ dependencies = [
20
20
 
21
21
  [project.optional-dependencies]
22
22
  srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
23
- "zmq", "vllm>=0.4.2", "interegular", "pydantic", "pillow", "outlines>=0.0.27", "packaging"]
23
+ "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
24
24
  openai = ["openai>=1.0", "numpy", "tiktoken"]
25
25
  anthropic = ["anthropic>=0.20.0", "numpy"]
26
- all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]"]
26
+ litellm = ["litellm>=1.0.0"]
27
+ all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
27
28
 
28
29
  [project.urls]
29
30
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.15"
1
+ __version__ = "0.1.17"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -19,6 +19,7 @@ from sglang.api import (
19
19
  user,
20
20
  user_begin,
21
21
  user_end,
22
+ video,
22
23
  )
23
24
 
24
25
  # SGL Backends
@@ -26,6 +27,7 @@ from sglang.backend.anthropic import Anthropic
26
27
  from sglang.backend.openai import OpenAI
27
28
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
28
29
  from sglang.backend.vertexai import VertexAI
30
+ from sglang.backend.litellm import LiteLLM
29
31
 
30
32
  # Global Configurations
31
33
  from sglang.global_config import global_config
@@ -34,6 +36,7 @@ from sglang.global_config import global_config
34
36
  __all__ = [
35
37
  "global_config",
36
38
  "Anthropic",
39
+ "LiteLLM",
37
40
  "OpenAI",
38
41
  "RuntimeEndpoint",
39
42
  "VertexAI",
@@ -46,6 +49,7 @@ __all__ = [
46
49
  "gen_int",
47
50
  "gen_string",
48
51
  "image",
52
+ "video",
49
53
  "select",
50
54
  "system",
51
55
  "user",
@@ -15,17 +15,18 @@ from sglang.lang.ir import (
15
15
  SglRoleBegin,
16
16
  SglRoleEnd,
17
17
  SglSelect,
18
+ SglVideo,
18
19
  )
19
20
 
20
21
 
21
22
  def function(
22
- func: Optional[Callable] = None, api_num_spec_tokens: Optional[int] = None
23
+ func: Optional[Callable] = None, num_api_spec_tokens: Optional[int] = None
23
24
  ):
24
25
  if func:
25
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
26
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
26
27
 
27
28
  def decorator(func):
28
- return SglFunction(func, api_num_spec_tokens=api_num_spec_tokens)
29
+ return SglFunction(func, num_api_spec_tokens=num_api_spec_tokens)
29
30
 
30
31
  return decorator
31
32
 
@@ -151,6 +152,10 @@ def image(expr: SglExpr):
151
152
  return SglImage(expr)
152
153
 
153
154
 
155
+ def video(path: str, num_frames: int):
156
+ return SglVideo(path, num_frames)
157
+
158
+
154
159
  def select(
155
160
  name: Optional[str] = None,
156
161
  choices: List[str] = None,
@@ -74,4 +74,4 @@ class Anthropic(BaseBackend):
74
74
  **sampling_params.to_anthropic_kwargs(),
75
75
  ) as stream:
76
76
  for text in stream.text_stream:
77
- yield text, {}
77
+ yield text, {}
@@ -0,0 +1,90 @@
1
+ from typing import Mapping, Optional
2
+
3
+ from sglang.backend.base_backend import BaseBackend
4
+ from sglang.lang.chat_template import get_chat_template_by_model_path
5
+ from sglang.lang.interpreter import StreamExecutor
6
+ from sglang.lang.ir import SglSamplingParams
7
+
8
+ try:
9
+ import litellm
10
+ except ImportError as e:
11
+ litellm = e
12
+ litellm.num_retries = 1
13
+
14
+
15
+ class LiteLLM(BaseBackend):
16
+
17
+ def __init__(
18
+ self,
19
+ model_name,
20
+ chat_template=None,
21
+ api_key=None,
22
+ organization: Optional[str] = None,
23
+ base_url: Optional[str] = None,
24
+ timeout: Optional[float] = 600,
25
+ max_retries: Optional[int] = litellm.num_retries,
26
+ default_headers: Optional[Mapping[str, str]] = None,
27
+ ):
28
+ super().__init__()
29
+
30
+ if isinstance(litellm, Exception):
31
+ raise litellm
32
+
33
+ self.model_name = model_name
34
+
35
+ self.chat_template = chat_template or get_chat_template_by_model_path(
36
+ model_name)
37
+
38
+ self.client_params = {
39
+ "api_key": api_key,
40
+ "organization": organization,
41
+ "base_url": base_url,
42
+ "timeout": timeout,
43
+ "max_retries": max_retries,
44
+ "default_headers": default_headers,
45
+ }
46
+
47
+ def get_chat_template(self):
48
+ return self.chat_template
49
+
50
+ def generate(
51
+ self,
52
+ s: StreamExecutor,
53
+ sampling_params: SglSamplingParams,
54
+ ):
55
+ if s.messages_:
56
+ messages = s.messages_
57
+ else:
58
+ messages = [{"role": "user", "content": s.text_}]
59
+
60
+ ret = litellm.completion(
61
+ model=self.model_name,
62
+ messages=messages,
63
+ **self.client_params,
64
+ **sampling_params.to_anthropic_kwargs(),
65
+ )
66
+ comp = ret.choices[0].message.content
67
+
68
+ return comp, {}
69
+
70
+ def generate_stream(
71
+ self,
72
+ s: StreamExecutor,
73
+ sampling_params: SglSamplingParams,
74
+ ):
75
+ if s.messages_:
76
+ messages = s.messages_
77
+ else:
78
+ messages = [{"role": "user", "content": s.text_}]
79
+
80
+ ret = litellm.completion(
81
+ model=self.model_name,
82
+ messages=messages,
83
+ stream=True,
84
+ **self.client_params,
85
+ **sampling_params.to_litellm_kwargs(),
86
+ )
87
+ for chunk in ret:
88
+ text = chunk.choices[0].delta.content
89
+ if text is not None:
90
+ yield text, {}
@@ -1,5 +1,7 @@
1
1
  import logging
2
2
  import time
3
+ import warnings
4
+ import dataclasses
3
5
  from typing import Callable, List, Optional, Union
4
6
 
5
7
  import numpy as np
@@ -41,6 +43,15 @@ INSTRUCT_MODEL_NAMES = [
41
43
  ]
42
44
 
43
45
 
46
+ @dataclasses.dataclass
47
+ class TokenUsage:
48
+ prompt_tokens: int
49
+ completion_tokens: int
50
+
51
+ def reset(self):
52
+ self.prompt_tokens = self.completion_tokens = 0
53
+
54
+
44
55
  class OpenAI(BaseBackend):
45
56
  def __init__(
46
57
  self,
@@ -80,40 +91,89 @@ class OpenAI(BaseBackend):
80
91
  else:
81
92
  self.is_chat_model = True
82
93
 
83
- self.chat_begin_str = self.chat_template.role_prefix_and_suffix["assistant"][0]
94
+ self.chat_prefix = self.chat_template.role_prefix_and_suffix["assistant"][0]
95
+
96
+ # Usage
97
+ self.token_usage = TokenUsage(0, 0)
98
+
99
+ # API speculative execution
100
+ # TODO(ying): This does not support multi-threading (run_batch)
101
+ self.spec_kwargs = {}
102
+ self.spec_format = []
103
+ self.spec_max_num_tries = 3
84
104
 
85
105
  def get_chat_template(self):
86
106
  return self.chat_template
87
107
 
108
+ def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
109
+ num_api_spec_tokens: int, spec_var_name: str):
110
+ if "max_tokens" not in self.spec_kwargs:
111
+ self.spec_kwargs["max_tokens"] = num_api_spec_tokens
112
+ else:
113
+ assert (
114
+ self.spec_kwargs["max_tokens"] == num_api_spec_tokens
115
+ )
116
+
117
+ params = sampling_params.to_openai_kwargs()
118
+ for key, value in params.items():
119
+ if key in ["stop"]:
120
+ continue
121
+ if key in ["max_tokens"]:
122
+ warnings.warn(
123
+ "The parameter max_tokens will be overwritten by speculated number of tokens."
124
+ )
125
+ continue
126
+ if key not in self.spec_kwargs:
127
+ self.spec_kwargs[key] = value
128
+ else:
129
+ assert (
130
+ value == self.spec_kwargs[key]
131
+ ), "sampling parameters should be consistent if turn on api speculative execution."
132
+ self.spec_format.append(
133
+ {"text": "", "stop": params["stop"], "name": spec_var_name}
134
+ )
135
+ return "", {}
136
+
88
137
  def generate(
89
138
  self,
90
139
  s: StreamExecutor,
91
140
  sampling_params: SglSamplingParams,
141
+ spec_var_name: str = None,
92
142
  ):
93
143
  if sampling_params.dtype is None:
94
144
  if self.is_chat_model:
95
- if not s.text_.endswith(self.chat_begin_str):
96
- raise RuntimeError(
97
- "This use case is not supported. "
98
- "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
99
- )
100
- prompt = s.messages_
145
+ if s.num_api_spec_tokens is None:
146
+ if not s.text_.endswith(self.chat_prefix):
147
+ raise RuntimeError(
148
+ "This use case is not supported if api speculative execution is off. "
149
+ "For OpenAI chat models, sgl.gen must be right after sgl.assistant. "
150
+ "Example of adding api speculative execution: @function(num_api_spec_tokens=128)."
151
+ )
152
+ prompt = s.messages_
153
+ else:
154
+ return self._prepare_spec_execution(sampling_params,
155
+ s.num_api_spec_tokens, spec_var_name)
101
156
  else:
102
157
  prompt = s.text_
103
158
 
104
159
  kwargs = sampling_params.to_openai_kwargs()
105
160
  comp = openai_completion(
106
161
  client=self.client,
162
+ token_usage=self.token_usage,
107
163
  is_chat=self.is_chat_model,
108
164
  model=self.model_name,
109
165
  prompt=prompt,
110
166
  **kwargs,
111
167
  )
112
168
  elif sampling_params.dtype in [str, "str", "string"]:
169
+ assert (
170
+ not self.is_chat_model
171
+ ), "constrained type not supported on chat model"
113
172
  kwargs = sampling_params.to_openai_kwargs()
114
173
  kwargs.pop("stop")
115
174
  comp = openai_completion(
116
175
  client=self.client,
176
+ token_usage=self.token_usage,
117
177
  is_chat=self.is_chat_model,
118
178
  model=self.model_name,
119
179
  prompt=s.text_ + '"',
@@ -122,10 +182,14 @@ class OpenAI(BaseBackend):
122
182
  )
123
183
  comp = '"' + comp + '"'
124
184
  elif sampling_params.dtype in [int, "int"]:
185
+ assert (
186
+ not self.is_chat_model
187
+ ), "constrained type not supported on chat model"
125
188
  kwargs = sampling_params.to_openai_kwargs()
126
189
  kwargs.pop("stop")
127
190
  comp = openai_completion(
128
191
  client=self.client,
192
+ token_usage=self.token_usage,
129
193
  is_chat=self.is_chat_model,
130
194
  model=self.model_name,
131
195
  prompt=s.text_,
@@ -138,6 +202,63 @@ class OpenAI(BaseBackend):
138
202
 
139
203
  return comp, {}
140
204
 
205
+ def spec_fill(self, value: str):
206
+ assert self.is_chat_model
207
+ self.spec_format.append({"text": value, "stop": None, "name": None})
208
+
209
+ def spec_pattern_match(self, comp):
210
+ for i, term in enumerate(self.spec_format):
211
+ text = term["text"]
212
+ if text != "":
213
+ if comp.startswith(text):
214
+ comp = comp[len(text) :]
215
+ else:
216
+ return False
217
+ else:
218
+ pos = comp.find(term["stop"])
219
+ if pos != -1:
220
+ term["text"] = comp[:pos]
221
+ comp = comp[pos:]
222
+ else:
223
+ if i == len(self.spec_format) - 1:
224
+ term["text"] = comp
225
+ else:
226
+ return False
227
+ return True
228
+
229
+ def role_end_generate(
230
+ self,
231
+ s: StreamExecutor,
232
+ ):
233
+ if s.num_api_spec_tokens is None or not s.text_.endswith(self.chat_prefix):
234
+ return
235
+
236
+ comp = ""
237
+ if not all(x["name"] is None for x in self.spec_format):
238
+ # TODO(ying): throw errors or warnings
239
+ for i in range(self.spec_max_num_tries):
240
+ comp = openai_completion(
241
+ client=self.client,
242
+ token_usage=self.token_usage,
243
+ is_chat=self.is_chat_model,
244
+ model=self.model_name,
245
+ prompt=s.messages_,
246
+ **self.spec_kwargs,
247
+ )
248
+ if self.spec_pattern_match(comp):
249
+ break
250
+
251
+ for term in self.spec_format:
252
+ s.text_ += term["text"]
253
+ name = term["name"]
254
+ if name is not None:
255
+ s.variables[name] = term["text"]
256
+ s.meta_info[name] = {}
257
+ s.variable_event[name].set()
258
+
259
+ self.spec_kwargs = {}
260
+ self.spec_format = []
261
+
141
262
  def generate_stream(
142
263
  self,
143
264
  s: StreamExecutor,
@@ -145,7 +266,7 @@ class OpenAI(BaseBackend):
145
266
  ):
146
267
  if sampling_params.dtype is None:
147
268
  if self.is_chat_model:
148
- if not s.text_.endswith(self.chat_begin_str):
269
+ if not s.text_.endswith(self.chat_prefix):
149
270
  raise RuntimeError(
150
271
  "This use case is not supported. "
151
272
  "For OpenAI chat models, sgl.gen must be right after sgl.assistant"
@@ -157,6 +278,7 @@ class OpenAI(BaseBackend):
157
278
  kwargs = sampling_params.to_openai_kwargs()
158
279
  generator = openai_completion_stream(
159
280
  client=self.client,
281
+ token_usage=self.token_usage,
160
282
  is_chat=self.is_chat_model,
161
283
  model=self.model_name,
162
284
  prompt=prompt,
@@ -202,6 +324,8 @@ class OpenAI(BaseBackend):
202
324
  )
203
325
  ret_str = ret.choices[0].text
204
326
  ret_token = self.tokenizer.encode(ret_str)[0]
327
+ self.token_usage.prompt_tokens += ret.usage.prompt_tokens
328
+ self.token_usage.completion_tokens= ret.usage.completion_tokens
205
329
 
206
330
  # TODO:
207
331
  # 1. return logits as the scores
@@ -231,7 +355,7 @@ class OpenAI(BaseBackend):
231
355
  return decision, scores, None, None
232
356
 
233
357
 
234
- def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
358
+ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
235
359
  for attempt in range(retries):
236
360
  try:
237
361
  if is_chat:
@@ -245,6 +369,9 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
245
369
  comp = [c.text for c in ret.choices]
246
370
  else:
247
371
  comp = ret.choices[0].text
372
+
373
+ token_usage.prompt_tokens += ret.usage.prompt_tokens
374
+ token_usage.completion_tokens += ret.usage.completion_tokens
248
375
  break
249
376
  except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
250
377
  logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")
@@ -258,16 +385,19 @@ def openai_completion(client, retries=3, is_chat=None, prompt=None, **kwargs):
258
385
  return comp
259
386
 
260
387
 
261
- def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwargs):
388
+ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
262
389
  for attempt in range(retries):
263
390
  try:
264
391
  if is_chat:
265
392
  if "stop" in kwargs and kwargs["stop"] is None:
266
393
  kwargs.pop("stop")
267
394
  generator = client.chat.completions.create(
268
- messages=prompt, stream=True, **kwargs
395
+ messages=prompt, stream=True, stream_options={"include_usage": True},
396
+ **kwargs
269
397
  )
270
398
  for ret in generator:
399
+ if len(ret.choices) == 0:
400
+ continue
271
401
  try:
272
402
  content = ret.choices[0].delta.content
273
403
  except IndexError:
@@ -275,11 +405,17 @@ def openai_completion_stream(client, retries=3, is_chat=None, prompt=None, **kwa
275
405
  yield content or "", {}
276
406
  else:
277
407
  generator = client.completions.create(
278
- prompt=prompt, stream=True, **kwargs
408
+ prompt=prompt, stream=True, stream_options={"include_usage": True},
409
+ **kwargs
279
410
  )
280
411
  for ret in generator:
412
+ if len(ret.choices) == 0:
413
+ continue
281
414
  content = ret.choices[0].text
282
415
  yield content or "", {}
416
+
417
+ token_usage.prompt_tokens += ret.usage.prompt_tokens
418
+ token_usage.completion_tokens += ret.usage.completion_tokens
283
419
  break
284
420
  except (openai.APIError, openai.APIConnectionError, openai.RateLimitError) as e:
285
421
  logger.error(f"OpenAI Error: {e}. Waiting 5 seconds...")