sglang 0.1.17__tar.gz → 0.1.21__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {sglang-0.1.17/sglang.egg-info → sglang-0.1.21}/PKG-INFO +52 -31
  2. {sglang-0.1.17 → sglang-0.1.21}/README.md +41 -19
  3. {sglang-0.1.17 → sglang-0.1.21}/pyproject.toml +6 -5
  4. {sglang-0.1.17 → sglang-0.1.21}/sglang/__init__.py +2 -2
  5. {sglang-0.1.17 → sglang-0.1.21}/sglang/api.py +30 -4
  6. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/litellm.py +2 -2
  7. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/openai.py +26 -15
  8. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/runtime_endpoint.py +26 -12
  9. sglang-0.1.21/sglang/bench_latency.py +320 -0
  10. {sglang-0.1.17 → sglang-0.1.21}/sglang/global_config.py +22 -12
  11. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/chat_template.py +40 -5
  12. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/compiler.py +2 -2
  13. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/interpreter.py +6 -2
  14. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/ir.py +74 -28
  15. {sglang-0.1.17 → sglang-0.1.21}/sglang/launch_server.py +4 -1
  16. {sglang-0.1.17 → sglang-0.1.21}/sglang/launch_server_llavavid.py +2 -1
  17. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/__init__.py +14 -6
  18. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/fsm_cache.py +6 -3
  19. sglang-0.1.21/sglang/srt/constrained/jump_forward.py +164 -0
  20. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/conversation.py +2 -0
  21. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/flush_cache.py +2 -0
  22. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/hf_transformers_utils.py +68 -9
  23. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/extend_attention.py +2 -1
  24. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/fused_moe.py +280 -169
  25. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/logits_processor.py +106 -42
  26. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/radix_attention.py +59 -58
  27. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/token_attention.py +4 -8
  28. sglang-0.1.21/sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  29. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/dp_worker.py +6 -3
  30. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/infer_batch.py +397 -108
  31. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/manager_multi.py +11 -7
  32. sglang-0.1.21/sglang/srt/managers/controller/manager_single.py +177 -0
  33. sglang-0.1.21/sglang/srt/managers/controller/model_runner.py +359 -0
  34. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/radix_cache.py +8 -3
  35. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/schedule_heuristic.py +6 -0
  36. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/controller/tp_worker.py +198 -176
  37. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/detokenizer_manager.py +19 -21
  38. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/io_struct.py +11 -5
  39. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/managers/tokenizer_manager.py +16 -14
  40. sglang-0.1.21/sglang/srt/memory_pool.py +105 -0
  41. sglang-0.1.21/sglang/srt/model_config.py +131 -0
  42. sglang-0.1.21/sglang/srt/models/chatglm.py +399 -0
  43. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/commandr.py +2 -2
  44. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/gemma.py +5 -1
  45. sglang-0.1.21/sglang/srt/models/gemma2.py +436 -0
  46. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/grok.py +204 -137
  47. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llama2.py +12 -5
  48. sglang-0.1.21/sglang/srt/models/llama_classification.py +107 -0
  49. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llava.py +11 -8
  50. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/llavavid.py +1 -1
  51. sglang-0.1.21/sglang/srt/models/minicpm.py +366 -0
  52. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mixtral.py +164 -115
  53. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mixtral_quant.py +0 -1
  54. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/qwen.py +1 -1
  55. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/qwen2.py +1 -1
  56. sglang-0.1.21/sglang/srt/models/qwen2_moe.py +473 -0
  57. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/stablelm.py +1 -1
  58. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/yivl.py +2 -2
  59. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/openai_api_adapter.py +35 -25
  60. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/openai_protocol.py +2 -2
  61. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/server.py +65 -19
  62. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/server_args.py +88 -47
  63. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/utils.py +177 -35
  64. {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_programs.py +28 -10
  65. {sglang-0.1.17 → sglang-0.1.21}/sglang/utils.py +4 -3
  66. {sglang-0.1.17 → sglang-0.1.21/sglang.egg-info}/PKG-INFO +52 -31
  67. {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/SOURCES.txt +7 -6
  68. {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/requires.txt +10 -11
  69. sglang-0.1.17/sglang/srt/constrained/jump_forward.py +0 -76
  70. sglang-0.1.17/sglang/srt/managers/controller/manager_single.py +0 -97
  71. sglang-0.1.17/sglang/srt/managers/controller/model_runner.py +0 -462
  72. sglang-0.1.17/sglang/srt/managers/router/infer_batch.py +0 -596
  73. sglang-0.1.17/sglang/srt/managers/router/manager.py +0 -82
  74. sglang-0.1.17/sglang/srt/managers/router/model_rpc.py +0 -818
  75. sglang-0.1.17/sglang/srt/managers/router/model_runner.py +0 -445
  76. sglang-0.1.17/sglang/srt/managers/router/radix_cache.py +0 -267
  77. sglang-0.1.17/sglang/srt/managers/router/scheduler.py +0 -59
  78. sglang-0.1.17/sglang/srt/memory_pool.py +0 -103
  79. sglang-0.1.17/sglang/srt/model_config.py +0 -46
  80. {sglang-0.1.17 → sglang-0.1.21}/LICENSE +0 -0
  81. {sglang-0.1.17 → sglang-0.1.21}/setup.cfg +0 -0
  82. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/__init__.py +0 -0
  83. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/anthropic.py +0 -0
  84. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/base_backend.py +0 -0
  85. {sglang-0.1.17 → sglang-0.1.21}/sglang/backend/vertexai.py +0 -0
  86. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/__init__.py +0 -0
  87. {sglang-0.1.17 → sglang-0.1.21}/sglang/lang/tracer.py +0 -0
  88. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/constrained/base_cache.py +0 -0
  89. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  90. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/mm_utils.py +0 -0
  91. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/dbrx.py +1 -1
  92. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/models/mistral.py +0 -0
  93. {sglang-0.1.17 → sglang-0.1.21}/sglang/srt/sampling_params.py +0 -0
  94. {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_conversation.py +0 -0
  95. {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_openai_protocol.py +0 -0
  96. {sglang-0.1.17 → sglang-0.1.21}/sglang/test/test_utils.py +0 -0
  97. {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/dependency_links.txt +0 -0
  98. {sglang-0.1.17 → sglang-0.1.21}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.17
3
+ Version: 0.1.21
4
4
  Summary: A structured generation langauge for LLMs.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -213,30 +213,29 @@ Description-Content-Type: text/markdown
213
213
  License-File: LICENSE
214
214
  Requires-Dist: requests
215
215
  Requires-Dist: tqdm
216
+ Requires-Dist: numpy
216
217
  Provides-Extra: srt
217
218
  Requires-Dist: aiohttp; extra == "srt"
218
219
  Requires-Dist: fastapi; extra == "srt"
220
+ Requires-Dist: hf_transfer; extra == "srt"
221
+ Requires-Dist: huggingface_hub; extra == "srt"
222
+ Requires-Dist: interegular; extra == "srt"
223
+ Requires-Dist: packaging; extra == "srt"
224
+ Requires-Dist: pillow; extra == "srt"
219
225
  Requires-Dist: psutil; extra == "srt"
226
+ Requires-Dist: pydantic; extra == "srt"
220
227
  Requires-Dist: rpyc; extra == "srt"
221
228
  Requires-Dist: torch; extra == "srt"
222
- Requires-Dist: uvloop; extra == "srt"
223
229
  Requires-Dist: uvicorn; extra == "srt"
230
+ Requires-Dist: uvloop; extra == "srt"
224
231
  Requires-Dist: zmq; extra == "srt"
225
- Requires-Dist: vllm==0.4.3; extra == "srt"
226
- Requires-Dist: interegular; extra == "srt"
227
- Requires-Dist: pydantic; extra == "srt"
228
- Requires-Dist: pillow; extra == "srt"
229
- Requires-Dist: packaging; extra == "srt"
230
- Requires-Dist: huggingface_hub; extra == "srt"
231
- Requires-Dist: hf_transfer; extra == "srt"
232
- Requires-Dist: outlines>=0.0.34; extra == "srt"
232
+ Requires-Dist: vllm==0.5.1; extra == "srt"
233
+ Requires-Dist: outlines>=0.0.44; extra == "srt"
233
234
  Provides-Extra: openai
234
235
  Requires-Dist: openai>=1.0; extra == "openai"
235
- Requires-Dist: numpy; extra == "openai"
236
236
  Requires-Dist: tiktoken; extra == "openai"
237
237
  Provides-Extra: anthropic
238
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
- Requires-Dist: numpy; extra == "anthropic"
240
239
  Provides-Extra: litellm
241
240
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
242
241
  Provides-Extra: all
@@ -257,8 +256,8 @@ SGLang is a structured generation language designed for large language models (L
257
256
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
258
257
 
259
258
  The core features include:
260
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
261
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
259
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
262
261
 
263
262
  ## News
264
263
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -279,19 +278,33 @@ The core features include:
279
278
  ### Method 1: With pip
280
279
  ```
281
280
  pip install "sglang[all]"
281
+
282
+ # Install FlashInfer CUDA kernels
283
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
282
284
  ```
283
285
 
284
286
  ### Method 2: From source
285
287
  ```
286
- git clone git@github.com:sgl-project/sglang.git
288
+ git clone https://github.com/sgl-project/sglang.git
287
289
  cd sglang
288
290
 
289
- pip install --upgrade pip
290
291
  pip install -e "python[all]"
292
+
293
+ # Install FlashInfer CUDA kernels
294
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
291
295
  ```
292
296
 
293
- ### Notes
294
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
297
+ ### Method 3: Using docker
298
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
299
+
300
+ ### Common Notes
301
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
302
+ ```
303
+ pip uninstall -y triton triton-nightly
304
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
305
+ ```
306
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
307
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
295
308
 
296
309
  ## Quick Start
297
310
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -511,8 +524,8 @@ for out in state.text_iter():
511
524
  ```
512
525
 
513
526
  ### Tips and Implementation Details
514
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
515
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
527
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
528
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
516
529
 
517
530
  ## Backend: SGLang Runtime (SRT)
518
531
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -569,7 +582,6 @@ response = client.chat.completions.create(
569
582
  print(response)
570
583
  ```
571
584
 
572
-
573
585
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
574
586
 
575
587
  If needed, you can also override the chat template when launching the server:
@@ -598,7 +610,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
598
610
  ```
599
611
 
600
612
  ### Additional Arguments
601
- - Add `--tp 2` to enable tensor parallelism.
613
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
602
614
  ```
603
615
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
604
616
  ```
@@ -610,16 +622,22 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
610
622
  ```
611
623
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
612
624
  ```
613
- - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
614
625
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
626
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
627
+ ```
628
+ # Node 0
629
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
630
+
631
+ # Node 1
632
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
633
+ ```
615
634
 
616
635
  ### Supported Models
617
636
  - Llama
618
637
  - Mistral
619
638
  - Mixtral
620
- - Qwen / Qwen 2
621
- - Gemma
622
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
639
+ - Qwen / Qwen 2 / Qwen 2 MoE
640
+ - Gemma / Gemma 2
623
641
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
624
642
  - LLaVA
625
643
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -632,6 +650,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
632
650
  - StableLM
633
651
  - Command-R
634
652
  - DBRX
653
+ - Grok
654
+ - ChatGLM
635
655
  - AWQ/GPTQ/Marlin quantization
636
656
 
637
657
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -643,17 +663,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
643
663
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
644
664
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
645
665
 
646
- Learn more [here](docs/benchmark_results.md).
666
+ - Learn more about the above [results](docs/benchmark_results.md).
667
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
647
668
 
648
669
  ## Roadmap
649
670
  https://github.com/sgl-project/sglang/issues/157
650
671
 
651
672
  ## Citation And Acknowledgment
652
673
  ```
653
- @misc{zheng2023efficiently,
654
- title={Efficiently Programming Large Language Models using SGLang},
655
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
656
- year={2023},
674
+ @misc{zheng2024sglang,
675
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
676
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
677
+ year={2024},
657
678
  eprint={2312.07104},
658
679
  archivePrefix={arXiv},
659
680
  primaryClass={cs.AI}
@@ -10,8 +10,8 @@ SGLang is a structured generation language designed for large language models (L
10
10
  It makes your interaction with LLMs faster and more controllable by co-designing the frontend language and the runtime system.
11
11
 
12
12
  The core features include:
13
- - **A Flexible Front-End Language**: This allows for easy programming of LLM applications with multiple chained generation calls, advanced prompting techniques, control flow, multiple modalities, parallelism, and external interaction.
14
- - **A High-Performance Runtime with RadixAttention**: This feature significantly accelerates the execution of complex LLM programs by automatically reusing the KV cache across multiple calls. It can also be used as a standalone serving engine with all common techniques implemented, such as continuous batching and tensor parallelism.
13
+ - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
14
+ - **High-Performance Backend Runtime**: Features RadixAttention for accelerating complex LLM programs by reusing the KV cache across multiple calls. It can also serve as a standalone inference engine with all common techniques implemented (e.g., continuous batching and tensor parallelism).
15
15
 
16
16
  ## News
17
17
  - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
@@ -32,19 +32,33 @@ The core features include:
32
32
  ### Method 1: With pip
33
33
  ```
34
34
  pip install "sglang[all]"
35
+
36
+ # Install FlashInfer CUDA kernels
37
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
35
38
  ```
36
39
 
37
40
  ### Method 2: From source
38
41
  ```
39
- git clone git@github.com:sgl-project/sglang.git
42
+ git clone https://github.com/sgl-project/sglang.git
40
43
  cd sglang
41
44
 
42
- pip install --upgrade pip
43
45
  pip install -e "python[all]"
46
+
47
+ # Install FlashInfer CUDA kernels
48
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
44
49
  ```
45
50
 
46
- ### Notes
47
- - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`
51
+ ### Method 3: Using docker
52
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
53
+
54
+ ### Common Notes
55
+ - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
56
+ ```
57
+ pip uninstall -y triton triton-nightly
58
+ pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
59
+ ```
60
+ - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
61
+ - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
48
62
 
49
63
  ## Quick Start
50
64
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -264,8 +278,8 @@ for out in state.text_iter():
264
278
  ```
265
279
 
266
280
  ### Tips and Implementation Details
267
- - The `choices` argument in `sgl.gen` is implemented by computing the normalized log probabilities of all choices and selecting the one with the highest probability.
268
- - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex.
281
+ - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
282
+ - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
269
283
 
270
284
  ## Backend: SGLang Runtime (SRT)
271
285
  The SGLang Runtime (SRT) is designed to work best with the SGLang frontend.
@@ -322,7 +336,6 @@ response = client.chat.completions.create(
322
336
  print(response)
323
337
  ```
324
338
 
325
-
326
339
  By default, the server uses the chat template specified in the model tokenizer from Hugging Face. It should just work for most official models such as Llama-2/Llama-3.
327
340
 
328
341
  If needed, you can also override the chat template when launching the server:
@@ -351,7 +364,7 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
351
364
  ```
352
365
 
353
366
  ### Additional Arguments
354
- - Add `--tp 2` to enable tensor parallelism.
367
+ - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
355
368
  ```
356
369
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --tp 2
357
370
  ```
@@ -363,16 +376,22 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
363
376
  ```
364
377
  python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port 30000 --mem-fraction-static 0.7
365
378
  ```
366
- - See [flashinfer.md](docs/flashinfer.md) on accelerating inference using highly optimized CUDA kernels.
367
379
  - See [hyperparameter_tuning.md](docs/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
380
+ - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-1` be the hostname of the first node and `50000` be an available port.
381
+ ```
382
+ # Node 0
383
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 0
384
+
385
+ # Node 1
386
+ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --tp 4 --nccl-init sgl-dev-1:50000 --nnodes 2 --node-rank 1
387
+ ```
368
388
 
369
389
  ### Supported Models
370
390
  - Llama
371
391
  - Mistral
372
392
  - Mixtral
373
- - Qwen / Qwen 2
374
- - Gemma
375
- - Please add a new flag `--attention-reduce-in-fp32` to avoid some precision errors.
393
+ - Qwen / Qwen 2 / Qwen 2 MoE
394
+ - Gemma / Gemma 2
376
395
  - `python -m sglang.launch_server --model-path google/gemma-7b-it --port 30000 --attention-reduce-in-fp32`
377
396
  - LLaVA
378
397
  - `python3 -m sglang.launch_server --model-path liuhaotian/llava-v1.5-7b --tokenizer-path llava-hf/llava-1.5-7b-hf --chat-template vicuna_v1.1 --port 30000`
@@ -385,6 +404,8 @@ python -m sglang.launch_server --model-path meta-llama/Llama-2-7b-chat-hf --port
385
404
  - StableLM
386
405
  - Command-R
387
406
  - DBRX
407
+ - Grok
408
+ - ChatGLM
388
409
  - AWQ/GPTQ/Marlin quantization
389
410
 
390
411
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/model_support.md).
@@ -396,17 +417,18 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
396
417
  - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
397
418
  ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
398
419
 
399
- Learn more [here](docs/benchmark_results.md).
420
+ - Learn more about the above [results](docs/benchmark_results.md).
421
+ - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
400
422
 
401
423
  ## Roadmap
402
424
  https://github.com/sgl-project/sglang/issues/157
403
425
 
404
426
  ## Citation And Acknowledgment
405
427
  ```
406
- @misc{zheng2023efficiently,
407
- title={Efficiently Programming Large Language Models using SGLang},
408
- author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Jeff Huang and Chuyue Sun and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
409
- year={2023},
428
+ @misc{zheng2024sglang,
429
+ title={SGLang: Efficient Execution of Structured Language Model Programs},
430
+ author={Lianmin Zheng and Liangsheng Yin and Zhiqiang Xie and Chuyue Sun and Jeff Huang and Cody Hao Yu and Shiyi Cao and Christos Kozyrakis and Ion Stoica and Joseph E. Gonzalez and Clark Barrett and Ying Sheng},
431
+ year={2024},
410
432
  eprint={2312.07104},
411
433
  archivePrefix={arXiv},
412
434
  primaryClass={cs.AI}
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.1.17"
7
+ version = "0.1.21"
8
8
  description = "A structured generation langauge for LLMs."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -16,13 +16,14 @@ classifiers = [
16
16
  dependencies = [
17
17
  "requests",
18
18
  "tqdm",
19
+ "numpy",
19
20
  ]
20
21
 
21
22
  [project.optional-dependencies]
22
- srt = ["aiohttp", "fastapi", "psutil", "rpyc", "torch", "uvloop", "uvicorn",
23
- "zmq", "vllm==0.4.3", "interegular", "pydantic", "pillow", "packaging", "huggingface_hub", "hf_transfer", "outlines>=0.0.34"]
24
- openai = ["openai>=1.0", "numpy", "tiktoken"]
25
- anthropic = ["anthropic>=0.20.0", "numpy"]
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
24
+ "psutil", "pydantic", "rpyc", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.1", "outlines>=0.0.44"]
25
+ openai = ["openai>=1.0", "tiktoken"]
26
+ anthropic = ["anthropic>=0.20.0"]
26
27
  litellm = ["litellm>=1.0.0"]
27
28
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
28
29
 
@@ -1,4 +1,4 @@
1
- __version__ = "0.1.17"
1
+ __version__ = "0.1.21"
2
2
 
3
3
  # SGL API Components
4
4
  from sglang.api import (
@@ -24,10 +24,10 @@ from sglang.api import (
24
24
 
25
25
  # SGL Backends
26
26
  from sglang.backend.anthropic import Anthropic
27
+ from sglang.backend.litellm import LiteLLM
27
28
  from sglang.backend.openai import OpenAI
28
29
  from sglang.backend.runtime_endpoint import RuntimeEndpoint
29
30
  from sglang.backend.vertexai import VertexAI
30
- from sglang.backend.litellm import LiteLLM
31
31
 
32
32
  # Global Configurations
33
33
  from sglang.global_config import global_config
@@ -1,4 +1,4 @@
1
- """Some Public API Definitions"""
1
+ """Public APIs of the language."""
2
2
 
3
3
  import os
4
4
  import re
@@ -43,14 +43,14 @@ def set_default_backend(backend: BaseBackend):
43
43
  global_config.default_backend = backend
44
44
 
45
45
 
46
- def flush_cache(backend: BaseBackend = None):
46
+ def flush_cache(backend: Optional[BaseBackend] = None):
47
47
  backend = backend or global_config.default_backend
48
48
  if backend is None:
49
49
  return False
50
50
  return backend.flush_cache()
51
51
 
52
52
 
53
- def get_server_args(backend: BaseBackend = None):
53
+ def get_server_args(backend: Optional[BaseBackend] = None):
54
54
  backend = backend or global_config.default_backend
55
55
  if backend is None:
56
56
  return None
@@ -67,10 +67,16 @@ def gen(
67
67
  frequency_penalty: Optional[float] = None,
68
68
  presence_penalty: Optional[float] = None,
69
69
  ignore_eos: Optional[bool] = None,
70
+ return_logprob: Optional[bool] = None,
71
+ logprob_start_len: Optional[int] = None,
72
+ top_logprobs_num: Optional[int] = None,
73
+ return_text_in_logprobs: Optional[bool] = None,
70
74
  dtype: Optional[type] = None,
71
75
  choices: Optional[List[str]] = None,
72
76
  regex: Optional[str] = None,
73
77
  ):
78
+ """Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
79
+
74
80
  if choices:
75
81
  return SglSelect(name, choices, 0.0 if temperature is None else temperature)
76
82
 
@@ -91,6 +97,10 @@ def gen(
91
97
  frequency_penalty,
92
98
  presence_penalty,
93
99
  ignore_eos,
100
+ return_logprob,
101
+ logprob_start_len,
102
+ top_logprobs_num,
103
+ return_text_in_logprobs,
94
104
  dtype,
95
105
  regex,
96
106
  )
@@ -106,6 +116,10 @@ def gen_int(
106
116
  frequency_penalty: Optional[float] = None,
107
117
  presence_penalty: Optional[float] = None,
108
118
  ignore_eos: Optional[bool] = None,
119
+ return_logprob: Optional[bool] = None,
120
+ logprob_start_len: Optional[int] = None,
121
+ top_logprobs_num: Optional[int] = None,
122
+ return_text_in_logprobs: Optional[bool] = None,
109
123
  ):
110
124
  return SglGen(
111
125
  name,
@@ -117,6 +131,10 @@ def gen_int(
117
131
  frequency_penalty,
118
132
  presence_penalty,
119
133
  ignore_eos,
134
+ return_logprob,
135
+ logprob_start_len,
136
+ top_logprobs_num,
137
+ return_text_in_logprobs,
120
138
  int,
121
139
  None,
122
140
  )
@@ -132,6 +150,10 @@ def gen_string(
132
150
  frequency_penalty: Optional[float] = None,
133
151
  presence_penalty: Optional[float] = None,
134
152
  ignore_eos: Optional[bool] = None,
153
+ return_logprob: Optional[bool] = None,
154
+ logprob_start_len: Optional[int] = None,
155
+ top_logprobs_num: Optional[int] = None,
156
+ return_text_in_logprobs: Optional[bool] = None,
135
157
  ):
136
158
  return SglGen(
137
159
  name,
@@ -143,6 +165,10 @@ def gen_string(
143
165
  frequency_penalty,
144
166
  presence_penalty,
145
167
  ignore_eos,
168
+ return_logprob,
169
+ logprob_start_len,
170
+ top_logprobs_num,
171
+ return_text_in_logprobs,
146
172
  str,
147
173
  None,
148
174
  )
@@ -158,7 +184,7 @@ def video(path: str, num_frames: int):
158
184
 
159
185
  def select(
160
186
  name: Optional[str] = None,
161
- choices: List[str] = None,
187
+ choices: Optional[List[str]] = None,
162
188
  temperature: float = 0.0,
163
189
  ):
164
190
  assert choices is not None
@@ -13,7 +13,6 @@ except ImportError as e:
13
13
 
14
14
 
15
15
  class LiteLLM(BaseBackend):
16
-
17
16
  def __init__(
18
17
  self,
19
18
  model_name,
@@ -33,7 +32,8 @@ class LiteLLM(BaseBackend):
33
32
  self.model_name = model_name
34
33
 
35
34
  self.chat_template = chat_template or get_chat_template_by_model_path(
36
- model_name)
35
+ model_name
36
+ )
37
37
 
38
38
  self.client_params = {
39
39
  "api_key": api_key,
@@ -1,7 +1,7 @@
1
+ import dataclasses
1
2
  import logging
2
3
  import time
3
4
  import warnings
4
- import dataclasses
5
5
  from typing import Callable, List, Optional, Union
6
6
 
7
7
  import numpy as np
@@ -105,14 +105,16 @@ class OpenAI(BaseBackend):
105
105
  def get_chat_template(self):
106
106
  return self.chat_template
107
107
 
108
- def _prepare_spec_execution(self, sampling_params: SglSamplingParams,
109
- num_api_spec_tokens: int, spec_var_name: str):
108
+ def _prepare_spec_execution(
109
+ self,
110
+ sampling_params: SglSamplingParams,
111
+ num_api_spec_tokens: int,
112
+ spec_var_name: str,
113
+ ):
110
114
  if "max_tokens" not in self.spec_kwargs:
111
115
  self.spec_kwargs["max_tokens"] = num_api_spec_tokens
112
116
  else:
113
- assert (
114
- self.spec_kwargs["max_tokens"] == num_api_spec_tokens
115
- )
117
+ assert self.spec_kwargs["max_tokens"] == num_api_spec_tokens
116
118
 
117
119
  params = sampling_params.to_openai_kwargs()
118
120
  for key, value in params.items():
@@ -151,8 +153,9 @@ class OpenAI(BaseBackend):
151
153
  )
152
154
  prompt = s.messages_
153
155
  else:
154
- return self._prepare_spec_execution(sampling_params,
155
- s.num_api_spec_tokens, spec_var_name)
156
+ return self._prepare_spec_execution(
157
+ sampling_params, s.num_api_spec_tokens, spec_var_name
158
+ )
156
159
  else:
157
160
  prompt = s.text_
158
161
 
@@ -325,7 +328,7 @@ class OpenAI(BaseBackend):
325
328
  ret_str = ret.choices[0].text
326
329
  ret_token = self.tokenizer.encode(ret_str)[0]
327
330
  self.token_usage.prompt_tokens += ret.usage.prompt_tokens
328
- self.token_usage.completion_tokens= ret.usage.completion_tokens
331
+ self.token_usage.completion_tokens = ret.usage.completion_tokens
329
332
 
330
333
  # TODO:
331
334
  # 1. return logits as the scores
@@ -355,7 +358,9 @@ class OpenAI(BaseBackend):
355
358
  return decision, scores, None, None
356
359
 
357
360
 
358
- def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
361
+ def openai_completion(
362
+ client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
363
+ ):
359
364
  for attempt in range(retries):
360
365
  try:
361
366
  if is_chat:
@@ -385,15 +390,19 @@ def openai_completion(client, token_usage, is_chat=None, retries=3, prompt=None,
385
390
  return comp
386
391
 
387
392
 
388
- def openai_completion_stream(client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs):
393
+ def openai_completion_stream(
394
+ client, token_usage, is_chat=None, retries=3, prompt=None, **kwargs
395
+ ):
389
396
  for attempt in range(retries):
390
397
  try:
391
398
  if is_chat:
392
399
  if "stop" in kwargs and kwargs["stop"] is None:
393
400
  kwargs.pop("stop")
394
401
  generator = client.chat.completions.create(
395
- messages=prompt, stream=True, stream_options={"include_usage": True},
396
- **kwargs
402
+ messages=prompt,
403
+ stream=True,
404
+ stream_options={"include_usage": True},
405
+ **kwargs,
397
406
  )
398
407
  for ret in generator:
399
408
  if len(ret.choices) == 0:
@@ -405,8 +414,10 @@ def openai_completion_stream(client, token_usage, is_chat=None, retries=3, promp
405
414
  yield content or "", {}
406
415
  else:
407
416
  generator = client.completions.create(
408
- prompt=prompt, stream=True, stream_options={"include_usage": True},
409
- **kwargs
417
+ prompt=prompt,
418
+ stream=True,
419
+ stream_options={"include_usage": True},
420
+ **kwargs,
410
421
  )
411
422
  for ret in generator:
412
423
  if len(ret.choices) == 0:
@@ -1,15 +1,14 @@
1
1
  import json
2
- from typing import Callable, List, Optional, Union
2
+ from typing import List, Optional
3
3
 
4
4
  import numpy as np
5
- import requests
6
5
 
7
6
  from sglang.backend.base_backend import BaseBackend
8
7
  from sglang.global_config import global_config
9
8
  from sglang.lang.chat_template import get_chat_template_by_model_path
10
9
  from sglang.lang.interpreter import StreamExecutor
11
- from sglang.lang.ir import SglArgument, SglSamplingParams
12
- from sglang.utils import encode_image_base64, find_printable_text, http_request
10
+ from sglang.lang.ir import SglSamplingParams
11
+ from sglang.utils import http_request
13
12
 
14
13
 
15
14
  class RuntimeEndpoint(BaseBackend):
@@ -125,6 +124,16 @@ class RuntimeEndpoint(BaseBackend):
125
124
  else:
126
125
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
127
126
 
127
+ for item in [
128
+ "return_logprob",
129
+ "logprob_start_len",
130
+ "top_logprobs_num",
131
+ "return_text_in_logprobs",
132
+ ]:
133
+ value = getattr(sampling_params, item, None)
134
+ if value is not None:
135
+ data[item] = value
136
+
128
137
  self._add_images(s, data)
129
138
 
130
139
  res = http_request(
@@ -167,6 +176,16 @@ class RuntimeEndpoint(BaseBackend):
167
176
  else:
168
177
  raise RuntimeError(f"Invalid dtype: {sampling_params.dtype}")
169
178
 
179
+ for item in [
180
+ "return_logprob",
181
+ "logprob_start_len",
182
+ "top_logprobs_num",
183
+ "return_text_in_logprobs",
184
+ ]:
185
+ value = getattr(sampling_params, item, None)
186
+ if value is not None:
187
+ data[item] = value
188
+
170
189
  data["stream"] = True
171
190
  self._add_images(s, data)
172
191
 
@@ -181,21 +200,16 @@ class RuntimeEndpoint(BaseBackend):
181
200
  self._assert_success(res)
182
201
  pos = 0
183
202
 
184
- incomplete_text = ""
185
203
  for chunk in res.iter_lines(decode_unicode=False):
186
204
  chunk = chunk.decode("utf-8")
187
205
  if chunk and chunk.startswith("data:"):
188
206
  if chunk == "data: [DONE]":
189
207
  break
190
208
  data = json.loads(chunk[5:].strip("\n"))
191
- text = find_printable_text(data["text"][pos:])
209
+ chunk_text = data["text"][pos:]
192
210
  meta_info = data["meta_info"]
193
- pos += len(text)
194
- incomplete_text = data["text"][pos:]
195
- yield text, meta_info
196
-
197
- if len(incomplete_text) > 0:
198
- yield incomplete_text, meta_info
211
+ pos += len(chunk_text)
212
+ yield chunk_text, meta_info
199
213
 
200
214
  def select(
201
215
  self,