sglang 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. {sglang-0.2.0/sglang.egg-info → sglang-0.2.1}/PKG-INFO +28 -14
  2. {sglang-0.2.0 → sglang-0.2.1}/README.md +27 -13
  3. {sglang-0.2.0 → sglang-0.2.1}/pyproject.toml +1 -1
  4. {sglang-0.2.0 → sglang-0.2.1}/sglang/bench_serving.py +3 -3
  5. {sglang-0.2.0 → sglang-0.2.1}/sglang/global_config.py +1 -1
  6. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/model_runner.py +1 -1
  7. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/io_struct.py +4 -1
  8. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/openai_api/adapter.py +6 -1
  9. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/utils.py +1 -0
  10. sglang-0.2.1/sglang/version.py +1 -0
  11. {sglang-0.2.0 → sglang-0.2.1/sglang.egg-info}/PKG-INFO +28 -14
  12. sglang-0.2.0/sglang/version.py +0 -1
  13. {sglang-0.2.0 → sglang-0.2.1}/LICENSE +0 -0
  14. {sglang-0.2.0 → sglang-0.2.1}/setup.cfg +0 -0
  15. {sglang-0.2.0 → sglang-0.2.1}/sglang/__init__.py +0 -0
  16. {sglang-0.2.0 → sglang-0.2.1}/sglang/api.py +0 -0
  17. {sglang-0.2.0 → sglang-0.2.1}/sglang/bench_latency.py +0 -0
  18. {sglang-0.2.0 → sglang-0.2.1}/sglang/check_env.py +0 -0
  19. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/__init__.py +0 -0
  20. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/__init__.py +0 -0
  21. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/anthropic.py +0 -0
  22. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/base_backend.py +0 -0
  23. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/litellm.py +0 -0
  24. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/openai.py +0 -0
  25. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/runtime_endpoint.py +0 -0
  26. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/backend/vertexai.py +0 -0
  27. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/chat_template.py +0 -0
  28. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/compiler.py +0 -0
  29. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/interpreter.py +0 -0
  30. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/ir.py +0 -0
  31. {sglang-0.2.0 → sglang-0.2.1}/sglang/lang/tracer.py +0 -0
  32. {sglang-0.2.0 → sglang-0.2.1}/sglang/launch_server.py +0 -0
  33. {sglang-0.2.0 → sglang-0.2.1}/sglang/launch_server_llavavid.py +0 -0
  34. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/constrained/__init__.py +0 -0
  35. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/constrained/base_cache.py +0 -0
  36. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/constrained/fsm_cache.py +0 -0
  37. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/constrained/jump_forward.py +0 -0
  38. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/conversation.py +0 -0
  39. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/flush_cache.py +0 -0
  40. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/hf_transformers_utils.py +0 -0
  41. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  42. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/extend_attention.py +0 -0
  43. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/fused_moe.py +0 -0
  44. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/linear.py +0 -0
  45. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/logits_processor.py +0 -0
  46. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/quantization/__init__.py +0 -0
  47. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/quantization/fp8.py +0 -0
  48. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/radix_attention.py +0 -0
  49. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/layers/token_attention.py +0 -0
  50. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/cuda_graph_runner.py +0 -0
  51. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/infer_batch.py +0 -0
  52. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/manager_multi.py +0 -0
  53. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/manager_single.py +0 -0
  54. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/radix_cache.py +0 -0
  55. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/schedule_heuristic.py +0 -0
  56. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/controller/tp_worker.py +0 -0
  57. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/detokenizer_manager.py +0 -0
  58. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/managers/tokenizer_manager.py +0 -0
  59. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/memory_pool.py +0 -0
  60. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/mm_utils.py +0 -0
  61. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/model_config.py +0 -0
  62. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/model_loader/model_loader.py +0 -0
  63. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/model_loader/utils.py +0 -0
  64. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/chatglm.py +0 -0
  65. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/commandr.py +0 -0
  66. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/dbrx.py +0 -0
  67. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/deepseek.py +0 -0
  68. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/gemma.py +0 -0
  69. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/gemma2.py +0 -0
  70. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/gpt_bigcode.py +0 -0
  71. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/grok.py +0 -0
  72. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/internlm2.py +0 -0
  73. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/llama2.py +0 -0
  74. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/llama_classification.py +0 -0
  75. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/llava.py +0 -0
  76. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/llavavid.py +0 -0
  77. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/minicpm.py +0 -0
  78. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/mistral.py +0 -0
  79. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/mixtral.py +0 -0
  80. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/mixtral_quant.py +0 -0
  81. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/qwen.py +0 -0
  82. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/qwen2.py +0 -0
  83. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/qwen2_moe.py +0 -0
  84. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/stablelm.py +0 -0
  85. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/models/yivl.py +0 -0
  86. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/openai_api/protocol.py +0 -0
  87. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/sampling_params.py +0 -0
  88. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/server.py +0 -0
  89. {sglang-0.2.0 → sglang-0.2.1}/sglang/srt/server_args.py +0 -0
  90. {sglang-0.2.0 → sglang-0.2.1}/sglang/test/test_conversation.py +0 -0
  91. {sglang-0.2.0 → sglang-0.2.1}/sglang/test/test_openai_protocol.py +0 -0
  92. {sglang-0.2.0 → sglang-0.2.1}/sglang/test/test_programs.py +0 -0
  93. {sglang-0.2.0 → sglang-0.2.1}/sglang/test/test_utils.py +0 -0
  94. {sglang-0.2.0 → sglang-0.2.1}/sglang/utils.py +0 -0
  95. {sglang-0.2.0 → sglang-0.2.1}/sglang.egg-info/SOURCES.txt +0 -0
  96. {sglang-0.2.0 → sglang-0.2.1}/sglang.egg-info/dependency_links.txt +0 -0
  97. {sglang-0.2.0 → sglang-0.2.1}/sglang.egg-info/requires.txt +0 -0
  98. {sglang-0.2.0 → sglang-0.2.1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -249,7 +249,7 @@ Requires-Dist: sglang[litellm]; extra == "all"
249
249
 
250
250
  --------------------------------------------------------------------------------
251
251
 
252
- | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
252
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
253
253
 
254
254
  SGLang is a fast serving framework for large language models and vision language models.
255
255
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -259,13 +259,14 @@ The core features include:
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
260
 
261
261
  ## News
262
- - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
- - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
262
+ - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
263
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
264
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
265
265
 
266
266
  <details>
267
267
  <summary>More</summary>
268
268
 
269
+ - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
269
270
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
271
 
271
272
  </details>
@@ -302,7 +303,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
302
303
  ```
303
304
 
304
305
  ### Method 3: Using docker
305
- The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
306
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
307
+ Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
306
308
 
307
309
  ```bash
308
310
  docker run --gpus all \
@@ -311,7 +313,7 @@ docker run --gpus all \
311
313
  --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
312
314
  --ipc=host \
313
315
  lmsysorg/sglang:latest \
314
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
316
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
315
317
  ```
316
318
 
317
319
  ### Common Notes
@@ -399,6 +401,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
399
401
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
400
402
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
401
403
 
404
+ ### Run Llama 3.1 405B
405
+
406
+ ```bash
407
+ # 2 nodes run 405B fp16
408
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
409
+ # on the first node
410
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
411
+
412
+ # on the second
413
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
414
+
415
+ # single node run 405B fp8
416
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
417
+ ```
418
+
402
419
  ### Supported Models
403
420
 
404
421
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -656,15 +673,12 @@ for out in state.text_iter():
656
673
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
657
674
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
658
675
 
659
- ## Benchmark And Performance
660
- - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
661
- ![llama_7b](assets/llama_7b.jpg)
662
676
 
663
- - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
664
- ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
677
+ ## Benchmark And Performance
678
+ ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
679
+ ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
665
680
 
666
- - Learn more about the above [results](docs/benchmark_results.md).
667
- - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
681
+ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
668
682
 
669
683
  ## Roadmap
670
684
  [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
@@ -4,7 +4,7 @@
4
4
 
5
5
  --------------------------------------------------------------------------------
6
6
 
7
- | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
7
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
8
8
 
9
9
  SGLang is a fast serving framework for large language models and vision language models.
10
10
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -14,13 +14,14 @@ The core features include:
14
14
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
15
15
 
16
16
  ## News
17
- - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
18
- - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
19
- - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
17
+ - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
18
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
19
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
20
20
 
21
21
  <details>
22
22
  <summary>More</summary>
23
23
 
24
+ - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
24
25
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
25
26
 
26
27
  </details>
@@ -57,7 +58,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
57
58
  ```
58
59
 
59
60
  ### Method 3: Using docker
60
- The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
61
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
62
+ Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
61
63
 
62
64
  ```bash
63
65
  docker run --gpus all \
@@ -66,7 +68,7 @@ docker run --gpus all \
66
68
  --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
67
69
  --ipc=host \
68
70
  lmsysorg/sglang:latest \
69
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
71
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
70
72
  ```
71
73
 
72
74
  ### Common Notes
@@ -154,6 +156,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
154
156
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
155
157
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
156
158
 
159
+ ### Run Llama 3.1 405B
160
+
161
+ ```bash
162
+ # 2 nodes run 405B fp16
163
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
164
+ # on the first node
165
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
166
+
167
+ # on the second
168
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
169
+
170
+ # single node run 405B fp8
171
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
+ ```
173
+
157
174
  ### Supported Models
158
175
 
159
176
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -411,15 +428,12 @@ for out in state.text_iter():
411
428
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
412
429
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
413
430
 
414
- ## Benchmark And Performance
415
- - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
416
- ![llama_7b](assets/llama_7b.jpg)
417
431
 
418
- - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
419
- ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
432
+ ## Benchmark And Performance
433
+ ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
434
+ ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
420
435
 
421
- - Learn more about the above [results](docs/benchmark_results.md).
422
- - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
436
+ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
423
437
 
424
438
  ## Roadmap
425
439
  [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -369,7 +369,7 @@ def sample_random_requests(
369
369
  ) -> List[Tuple[str, int, int]]:
370
370
 
371
371
  input_lens = np.random.randint(
372
- int(input_len * range_ratio),
372
+ max(int(input_len * range_ratio), 1),
373
373
  input_len + 1,
374
374
  size=num_prompts,
375
375
  )
@@ -415,7 +415,7 @@ def sample_random_requests(
415
415
  prompt_token_ids = tokenizer(prompt).input_ids
416
416
  prompt_len = len(prompt_token_ids)
417
417
 
418
- if prompt_len <= input_lens[i]:
418
+ if prompt_len > input_lens[i]:
419
419
  input_ids = prompt_token_ids[: input_lens[i]]
420
420
  else:
421
421
  ratio = (input_lens[i] + prompt_len - 1) // prompt_len
@@ -935,7 +935,7 @@ if __name__ == "__main__":
935
935
  parser.add_argument(
936
936
  "--random-range-ratio",
937
937
  type=float,
938
- default=1.0,
938
+ default=0.0,
939
939
  help="Range of sampled ratio of input/output length, "
940
940
  "used only for random dataset.",
941
941
  )
@@ -17,7 +17,7 @@ class GlobalConfig:
17
17
 
18
18
  # Runtime constants: New generation token ratio estimation
19
19
  self.init_new_token_ratio = 0.7
20
- self.base_min_new_token_ratio = 0.2
20
+ self.base_min_new_token_ratio = 0.1
21
21
  self.new_token_ratio_decay = 0.001
22
22
  self.new_token_ratio_recovery = 0.05
23
23
 
@@ -121,7 +121,7 @@ class ModelRunner:
121
121
  skip_tokenizer_init=True,
122
122
  )
123
123
 
124
- if is_llama3_405b_fp8(self.model_config):
124
+ if is_llama3_405b_fp8(self.model_config) and self.tp_size <= 8:
125
125
  # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
126
126
  self.model_config.hf_config.num_key_value_heads = 8
127
127
  vllm_model_config.hf_config.num_key_value_heads = 8
@@ -40,7 +40,10 @@ class GenerateReqInput:
40
40
  self.text is not None and self.input_ids is not None
41
41
  ):
42
42
  raise ValueError("Either text or input_ids should be provided.")
43
- if self.sampling_params.get("n", 1) != 1:
43
+ if (
44
+ isinstance(self.sampling_params, dict)
45
+ and self.sampling_params.get("n", 1) != 1
46
+ ):
44
47
  is_single = False
45
48
  else:
46
49
  if self.text is not None:
@@ -94,9 +94,14 @@ def load_chat_template_for_openai_api(chat_template_arg):
94
94
  async def v1_completions(tokenizer_manager, raw_request: Request):
95
95
  request_json = await raw_request.json()
96
96
  request = CompletionRequest(**request_json)
97
+ prompt = request.prompt
98
+ if isinstance(prompt, str) or isinstance(prompt[0], str):
99
+ prompt_kwargs = {"text": prompt}
100
+ else:
101
+ prompt_kwargs = {"input_ids": prompt}
97
102
 
98
103
  adapted_request = GenerateReqInput(
99
- text=request.prompt,
104
+ **prompt_kwargs,
100
105
  sampling_params={
101
106
  "temperature": request.temperature,
102
107
  "max_new_tokens": request.max_tokens,
@@ -626,6 +626,7 @@ def is_llama3_405b_fp8(model_config):
626
626
  and model_config.hf_config.intermediate_size == 53248
627
627
  and model_config.hf_config.num_hidden_layers == 126
628
628
  and model_config.hf_config.num_key_value_heads == 16
629
+ and hasattr(model_config.hf_config, "quantization_config")
629
630
  and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
630
631
  ):
631
632
  return True
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -249,7 +249,7 @@ Requires-Dist: sglang[litellm]; extra == "all"
249
249
 
250
250
  --------------------------------------------------------------------------------
251
251
 
252
- | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
252
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
253
253
 
254
254
  SGLang is a fast serving framework for large language models and vision language models.
255
255
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -259,13 +259,14 @@ The core features include:
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
260
 
261
261
  ## News
262
- - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
- - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
262
+ - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
263
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
264
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
265
265
 
266
266
  <details>
267
267
  <summary>More</summary>
268
268
 
269
+ - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
269
270
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
271
 
271
272
  </details>
@@ -302,7 +303,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
302
303
  ```
303
304
 
304
305
  ### Method 3: Using docker
305
- The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
306
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
307
+ Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
306
308
 
307
309
  ```bash
308
310
  docker run --gpus all \
@@ -311,7 +313,7 @@ docker run --gpus all \
311
313
  --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
312
314
  --ipc=host \
313
315
  lmsysorg/sglang:latest \
314
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
316
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
315
317
  ```
316
318
 
317
319
  ### Common Notes
@@ -399,6 +401,21 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
399
401
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
400
402
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
401
403
 
404
+ ### Run Llama 3.1 405B
405
+
406
+ ```bash
407
+ # 2 nodes run 405B fp16
408
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
409
+ # on the first node
410
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
411
+
412
+ # on the second
413
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
414
+
415
+ # single node run 405B fp8
416
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
417
+ ```
418
+
402
419
  ### Supported Models
403
420
 
404
421
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -656,15 +673,12 @@ for out in state.text_iter():
656
673
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
657
674
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
658
675
 
659
- ## Benchmark And Performance
660
- - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
661
- ![llama_7b](assets/llama_7b.jpg)
662
676
 
663
- - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
664
- ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
677
+ ## Benchmark And Performance
678
+ ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
679
+ ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
665
680
 
666
- - Learn more about the above [results](docs/benchmark_results.md).
667
- - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
681
+ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
668
682
 
669
683
  ## Roadmap
670
684
  [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
@@ -1 +0,0 @@
1
- __version__ = "0.2.0"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes