sglang 0.2.10__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. {sglang-0.2.10/sglang.egg-info → sglang-0.2.11}/PKG-INFO +34 -24
  2. {sglang-0.2.10 → sglang-0.2.11}/README.md +25 -21
  3. {sglang-0.2.10 → sglang-0.2.11}/pyproject.toml +5 -3
  4. {sglang-0.2.10 → sglang-0.2.11}/sglang/__init__.py +8 -0
  5. {sglang-0.2.10 → sglang-0.2.11}/sglang/api.py +10 -2
  6. {sglang-0.2.10 → sglang-0.2.11}/sglang/bench_latency.py +145 -36
  7. {sglang-0.2.10 → sglang-0.2.11}/sglang/check_env.py +24 -2
  8. {sglang-0.2.10 → sglang-0.2.11}/sglang/global_config.py +0 -1
  9. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/base_backend.py +3 -1
  10. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/openai.py +8 -3
  11. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/runtime_endpoint.py +46 -29
  12. sglang-0.2.11/sglang/lang/choices.py +164 -0
  13. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/interpreter.py +6 -13
  14. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/ir.py +11 -2
  15. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/logits_processor.py +1 -1
  16. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/radix_attention.py +2 -5
  17. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/schedule_batch.py +95 -324
  18. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/tokenizer_manager.py +6 -3
  19. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/tp_worker.py +20 -22
  20. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/memory_pool.py +9 -14
  21. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_executor/cuda_graph_runner.py +3 -3
  22. sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +256 -0
  23. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_executor/model_runner.py +6 -10
  24. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/chatglm.py +1 -1
  25. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/commandr.py +1 -1
  26. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/dbrx.py +1 -1
  27. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/deepseek.py +1 -1
  28. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/deepseek_v2.py +1 -1
  29. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gemma.py +1 -1
  30. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gemma2.py +1 -1
  31. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/gpt_bigcode.py +1 -1
  32. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/grok.py +1 -1
  33. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/internlm2.py +1 -1
  34. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llama2.py +1 -1
  35. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llama_classification.py +1 -1
  36. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llava.py +1 -2
  37. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/llavavid.py +1 -2
  38. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/minicpm.py +1 -1
  39. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mixtral.py +1 -1
  40. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mixtral_quant.py +1 -1
  41. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen.py +1 -1
  42. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen2.py +1 -1
  43. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/qwen2_moe.py +1 -1
  44. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/stablelm.py +1 -1
  45. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/openai_api/adapter.py +34 -12
  46. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/openai_api/protocol.py +6 -0
  47. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/server.py +24 -6
  48. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/server_args.py +4 -0
  49. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/test_utils.py +1 -1
  50. sglang-0.2.11/sglang/version.py +1 -0
  51. {sglang-0.2.10 → sglang-0.2.11/sglang.egg-info}/PKG-INFO +34 -24
  52. {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/SOURCES.txt +2 -0
  53. {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/requires.txt +10 -2
  54. sglang-0.2.10/sglang/version.py +0 -1
  55. {sglang-0.2.10 → sglang-0.2.11}/LICENSE +0 -0
  56. {sglang-0.2.10 → sglang-0.2.11}/setup.cfg +0 -0
  57. {sglang-0.2.10 → sglang-0.2.11}/sglang/bench_serving.py +0 -0
  58. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/__init__.py +0 -0
  59. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/__init__.py +0 -0
  60. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/anthropic.py +0 -0
  61. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/litellm.py +0 -0
  62. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/backend/vertexai.py +0 -0
  63. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/chat_template.py +0 -0
  64. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/compiler.py +0 -0
  65. {sglang-0.2.10 → sglang-0.2.11}/sglang/lang/tracer.py +0 -0
  66. {sglang-0.2.10 → sglang-0.2.11}/sglang/launch_server.py +0 -0
  67. {sglang-0.2.10 → sglang-0.2.11}/sglang/launch_server_llavavid.py +0 -0
  68. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/__init__.py +0 -0
  69. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/base_tool_cache.py +0 -0
  70. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/fsm_cache.py +0 -0
  71. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/constrained/jump_forward.py +0 -0
  72. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/conversation.py +0 -0
  73. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/hf_transformers_utils.py +0 -0
  74. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  75. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/extend_attention.py +0 -0
  76. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/fused_moe.py +0 -0
  77. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/linear.py +0 -0
  78. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/quantization/__init__.py +0 -0
  79. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/quantization/fp8.py +0 -0
  80. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/layers/token_attention.py +0 -0
  81. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/controller_multi.py +0 -0
  82. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/controller_single.py +0 -0
  83. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/detokenizer_manager.py +0 -0
  84. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/io_struct.py +0 -0
  85. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/managers/policy_scheduler.py +0 -0
  86. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/base_cache.py +0 -0
  87. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  88. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/flush_cache.py +0 -0
  89. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mem_cache/radix_cache.py +0 -0
  90. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/mm_utils.py +0 -0
  91. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_config.py +0 -0
  92. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_loader/model_loader.py +0 -0
  93. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/model_loader/utils.py +0 -0
  94. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/mistral.py +0 -0
  95. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/models/yivl.py +0 -0
  96. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/sampling_params.py +0 -0
  97. {sglang-0.2.10 → sglang-0.2.11}/sglang/srt/utils.py +0 -0
  98. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/run_eval.py +0 -0
  99. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/runners.py +0 -0
  100. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_common.py +0 -0
  101. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_gpqa.py +0 -0
  102. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_humaneval.py +0 -0
  103. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_math.py +0 -0
  104. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/simple_eval_mmlu.py +0 -0
  105. {sglang-0.2.10 → sglang-0.2.11}/sglang/test/test_programs.py +0 -0
  106. {sglang-0.2.10 → sglang-0.2.11}/sglang/utils.py +0 -0
  107. {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/dependency_links.txt +0 -0
  108. {sglang-0.2.10 → sglang-0.2.11}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.10
3
+ Version: 0.2.11
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -220,7 +220,6 @@ Requires-Dist: fastapi; extra == "srt"
220
220
  Requires-Dist: hf_transfer; extra == "srt"
221
221
  Requires-Dist: huggingface_hub; extra == "srt"
222
222
  Requires-Dist: interegular; extra == "srt"
223
- Requires-Dist: jsonlines; extra == "srt"
224
223
  Requires-Dist: packaging; extra == "srt"
225
224
  Requires-Dist: pillow; extra == "srt"
226
225
  Requires-Dist: psutil; extra == "srt"
@@ -230,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
230
229
  Requires-Dist: uvicorn; extra == "srt"
231
230
  Requires-Dist: uvloop; extra == "srt"
232
231
  Requires-Dist: zmq; extra == "srt"
233
- Requires-Dist: vllm==0.5.3.post1; extra == "srt"
232
+ Requires-Dist: vllm==0.5.4; extra == "srt"
234
233
  Requires-Dist: outlines>=0.0.44; extra == "srt"
235
234
  Provides-Extra: openai
236
235
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -239,11 +238,18 @@ Provides-Extra: anthropic
239
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
240
239
  Provides-Extra: litellm
241
240
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
241
+ Provides-Extra: test
242
+ Requires-Dist: jsonlines; extra == "test"
243
+ Requires-Dist: matplotlib; extra == "test"
244
+ Requires-Dist: pandas; extra == "test"
242
245
  Provides-Extra: all
243
246
  Requires-Dist: sglang[srt]; extra == "all"
244
247
  Requires-Dist: sglang[openai]; extra == "all"
245
248
  Requires-Dist: sglang[anthropic]; extra == "all"
246
249
  Requires-Dist: sglang[litellm]; extra == "all"
250
+ Provides-Extra: dev
251
+ Requires-Dist: sglang[all]; extra == "dev"
252
+ Requires-Dist: sglang[test]; extra == "dev"
247
253
 
248
254
  <div align="center">
249
255
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -296,20 +302,20 @@ pip install --upgrade pip
296
302
  pip install "sglang[all]"
297
303
 
298
304
  # Install FlashInfer CUDA kernels
299
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
305
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
300
306
  ```
301
307
 
302
308
  ### Method 2: From source
303
309
  ```
304
310
  # Use the last release branch
305
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
306
312
  cd sglang
307
313
 
308
314
  pip install --upgrade pip
309
315
  pip install -e "python[all]"
310
316
 
311
317
  # Install FlashInfer CUDA kernels
312
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
318
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
313
319
  ```
314
320
 
315
321
  ### Method 3: Using docker
@@ -383,7 +389,7 @@ response = client.chat.completions.create(
383
389
  print(response)
384
390
  ```
385
391
 
386
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
392
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
387
393
 
388
394
  ### Additional Server Arguments
389
395
  - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -394,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
394
400
  ```
395
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
396
402
  ```
397
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
403
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
398
404
  ```
399
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
400
406
  ```
407
+ - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
408
+ ```
409
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
410
+ ```
401
411
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
402
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
403
413
  ```
@@ -411,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
411
421
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
412
422
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
413
423
 
414
- ### Run Llama 3.1 405B
415
-
416
- ```bash
417
- ## Run 405B (fp8) on a single node
418
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
419
-
420
- ## Run 405B (fp16) on two nodes
421
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
422
-
423
- # on the first node
424
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
425
-
426
- # on the second
427
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
428
- ```
429
-
430
424
  ### Supported Models
431
425
 
432
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -452,6 +446,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
452
446
 
453
447
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
454
448
 
449
+ ### Run Llama 3.1 405B
450
+
451
+ ```bash
452
+ ## Run 405B (fp8) on a single node
453
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
454
+
455
+ ## Run 405B (fp16) on two nodes
456
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
457
+
458
+ # on the first node
459
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
460
+
461
+ # on the second
462
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
463
+ ```
464
+
455
465
  ### Benchmark Performance
456
466
 
457
467
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -49,20 +49,20 @@ pip install --upgrade pip
49
49
  pip install "sglang[all]"
50
50
 
51
51
  # Install FlashInfer CUDA kernels
52
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
52
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
53
53
  ```
54
54
 
55
55
  ### Method 2: From source
56
56
  ```
57
57
  # Use the last release branch
58
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
58
+ git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
59
59
  cd sglang
60
60
 
61
61
  pip install --upgrade pip
62
62
  pip install -e "python[all]"
63
63
 
64
64
  # Install FlashInfer CUDA kernels
65
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
65
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
66
66
  ```
67
67
 
68
68
  ### Method 3: Using docker
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
136
136
  print(response)
137
137
  ```
138
138
 
139
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
139
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
140
140
 
141
141
  ### Additional Server Arguments
142
142
  - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -147,10 +147,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
147
147
  ```
148
148
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
149
149
  ```
150
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
150
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
151
151
  ```
152
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
153
153
  ```
154
+ - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
155
+ ```
156
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
157
+ ```
154
158
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
155
159
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
156
160
  ```
@@ -164,22 +168,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
164
168
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
165
169
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
166
170
 
167
- ### Run Llama 3.1 405B
168
-
169
- ```bash
170
- ## Run 405B (fp8) on a single node
171
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
-
173
- ## Run 405B (fp16) on two nodes
174
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
175
-
176
- # on the first node
177
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
178
-
179
- # on the second
180
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
181
- ```
182
-
183
171
  ### Supported Models
184
172
 
185
173
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -205,6 +193,22 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
205
193
 
206
194
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
207
195
 
196
+ ### Run Llama 3.1 405B
197
+
198
+ ```bash
199
+ ## Run 405B (fp8) on a single node
200
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
201
+
202
+ ## Run 405B (fp16) on two nodes
203
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
204
+
205
+ # on the first node
206
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
207
+
208
+ # on the second
209
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
210
+ ```
211
+
208
212
  ### Benchmark Performance
209
213
 
210
214
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.10"
7
+ version = "0.2.11"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,14 +20,16 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "jsonlines",
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.3.post1", "outlines>=0.0.44"]
26
+ "vllm==0.5.4", "outlines>=0.0.44"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
30
+ test = ["jsonlines", "matplotlib", "pandas"]
30
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
+ dev = ["sglang[all]", "sglang[test]"]
31
33
 
32
34
  [project.urls]
33
35
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -22,6 +22,11 @@ from sglang.api import (
22
22
  user_end,
23
23
  video,
24
24
  )
25
+ from sglang.lang.choices import (
26
+ greedy_token_selection,
27
+ token_length_normalized,
28
+ unconditional_likelihood_normalized,
29
+ )
25
30
 
26
31
  # SGLang DSL APIs
27
32
  __all__ = [
@@ -45,6 +50,9 @@ __all__ = [
45
50
  "user_begin",
46
51
  "user_end",
47
52
  "video",
53
+ "greedy_token_selection",
54
+ "token_length_normalized",
55
+ "unconditional_likelihood_normalized",
48
56
  ]
49
57
 
50
58
  # Global Configurations
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
6
6
 
7
7
  from sglang.global_config import global_config
8
8
  from sglang.lang.backend.base_backend import BaseBackend
9
+ from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
9
10
  from sglang.lang.ir import (
10
11
  SglExpr,
11
12
  SglExprList,
@@ -73,12 +74,18 @@ def gen(
73
74
  return_text_in_logprobs: Optional[bool] = None,
74
75
  dtype: Optional[type] = None,
75
76
  choices: Optional[List[str]] = None,
77
+ choices_method: Optional[ChoicesSamplingMethod] = None,
76
78
  regex: Optional[str] = None,
77
79
  ):
78
80
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
79
81
 
80
82
  if choices:
81
- return SglSelect(name, choices, 0.0 if temperature is None else temperature)
83
+ return SglSelect(
84
+ name,
85
+ choices,
86
+ 0.0 if temperature is None else temperature,
87
+ token_length_normalized if choices_method is None else choices_method,
88
+ )
82
89
 
83
90
  # check regex is valid
84
91
  if regex is not None:
@@ -186,9 +193,10 @@ def select(
186
193
  name: Optional[str] = None,
187
194
  choices: Optional[List[str]] = None,
188
195
  temperature: float = 0.0,
196
+ choices_method: ChoicesSamplingMethod = token_length_normalized,
189
197
  ):
190
198
  assert choices is not None
191
- return SglSelect(name, choices, temperature)
199
+ return SglSelect(name, choices, temperature, choices_method)
192
200
 
193
201
 
194
202
  def _role_common(name: str, expr: Optional[SglExpr] = None):
@@ -1,13 +1,21 @@
1
1
  """
2
2
  Benchmark the latency of a given model. It accepts arguments similar to those of launch_server.py.
3
3
 
4
- # Usage (latency test) with dummy weights:
4
+ # Usage (latency test)
5
+ ## with dummy weights:
5
6
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
7
+ ## sweep through multiple data points and store (append) the results in a jsonl file:
8
+ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
9
+ ## do some changes, and store the results under a different run_name:
10
+ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
11
+ ## plot the results in series of lines:
12
+ python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
13
+
6
14
 
7
15
  # Usage (correctness test):
8
16
  python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
9
17
 
10
- ### Reference output (of the correctness test above, can be gpu dependent):
18
+ ## Reference output (of the correctness test above, can be gpu dependent):
11
19
  prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
12
20
  [-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
13
21
  [ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
@@ -28,19 +36,23 @@ I'm going to the park
28
36
 
29
37
  import argparse
30
38
  import dataclasses
39
+ import itertools
31
40
  import logging
32
41
  import multiprocessing
42
+ import os
43
+ import sqlite3
33
44
  import time
34
45
  from typing import Tuple
35
46
 
36
- import jsonlines
37
47
  import numpy as np
48
+ import pandas as pd
38
49
  import torch
39
50
  import torch.distributed as dist
40
51
 
41
52
  from sglang.srt.hf_transformers_utils import get_tokenizer
42
- from sglang.srt.managers.schedule_batch import Batch, ForwardMode, Req
53
+ from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
43
54
  from sglang.srt.model_config import ModelConfig
55
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode
44
56
  from sglang.srt.model_executor.model_runner import ModelRunner
45
57
  from sglang.srt.sampling_params import SamplingParams
46
58
  from sglang.srt.server_args import ServerArgs
@@ -49,26 +61,42 @@ from sglang.srt.utils import suppress_other_loggers
49
61
 
50
62
  @dataclasses.dataclass
51
63
  class BenchArgs:
64
+ run_name: str = "before"
52
65
  batch_size: Tuple[int] = (1,)
53
- input_len: int = 1024
54
- output_len: int = 4
66
+ input_len: Tuple[int] = (1024,)
67
+ output_len: Tuple[int] = (4,)
55
68
  result_filename: str = ""
56
69
  correctness_test: bool = False
57
70
  # This is only used for correctness test
58
71
  cut_len: int = 4
72
+ # Plotting args
73
+ graph_sql: str = (
74
+ "select run_name, batch_size, prefill_throughput from results where run_name='before'"
75
+ )
76
+ graph_filename: str = "out.png"
59
77
 
60
78
  @staticmethod
61
79
  def add_cli_args(parser: argparse.ArgumentParser):
80
+ parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
62
81
  parser.add_argument(
63
82
  "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
64
83
  )
65
- parser.add_argument("--input-len", type=int, default=BenchArgs.input_len)
66
- parser.add_argument("--output-len", type=int, default=BenchArgs.output_len)
84
+ parser.add_argument(
85
+ "--input-len", type=int, nargs="+", default=BenchArgs.input_len
86
+ )
87
+ parser.add_argument(
88
+ "--output-len", type=int, nargs="+", default=BenchArgs.output_len
89
+ )
67
90
  parser.add_argument(
68
91
  "--result-filename", type=str, default=BenchArgs.result_filename
69
92
  )
70
93
  parser.add_argument("--correctness-test", action="store_true")
71
94
  parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
95
+ # graphing
96
+ parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
97
+ parser.add_argument(
98
+ "--graph-filename", type=str, default=BenchArgs.graph_filename
99
+ )
72
100
 
73
101
  @classmethod
74
102
  def from_cli_args(cls, args: argparse.Namespace):
@@ -161,7 +189,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
161
189
 
162
190
 
163
191
  def extend(reqs, model_runner):
164
- batch = Batch.init_new(
192
+ batch = ScheduleBatch.init_new(
165
193
  reqs=reqs,
166
194
  req_to_token_pool=model_runner.req_to_token_pool,
167
195
  token_to_kv_pool=model_runner.token_to_kv_pool,
@@ -222,15 +250,21 @@ def correctness_test(
222
250
 
223
251
  @torch.inference_mode()
224
252
  def latency_test_run_once(
225
- model_runner, rank_print, reqs, batch_size, input_len, output_len
253
+ run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
226
254
  ):
255
+ max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
256
+ if batch_size > max_batch_size:
257
+ rank_print(
258
+ f"skipping ({batch_size}, {input_len}, {output_len}) due to max batch size limit"
259
+ )
260
+ return
227
261
 
228
262
  # Clear the pools.
229
263
  model_runner.req_to_token_pool.clear()
230
264
  model_runner.token_to_kv_pool.clear()
231
265
 
232
266
  measurement_results = {
233
- "run_name": "before",
267
+ "run_name": run_name,
234
268
  "batch_size": batch_size,
235
269
  "input_len": input_len,
236
270
  "output_len": output_len,
@@ -291,49 +325,119 @@ def latency_test(
291
325
 
292
326
  # Load the model
293
327
  model_runner, tokenizer = load_model(server_args, tp_rank)
294
- rank_print(
295
- f"max_batch_size={model_runner.max_total_num_tokens // (bench_args.input_len + bench_args.output_len)}"
296
- )
297
328
 
298
- # To make this PR easier to review, for now, only do the first element in batch_size tuple.
299
- bench_args.batch_size = bench_args.batch_size[0]
300
-
301
- # Prepare inputs
329
+ # Prepare inputs for warm up
302
330
  reqs = prepare_synthetic_inputs_for_latency_test(
303
- bench_args.batch_size, bench_args.input_len
331
+ bench_args.batch_size[0], bench_args.input_len[0]
304
332
  )
305
333
 
306
334
  # Warm up
307
335
  latency_test_run_once(
308
- model_runner, rank_print, reqs, bench_args.batch_size, bench_args.input_len, 4
336
+ bench_args.run_name,
337
+ model_runner,
338
+ rank_print,
339
+ reqs,
340
+ bench_args.batch_size[0],
341
+ bench_args.input_len[0],
342
+ 4, # shorter decoding to speed up the warmup
309
343
  )
310
344
 
311
- # Run again
345
+ # Run the sweep
312
346
  result_list = []
313
- result_list.append(
314
- latency_test_run_once(
315
- model_runner,
316
- rank_print,
317
- reqs,
318
- bench_args.batch_size,
319
- bench_args.input_len,
320
- bench_args.output_len,
347
+ for bs, il, ol in itertools.product(
348
+ bench_args.batch_size, bench_args.input_len, bench_args.output_len
349
+ ):
350
+ req = prepare_synthetic_inputs_for_latency_test(bs, il)
351
+ ret = latency_test_run_once(
352
+ bench_args.run_name, model_runner, rank_print, reqs, bs, il, ol
321
353
  )
322
- )
354
+ if ret is not None:
355
+ result_list.append(ret)
356
+
357
+ # Write results in jsonlines format on rank 0.
358
+ if tp_rank == 0 and bench_args.result_filename:
359
+ import jsonlines
323
360
 
324
- # Write results in jsonlines format.
325
- if bench_args.result_filename:
326
361
  with jsonlines.open(bench_args.result_filename, "a") as f:
327
362
  f.write_all(result_list)
328
363
 
329
364
 
365
+ def plot_latency_test(
366
+ server_args,
367
+ bench_args,
368
+ tp_rank,
369
+ ):
370
+ assert tp_rank == 0
371
+
372
+ # read the jsonl file and put in sqlite
373
+ df = pd.read_json(bench_args.result_filename, lines=True)
374
+ conn = sqlite3.connect(":memory:")
375
+ cur = conn.cursor()
376
+
377
+ # get the columns and their types
378
+ column_names = list(df.iloc[0].keys())
379
+ type_dict = {
380
+ str: "TEXT",
381
+ np.int64: "INTEGER",
382
+ np.float64: "FLOAT",
383
+ }
384
+ column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
385
+
386
+ # create the table
387
+ cur.execute(
388
+ f"""
389
+ CREATE TABLE IF NOT EXISTS results (
390
+ {", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
391
+ )
392
+ """
393
+ )
394
+ conn.commit()
395
+
396
+ # write the results to DB
397
+ df.to_sql("results", conn, if_exists="replace", index=False)
398
+ conn.commit()
399
+
400
+ # read it back using sql
401
+ df = pd.read_sql_query(bench_args.graph_sql, conn)
402
+ conn.close()
403
+
404
+ # plot it and save to a file
405
+ import matplotlib.pyplot as plt
406
+
407
+ assert (
408
+ len(df.columns) == 3
409
+ ), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
410
+ for label in df[df.columns[0]].unique():
411
+ q = f"{df.columns[0]}=='{label}'"
412
+ series = df.query(q)
413
+ plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
414
+ plt.xlabel(df.columns[1])
415
+ plt.ylabel(df.columns[2])
416
+ plt.legend()
417
+ plt.savefig(bench_args.graph_filename, dpi=300)
418
+
419
+ # if in kitty, just dump it to the terminal
420
+ if os.environ["TERM"] == "xterm-kitty":
421
+ os.system(
422
+ f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
423
+ )
424
+
425
+
330
426
  def main(server_args, bench_args):
331
- print(bench_args)
332
427
 
333
- if bench_args.correctness_test:
334
- work_func = correctness_test
428
+ if server_args.model_path:
429
+ if bench_args.correctness_test:
430
+ work_func = correctness_test
431
+ else:
432
+ work_func = latency_test
433
+ elif os.path.isfile(bench_args.result_filename):
434
+ assert bench_args.graph_filename, "please provide a filename for the graph"
435
+ work_func = plot_latency_test
335
436
  else:
336
- work_func = latency_test
437
+ raise ValueError(
438
+ "Provide --model-path for running the tests or "
439
+ "provide --result-filename for plotting the results"
440
+ )
337
441
 
338
442
  if server_args.tp_size == 1:
339
443
  work_func(server_args, bench_args, 0)
@@ -361,6 +465,11 @@ if __name__ == "__main__":
361
465
  parser = argparse.ArgumentParser()
362
466
  ServerArgs.add_cli_args(parser)
363
467
  BenchArgs.add_cli_args(parser)
468
+ # For this script, model-path is not required
469
+ assert (
470
+ parser._actions[1].option_strings[0] == "--model-path"
471
+ ), "options changed, this code need to be updated"
472
+ parser._actions[1].required = False
364
473
  args = parser.parse_args()
365
474
 
366
475
  server_args = ServerArgs.from_cli_args(args)
@@ -14,6 +14,7 @@ PACKAGE_LIST = [
14
14
  "sglang",
15
15
  "flashinfer",
16
16
  "triton",
17
+ "transformers",
17
18
  "requests",
18
19
  "tqdm",
19
20
  "numpy",
@@ -73,10 +74,26 @@ def _get_gpu_info():
73
74
  Get information about available GPUs.
74
75
  """
75
76
  devices = defaultdict(list)
77
+ capabilities = defaultdict(list)
76
78
  for k in range(torch.cuda.device_count()):
77
79
  devices[torch.cuda.get_device_name(k)].append(str(k))
80
+ capability = torch.cuda.get_device_capability(k)
81
+ capabilities[f"{capability[0]}.{capability[1]}"].append(str(k))
78
82
 
79
- return {f"GPU {','.join(device_ids)}": name for name, device_ids in devices.items()}
83
+ gpu_info = {}
84
+ for name, device_ids in devices.items():
85
+ gpu_info[f"GPU {','.join(device_ids)}"] = name
86
+
87
+ if len(capabilities) == 1:
88
+ # All GPUs have the same compute capability
89
+ cap, gpu_ids = list(capabilities.items())[0]
90
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
91
+ else:
92
+ # GPUs have different compute capabilities
93
+ for cap, gpu_ids in capabilities.items():
94
+ gpu_info[f"GPU {','.join(gpu_ids)} Compute Capability"] = cap
95
+
96
+ return gpu_info
80
97
 
81
98
 
82
99
  def _get_cuda_version_info():
@@ -118,6 +135,7 @@ def _get_cuda_driver_version():
118
135
  """
119
136
  Get CUDA driver version.
120
137
  """
138
+ versions = set()
121
139
  try:
122
140
  output = subprocess.check_output(
123
141
  [
@@ -126,7 +144,11 @@ def _get_cuda_driver_version():
126
144
  "--format=csv,noheader,nounits",
127
145
  ]
128
146
  )
129
- return {"CUDA Driver Version": output.decode().strip()}
147
+ versions = set(output.decode().strip().split("\n"))
148
+ if len(versions) == 1:
149
+ return {"CUDA Driver Version": versions.pop()}
150
+ else:
151
+ return {"CUDA Driver Versions": ", ".join(sorted(versions))}
130
152
  except subprocess.SubprocessError:
131
153
  return {"CUDA Driver Version": "Not Available"}
132
154
 
@@ -19,7 +19,6 @@ class GlobalConfig:
19
19
  self.init_new_token_ratio = 0.7
20
20
  self.base_min_new_token_ratio = 0.1
21
21
  self.new_token_ratio_decay = 0.001
22
- self.new_token_ratio_recovery = 0.05
23
22
 
24
23
  # Runtime constants: The threshold (number of tokens) to trigger layer-wise cuda sync.
25
24
  # This can improve the speed for large batch sizes during prefill.