sglang 0.2.9.post1__tar.gz → 0.2.11__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. {sglang-0.2.9.post1/sglang.egg-info → sglang-0.2.11}/PKG-INFO +37 -26
  2. {sglang-0.2.9.post1 → sglang-0.2.11}/README.md +27 -23
  3. {sglang-0.2.9.post1 → sglang-0.2.11}/pyproject.toml +7 -3
  4. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/__init__.py +8 -0
  5. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/api.py +10 -2
  6. sglang-0.2.11/sglang/bench_latency.py +483 -0
  7. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/check_env.py +25 -2
  8. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/global_config.py +0 -1
  9. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/base_backend.py +3 -1
  10. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/openai.py +8 -3
  11. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/runtime_endpoint.py +46 -40
  12. sglang-0.2.11/sglang/lang/choices.py +164 -0
  13. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/interpreter.py +6 -13
  14. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/ir.py +11 -2
  15. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/hf_transformers_utils.py +2 -2
  16. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/extend_attention.py +59 -7
  17. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/logits_processor.py +1 -1
  18. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/radix_attention.py +24 -14
  19. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/token_attention.py +28 -2
  20. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/io_struct.py +9 -4
  21. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/schedule_batch.py +98 -323
  22. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/tokenizer_manager.py +34 -16
  23. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/tp_worker.py +20 -22
  24. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/memory_pool.py +74 -38
  25. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_config.py +11 -0
  26. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_executor/cuda_graph_runner.py +3 -3
  27. sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +256 -0
  28. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_executor/model_runner.py +51 -26
  29. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/chatglm.py +1 -1
  30. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/commandr.py +1 -1
  31. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/dbrx.py +1 -1
  32. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/deepseek.py +1 -1
  33. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/deepseek_v2.py +199 -17
  34. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gemma.py +1 -1
  35. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gemma2.py +1 -1
  36. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/gpt_bigcode.py +1 -1
  37. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/grok.py +1 -1
  38. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/internlm2.py +1 -1
  39. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llama2.py +1 -1
  40. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llama_classification.py +1 -1
  41. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llava.py +1 -2
  42. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/llavavid.py +1 -2
  43. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/minicpm.py +1 -1
  44. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mixtral.py +1 -1
  45. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mixtral_quant.py +1 -1
  46. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen.py +1 -1
  47. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen2.py +1 -1
  48. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/qwen2_moe.py +1 -1
  49. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/stablelm.py +1 -1
  50. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/openai_api/adapter.py +151 -29
  51. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/openai_api/protocol.py +7 -1
  52. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/server.py +111 -84
  53. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/server_args.py +12 -2
  54. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/utils.py +25 -20
  55. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/run_eval.py +21 -10
  56. sglang-0.2.11/sglang/test/runners.py +237 -0
  57. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_common.py +12 -12
  58. sglang-0.2.11/sglang/test/simple_eval_gpqa.py +92 -0
  59. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_humaneval.py +5 -5
  60. sglang-0.2.11/sglang/test/simple_eval_math.py +72 -0
  61. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/test_utils.py +95 -14
  62. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/utils.py +15 -37
  63. sglang-0.2.11/sglang/version.py +1 -0
  64. {sglang-0.2.9.post1 → sglang-0.2.11/sglang.egg-info}/PKG-INFO +37 -26
  65. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/SOURCES.txt +5 -0
  66. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/requires.txt +11 -2
  67. sglang-0.2.9.post1/sglang/bench_latency.py +0 -323
  68. sglang-0.2.9.post1/sglang/version.py +0 -1
  69. {sglang-0.2.9.post1 → sglang-0.2.11}/LICENSE +0 -0
  70. {sglang-0.2.9.post1 → sglang-0.2.11}/setup.cfg +0 -0
  71. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/bench_serving.py +0 -0
  72. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/__init__.py +0 -0
  73. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/__init__.py +0 -0
  74. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/anthropic.py +0 -0
  75. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/litellm.py +0 -0
  76. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/backend/vertexai.py +0 -0
  77. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/chat_template.py +0 -0
  78. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/compiler.py +0 -0
  79. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/lang/tracer.py +0 -0
  80. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/launch_server.py +0 -0
  81. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/launch_server_llavavid.py +0 -0
  82. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/__init__.py +0 -0
  83. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/base_tool_cache.py +0 -0
  84. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/fsm_cache.py +0 -0
  85. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/constrained/jump_forward.py +0 -0
  86. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/conversation.py +0 -0
  87. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/context_flashattention_nopad.py +0 -0
  88. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/fused_moe.py +0 -0
  89. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/linear.py +0 -0
  90. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/quantization/__init__.py +0 -0
  91. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/layers/quantization/fp8.py +0 -0
  92. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/controller_multi.py +0 -0
  93. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/controller_single.py +0 -0
  94. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/detokenizer_manager.py +0 -0
  95. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/managers/policy_scheduler.py +0 -0
  96. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/base_cache.py +0 -0
  97. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  98. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/flush_cache.py +0 -0
  99. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mem_cache/radix_cache.py +0 -0
  100. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/mm_utils.py +0 -0
  101. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_loader/model_loader.py +0 -0
  102. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/model_loader/utils.py +0 -0
  103. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/mistral.py +0 -0
  104. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/models/yivl.py +0 -0
  105. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/srt/sampling_params.py +0 -0
  106. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/simple_eval_mmlu.py +0 -0
  107. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang/test/test_programs.py +0 -0
  108. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/dependency_links.txt +0 -0
  109. {sglang-0.2.9.post1 → sglang-0.2.11}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.9.post1
3
+ Version: 0.2.11
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -224,13 +224,13 @@ Requires-Dist: packaging; extra == "srt"
224
224
  Requires-Dist: pillow; extra == "srt"
225
225
  Requires-Dist: psutil; extra == "srt"
226
226
  Requires-Dist: pydantic; extra == "srt"
227
+ Requires-Dist: python-multipart; extra == "srt"
227
228
  Requires-Dist: torch; extra == "srt"
228
229
  Requires-Dist: uvicorn; extra == "srt"
229
230
  Requires-Dist: uvloop; extra == "srt"
230
231
  Requires-Dist: zmq; extra == "srt"
231
- Requires-Dist: vllm==0.5.3.post1; extra == "srt"
232
+ Requires-Dist: vllm==0.5.4; extra == "srt"
232
233
  Requires-Dist: outlines>=0.0.44; extra == "srt"
233
- Requires-Dist: python-multipart; extra == "srt"
234
234
  Provides-Extra: openai
235
235
  Requires-Dist: openai>=1.0; extra == "openai"
236
236
  Requires-Dist: tiktoken; extra == "openai"
@@ -238,11 +238,18 @@ Provides-Extra: anthropic
238
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
239
239
  Provides-Extra: litellm
240
240
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
241
+ Provides-Extra: test
242
+ Requires-Dist: jsonlines; extra == "test"
243
+ Requires-Dist: matplotlib; extra == "test"
244
+ Requires-Dist: pandas; extra == "test"
241
245
  Provides-Extra: all
242
246
  Requires-Dist: sglang[srt]; extra == "all"
243
247
  Requires-Dist: sglang[openai]; extra == "all"
244
248
  Requires-Dist: sglang[anthropic]; extra == "all"
245
249
  Requires-Dist: sglang[litellm]; extra == "all"
250
+ Provides-Extra: dev
251
+ Requires-Dist: sglang[all]; extra == "dev"
252
+ Requires-Dist: sglang[test]; extra == "dev"
246
253
 
247
254
  <div align="center">
248
255
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -295,20 +302,20 @@ pip install --upgrade pip
295
302
  pip install "sglang[all]"
296
303
 
297
304
  # Install FlashInfer CUDA kernels
298
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
305
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
299
306
  ```
300
307
 
301
308
  ### Method 2: From source
302
309
  ```
303
- # Use the stable v0.2.9.post1 branch
304
- git clone -b v0.2.9.post1 https://github.com/sgl-project/sglang.git
310
+ # Use the last release branch
311
+ git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
305
312
  cd sglang
306
313
 
307
314
  pip install --upgrade pip
308
315
  pip install -e "python[all]"
309
316
 
310
317
  # Install FlashInfer CUDA kernels
311
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
318
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
319
  ```
313
320
 
314
321
  ### Method 3: Using docker
@@ -382,7 +389,7 @@ response = client.chat.completions.create(
382
389
  print(response)
383
390
  ```
384
391
 
385
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
392
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
386
393
 
387
394
  ### Additional Server Arguments
388
395
  - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -393,10 +400,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
393
400
  ```
394
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
395
402
  ```
396
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
403
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
397
404
  ```
398
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
399
406
  ```
407
+ - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
408
+ ```
409
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
410
+ ```
400
411
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
401
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
402
413
  ```
@@ -410,22 +421,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
410
421
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
411
422
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
412
423
 
413
- ### Run Llama 3.1 405B
414
-
415
- ```bash
416
- ## Run 405B (fp8) on a single node
417
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
418
-
419
- ## Run 405B (fp16) on two nodes
420
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
421
-
422
- # on the first node
423
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
424
-
425
- # on the second
426
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
427
- ```
428
-
429
424
  ### Supported Models
430
425
 
431
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -451,9 +446,25 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
451
446
 
452
447
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
453
448
 
449
+ ### Run Llama 3.1 405B
450
+
451
+ ```bash
452
+ ## Run 405B (fp8) on a single node
453
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
454
+
455
+ ## Run 405B (fp16) on two nodes
456
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
457
+
458
+ # on the first node
459
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
460
+
461
+ # on the second
462
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
463
+ ```
464
+
454
465
  ### Benchmark Performance
455
466
 
456
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
467
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
457
468
  ```
458
469
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
459
470
  ```
@@ -49,20 +49,20 @@ pip install --upgrade pip
49
49
  pip install "sglang[all]"
50
50
 
51
51
  # Install FlashInfer CUDA kernels
52
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
52
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
53
53
  ```
54
54
 
55
55
  ### Method 2: From source
56
56
  ```
57
- # Use the stable v0.2.9.post1 branch
58
- git clone -b v0.2.9.post1 https://github.com/sgl-project/sglang.git
57
+ # Use the last release branch
58
+ git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
59
59
  cd sglang
60
60
 
61
61
  pip install --upgrade pip
62
62
  pip install -e "python[all]"
63
63
 
64
64
  # Install FlashInfer CUDA kernels
65
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
65
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
66
66
  ```
67
67
 
68
68
  ### Method 3: Using docker
@@ -136,7 +136,7 @@ response = client.chat.completions.create(
136
136
  print(response)
137
137
  ```
138
138
 
139
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
139
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
140
140
 
141
141
  ### Additional Server Arguments
142
142
  - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
@@ -147,10 +147,14 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
147
147
  ```
148
148
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
149
149
  ```
150
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
150
+ - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
151
151
  ```
152
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
153
153
  ```
154
+ - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
155
+ ```
156
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
157
+ ```
154
158
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
155
159
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
156
160
  ```
@@ -164,22 +168,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
164
168
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
165
169
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
166
170
 
167
- ### Run Llama 3.1 405B
168
-
169
- ```bash
170
- ## Run 405B (fp8) on a single node
171
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
-
173
- ## Run 405B (fp16) on two nodes
174
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
175
-
176
- # on the first node
177
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
178
-
179
- # on the second
180
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
181
- ```
182
-
183
171
  ### Supported Models
184
172
 
185
173
  - Llama / Llama 2 / Llama 3 / Llama 3.1
@@ -205,9 +193,25 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
205
193
 
206
194
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
207
195
 
196
+ ### Run Llama 3.1 405B
197
+
198
+ ```bash
199
+ ## Run 405B (fp8) on a single node
200
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
201
+
202
+ ## Run 405B (fp16) on two nodes
203
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
204
+
205
+ # on the first node
206
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
207
+
208
+ # on the second
209
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
210
+ ```
211
+
208
212
  ### Benchmark Performance
209
213
 
210
- - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as those for `launch_server.py`. This is not a dynamic batching server, so it may run out of memory for a batch size that can run successfully with a real server. This is because a real server will truncate the prefill into several batches/chunks, while this unit test does not do this.
214
+ - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
211
215
  ```
212
216
  python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 32 --input-len 256 --output-len 32
213
217
  ```
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.9.post1"
7
+ version = "0.2.11"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,12 +20,16 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "packaging", "pillow",
24
- "psutil", "pydantic", "torch", "uvicorn", "uvloop", "zmq", "vllm==0.5.3.post1", "outlines>=0.0.44", "python-multipart"]
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
+ "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
+ "torch", "uvicorn", "uvloop", "zmq",
26
+ "vllm==0.5.4", "outlines>=0.0.44"]
25
27
  openai = ["openai>=1.0", "tiktoken"]
26
28
  anthropic = ["anthropic>=0.20.0"]
27
29
  litellm = ["litellm>=1.0.0"]
30
+ test = ["jsonlines", "matplotlib", "pandas"]
28
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
+ dev = ["sglang[all]", "sglang[test]"]
29
33
 
30
34
  [project.urls]
31
35
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -22,6 +22,11 @@ from sglang.api import (
22
22
  user_end,
23
23
  video,
24
24
  )
25
+ from sglang.lang.choices import (
26
+ greedy_token_selection,
27
+ token_length_normalized,
28
+ unconditional_likelihood_normalized,
29
+ )
25
30
 
26
31
  # SGLang DSL APIs
27
32
  __all__ = [
@@ -45,6 +50,9 @@ __all__ = [
45
50
  "user_begin",
46
51
  "user_end",
47
52
  "video",
53
+ "greedy_token_selection",
54
+ "token_length_normalized",
55
+ "unconditional_likelihood_normalized",
48
56
  ]
49
57
 
50
58
  # Global Configurations
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
6
6
 
7
7
  from sglang.global_config import global_config
8
8
  from sglang.lang.backend.base_backend import BaseBackend
9
+ from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
9
10
  from sglang.lang.ir import (
10
11
  SglExpr,
11
12
  SglExprList,
@@ -73,12 +74,18 @@ def gen(
73
74
  return_text_in_logprobs: Optional[bool] = None,
74
75
  dtype: Optional[type] = None,
75
76
  choices: Optional[List[str]] = None,
77
+ choices_method: Optional[ChoicesSamplingMethod] = None,
76
78
  regex: Optional[str] = None,
77
79
  ):
78
80
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
79
81
 
80
82
  if choices:
81
- return SglSelect(name, choices, 0.0 if temperature is None else temperature)
83
+ return SglSelect(
84
+ name,
85
+ choices,
86
+ 0.0 if temperature is None else temperature,
87
+ token_length_normalized if choices_method is None else choices_method,
88
+ )
82
89
 
83
90
  # check regex is valid
84
91
  if regex is not None:
@@ -186,9 +193,10 @@ def select(
186
193
  name: Optional[str] = None,
187
194
  choices: Optional[List[str]] = None,
188
195
  temperature: float = 0.0,
196
+ choices_method: ChoicesSamplingMethod = token_length_normalized,
189
197
  ):
190
198
  assert choices is not None
191
- return SglSelect(name, choices, temperature)
199
+ return SglSelect(name, choices, temperature, choices_method)
192
200
 
193
201
 
194
202
  def _role_common(name: str, expr: Optional[SglExpr] = None):