sglang 0.2.10__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (123) hide show
  1. {sglang-0.2.10/sglang.egg-info → sglang-0.2.12}/PKG-INFO +50 -31
  2. {sglang-0.2.10 → sglang-0.2.12}/README.md +41 -28
  3. {sglang-0.2.10 → sglang-0.2.12}/pyproject.toml +5 -3
  4. {sglang-0.2.10 → sglang-0.2.12}/sglang/__init__.py +8 -0
  5. {sglang-0.2.10 → sglang-0.2.12}/sglang/api.py +10 -2
  6. {sglang-0.2.10 → sglang-0.2.12}/sglang/bench_latency.py +151 -40
  7. {sglang-0.2.10 → sglang-0.2.12}/sglang/bench_serving.py +46 -22
  8. {sglang-0.2.10 → sglang-0.2.12}/sglang/check_env.py +24 -2
  9. {sglang-0.2.10 → sglang-0.2.12}/sglang/global_config.py +0 -1
  10. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/base_backend.py +3 -1
  11. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/openai.py +8 -3
  12. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/runtime_endpoint.py +46 -29
  13. sglang-0.2.12/sglang/lang/choices.py +164 -0
  14. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/compiler.py +2 -2
  15. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/interpreter.py +6 -13
  16. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/ir.py +14 -5
  17. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/base_tool_cache.py +1 -1
  18. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/fsm_cache.py +12 -2
  19. sglang-0.2.12/sglang/srt/layers/activation.py +33 -0
  20. sglang-0.2.10/sglang/srt/layers/token_attention.py → sglang-0.2.12/sglang/srt/layers/decode_attention.py +9 -5
  21. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/extend_attention.py +6 -1
  22. sglang-0.2.12/sglang/srt/layers/layernorm.py +65 -0
  23. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/logits_processor.py +6 -1
  24. sglang-0.2.12/sglang/srt/layers/pooler.py +50 -0
  25. sglang-0.2.10/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.12/sglang/srt/layers/prefill_attention.py +5 -0
  26. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/radix_attention.py +4 -7
  27. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/detokenizer_manager.py +31 -9
  28. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/io_struct.py +63 -0
  29. sglang-0.2.12/sglang/srt/managers/policy_scheduler.py +233 -0
  30. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/schedule_batch.py +174 -380
  31. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/tokenizer_manager.py +197 -112
  32. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/tp_worker.py +299 -364
  33. sglang-0.2.10/sglang/srt/mem_cache/base_cache.py → sglang-0.2.12/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
  34. sglang-0.2.12/sglang/srt/mem_cache/chunk_cache.py +83 -0
  35. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/memory_pool.py +10 -15
  36. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/radix_cache.py +74 -40
  37. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_executor/cuda_graph_runner.py +27 -12
  38. sglang-0.2.12/sglang/srt/model_executor/forward_batch_info.py +319 -0
  39. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_executor/model_runner.py +30 -47
  40. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/chatglm.py +1 -1
  41. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/commandr.py +1 -1
  42. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/dbrx.py +1 -1
  43. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/deepseek.py +1 -1
  44. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/deepseek_v2.py +1 -1
  45. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gemma.py +1 -1
  46. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gemma2.py +1 -2
  47. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/gpt_bigcode.py +1 -1
  48. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/grok.py +1 -1
  49. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/internlm2.py +3 -8
  50. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llama2.py +5 -5
  51. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llama_classification.py +1 -1
  52. sglang-0.2.12/sglang/srt/models/llama_embedding.py +88 -0
  53. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llava.py +1 -2
  54. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/llavavid.py +1 -2
  55. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/minicpm.py +1 -1
  56. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mixtral.py +1 -1
  57. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mixtral_quant.py +1 -1
  58. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen.py +1 -1
  59. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen2.py +1 -1
  60. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/qwen2_moe.py +1 -12
  61. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/stablelm.py +1 -1
  62. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/openai_api/adapter.py +189 -39
  63. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/openai_api/protocol.py +43 -1
  64. sglang-0.2.12/sglang/srt/sampling/penaltylib/__init__.py +13 -0
  65. sglang-0.2.12/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  66. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  67. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  68. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  69. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  70. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/sampling_params.py +31 -4
  71. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/server.py +93 -21
  72. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/server_args.py +30 -19
  73. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/utils.py +31 -13
  74. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/run_eval.py +10 -1
  75. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/runners.py +63 -63
  76. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_humaneval.py +2 -8
  77. sglang-0.2.12/sglang/test/simple_eval_mgsm.py +203 -0
  78. sglang-0.2.12/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  79. sglang-0.2.12/sglang/test/test_layernorm.py +60 -0
  80. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/test_programs.py +4 -2
  81. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/test_utils.py +21 -3
  82. {sglang-0.2.10 → sglang-0.2.12}/sglang/utils.py +0 -1
  83. sglang-0.2.12/sglang/version.py +1 -0
  84. {sglang-0.2.10 → sglang-0.2.12/sglang.egg-info}/PKG-INFO +50 -31
  85. {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/SOURCES.txt +19 -7
  86. {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/requires.txt +10 -2
  87. sglang-0.2.10/sglang/srt/layers/linear.py +0 -884
  88. sglang-0.2.10/sglang/srt/layers/quantization/__init__.py +0 -64
  89. sglang-0.2.10/sglang/srt/layers/quantization/fp8.py +0 -677
  90. sglang-0.2.10/sglang/srt/managers/policy_scheduler.py +0 -85
  91. sglang-0.2.10/sglang/srt/mem_cache/chunk_cache.py +0 -60
  92. sglang-0.2.10/sglang/version.py +0 -1
  93. {sglang-0.2.10 → sglang-0.2.12}/LICENSE +0 -0
  94. {sglang-0.2.10 → sglang-0.2.12}/setup.cfg +0 -0
  95. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/__init__.py +0 -0
  96. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/__init__.py +0 -0
  97. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/anthropic.py +0 -0
  98. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/litellm.py +0 -0
  99. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/backend/vertexai.py +0 -0
  100. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/chat_template.py +0 -0
  101. {sglang-0.2.10 → sglang-0.2.12}/sglang/lang/tracer.py +0 -0
  102. {sglang-0.2.10 → sglang-0.2.12}/sglang/launch_server.py +0 -0
  103. {sglang-0.2.10 → sglang-0.2.12}/sglang/launch_server_llavavid.py +0 -0
  104. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/__init__.py +0 -0
  105. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/constrained/jump_forward.py +0 -0
  106. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/conversation.py +0 -0
  107. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/hf_transformers_utils.py +0 -0
  108. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/layers/fused_moe.py +0 -0
  109. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/controller_multi.py +0 -0
  110. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/managers/controller_single.py +0 -0
  111. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mem_cache/flush_cache.py +0 -0
  112. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/mm_utils.py +0 -0
  113. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_config.py +0 -0
  114. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_loader/model_loader.py +0 -0
  115. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/model_loader/utils.py +0 -0
  116. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/mistral.py +0 -0
  117. {sglang-0.2.10 → sglang-0.2.12}/sglang/srt/models/yivl.py +0 -0
  118. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_common.py +0 -0
  119. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_gpqa.py +0 -0
  120. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_math.py +0 -0
  121. {sglang-0.2.10 → sglang-0.2.12}/sglang/test/simple_eval_mmlu.py +0 -0
  122. {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/dependency_links.txt +0 -0
  123. {sglang-0.2.10 → sglang-0.2.12}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.10
3
+ Version: 0.2.12
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -220,7 +220,6 @@ Requires-Dist: fastapi; extra == "srt"
220
220
  Requires-Dist: hf_transfer; extra == "srt"
221
221
  Requires-Dist: huggingface_hub; extra == "srt"
222
222
  Requires-Dist: interegular; extra == "srt"
223
- Requires-Dist: jsonlines; extra == "srt"
224
223
  Requires-Dist: packaging; extra == "srt"
225
224
  Requires-Dist: pillow; extra == "srt"
226
225
  Requires-Dist: psutil; extra == "srt"
@@ -230,7 +229,7 @@ Requires-Dist: torch; extra == "srt"
230
229
  Requires-Dist: uvicorn; extra == "srt"
231
230
  Requires-Dist: uvloop; extra == "srt"
232
231
  Requires-Dist: zmq; extra == "srt"
233
- Requires-Dist: vllm==0.5.3.post1; extra == "srt"
232
+ Requires-Dist: vllm==0.5.4; extra == "srt"
234
233
  Requires-Dist: outlines>=0.0.44; extra == "srt"
235
234
  Provides-Extra: openai
236
235
  Requires-Dist: openai>=1.0; extra == "openai"
@@ -239,11 +238,18 @@ Provides-Extra: anthropic
239
238
  Requires-Dist: anthropic>=0.20.0; extra == "anthropic"
240
239
  Provides-Extra: litellm
241
240
  Requires-Dist: litellm>=1.0.0; extra == "litellm"
241
+ Provides-Extra: test
242
+ Requires-Dist: jsonlines; extra == "test"
243
+ Requires-Dist: matplotlib; extra == "test"
244
+ Requires-Dist: pandas; extra == "test"
242
245
  Provides-Extra: all
243
246
  Requires-Dist: sglang[srt]; extra == "all"
244
247
  Requires-Dist: sglang[openai]; extra == "all"
245
248
  Requires-Dist: sglang[anthropic]; extra == "all"
246
249
  Requires-Dist: sglang[litellm]; extra == "all"
250
+ Provides-Extra: dev
251
+ Requires-Dist: sglang[all]; extra == "dev"
252
+ Requires-Dist: sglang[test]; extra == "dev"
247
253
 
248
254
  <div align="center">
249
255
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
@@ -296,20 +302,20 @@ pip install --upgrade pip
296
302
  pip install "sglang[all]"
297
303
 
298
304
  # Install FlashInfer CUDA kernels
299
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
305
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
300
306
  ```
301
307
 
302
308
  ### Method 2: From source
303
309
  ```
304
310
  # Use the last release branch
305
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
306
312
  cd sglang
307
313
 
308
314
  pip install --upgrade pip
309
315
  pip install -e "python[all]"
310
316
 
311
317
  # Install FlashInfer CUDA kernels
312
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
318
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
313
319
  ```
314
320
 
315
321
  ### Method 3: Using docker
@@ -383,22 +389,26 @@ response = client.chat.completions.create(
383
389
  print(response)
384
390
  ```
385
391
 
386
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
392
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
387
393
 
388
394
  ### Additional Server Arguments
389
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
395
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
390
396
  ```
391
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
392
398
  ```
393
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
399
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
394
400
  ```
395
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
396
402
  ```
397
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
403
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
398
404
  ```
399
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
400
406
  ```
401
407
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
408
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
409
+ ```
410
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
411
+ ```
402
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
403
413
  ```
404
414
  # Node 0
@@ -408,29 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
408
418
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
409
419
  ```
410
420
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
411
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
412
421
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
413
-
414
- ### Run Llama 3.1 405B
415
-
416
- ```bash
417
- ## Run 405B (fp8) on a single node
418
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
419
-
420
- ## Run 405B (fp16) on two nodes
421
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
422
-
423
- # on the first node
424
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
425
-
426
- # on the second
427
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
428
- ```
429
-
422
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
423
+
430
424
  ### Supported Models
431
425
 
432
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
433
- - Mistral / Mixtral
427
+ - Mistral / Mixtral / Mistral NeMo
434
428
  - Gemma / Gemma 2
435
429
  - Qwen / Qwen 2 / Qwen 2 MoE
436
430
  - DeepSeek / DeepSeek 2
@@ -448,10 +442,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
448
442
  - Grok
449
443
  - ChatGLM
450
444
  - InternLM 2
451
- - Mistral NeMo
452
445
 
453
446
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
454
447
 
448
+ #### Use Models From ModelScope
449
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
450
+ ```
451
+ export SGLANG_USE_MODELSCOPE=true
452
+ ```
453
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
454
+ ```
455
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
456
+ ```
457
+
458
+ #### Run Llama 3.1 405B
459
+
460
+ ```bash
461
+ ## Run 405B (fp8) on a single node
462
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
463
+
464
+ ## Run 405B (fp16) on two nodes
465
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
466
+
467
+ # on the first node
468
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
469
+
470
+ # on the second
471
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
472
+ ```
473
+
455
474
  ### Benchmark Performance
456
475
 
457
476
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -464,7 +483,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
464
483
  ```
465
484
 
466
485
  ## Frontend: Structured Generation Language (SGLang)
467
- The frontend language can be used with local models or API models.
486
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
468
487
 
469
488
  ### Quick Start
470
489
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -49,20 +49,20 @@ pip install --upgrade pip
49
49
  pip install "sglang[all]"
50
50
 
51
51
  # Install FlashInfer CUDA kernels
52
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
52
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
53
53
  ```
54
54
 
55
55
  ### Method 2: From source
56
56
  ```
57
57
  # Use the last release branch
58
- git clone -b v0.2.10 https://github.com/sgl-project/sglang.git
58
+ git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
59
59
  cd sglang
60
60
 
61
61
  pip install --upgrade pip
62
62
  pip install -e "python[all]"
63
63
 
64
64
  # Install FlashInfer CUDA kernels
65
- pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
65
+ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
66
66
  ```
67
67
 
68
68
  ### Method 3: Using docker
@@ -136,22 +136,26 @@ response = client.chat.completions.create(
136
136
  print(response)
137
137
  ```
138
138
 
139
- It supports streaming, vision, and most features of the Chat/Completions/Models endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
139
+ It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
140
140
 
141
141
  ### Additional Server Arguments
142
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
142
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
143
143
  ```
144
144
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
145
145
  ```
146
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
146
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
147
147
  ```
148
148
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
149
149
  ```
150
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`
150
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
151
151
  ```
152
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
153
153
  ```
154
154
  - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
155
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
156
+ ```
157
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
158
+ ```
155
159
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
156
160
  ```
157
161
  # Node 0
@@ -161,29 +165,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
161
165
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
162
166
  ```
163
167
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
164
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
165
168
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
166
-
167
- ### Run Llama 3.1 405B
168
-
169
- ```bash
170
- ## Run 405B (fp8) on a single node
171
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
172
-
173
- ## Run 405B (fp16) on two nodes
174
- # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
175
-
176
- # on the first node
177
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
178
-
179
- # on the second
180
- GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
181
- ```
182
-
169
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
170
+
183
171
  ### Supported Models
184
172
 
185
173
  - Llama / Llama 2 / Llama 3 / Llama 3.1
186
- - Mistral / Mixtral
174
+ - Mistral / Mixtral / Mistral NeMo
187
175
  - Gemma / Gemma 2
188
176
  - Qwen / Qwen 2 / Qwen 2 MoE
189
177
  - DeepSeek / DeepSeek 2
@@ -201,10 +189,35 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
201
189
  - Grok
202
190
  - ChatGLM
203
191
  - InternLM 2
204
- - Mistral NeMo
205
192
 
206
193
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
207
194
 
195
+ #### Use Models From ModelScope
196
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
197
+ ```
198
+ export SGLANG_USE_MODELSCOPE=true
199
+ ```
200
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
201
+ ```
202
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
203
+ ```
204
+
205
+ #### Run Llama 3.1 405B
206
+
207
+ ```bash
208
+ ## Run 405B (fp8) on a single node
209
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
210
+
211
+ ## Run 405B (fp16) on two nodes
212
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
213
+
214
+ # on the first node
215
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
216
+
217
+ # on the second
218
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
219
+ ```
220
+
208
221
  ### Benchmark Performance
209
222
 
210
223
  - Benchmark a single static batch by running the following command without launching a server. The arguments are the same as for `launch_server.py`. Note that this is not a dynamic batching server, so it may run out of memory for a batch size that a real server can handle. A real server truncates the prefill into several batches, while this unit test does not. For accurate large batch testing, consider using `sglang.bench_serving`.
@@ -217,7 +230,7 @@ Instructions for supporting a new model are [here](https://github.com/sgl-projec
217
230
  ```
218
231
 
219
232
  ## Frontend: Structured Generation Language (SGLang)
220
- The frontend language can be used with local models or API models.
233
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
221
234
 
222
235
  ### Quick Start
223
236
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.10"
7
+ version = "0.2.12"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -20,14 +20,16 @@ dependencies = [
20
20
  ]
21
21
 
22
22
  [project.optional-dependencies]
23
- srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular", "jsonlines",
23
+ srt = ["aiohttp", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
24
24
  "packaging", "pillow", "psutil", "pydantic", "python-multipart",
25
25
  "torch", "uvicorn", "uvloop", "zmq",
26
- "vllm==0.5.3.post1", "outlines>=0.0.44"]
26
+ "vllm==0.5.4", "outlines>=0.0.44"]
27
27
  openai = ["openai>=1.0", "tiktoken"]
28
28
  anthropic = ["anthropic>=0.20.0"]
29
29
  litellm = ["litellm>=1.0.0"]
30
+ test = ["jsonlines", "matplotlib", "pandas"]
30
31
  all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
32
+ dev = ["sglang[all]", "sglang[test]"]
31
33
 
32
34
  [project.urls]
33
35
  "Homepage" = "https://github.com/sgl-project/sglang"
@@ -22,6 +22,11 @@ from sglang.api import (
22
22
  user_end,
23
23
  video,
24
24
  )
25
+ from sglang.lang.choices import (
26
+ greedy_token_selection,
27
+ token_length_normalized,
28
+ unconditional_likelihood_normalized,
29
+ )
25
30
 
26
31
  # SGLang DSL APIs
27
32
  __all__ = [
@@ -45,6 +50,9 @@ __all__ = [
45
50
  "user_begin",
46
51
  "user_end",
47
52
  "video",
53
+ "greedy_token_selection",
54
+ "token_length_normalized",
55
+ "unconditional_likelihood_normalized",
48
56
  ]
49
57
 
50
58
  # Global Configurations
@@ -6,6 +6,7 @@ from typing import Callable, List, Optional, Union
6
6
 
7
7
  from sglang.global_config import global_config
8
8
  from sglang.lang.backend.base_backend import BaseBackend
9
+ from sglang.lang.choices import ChoicesSamplingMethod, token_length_normalized
9
10
  from sglang.lang.ir import (
10
11
  SglExpr,
11
12
  SglExprList,
@@ -73,12 +74,18 @@ def gen(
73
74
  return_text_in_logprobs: Optional[bool] = None,
74
75
  dtype: Optional[type] = None,
75
76
  choices: Optional[List[str]] = None,
77
+ choices_method: Optional[ChoicesSamplingMethod] = None,
76
78
  regex: Optional[str] = None,
77
79
  ):
78
80
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
79
81
 
80
82
  if choices:
81
- return SglSelect(name, choices, 0.0 if temperature is None else temperature)
83
+ return SglSelect(
84
+ name,
85
+ choices,
86
+ 0.0 if temperature is None else temperature,
87
+ token_length_normalized if choices_method is None else choices_method,
88
+ )
82
89
 
83
90
  # check regex is valid
84
91
  if regex is not None:
@@ -186,9 +193,10 @@ def select(
186
193
  name: Optional[str] = None,
187
194
  choices: Optional[List[str]] = None,
188
195
  temperature: float = 0.0,
196
+ choices_method: ChoicesSamplingMethod = token_length_normalized,
189
197
  ):
190
198
  assert choices is not None
191
- return SglSelect(name, choices, temperature)
199
+ return SglSelect(name, choices, temperature, choices_method)
192
200
 
193
201
 
194
202
  def _role_common(name: str, expr: Optional[SglExpr] = None):