sglang 0.2.11__tar.gz → 0.2.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. {sglang-0.2.11/sglang.egg-info → sglang-0.2.12}/PKG-INFO +23 -14
  2. {sglang-0.2.11 → sglang-0.2.12}/README.md +22 -13
  3. {sglang-0.2.11 → sglang-0.2.12}/pyproject.toml +1 -1
  4. {sglang-0.2.11 → sglang-0.2.12}/sglang/bench_latency.py +6 -4
  5. {sglang-0.2.11 → sglang-0.2.12}/sglang/bench_serving.py +46 -22
  6. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/compiler.py +2 -2
  7. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/ir.py +3 -3
  8. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/base_tool_cache.py +1 -1
  9. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/fsm_cache.py +12 -2
  10. sglang-0.2.12/sglang/srt/layers/activation.py +33 -0
  11. sglang-0.2.11/sglang/srt/layers/token_attention.py → sglang-0.2.12/sglang/srt/layers/decode_attention.py +9 -5
  12. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/extend_attention.py +6 -1
  13. sglang-0.2.12/sglang/srt/layers/layernorm.py +65 -0
  14. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/logits_processor.py +5 -0
  15. sglang-0.2.12/sglang/srt/layers/pooler.py +50 -0
  16. sglang-0.2.11/sglang/srt/layers/context_flashattention_nopad.py → sglang-0.2.12/sglang/srt/layers/prefill_attention.py +5 -0
  17. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/radix_attention.py +2 -2
  18. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/detokenizer_manager.py +31 -9
  19. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/io_struct.py +63 -0
  20. sglang-0.2.12/sglang/srt/managers/policy_scheduler.py +233 -0
  21. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/schedule_batch.py +110 -87
  22. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/tokenizer_manager.py +193 -111
  23. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/tp_worker.py +289 -352
  24. sglang-0.2.11/sglang/srt/mem_cache/base_cache.py → sglang-0.2.12/sglang/srt/mem_cache/base_prefix_cache.py +9 -4
  25. sglang-0.2.12/sglang/srt/mem_cache/chunk_cache.py +83 -0
  26. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/memory_pool.py +2 -2
  27. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/radix_cache.py +74 -40
  28. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_executor/cuda_graph_runner.py +24 -9
  29. sglang-0.2.12/sglang/srt/model_executor/forward_batch_info.py +319 -0
  30. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_executor/model_runner.py +24 -37
  31. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gemma2.py +0 -1
  32. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/internlm2.py +2 -7
  33. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llama2.py +4 -4
  34. sglang-0.2.12/sglang/srt/models/llama_embedding.py +88 -0
  35. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen2_moe.py +0 -11
  36. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/openai_api/adapter.py +155 -27
  37. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/openai_api/protocol.py +37 -1
  38. sglang-0.2.12/sglang/srt/sampling/penaltylib/__init__.py +13 -0
  39. sglang-0.2.12/sglang/srt/sampling/penaltylib/orchestrator.py +357 -0
  40. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +80 -0
  41. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +105 -0
  42. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +79 -0
  43. sglang-0.2.12/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +83 -0
  44. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/sampling_params.py +31 -4
  45. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/server.py +69 -15
  46. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/server_args.py +26 -19
  47. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/utils.py +31 -13
  48. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/run_eval.py +10 -1
  49. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/runners.py +63 -63
  50. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_humaneval.py +2 -8
  51. sglang-0.2.12/sglang/test/simple_eval_mgsm.py +203 -0
  52. sglang-0.2.12/sglang/test/srt/sampling/penaltylib/utils.py +337 -0
  53. sglang-0.2.12/sglang/test/test_layernorm.py +60 -0
  54. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/test_programs.py +4 -2
  55. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/test_utils.py +20 -2
  56. {sglang-0.2.11 → sglang-0.2.12}/sglang/utils.py +0 -1
  57. sglang-0.2.12/sglang/version.py +1 -0
  58. {sglang-0.2.11 → sglang-0.2.12/sglang.egg-info}/PKG-INFO +23 -14
  59. {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/SOURCES.txt +17 -7
  60. sglang-0.2.11/sglang/srt/layers/linear.py +0 -884
  61. sglang-0.2.11/sglang/srt/layers/quantization/__init__.py +0 -64
  62. sglang-0.2.11/sglang/srt/layers/quantization/fp8.py +0 -677
  63. sglang-0.2.11/sglang/srt/managers/policy_scheduler.py +0 -85
  64. sglang-0.2.11/sglang/srt/mem_cache/chunk_cache.py +0 -60
  65. sglang-0.2.11/sglang/srt/model_executor/forward_batch_info.py +0 -256
  66. sglang-0.2.11/sglang/version.py +0 -1
  67. {sglang-0.2.11 → sglang-0.2.12}/LICENSE +0 -0
  68. {sglang-0.2.11 → sglang-0.2.12}/setup.cfg +0 -0
  69. {sglang-0.2.11 → sglang-0.2.12}/sglang/__init__.py +0 -0
  70. {sglang-0.2.11 → sglang-0.2.12}/sglang/api.py +0 -0
  71. {sglang-0.2.11 → sglang-0.2.12}/sglang/check_env.py +0 -0
  72. {sglang-0.2.11 → sglang-0.2.12}/sglang/global_config.py +0 -0
  73. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/__init__.py +0 -0
  74. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/__init__.py +0 -0
  75. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/anthropic.py +0 -0
  76. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/base_backend.py +0 -0
  77. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/litellm.py +0 -0
  78. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/openai.py +0 -0
  79. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/runtime_endpoint.py +0 -0
  80. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/backend/vertexai.py +0 -0
  81. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/chat_template.py +0 -0
  82. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/choices.py +0 -0
  83. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/interpreter.py +0 -0
  84. {sglang-0.2.11 → sglang-0.2.12}/sglang/lang/tracer.py +0 -0
  85. {sglang-0.2.11 → sglang-0.2.12}/sglang/launch_server.py +0 -0
  86. {sglang-0.2.11 → sglang-0.2.12}/sglang/launch_server_llavavid.py +0 -0
  87. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/__init__.py +0 -0
  88. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/constrained/jump_forward.py +0 -0
  89. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/conversation.py +0 -0
  90. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/hf_transformers_utils.py +0 -0
  91. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/layers/fused_moe.py +0 -0
  92. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/controller_multi.py +0 -0
  93. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/managers/controller_single.py +0 -0
  94. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mem_cache/flush_cache.py +0 -0
  95. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/mm_utils.py +0 -0
  96. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_config.py +0 -0
  97. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_loader/model_loader.py +0 -0
  98. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/model_loader/utils.py +0 -0
  99. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/chatglm.py +0 -0
  100. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/commandr.py +0 -0
  101. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/dbrx.py +0 -0
  102. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/deepseek.py +0 -0
  103. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/deepseek_v2.py +0 -0
  104. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gemma.py +0 -0
  105. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/gpt_bigcode.py +0 -0
  106. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/grok.py +0 -0
  107. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llama_classification.py +0 -0
  108. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llava.py +0 -0
  109. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/llavavid.py +0 -0
  110. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/minicpm.py +0 -0
  111. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mistral.py +0 -0
  112. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mixtral.py +0 -0
  113. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/mixtral_quant.py +0 -0
  114. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen.py +0 -0
  115. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/qwen2.py +0 -0
  116. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/stablelm.py +0 -0
  117. {sglang-0.2.11 → sglang-0.2.12}/sglang/srt/models/yivl.py +0 -0
  118. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_common.py +0 -0
  119. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_gpqa.py +0 -0
  120. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_math.py +0 -0
  121. {sglang-0.2.11 → sglang-0.2.12}/sglang/test/simple_eval_mmlu.py +0 -0
  122. {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/dependency_links.txt +0 -0
  123. {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/requires.txt +0 -0
  124. {sglang-0.2.11 → sglang-0.2.12}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.11
3
+ Version: 0.2.12
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -308,7 +308,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
308
308
  ### Method 2: From source
309
309
  ```
310
310
  # Use the last release branch
311
- git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
311
+ git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
312
312
  cd sglang
313
313
 
314
314
  pip install --upgrade pip
@@ -392,23 +392,23 @@ print(response)
392
392
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
393
393
 
394
394
  ### Additional Server Arguments
395
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
395
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
396
396
  ```
397
397
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
398
398
  ```
399
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
399
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
400
400
  ```
401
401
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
402
402
  ```
403
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
403
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
404
404
  ```
405
405
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
406
406
  ```
407
- - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
407
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
408
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
408
409
  ```
409
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
410
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
410
411
  ```
411
- - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
412
412
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
413
413
  ```
414
414
  # Node 0
@@ -418,13 +418,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
418
418
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
419
419
  ```
420
420
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
421
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
422
421
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
423
-
422
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
423
+
424
424
  ### Supported Models
425
425
 
426
426
  - Llama / Llama 2 / Llama 3 / Llama 3.1
427
- - Mistral / Mixtral
427
+ - Mistral / Mixtral / Mistral NeMo
428
428
  - Gemma / Gemma 2
429
429
  - Qwen / Qwen 2 / Qwen 2 MoE
430
430
  - DeepSeek / DeepSeek 2
@@ -442,11 +442,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
442
442
  - Grok
443
443
  - ChatGLM
444
444
  - InternLM 2
445
- - Mistral NeMo
446
445
 
447
446
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
448
447
 
449
- ### Run Llama 3.1 405B
448
+ #### Use Models From ModelScope
449
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
450
+ ```
451
+ export SGLANG_USE_MODELSCOPE=true
452
+ ```
453
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
454
+ ```
455
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
456
+ ```
457
+
458
+ #### Run Llama 3.1 405B
450
459
 
451
460
  ```bash
452
461
  ## Run 405B (fp8) on a single node
@@ -474,7 +483,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
474
483
  ```
475
484
 
476
485
  ## Frontend: Structured Generation Language (SGLang)
477
- The frontend language can be used with local models or API models.
486
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
478
487
 
479
488
  ### Quick Start
480
489
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -55,7 +55,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
55
55
  ### Method 2: From source
56
56
  ```
57
57
  # Use the last release branch
58
- git clone -b v0.2.11 https://github.com/sgl-project/sglang.git
58
+ git clone -b v0.2.12 https://github.com/sgl-project/sglang.git
59
59
  cd sglang
60
60
 
61
61
  pip install --upgrade pip
@@ -139,23 +139,23 @@ print(response)
139
139
  It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
140
140
 
141
141
  ### Additional Server Arguments
142
- - Add `--tp 2` to enable tensor parallelism. If it indicates `peer access is not supported between these two devices`, add `--enable-p2p-check` option.
142
+ - Add `--tp 2` to enable multi-GPU tensor parallelism. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
143
143
  ```
144
144
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
145
145
  ```
146
- - Add `--dp 2` to enable data parallelism. It can also be used together with tp. Data parallelism is better for throughput if there is enough memory.
146
+ - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
147
147
  ```
148
148
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
149
149
  ```
150
- - If you see out-of-memory errors during serving, please try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
150
+ - If you see out-of-memory errors during serving, try to reduce the memory usage of the KV cache pool by setting a smaller value of `--mem-fraction-static`. The default value is `0.9`.
151
151
  ```
152
152
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --mem-fraction-static 0.7
153
153
  ```
154
- - If you see out-of-memory errors during prefill for long prompts on a model that supports long context, consider using chunked prefill.
154
+ - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
155
+ - If you see out-of-memory errors during prefill for long prompts, try to set a smaller chunked prefill size.
155
156
  ```
156
- python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --port 30000 --chunked-prefill-size 8192
157
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --chunked-prefill-size 4096
157
158
  ```
158
- - See [hyperparameter_tuning.md](docs/en/hyperparameter_tuning.md) on tuning hyperparameters for better performance.
159
159
  - Add `--nnodes 2` to run tensor parallelism on multiple nodes. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
160
160
  ```
161
161
  # Node 0
@@ -165,13 +165,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
165
165
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
166
166
  ```
167
167
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
168
- - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
169
168
  - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
170
-
169
+ - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
170
+
171
171
  ### Supported Models
172
172
 
173
173
  - Llama / Llama 2 / Llama 3 / Llama 3.1
174
- - Mistral / Mixtral
174
+ - Mistral / Mixtral / Mistral NeMo
175
175
  - Gemma / Gemma 2
176
176
  - Qwen / Qwen 2 / Qwen 2 MoE
177
177
  - DeepSeek / DeepSeek 2
@@ -189,11 +189,20 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
189
189
  - Grok
190
190
  - ChatGLM
191
191
  - InternLM 2
192
- - Mistral NeMo
193
192
 
194
193
  Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
195
194
 
196
- ### Run Llama 3.1 405B
195
+ #### Use Models From ModelScope
196
+ To use model from [ModelScope](https://www.modelscope.cn), setting environment variable SGLANG_USE_MODELSCOPE.
197
+ ```
198
+ export SGLANG_USE_MODELSCOPE=true
199
+ ```
200
+ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instruct) Server
201
+ ```
202
+ SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
203
+ ```
204
+
205
+ #### Run Llama 3.1 405B
197
206
 
198
207
  ```bash
199
208
  ## Run 405B (fp8) on a single node
@@ -221,7 +230,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
221
230
  ```
222
231
 
223
232
  ## Frontend: Structured Generation Language (SGLang)
224
- The frontend language can be used with local models or API models.
233
+ The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
225
234
 
226
235
  ### Quick Start
227
236
  The example below shows how to use sglang to answer a mulit-turn question.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.11"
7
+ version = "0.2.12"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -152,7 +152,7 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
152
152
  req = Req(rid=i, origin_input_text=prompts[i], origin_input_ids=tmp_input_ids)
153
153
  req.prefix_indices = []
154
154
  req.sampling_params = sampling_params
155
- req.input_ids = req.origin_input_ids
155
+ req.fill_ids = req.origin_input_ids
156
156
  reqs.append(req)
157
157
 
158
158
  return input_ids, reqs
@@ -163,7 +163,7 @@ def prepare_extend_inputs_for_correctness_test(
163
163
  ):
164
164
  for i in range(len(reqs)):
165
165
  req = reqs[i]
166
- req.input_ids += input_ids[i][bench_args.cut_len :]
166
+ req.fill_ids += input_ids[i][bench_args.cut_len :]
167
167
  req.prefix_indices = model_runner.req_to_token_pool.req_to_token[
168
168
  i, : bench_args.cut_len
169
169
  ]
@@ -182,7 +182,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
182
182
  req = Req(rid=i, origin_input_text="", origin_input_ids=list(input_ids[i]))
183
183
  req.prefix_indices = []
184
184
  req.sampling_params = sampling_params
185
- req.input_ids = req.origin_input_ids
185
+ req.fill_ids = req.origin_input_ids
186
186
  reqs.append(req)
187
187
 
188
188
  return reqs
@@ -238,7 +238,7 @@ def correctness_test(
238
238
 
239
239
  # Decode
240
240
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
241
- for _ in range(bench_args.output_len):
241
+ for _ in range(bench_args.output_len[0]):
242
242
  next_token_ids, _ = decode(next_token_ids, batch, model_runner)
243
243
  for i in range(len(reqs)):
244
244
  output_ids[i].append(next_token_ids[i])
@@ -332,6 +332,7 @@ def latency_test(
332
332
  )
333
333
 
334
334
  # Warm up
335
+ rank_print("Warmup ...")
335
336
  latency_test_run_once(
336
337
  bench_args.run_name,
337
338
  model_runner,
@@ -341,6 +342,7 @@ def latency_test(
341
342
  bench_args.input_len[0],
342
343
  4, # shorter decoding to speed up the warmup
343
344
  )
345
+ rank_print("Benchmark ...")
344
346
 
345
347
  # Run the sweep
346
348
  result_list = []
@@ -24,7 +24,7 @@ import warnings
24
24
  from argparse import ArgumentParser
25
25
  from dataclasses import dataclass, field
26
26
  from datetime import datetime
27
- from typing import AsyncGenerator, List, Optional, Tuple, Union
27
+ from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
28
28
 
29
29
  import aiohttp
30
30
  import numpy as np
@@ -39,6 +39,8 @@ from transformers import (
39
39
 
40
40
  AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
41
41
 
42
+ global args
43
+
42
44
 
43
45
  @dataclass
44
46
  class RequestFuncInput:
@@ -47,6 +49,7 @@ class RequestFuncInput:
47
49
  prompt_len: int
48
50
  output_len: int
49
51
  model: str
52
+ extra_request_body: Dict[str, Any]
50
53
 
51
54
 
52
55
  @dataclass
@@ -84,6 +87,7 @@ async def async_request_trt_llm(
84
87
  "stream": True,
85
88
  "min_length": request_func_input.output_len,
86
89
  "end_id": 1048576,
90
+ **request_func_input.extra_request_body,
87
91
  }
88
92
  if args.disable_ignore_eos:
89
93
  del payload["min_length"]
@@ -154,6 +158,7 @@ async def async_request_openai_completions(
154
158
  "max_tokens": request_func_input.output_len,
155
159
  "stream": not args.disable_stream,
156
160
  "ignore_eos": not args.disable_ignore_eos,
161
+ **request_func_input.extra_request_body,
157
162
  }
158
163
  headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
159
164
 
@@ -192,7 +197,8 @@ async def async_request_openai_completions(
192
197
  output.ttft = ttft
193
198
 
194
199
  # Decoding phase
195
- output.itl.append(timestamp - most_recent_timestamp)
200
+ else:
201
+ output.itl.append(timestamp - most_recent_timestamp)
196
202
 
197
203
  most_recent_timestamp = timestamp
198
204
  generated_text += data["choices"][0]["text"]
@@ -542,6 +548,7 @@ async def benchmark(
542
548
  request_rate: float,
543
549
  disable_tqdm: bool,
544
550
  enable_multi: bool,
551
+ extra_request_body: Dict[str, Any],
545
552
  ):
546
553
  if backend in ASYNC_REQUEST_FUNCS:
547
554
  request_func = ASYNC_REQUEST_FUNCS[backend]
@@ -556,6 +563,7 @@ async def benchmark(
556
563
  api_url=api_url,
557
564
  prompt_len=test_prompt_len,
558
565
  output_len=test_output_len,
566
+ extra_request_body=extra_request_body,
559
567
  )
560
568
  test_output = await request_func(request_func_input=test_input)
561
569
  if not test_output.success:
@@ -578,6 +586,7 @@ async def benchmark(
578
586
  api_url=api_url,
579
587
  prompt_len=prompt_len,
580
588
  output_len=output_len,
589
+ extra_request_body=extra_request_body,
581
590
  )
582
591
  tasks.append(
583
592
  asyncio.create_task(
@@ -660,19 +669,20 @@ async def benchmark(
660
669
  "backend": args.backend,
661
670
  "dataset_name": args.dataset_name,
662
671
  "request_rate": request_rate,
663
- "total_input": metrics.total_input,
664
- "total_output": metrics.total_output,
665
- "total_output_retokenized": metrics.total_output_retokenized,
666
- "mean_e2e_latency": metrics.mean_e2e_latency_ms,
667
- "median_e2e_latency": metrics.median_e2e_latency_ms,
668
- "median_ttft": metrics.median_ttft_ms,
669
- "median_itl": metrics.median_itl_ms,
670
- "output_token_throughput": metrics.output_throughput,
672
+ "total_input_tokens": metrics.total_input,
673
+ "total_output_tokens": metrics.total_output,
674
+ "total_output_tokens_retokenized": metrics.total_output_retokenized,
675
+ "mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
676
+ "median_e2e_latency_ms": metrics.median_e2e_latency_ms,
677
+ "median_ttft_ms": metrics.median_ttft_ms,
678
+ "median_itl_ms": metrics.median_itl_ms,
679
+ "output_throughput": metrics.output_throughput,
671
680
  "sharegpt_output_len": args.sharegpt_output_len,
672
681
  "random_input_len": args.random_input_len,
673
682
  "random_output_len": args.random_output_len,
674
683
  "random_range_ratio": args.random_range_ratio,
675
- "benchmark_duration": benchmark_duration,
684
+ "duration": benchmark_duration,
685
+ "completed": metrics.completed,
676
686
  }
677
687
  else:
678
688
  print(f"Error running benchmark for request rate: {request_rate}")
@@ -742,10 +752,18 @@ def check_chat_template(model_path):
742
752
  return False
743
753
 
744
754
 
745
- def fire(args: argparse.Namespace):
755
+ def run_benchmark(args_: argparse.Namespace):
756
+ global args
757
+ args = args_
758
+
759
+ set_ulimit()
746
760
  random.seed(args.seed)
747
761
  np.random.seed(args.seed)
748
762
 
763
+ extra_request_body = {}
764
+ if args.extra_request_body:
765
+ extra_request_body = json.loads(args.extra_request_body)
766
+
749
767
  if args.port is None:
750
768
  args.port = {
751
769
  "sglang": 30000,
@@ -838,10 +856,11 @@ def fire(args: argparse.Namespace):
838
856
  request_rate=rate,
839
857
  disable_tqdm=args.disable_tqdm,
840
858
  enable_multi=args.multi,
859
+ extra_request_body=extra_request_body,
841
860
  )
842
861
  )
843
862
  else:
844
- asyncio.run(
863
+ return asyncio.run(
845
864
  benchmark(
846
865
  backend=backend,
847
866
  api_url=api_url,
@@ -851,6 +870,7 @@ def fire(args: argparse.Namespace):
851
870
  request_rate=args.request_rate,
852
871
  disable_tqdm=args.disable_tqdm,
853
872
  enable_multi=args.multi,
873
+ extra_request_body=extra_request_body,
854
874
  )
855
875
  )
856
876
 
@@ -949,11 +969,6 @@ if __name__ == "__main__":
949
969
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is 128.0.",
950
970
  )
951
971
  parser.add_argument("--seed", type=int, default=0, help="Default is 0.")
952
- parser.add_argument(
953
- "--disable-tqdm",
954
- action="store_true",
955
- help="Specify to disable tqdm progress bar.",
956
- )
957
972
  parser.add_argument(
958
973
  "--multi",
959
974
  action="store_true",
@@ -966,6 +981,11 @@ if __name__ == "__main__":
966
981
  help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
967
982
  )
968
983
  parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
984
+ parser.add_argument(
985
+ "--disable-tqdm",
986
+ action="store_true",
987
+ help="Specify to disable tqdm progress bar.",
988
+ )
969
989
  parser.add_argument(
970
990
  "--disable-stream",
971
991
  action="store_true",
@@ -976,8 +996,12 @@ if __name__ == "__main__":
976
996
  action="store_true",
977
997
  help="Disable ignoring EOS.",
978
998
  )
979
-
980
- set_ulimit()
981
-
999
+ parser.add_argument(
1000
+ "--extra-request-body",
1001
+ metavar='{"key1": "value1", "key2": "value2"}',
1002
+ type=str,
1003
+ help="Append given JSON object to the request payload. You can use this to specify"
1004
+ "additional generate params like sampling params.",
1005
+ )
982
1006
  args = parser.parse_args()
983
- fire(args)
1007
+ run_benchmark(args)
@@ -125,7 +125,7 @@ class CompiledFunction:
125
125
  def run(
126
126
  self,
127
127
  *,
128
- max_new_tokens: int = 16,
128
+ max_new_tokens: int = 128,
129
129
  stop: Union[str, List[str]] = (),
130
130
  temperature: float = 1.0,
131
131
  top_p: float = 1.0,
@@ -155,7 +155,7 @@ class CompiledFunction:
155
155
  self,
156
156
  batch_kwargs,
157
157
  *,
158
- max_new_tokens: int = 16,
158
+ max_new_tokens: int = 128,
159
159
  stop: Union[str, List[str]] = (),
160
160
  temperature: float = 1.0,
161
161
  top_p: float = 1.0,
@@ -16,7 +16,7 @@ REGEX_STRING = r"\"[\w\d\s]*\"" # bugs with regex r"\".*\"" in interegular pkg
16
16
 
17
17
  @dataclasses.dataclass
18
18
  class SglSamplingParams:
19
- max_new_tokens: int = 16
19
+ max_new_tokens: int = 128
20
20
  stop: Union[str, List[str]] = ()
21
21
  temperature: float = 1.0
22
22
  top_p: float = 1.0
@@ -140,7 +140,7 @@ class SglFunction:
140
140
  def run(
141
141
  self,
142
142
  *args,
143
- max_new_tokens: int = 16,
143
+ max_new_tokens: int = 128,
144
144
  stop: Union[str, List[str]] = (),
145
145
  temperature: float = 1.0,
146
146
  top_p: float = 1.0,
@@ -179,7 +179,7 @@ class SglFunction:
179
179
  self,
180
180
  batch_kwargs,
181
181
  *,
182
- max_new_tokens: int = 16,
182
+ max_new_tokens: int = 128,
183
183
  stop: Union[str, List[str]] = (),
184
184
  temperature: float = 1.0,
185
185
  top_p: float = 1.0,
@@ -54,7 +54,7 @@ class BaseToolCache:
54
54
  return val
55
55
 
56
56
  def init_value(self, key):
57
- raise NotImplementedError
57
+ raise NotImplementedError()
58
58
 
59
59
  def get_cache_hit_rate(self):
60
60
  if self.metrics["total"] == 0:
@@ -20,10 +20,20 @@ from sglang.srt.constrained.base_tool_cache import BaseToolCache
20
20
 
21
21
 
22
22
  class FSMCache(BaseToolCache):
23
- def __init__(self, tokenizer_path, tokenizer_args_dict, enable=True):
23
+ def __init__(
24
+ self,
25
+ tokenizer_path,
26
+ tokenizer_args_dict,
27
+ enable=True,
28
+ skip_tokenizer_init=False,
29
+ ):
24
30
  super().__init__(enable=enable)
25
31
 
26
- if tokenizer_path.endswith(".json") or tokenizer_path.endswith(".model"):
32
+ if (
33
+ skip_tokenizer_init
34
+ or tokenizer_path.endswith(".json")
35
+ or tokenizer_path.endswith(".model")
36
+ ):
27
37
  # Do not support TiktokenTokenizer or SentencePieceTokenizer
28
38
  return
29
39
 
@@ -0,0 +1,33 @@
1
+ """
2
+ Copyright 2023-2024 SGLang Team
3
+ Licensed under the Apache License, Version 2.0 (the "License");
4
+ you may not use this file except in compliance with the License.
5
+ You may obtain a copy of the License at
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+ Unless required by applicable law or agreed to in writing, software
8
+ distributed under the License is distributed on an "AS IS" BASIS,
9
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10
+ See the License for the specific language governing permissions and
11
+ limitations under the License.
12
+ """
13
+
14
+ """Fused operators for activation layers."""
15
+
16
+ import torch
17
+ import torch.nn as nn
18
+ import torch.nn.functional as F
19
+ from flashinfer.activation import silu_and_mul
20
+ from vllm.model_executor.custom_op import CustomOp
21
+
22
+
23
+ class SiluAndMul(CustomOp):
24
+ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
25
+ d = x.shape[-1] // 2
26
+ return F.silu(x[..., :d]) * x[..., d:]
27
+
28
+ def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
29
+ d = x.shape[-1] // 2
30
+ output_shape = x.shape[:-1] + (d,)
31
+ out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
32
+ silu_and_mul(x, out)
33
+ return out
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
+ """
17
+ Memory-efficient attention for decoding.
18
+ """
19
+
16
20
  # Adapted from
17
21
  # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_nopad_att1.py
18
22
  # https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/token_attention_softmax_and_reducev.py
@@ -194,7 +198,7 @@ def _fwd_kernel_stage2(
194
198
  tl.store(out_ptrs, acc)
195
199
 
196
200
 
197
- def _token_att_m_fwd(
201
+ def _decode_att_m_fwd(
198
202
  q,
199
203
  k_buffer,
200
204
  att_out,
@@ -254,7 +258,7 @@ def _token_att_m_fwd(
254
258
  )
255
259
 
256
260
 
257
- def _token_softmax_reducev_fwd(
261
+ def _decode_softmax_reducev_fwd(
258
262
  logics,
259
263
  v_buffer,
260
264
  o,
@@ -292,7 +296,7 @@ def _token_softmax_reducev_fwd(
292
296
  )
293
297
 
294
298
 
295
- def token_attention_fwd(
299
+ def decode_attention_fwd(
296
300
  q,
297
301
  k_buffer,
298
302
  v_buffer,
@@ -312,7 +316,7 @@ def token_attention_fwd(
312
316
  (q.shape[-2], total_num_tokens), dtype=REDUCE_TORCH_TYPE, device="cuda"
313
317
  )
314
318
 
315
- _token_att_m_fwd(
319
+ _decode_att_m_fwd(
316
320
  q,
317
321
  k_buffer,
318
322
  att_m,
@@ -324,7 +328,7 @@ def token_attention_fwd(
324
328
  sm_scale,
325
329
  logit_cap,
326
330
  )
327
- _token_softmax_reducev_fwd(
331
+ _decode_softmax_reducev_fwd(
328
332
  att_m,
329
333
  v_buffer,
330
334
  o,
@@ -13,11 +13,16 @@ See the License for the specific language governing permissions and
13
13
  limitations under the License.
14
14
  """
15
15
 
16
+ """
17
+ Memory-efficient attention for prefill.
18
+ It supporst page size = 1 and prefill with KV cache (i.e. extend).
19
+ """
20
+
16
21
  import torch
17
22
  import triton
18
23
  import triton.language as tl
19
24
 
20
- from sglang.srt.layers.context_flashattention_nopad import context_attention_fwd
25
+ from sglang.srt.layers.prefill_attention import context_attention_fwd
21
26
 
22
27
  CUDA_CAPABILITY = torch.cuda.get_device_capability()
23
28