sglang 0.3.2__tar.gz → 0.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.2/sglang.egg-info → sglang-0.3.3}/PKG-INFO +37 -19
- {sglang-0.3.2 → sglang-0.3.3}/README.md +35 -18
- {sglang-0.3.2 → sglang-0.3.3}/pyproject.toml +2 -2
- {sglang-0.3.2 → sglang-0.3.3}/sglang/__init__.py +2 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/api.py +23 -1
- {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_latency.py +46 -25
- {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_serving.py +2 -2
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/runtime_endpoint.py +14 -1
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/interpreter.py +16 -6
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/ir.py +20 -4
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/model_config.py +11 -9
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/fsm_cache.py +9 -1
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/jump_forward.py +15 -2
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/activation.py +4 -4
- sglang-0.3.3/sglang/srt/layers/attention/__init__.py +49 -0
- sglang-0.3.3/sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- {sglang-0.3.2/sglang/srt/layers → sglang-0.3.3/sglang/srt/layers/attention}/flashinfer_utils.py +82 -80
- sglang-0.3.3/sglang/srt/layers/attention/triton_backend.py +161 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/extend_attention.py +3 -1
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/layernorm.py +4 -4
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/logits_processor.py +19 -15
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/pooler.py +3 -3
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/quantization/__init__.py +0 -2
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/radix_attention.py +6 -4
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/sampler.py +6 -4
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/torchao_utils.py +18 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora.py +20 -21
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora_manager.py +97 -25
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang-0.3.3/sglang/srt/managers/image_processor.py +187 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/io_struct.py +99 -75
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/schedule_batch.py +184 -63
- sglang-0.3.2/sglang/srt/managers/policy_scheduler.py → sglang-0.3.3/sglang/srt/managers/schedule_policy.py +31 -21
- sglang-0.3.2/sglang/srt/managers/tp_worker.py → sglang-0.3.3/sglang/srt/managers/scheduler.py +379 -383
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/managers/tokenizer_manager.py +120 -248
- sglang-0.3.3/sglang/srt/managers/tp_worker.py +128 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/memory_pool.py +34 -52
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/model_executor/cuda_graph_runner.py +15 -19
- sglang-0.3.3/sglang/srt/model_executor/forward_batch_info.py +173 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/model_executor/model_runner.py +76 -75
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/baichuan.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/chatglm.py +12 -12
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/commandr.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/dbrx.py +12 -12
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/deepseek.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/deepseek_v2.py +14 -15
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/exaone.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gemma.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gemma2.py +11 -11
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/gpt_bigcode.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/grok.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/internlm2.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama.py +14 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama_classification.py +5 -5
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llama_embedding.py +4 -4
- sglang-0.3.3/sglang/srt/models/llama_reward.py +142 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llava.py +39 -33
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/llavavid.py +31 -28
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/minicpm.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/minicpm3.py +14 -15
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mixtral.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mixtral_quant.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/olmoe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen2.py +11 -11
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/qwen2_moe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/stablelm.py +10 -10
- sglang-0.3.3/sglang/srt/models/torch_native_llama.py +506 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/xverse.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/xverse_moe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/sampling_batch_info.py +36 -27
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/sampling_params.py +3 -1
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/server.py +170 -119
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/server_args.py +54 -27
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/utils.py +101 -128
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/runners.py +71 -26
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_programs.py +38 -5
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_utils.py +18 -9
- sglang-0.3.3/sglang/version.py +1 -0
- {sglang-0.3.2 → sglang-0.3.3/sglang.egg-info}/PKG-INFO +37 -19
- {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/SOURCES.txt +12 -8
- {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/requires.txt +1 -0
- sglang-0.3.2/sglang/srt/layers/attention_backend.py +0 -474
- sglang-0.3.2/sglang/srt/managers/controller_multi.py +0 -207
- sglang-0.3.2/sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.2/sglang/srt/model_executor/forward_batch_info.py +0 -174
- sglang-0.3.2/sglang/version.py +0 -1
- {sglang-0.3.2 → sglang-0.3.3}/LICENSE +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/setup.cfg +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/check_env.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/global_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/choices.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/launch_server.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/decode_attention.py +0 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3/sglang/srt/layers/attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang/utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.2 → sglang-0.3.3}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -233,6 +233,7 @@ Requires-Dist: uvloop; extra == "srt"
|
|
233
233
|
Requires-Dist: zmq; extra == "srt"
|
234
234
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
235
235
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
236
|
+
Requires-Dist: modelscope; extra == "srt"
|
236
237
|
Provides-Extra: openai
|
237
238
|
Requires-Dist: openai>=1.0; extra == "openai"
|
238
239
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -269,16 +270,11 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
269
270
|
|
270
271
|
--------------------------------------------------------------------------------
|
271
272
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
275
|
+
## Upcoming Events
|
276
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
277
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
282
278
|
|
283
279
|
## News
|
284
280
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -294,6 +290,16 @@ The core features include:
|
|
294
290
|
|
295
291
|
</details>
|
296
292
|
|
293
|
+
## About
|
294
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
295
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
296
|
+
The core features include:
|
297
|
+
|
298
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
299
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
300
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
301
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
302
|
+
|
297
303
|
## Contents
|
298
304
|
- [Install](#install)
|
299
305
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
324
|
### Method 2: From source
|
319
325
|
```
|
320
326
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
327
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
322
328
|
cd sglang
|
323
329
|
|
324
330
|
pip install --upgrade pip
|
@@ -339,7 +345,7 @@ docker run --gpus all \
|
|
339
345
|
--env "HF_TOKEN=<secret>" \
|
340
346
|
--ipc=host \
|
341
347
|
lmsysorg/sglang:latest \
|
342
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
348
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
343
349
|
```
|
344
350
|
|
345
351
|
### Method 4: Using docker compose
|
@@ -379,7 +385,7 @@ resources:
|
|
379
385
|
run: |
|
380
386
|
conda deactivate
|
381
387
|
python3 -m sglang.launch_server \
|
382
|
-
--model-path meta-llama/
|
388
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
383
389
|
--host 0.0.0.0 \
|
384
390
|
--port 30000
|
385
391
|
```
|
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
|
|
421
427
|
}
|
422
428
|
}'
|
423
429
|
```
|
424
|
-
|
430
|
+
|
431
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
425
432
|
|
426
433
|
### OpenAI Compatible API
|
427
434
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -460,7 +467,7 @@ response = client.embeddings.create(
|
|
460
467
|
print(response)
|
461
468
|
```
|
462
469
|
|
463
|
-
It supports streaming, vision, and
|
470
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
464
471
|
|
465
472
|
### Additional Server Arguments
|
466
473
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
481
488
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
482
489
|
```
|
483
490
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
491
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
484
492
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
493
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
494
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
487
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
495
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
488
496
|
```
|
489
497
|
# Node 0
|
490
498
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
507
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
508
|
- Mistral / Mixtral / Mistral NeMo
|
501
509
|
- Gemma / Gemma 2
|
502
|
-
- OLMoE
|
503
510
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
511
|
- DeepSeek / DeepSeek 2
|
512
|
+
- OLMoE
|
505
513
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
506
514
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
507
515
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -523,7 +531,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
523
531
|
- XVERSE / XVERSE MoE
|
524
532
|
- SmolLM
|
525
533
|
|
526
|
-
|
527
534
|
**Embedding Models**
|
528
535
|
|
529
536
|
- e5-mistral
|
@@ -544,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
544
551
|
```
|
545
552
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
546
553
|
```
|
554
|
+
|
555
|
+
Or start it by docker.
|
556
|
+
```bash
|
557
|
+
docker run --gpus all \
|
558
|
+
-p 30000:30000 \
|
559
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
560
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
561
|
+
--ipc=host \
|
562
|
+
lmsysorg/sglang:latest \
|
563
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
564
|
+
```
|
547
565
|
|
548
566
|
</details>
|
549
567
|
|
@@ -582,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
582
600
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
583
601
|
|
584
602
|
### Quick Start
|
585
|
-
The example below shows how to use sglang to answer a
|
603
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
586
604
|
|
587
605
|
#### Using Local Models
|
588
606
|
First, launch a server with
|
@@ -11,16 +11,11 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
21
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
22
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
23
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
16
|
+
## Upcoming Events
|
17
|
+
- [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
|
18
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
24
19
|
|
25
20
|
## News
|
26
21
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -36,6 +31,16 @@ The core features include:
|
|
36
31
|
|
37
32
|
</details>
|
38
33
|
|
34
|
+
## About
|
35
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
36
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
37
|
+
The core features include:
|
38
|
+
|
39
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
40
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
41
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
42
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
43
|
+
|
39
44
|
## Contents
|
40
45
|
- [Install](#install)
|
41
46
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -60,7 +65,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
60
65
|
### Method 2: From source
|
61
66
|
```
|
62
67
|
# Use the last release branch
|
63
|
-
git clone -b v0.3.
|
68
|
+
git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
|
64
69
|
cd sglang
|
65
70
|
|
66
71
|
pip install --upgrade pip
|
@@ -81,7 +86,7 @@ docker run --gpus all \
|
|
81
86
|
--env "HF_TOKEN=<secret>" \
|
82
87
|
--ipc=host \
|
83
88
|
lmsysorg/sglang:latest \
|
84
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
89
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
85
90
|
```
|
86
91
|
|
87
92
|
### Method 4: Using docker compose
|
@@ -121,7 +126,7 @@ resources:
|
|
121
126
|
run: |
|
122
127
|
conda deactivate
|
123
128
|
python3 -m sglang.launch_server \
|
124
|
-
--model-path meta-llama/
|
129
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
125
130
|
--host 0.0.0.0 \
|
126
131
|
--port 30000
|
127
132
|
```
|
@@ -163,7 +168,8 @@ curl http://localhost:30000/generate \
|
|
163
168
|
}
|
164
169
|
}'
|
165
170
|
```
|
166
|
-
|
171
|
+
|
172
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
167
173
|
|
168
174
|
### OpenAI Compatible API
|
169
175
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -202,7 +208,7 @@ response = client.embeddings.create(
|
|
202
208
|
print(response)
|
203
209
|
```
|
204
210
|
|
205
|
-
It supports streaming, vision, and
|
211
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
206
212
|
|
207
213
|
### Additional Server Arguments
|
208
214
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -223,10 +229,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
223
229
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
224
230
|
```
|
225
231
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
232
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
226
233
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
227
234
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
228
235
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
229
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
236
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
230
237
|
```
|
231
238
|
# Node 0
|
232
239
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -241,9 +248,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
248
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
242
249
|
- Mistral / Mixtral / Mistral NeMo
|
243
250
|
- Gemma / Gemma 2
|
244
|
-
- OLMoE
|
245
251
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
246
252
|
- DeepSeek / DeepSeek 2
|
253
|
+
- OLMoE
|
247
254
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
248
255
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
249
256
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -265,7 +272,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
265
272
|
- XVERSE / XVERSE MoE
|
266
273
|
- SmolLM
|
267
274
|
|
268
|
-
|
269
275
|
**Embedding Models**
|
270
276
|
|
271
277
|
- e5-mistral
|
@@ -286,6 +292,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
286
292
|
```
|
287
293
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
288
294
|
```
|
295
|
+
|
296
|
+
Or start it by docker.
|
297
|
+
```bash
|
298
|
+
docker run --gpus all \
|
299
|
+
-p 30000:30000 \
|
300
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
301
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
302
|
+
--ipc=host \
|
303
|
+
lmsysorg/sglang:latest \
|
304
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
305
|
+
```
|
289
306
|
|
290
307
|
</details>
|
291
308
|
|
@@ -324,7 +341,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
324
341
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
325
342
|
|
326
343
|
### Quick Start
|
327
|
-
The example below shows how to use sglang to answer a
|
344
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
328
345
|
|
329
346
|
#### Using Local Models
|
330
347
|
First, launch a server with
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.3"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ dependencies = [
|
|
23
23
|
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "torchao", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.5", "outlines>=0.0.44"]
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SGL API Components
|
2
2
|
|
3
3
|
from sglang.api import (
|
4
|
+
Engine,
|
4
5
|
Runtime,
|
5
6
|
assistant,
|
6
7
|
assistant_begin,
|
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
|
|
31
32
|
# SGLang DSL APIs
|
32
33
|
__all__ = [
|
33
34
|
"Runtime",
|
35
|
+
"Engine",
|
34
36
|
"assistant",
|
35
37
|
"assistant_begin",
|
36
38
|
"assistant_end",
|
@@ -33,13 +33,23 @@ def function(
|
|
33
33
|
|
34
34
|
|
35
35
|
def Runtime(*args, **kwargs):
|
36
|
-
# Avoid importing unnecessary dependency
|
37
36
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
37
|
+
|
38
|
+
# Avoid importing unnecessary dependency
|
38
39
|
from sglang.srt.server import Runtime
|
39
40
|
|
40
41
|
return Runtime(*args, **kwargs)
|
41
42
|
|
42
43
|
|
44
|
+
def Engine(*args, **kwargs):
|
45
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
46
|
+
|
47
|
+
# Avoid importing unnecessary dependency
|
48
|
+
from sglang.srt.server import Engine
|
49
|
+
|
50
|
+
return Engine(*args, **kwargs)
|
51
|
+
|
52
|
+
|
43
53
|
def set_default_backend(backend: BaseBackend):
|
44
54
|
global_config.default_backend = backend
|
45
55
|
|
@@ -48,6 +58,10 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
48
58
|
backend = backend or global_config.default_backend
|
49
59
|
if backend is None:
|
50
60
|
return False
|
61
|
+
|
62
|
+
# If backend is Runtime
|
63
|
+
if hasattr(backend, "endpoint"):
|
64
|
+
backend = backend.endpoint
|
51
65
|
return backend.flush_cache()
|
52
66
|
|
53
67
|
|
@@ -55,12 +69,17 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
55
69
|
backend = backend or global_config.default_backend
|
56
70
|
if backend is None:
|
57
71
|
return None
|
72
|
+
|
73
|
+
# If backend is Runtime
|
74
|
+
if hasattr(backend, "endpoint"):
|
75
|
+
backend = backend.endpoint
|
58
76
|
return backend.get_server_args()
|
59
77
|
|
60
78
|
|
61
79
|
def gen(
|
62
80
|
name: Optional[str] = None,
|
63
81
|
max_tokens: Optional[int] = None,
|
82
|
+
min_tokens: Optional[int] = None,
|
64
83
|
stop: Optional[Union[str, List[str]]] = None,
|
65
84
|
stop_token_ids: Optional[List[int]] = None,
|
66
85
|
temperature: Optional[float] = None,
|
@@ -100,6 +119,7 @@ def gen(
|
|
100
119
|
return SglGen(
|
101
120
|
name,
|
102
121
|
max_tokens,
|
122
|
+
min_tokens,
|
103
123
|
stop,
|
104
124
|
stop_token_ids,
|
105
125
|
temperature,
|
@@ -139,6 +159,7 @@ def gen_int(
|
|
139
159
|
return SglGen(
|
140
160
|
name,
|
141
161
|
max_tokens,
|
162
|
+
None,
|
142
163
|
stop,
|
143
164
|
stop_token_ids,
|
144
165
|
temperature,
|
@@ -177,6 +198,7 @@ def gen_string(
|
|
177
198
|
return SglGen(
|
178
199
|
name,
|
179
200
|
max_tokens,
|
201
|
+
None,
|
180
202
|
stop,
|
181
203
|
stop_token_ids,
|
182
204
|
temperature,
|
@@ -47,6 +47,7 @@ I'm going to the park
|
|
47
47
|
import argparse
|
48
48
|
import dataclasses
|
49
49
|
import itertools
|
50
|
+
import json
|
50
51
|
import logging
|
51
52
|
import multiprocessing
|
52
53
|
import os
|
@@ -62,10 +63,11 @@ import torch.distributed as dist
|
|
62
63
|
from sglang.srt.configs.model_config import ModelConfig
|
63
64
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
65
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
66
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
65
67
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
66
68
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
67
69
|
from sglang.srt.server import _set_envs_and_config
|
68
|
-
from sglang.srt.server_args import ServerArgs
|
70
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
69
71
|
from sglang.srt.utils import (
|
70
72
|
configure_logger,
|
71
73
|
kill_child_process,
|
@@ -121,7 +123,7 @@ class BenchArgs:
|
|
121
123
|
)
|
122
124
|
|
123
125
|
|
124
|
-
def load_model(server_args, tp_rank):
|
126
|
+
def load_model(server_args, port_args, tp_rank):
|
125
127
|
suppress_other_loggers()
|
126
128
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
127
129
|
|
@@ -129,6 +131,7 @@ def load_model(server_args, tp_rank):
|
|
129
131
|
server_args.model_path,
|
130
132
|
server_args.trust_remote_code,
|
131
133
|
context_length=server_args.context_length,
|
134
|
+
model_override_args=json.loads(server_args.json_model_override_args),
|
132
135
|
)
|
133
136
|
model_runner = ModelRunner(
|
134
137
|
model_config=model_config,
|
@@ -136,7 +139,7 @@ def load_model(server_args, tp_rank):
|
|
136
139
|
gpu_id=tp_rank,
|
137
140
|
tp_rank=tp_rank,
|
138
141
|
tp_size=server_args.tp_size,
|
139
|
-
nccl_port=
|
142
|
+
nccl_port=port_args.nccl_ports[0],
|
140
143
|
server_args=server_args,
|
141
144
|
)
|
142
145
|
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
|
@@ -167,9 +170,13 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
|
|
167
170
|
assert len(input_ids[i]) > bench_args.cut_len
|
168
171
|
|
169
172
|
tmp_input_ids = input_ids[i][: bench_args.cut_len]
|
170
|
-
req = Req(
|
173
|
+
req = Req(
|
174
|
+
rid=i,
|
175
|
+
origin_input_text=prompts[i],
|
176
|
+
origin_input_ids=tmp_input_ids,
|
177
|
+
sampling_params=sampling_params,
|
178
|
+
)
|
171
179
|
req.prefix_indices = []
|
172
|
-
req.sampling_params = sampling_params
|
173
180
|
req.fill_ids = req.origin_input_ids
|
174
181
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
175
182
|
reqs.append(req)
|
@@ -199,9 +206,13 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
199
206
|
|
200
207
|
reqs = []
|
201
208
|
for i in range(len(input_ids)):
|
202
|
-
req = Req(
|
209
|
+
req = Req(
|
210
|
+
rid=i,
|
211
|
+
origin_input_text="",
|
212
|
+
origin_input_ids=list(input_ids[i]),
|
213
|
+
sampling_params=sampling_params,
|
214
|
+
)
|
203
215
|
req.prefix_indices = []
|
204
|
-
req.sampling_params = sampling_params
|
205
216
|
req.fill_ids = req.origin_input_ids
|
206
217
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
207
218
|
reqs.append(req)
|
@@ -217,28 +228,33 @@ def extend(reqs, model_runner):
|
|
217
228
|
tree_cache=None,
|
218
229
|
)
|
219
230
|
batch.prepare_for_extend(model_runner.model_config.vocab_size)
|
220
|
-
|
221
|
-
|
231
|
+
model_worker_batch = batch.get_model_worker_batch()
|
232
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
233
|
+
logits_output = model_runner.forward(forward_batch)
|
234
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
|
222
235
|
return next_token_ids, logits_output.next_token_logits, batch
|
223
236
|
|
224
237
|
|
225
238
|
def decode(input_token_ids, batch, model_runner):
|
226
239
|
batch.prepare_for_decode(input_token_ids)
|
227
|
-
|
228
|
-
|
240
|
+
model_worker_batch = batch.get_model_worker_batch()
|
241
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
242
|
+
logits_output = model_runner.forward(forward_batch)
|
243
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
|
229
244
|
return next_token_ids, logits_output.next_token_logits
|
230
245
|
|
231
246
|
|
232
247
|
@torch.inference_mode()
|
233
248
|
def correctness_test(
|
234
249
|
server_args,
|
250
|
+
port_args,
|
235
251
|
bench_args,
|
236
252
|
tp_rank,
|
237
253
|
):
|
238
254
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
239
255
|
|
240
256
|
# Load the model
|
241
|
-
model_runner, tokenizer = load_model(server_args, tp_rank)
|
257
|
+
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
242
258
|
|
243
259
|
# Prepare inputs
|
244
260
|
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
|
@@ -324,13 +340,16 @@ def latency_test_run_once(
|
|
324
340
|
rank_print(
|
325
341
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
326
342
|
)
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
343
|
+
|
344
|
+
# record decode timing from 2nd output
|
345
|
+
if output_len > 1:
|
346
|
+
med_decode_latency = np.median(decode_latencies)
|
347
|
+
med_decode_throughput = batch_size / med_decode_latency
|
348
|
+
rank_print(
|
349
|
+
f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
|
350
|
+
)
|
351
|
+
measurement_results["median_decode_latency"] = med_decode_latency
|
352
|
+
measurement_results["median_decode_throughput"] = med_decode_throughput
|
334
353
|
|
335
354
|
throughput = (input_len + output_len) * batch_size / tot_latency
|
336
355
|
rank_print(
|
@@ -343,15 +362,15 @@ def latency_test_run_once(
|
|
343
362
|
|
344
363
|
def latency_test(
|
345
364
|
server_args,
|
365
|
+
port_args,
|
346
366
|
bench_args,
|
347
367
|
tp_rank,
|
348
368
|
):
|
349
369
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
350
|
-
_set_envs_and_config(server_args)
|
351
370
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
352
371
|
|
353
372
|
# Load the model
|
354
|
-
model_runner, tokenizer = load_model(server_args, tp_rank)
|
373
|
+
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
355
374
|
|
356
375
|
# Prepare inputs for warm up
|
357
376
|
reqs = prepare_synthetic_inputs_for_latency_test(
|
@@ -367,7 +386,7 @@ def latency_test(
|
|
367
386
|
reqs,
|
368
387
|
bench_args.batch_size[0],
|
369
388
|
bench_args.input_len[0],
|
370
|
-
|
389
|
+
8, # shorter decoding to speed up the warmup
|
371
390
|
)
|
372
391
|
rank_print("Benchmark ...")
|
373
392
|
|
@@ -453,6 +472,7 @@ def plot_latency_test(
|
|
453
472
|
|
454
473
|
|
455
474
|
def main(server_args, bench_args):
|
475
|
+
_set_envs_and_config(server_args)
|
456
476
|
|
457
477
|
if server_args.model_path:
|
458
478
|
if bench_args.correctness_test:
|
@@ -468,8 +488,10 @@ def main(server_args, bench_args):
|
|
468
488
|
"provide --result-filename for plotting the results"
|
469
489
|
)
|
470
490
|
|
491
|
+
port_args = PortArgs.init_new(server_args)
|
492
|
+
|
471
493
|
if server_args.tp_size == 1:
|
472
|
-
work_func(server_args, bench_args, 0)
|
494
|
+
work_func(server_args, port_args, bench_args, 0)
|
473
495
|
else:
|
474
496
|
workers = []
|
475
497
|
for tp_rank in range(server_args.tp_size):
|
@@ -477,6 +499,7 @@ def main(server_args, bench_args):
|
|
477
499
|
target=work_func,
|
478
500
|
args=(
|
479
501
|
server_args,
|
502
|
+
port_args,
|
480
503
|
bench_args,
|
481
504
|
tp_rank,
|
482
505
|
),
|
@@ -503,8 +526,6 @@ if __name__ == "__main__":
|
|
503
526
|
format="%(message)s",
|
504
527
|
)
|
505
528
|
|
506
|
-
multiprocessing.set_start_method("spawn", force=True)
|
507
|
-
|
508
529
|
try:
|
509
530
|
main(server_args, bench_args)
|
510
531
|
except Exception as e:
|
@@ -845,6 +845,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
845
845
|
tokenizer = get_tokenizer(tokenizer_id)
|
846
846
|
|
847
847
|
if args.dataset_name == "sharegpt":
|
848
|
+
assert args.random_input_len is None and args.random_output_len is None
|
848
849
|
input_requests = sample_sharegpt_requests(
|
849
850
|
dataset_path=args.dataset_path,
|
850
851
|
num_requests=args.num_prompts,
|
@@ -852,6 +853,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
852
853
|
fixed_output_len=args.sharegpt_output_len,
|
853
854
|
)
|
854
855
|
elif args.dataset_name == "random":
|
856
|
+
assert args.random_input_len is not None and args.random_output_len is not None
|
855
857
|
input_requests = sample_random_requests(
|
856
858
|
input_len=args.random_input_len,
|
857
859
|
output_len=args.random_output_len,
|
@@ -964,13 +966,11 @@ if __name__ == "__main__":
|
|
964
966
|
parser.add_argument(
|
965
967
|
"--random-input-len",
|
966
968
|
type=int,
|
967
|
-
default=1024,
|
968
969
|
help="Number of input tokens per request, used only for random dataset.",
|
969
970
|
)
|
970
971
|
parser.add_argument(
|
971
972
|
"--random-output-len",
|
972
973
|
type=int,
|
973
|
-
default=128,
|
974
974
|
help="Number of output tokens per request, used only for random dataset.",
|
975
975
|
)
|
976
976
|
parser.add_argument(
|