sglang 0.3.1.post1__tar.gz → 0.3.1.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.1.post1/sglang.egg-info → sglang-0.3.1.post2}/PKG-INFO +4 -5
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/README.md +3 -4
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/pyproject.toml +1 -1
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/bench_latency.py +3 -1
- sglang-0.3.1.post2/sglang/bench_server_latency.py +187 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/bench_serving.py +1 -1
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/activation.py +6 -3
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/layernorm.py +10 -7
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/sampler.py +9 -2
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/io_struct.py +3 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/policy_scheduler.py +49 -93
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/schedule_batch.py +1 -1
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/tp_worker.py +11 -6
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/cuda_graph_runner.py +15 -14
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/model_runner.py +13 -5
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/deepseek_v2.py +2 -2
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama.py +1 -3
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama_classification.py +2 -3
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/minicpm3.py +2 -2
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/xverse.py +1 -3
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/xverse_moe.py +1 -4
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/server_args.py +17 -21
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/few_shot_gsm8k.py +8 -2
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_utils.py +1 -0
- sglang-0.3.1.post2/sglang/version.py +1 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2/sglang.egg-info}/PKG-INFO +4 -5
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/SOURCES.txt +1 -0
- sglang-0.3.1.post1/sglang/version.py +0 -1
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/LICENSE +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/setup.cfg +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/api.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/check_env.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/global_config.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/launch_server.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/attention_backend.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/flashinfer_utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/decode_attention.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/extend_attention.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/layers/triton_attention/prefill_attention.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/controller_multi.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/controller_single.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/managers/tokenizer_manager.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/model_executor/forward_batch_info.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llava.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/openai_api/adapter.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/sampling_batch_info.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/sampling/sampling_params.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/server.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/srt/utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/runners.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang/utils.py +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/requires.txt +0 -0
- {sglang-0.3.1.post1 → sglang-0.3.1.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.1.
|
3
|
+
Version: 0.3.1.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -269,7 +269,7 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
269
269
|
|
270
270
|
--------------------------------------------------------------------------------
|
271
271
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
272
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
|
273
273
|
|
274
274
|
SGLang is a fast serving framework for large language models and vision language models.
|
275
275
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -278,7 +278,7 @@ The core features include:
|
|
278
278
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
279
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
280
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption
|
281
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
282
282
|
|
283
283
|
## News
|
284
284
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -318,7 +318,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
318
|
### Method 2: From source
|
319
319
|
```
|
320
320
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.1.
|
321
|
+
git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
|
322
322
|
cd sglang
|
323
323
|
|
324
324
|
pip install --upgrade pip
|
@@ -483,7 +483,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
483
483
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
484
484
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
485
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
|
-
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
487
486
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
488
487
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
489
488
|
```
|
@@ -11,7 +11,7 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
|
15
15
|
|
16
16
|
SGLang is a fast serving framework for large language models and vision language models.
|
17
17
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
@@ -20,7 +20,7 @@ The core features include:
|
|
20
20
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
21
21
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
22
22
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
23
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption
|
23
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
24
24
|
|
25
25
|
## News
|
26
26
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -60,7 +60,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
60
60
|
### Method 2: From source
|
61
61
|
```
|
62
62
|
# Use the last release branch
|
63
|
-
git clone -b v0.3.1.
|
63
|
+
git clone -b v0.3.1.post2 https://github.com/sgl-project/sglang.git
|
64
64
|
cd sglang
|
65
65
|
|
66
66
|
pip install --upgrade pip
|
@@ -225,7 +225,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
225
225
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
226
226
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
227
227
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
228
|
-
- To enable DeepSeek MLA acceleration, add `--enable-mla`.
|
229
228
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
230
229
|
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
231
230
|
```
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.1.
|
7
|
+
version = "0.3.1.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -1,5 +1,7 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of a
|
2
|
+
Benchmark the latency of running a single static batch.
|
3
|
+
This script does not launch a server and uses the low-level APIs.
|
4
|
+
It accepts arguments similar to those of launch_server.py.
|
3
5
|
|
4
6
|
# Usage (latency test)
|
5
7
|
## with dummy weights:
|
@@ -0,0 +1,187 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark the latency of serving a single batch with a real server.
|
3
|
+
This script launches a server and uses the HTTP interface.
|
4
|
+
It accepts arguments similar to those of launch_server.py.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
|
8
|
+
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
|
+
"""
|
10
|
+
|
11
|
+
import argparse
|
12
|
+
import dataclasses
|
13
|
+
import itertools
|
14
|
+
import json
|
15
|
+
import multiprocessing
|
16
|
+
import os
|
17
|
+
import time
|
18
|
+
from typing import Tuple
|
19
|
+
|
20
|
+
import numpy as np
|
21
|
+
import requests
|
22
|
+
|
23
|
+
from sglang.srt.server import launch_server
|
24
|
+
from sglang.srt.server_args import ServerArgs
|
25
|
+
from sglang.srt.utils import kill_child_process
|
26
|
+
|
27
|
+
|
28
|
+
@dataclasses.dataclass
|
29
|
+
class BenchArgs:
|
30
|
+
run_name: str = "default"
|
31
|
+
batch_size: Tuple[int] = (1,)
|
32
|
+
input_len: Tuple[int] = (1024,)
|
33
|
+
output_len: Tuple[int] = (16,)
|
34
|
+
result_filename: str = "result.jsonl"
|
35
|
+
|
36
|
+
@staticmethod
|
37
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
38
|
+
parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
|
39
|
+
parser.add_argument(
|
40
|
+
"--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
|
41
|
+
)
|
42
|
+
parser.add_argument(
|
43
|
+
"--input-len", type=int, nargs="+", default=BenchArgs.input_len
|
44
|
+
)
|
45
|
+
parser.add_argument(
|
46
|
+
"--output-len", type=int, nargs="+", default=BenchArgs.output_len
|
47
|
+
)
|
48
|
+
parser.add_argument(
|
49
|
+
"--result-filename", type=str, default=BenchArgs.result_filename
|
50
|
+
)
|
51
|
+
|
52
|
+
@classmethod
|
53
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
54
|
+
# use the default value's type to case the args into correct types.
|
55
|
+
attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
|
56
|
+
return cls(
|
57
|
+
**{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
|
58
|
+
)
|
59
|
+
|
60
|
+
|
61
|
+
def launch_server_internal(server_args):
|
62
|
+
try:
|
63
|
+
launch_server(server_args)
|
64
|
+
except Exception as e:
|
65
|
+
raise e
|
66
|
+
finally:
|
67
|
+
kill_child_process(os.getpid(), including_parent=False)
|
68
|
+
|
69
|
+
|
70
|
+
def launch_server_process(server_args: ServerArgs):
|
71
|
+
proc = multiprocessing.Process(target=launch_server_internal, args=(server_args,))
|
72
|
+
proc.start()
|
73
|
+
base_url = f"http://{server_args.host}:{server_args.port}"
|
74
|
+
timeout = 600
|
75
|
+
|
76
|
+
start_time = time.time()
|
77
|
+
while time.time() - start_time < timeout:
|
78
|
+
try:
|
79
|
+
headers = {
|
80
|
+
"Content-Type": "application/json; charset=utf-8",
|
81
|
+
}
|
82
|
+
response = requests.get(f"{base_url}/v1/models", headers=headers)
|
83
|
+
if response.status_code == 200:
|
84
|
+
return proc, base_url
|
85
|
+
except requests.RequestException:
|
86
|
+
pass
|
87
|
+
time.sleep(10)
|
88
|
+
raise TimeoutError("Server failed to start within the timeout period.")
|
89
|
+
|
90
|
+
|
91
|
+
def run_one_case(
|
92
|
+
url: str,
|
93
|
+
batch_size: int,
|
94
|
+
input_len: int,
|
95
|
+
output_len: int,
|
96
|
+
run_name: str,
|
97
|
+
result_filename: str,
|
98
|
+
):
|
99
|
+
input_ids = [
|
100
|
+
[int(x) for x in np.random.randint(0, high=16384, size=(input_len,))]
|
101
|
+
for _ in range(batch_size)
|
102
|
+
]
|
103
|
+
|
104
|
+
tic = time.time()
|
105
|
+
response = requests.post(
|
106
|
+
url + "/generate",
|
107
|
+
json={
|
108
|
+
"input_ids": input_ids,
|
109
|
+
"sampling_params": {
|
110
|
+
"temperature": 0,
|
111
|
+
"max_new_tokens": output_len,
|
112
|
+
"ignore_eos": True,
|
113
|
+
},
|
114
|
+
},
|
115
|
+
)
|
116
|
+
latency = time.time() - tic
|
117
|
+
|
118
|
+
_ = response.json()
|
119
|
+
output_throughput = batch_size * output_len / latency
|
120
|
+
overall_throughput = batch_size * (input_len + output_len) / latency
|
121
|
+
|
122
|
+
print(f"batch size: {batch_size}")
|
123
|
+
print(f"latency: {latency:.2f} s")
|
124
|
+
print(f"output throughput: {output_throughput:.2f} token/s")
|
125
|
+
print(f"(input + output) throughput: {overall_throughput:.2f} token/s")
|
126
|
+
|
127
|
+
if result_filename:
|
128
|
+
with open(result_filename, "a") as fout:
|
129
|
+
res = {
|
130
|
+
"run_name": run_name,
|
131
|
+
"batch_size": batch_size,
|
132
|
+
"input_len": input_len,
|
133
|
+
"output_len": output_len,
|
134
|
+
"latency": round(latency, 4),
|
135
|
+
"output_throughput": round(output_throughput, 2),
|
136
|
+
"overall_throughput": round(overall_throughput, 2),
|
137
|
+
}
|
138
|
+
fout.write(json.dumps(res) + "\n")
|
139
|
+
|
140
|
+
|
141
|
+
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
142
|
+
proc, base_url = launch_server_process(server_args)
|
143
|
+
|
144
|
+
# warmup
|
145
|
+
run_one_case(
|
146
|
+
base_url,
|
147
|
+
batch_size=16,
|
148
|
+
input_len=1024,
|
149
|
+
output_len=16,
|
150
|
+
run_name="",
|
151
|
+
result_filename="",
|
152
|
+
)
|
153
|
+
|
154
|
+
# benchmark
|
155
|
+
try:
|
156
|
+
for bs, il, ol in itertools.product(
|
157
|
+
bench_args.batch_size, bench_args.input_len, bench_args.output_len
|
158
|
+
):
|
159
|
+
run_one_case(
|
160
|
+
base_url,
|
161
|
+
bs,
|
162
|
+
il,
|
163
|
+
ol,
|
164
|
+
bench_args.run_name,
|
165
|
+
bench_args.result_filename,
|
166
|
+
)
|
167
|
+
finally:
|
168
|
+
kill_child_process(proc.pid)
|
169
|
+
|
170
|
+
print(f"\nResults are saved to {bench_args.result_filename}")
|
171
|
+
|
172
|
+
|
173
|
+
if __name__ == "__main__":
|
174
|
+
parser = argparse.ArgumentParser()
|
175
|
+
ServerArgs.add_cli_args(parser)
|
176
|
+
BenchArgs.add_cli_args(parser)
|
177
|
+
# For this script, model-path is not required
|
178
|
+
assert (
|
179
|
+
parser._actions[1].option_strings[0] == "--model-path"
|
180
|
+
), "options changed, this code need to be updated"
|
181
|
+
parser._actions[1].required = False
|
182
|
+
args = parser.parse_args()
|
183
|
+
|
184
|
+
server_args = ServerArgs.from_cli_args(args)
|
185
|
+
bench_args = BenchArgs.from_cli_args(args)
|
186
|
+
|
187
|
+
run_benchmark(server_args, bench_args)
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# Adapted from https://github.com/vllm-project/vllm/blob/6366efc67b0aedd2c1721c14385370e50b297fb3/benchmarks/benchmark_serving.py
|
3
3
|
|
4
4
|
"""
|
5
|
-
Benchmark online serving.
|
5
|
+
Benchmark online serving with dynamic requests.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
@@ -19,7 +19,12 @@ from typing import Optional
|
|
19
19
|
import torch
|
20
20
|
import torch.nn as nn
|
21
21
|
import torch.nn.functional as F
|
22
|
-
|
22
|
+
|
23
|
+
from sglang.srt.utils import is_hip
|
24
|
+
|
25
|
+
if not is_hip():
|
26
|
+
from flashinfer.activation import gelu_and_mul, gelu_tanh_and_mul, silu_and_mul
|
27
|
+
|
23
28
|
from vllm.distributed import (
|
24
29
|
divide,
|
25
30
|
get_tensor_model_parallel_rank,
|
@@ -29,8 +34,6 @@ from vllm.model_executor.custom_op import CustomOp
|
|
29
34
|
from vllm.model_executor.layers.quantization import QuantizationConfig
|
30
35
|
from vllm.model_executor.utils import set_weight_attrs
|
31
36
|
|
32
|
-
from sglang.srt.utils import is_hip
|
33
|
-
|
34
37
|
logger = logging.getLogger(__name__)
|
35
38
|
|
36
39
|
|
@@ -20,16 +20,19 @@ from typing import Optional, Tuple, Union
|
|
20
20
|
|
21
21
|
import torch
|
22
22
|
import torch.nn as nn
|
23
|
-
from flashinfer.norm import (
|
24
|
-
fused_add_rmsnorm,
|
25
|
-
gemma_fused_add_rmsnorm,
|
26
|
-
gemma_rmsnorm,
|
27
|
-
rmsnorm,
|
28
|
-
)
|
29
|
-
from vllm.model_executor.custom_op import CustomOp
|
30
23
|
|
31
24
|
from sglang.srt.utils import is_hip
|
32
25
|
|
26
|
+
if not is_hip():
|
27
|
+
from flashinfer.norm import (
|
28
|
+
fused_add_rmsnorm,
|
29
|
+
gemma_fused_add_rmsnorm,
|
30
|
+
gemma_rmsnorm,
|
31
|
+
rmsnorm,
|
32
|
+
)
|
33
|
+
|
34
|
+
from vllm.model_executor.custom_op import CustomOp
|
35
|
+
|
33
36
|
logger = logging.getLogger(__name__)
|
34
37
|
|
35
38
|
|
@@ -31,8 +31,11 @@ class Sampler(nn.Module):
|
|
31
31
|
logits = logits.next_token_logits
|
32
32
|
|
33
33
|
# Post process logits
|
34
|
+
logits = logits.contiguous()
|
34
35
|
logits.div_(sampling_info.temperatures)
|
35
|
-
probs =
|
36
|
+
probs = torch.softmax(logits, dim=-1)
|
37
|
+
logits = None
|
38
|
+
del logits
|
36
39
|
|
37
40
|
if torch.any(torch.isnan(probs)):
|
38
41
|
logger.warning("Detected errors during sampling! NaN in the probability.")
|
@@ -53,7 +56,11 @@ class Sampler(nn.Module):
|
|
53
56
|
)
|
54
57
|
else:
|
55
58
|
batch_next_token_ids, success = top_k_top_p_sampling_from_probs(
|
56
|
-
probs,
|
59
|
+
probs,
|
60
|
+
uniform_samples,
|
61
|
+
sampling_info.top_ks,
|
62
|
+
sampling_info.top_ps,
|
63
|
+
filter_apply_order="joint",
|
57
64
|
)
|
58
65
|
|
59
66
|
if not torch.all(success):
|
@@ -133,6 +133,9 @@ class GenerateReqInput:
|
|
133
133
|
self.image_data = [None] * num
|
134
134
|
elif not isinstance(self.image_data, list):
|
135
135
|
self.image_data = [self.image_data] * num
|
136
|
+
elif isinstance(self.image_data, list):
|
137
|
+
# multi-image with n > 1
|
138
|
+
self.image_data = self.image_data * num
|
136
139
|
|
137
140
|
if self.sampling_params is None:
|
138
141
|
self.sampling_params = [{}] * num
|
@@ -119,19 +119,32 @@ class PrefillAdder:
|
|
119
119
|
self.running_batch = running_batch
|
120
120
|
self.new_token_ratio = new_token_ratio
|
121
121
|
self.rem_total_tokens = rem_total_tokens - mixed_with_decode_tokens
|
122
|
-
self.rem_total_tokens_ = self.rem_total_tokens
|
123
|
-
self.total_tokens = rem_total_tokens
|
124
122
|
self.rem_input_tokens = rem_input_tokens - mixed_with_decode_tokens
|
125
123
|
self.rem_chunk_tokens = rem_chunk_tokens
|
126
124
|
if self.rem_chunk_tokens is not None:
|
127
125
|
self.rem_chunk_tokens -= mixed_with_decode_tokens
|
128
126
|
|
127
|
+
self.cur_rem_tokens = rem_total_tokens - mixed_with_decode_tokens
|
128
|
+
|
129
129
|
self.req_states = None
|
130
130
|
self.can_run_list = []
|
131
131
|
self.new_inflight_req = None
|
132
132
|
self.log_hit_tokens = 0
|
133
133
|
self.log_input_tokens = 0
|
134
134
|
|
135
|
+
if running_batch is not None:
|
136
|
+
# Pre-remove the tokens which will be occupied by the running requests
|
137
|
+
self.rem_total_tokens -= sum(
|
138
|
+
[
|
139
|
+
min(
|
140
|
+
(r.sampling_params.max_new_tokens - len(r.output_ids)),
|
141
|
+
CLIP_MAX_NEW_TOKENS,
|
142
|
+
)
|
143
|
+
* self.new_token_ratio
|
144
|
+
for r in running_batch.reqs
|
145
|
+
]
|
146
|
+
)
|
147
|
+
|
135
148
|
def no_remaining_tokens(self):
|
136
149
|
return (
|
137
150
|
self.rem_total_tokens <= 0
|
@@ -141,31 +154,14 @@ class PrefillAdder:
|
|
141
154
|
if self.rem_chunk_tokens is not None
|
142
155
|
else False
|
143
156
|
)
|
144
|
-
|
145
|
-
|
146
|
-
def remove_running_tokens(self, running_batch: ScheduleBatch):
|
147
|
-
self.rem_total_tokens -= sum(
|
148
|
-
[
|
149
|
-
min(
|
150
|
-
(r.sampling_params.max_new_tokens - len(r.output_ids)),
|
151
|
-
CLIP_MAX_NEW_TOKENS,
|
152
|
-
)
|
153
|
-
* self.new_token_ratio
|
154
|
-
for r in running_batch.reqs
|
155
|
-
]
|
156
|
-
)
|
157
|
-
self.rem_total_tokens_ -= sum(
|
158
|
-
[
|
159
|
-
r.sampling_params.max_new_tokens - len(r.output_ids)
|
160
|
-
for r in running_batch.reqs
|
161
|
-
]
|
157
|
+
or self.cur_rem_tokens <= 0
|
162
158
|
)
|
163
159
|
|
164
160
|
def _prefill_one_req(
|
165
161
|
self, prefix_len: int, extend_input_len: int, max_new_tokens: int
|
166
162
|
):
|
167
163
|
self.rem_total_tokens -= extend_input_len + max_new_tokens
|
168
|
-
self.
|
164
|
+
self.cur_rem_tokens -= extend_input_len
|
169
165
|
self.rem_input_tokens -= extend_input_len
|
170
166
|
if self.rem_chunk_tokens is not None:
|
171
167
|
self.rem_chunk_tokens -= extend_input_len
|
@@ -173,29 +169,7 @@ class PrefillAdder:
|
|
173
169
|
self.log_hit_tokens += prefix_len
|
174
170
|
self.log_input_tokens += extend_input_len
|
175
171
|
|
176
|
-
def add_inflight_req_ignore_eos(self, req: Req):
|
177
|
-
truncated = req.extend_input_len > self.rem_chunk_tokens
|
178
|
-
req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
|
179
|
-
req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
|
180
|
-
self.can_run_list.append(req)
|
181
|
-
|
182
|
-
self._prefill_one_req(
|
183
|
-
0,
|
184
|
-
req.extend_input_len,
|
185
|
-
(
|
186
|
-
min(req.sampling_params.max_new_tokens, CLIP_MAX_NEW_TOKENS)
|
187
|
-
if not truncated
|
188
|
-
else 0
|
189
|
-
),
|
190
|
-
)
|
191
|
-
|
192
|
-
# Return if chunked prefill not finished
|
193
|
-
return req if truncated else None
|
194
|
-
|
195
172
|
def add_inflight_req(self, req: Req):
|
196
|
-
if req.sampling_params.ignore_eos:
|
197
|
-
return self.add_inflight_req_ignore_eos(req)
|
198
|
-
|
199
173
|
truncated = req.extend_input_len > self.rem_chunk_tokens
|
200
174
|
req.extend_input_len = min(req.extend_input_len, self.rem_chunk_tokens)
|
201
175
|
req.fill_ids = req.fill_ids[: len(req.prefix_indices) + req.extend_input_len]
|
@@ -225,7 +199,7 @@ class PrefillAdder:
|
|
225
199
|
self.rem_total_tokens += delta
|
226
200
|
|
227
201
|
def add_one_req_ignore_eos(self, req: Req):
|
228
|
-
def
|
202
|
+
def add_req_state(r, insert_sort=False):
|
229
203
|
new_token_ratio = (
|
230
204
|
1.0 if r.sampling_params.ignore_eos else self.new_token_ratio
|
231
205
|
)
|
@@ -235,56 +209,38 @@ class PrefillAdder:
|
|
235
209
|
tokens_occupied = len(r.origin_input_ids) + len(r.output_ids)
|
236
210
|
|
237
211
|
if tokens_left > 0:
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
can_run = False
|
244
|
-
if (
|
245
|
-
req.extend_input_len + req.sampling_params.max_new_tokens
|
246
|
-
<= self.rem_total_tokens
|
247
|
-
):
|
248
|
-
can_run = True
|
249
|
-
|
250
|
-
if not can_run:
|
251
|
-
if self.req_states is None:
|
252
|
-
self.req_states = []
|
253
|
-
if self.running_batch is not None:
|
254
|
-
for r in self.running_batch.reqs:
|
255
|
-
state = get_req_state(r)
|
256
|
-
if state is not None:
|
257
|
-
self.req_states.append(state)
|
258
|
-
for r in self.can_run_list:
|
259
|
-
state = get_req_state(r)
|
260
|
-
if state is not None:
|
261
|
-
self.req_states.append(state)
|
262
|
-
state = get_req_state(req)
|
263
|
-
if state is not None:
|
264
|
-
self.req_states.append(state)
|
265
|
-
|
266
|
-
self.req_states.sort(key=lambda x: x[0])
|
267
|
-
else:
|
268
|
-
state = get_req_state(req)
|
269
|
-
if state is not None:
|
270
|
-
for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
|
271
|
-
if tokens_left >= state[0]:
|
272
|
-
self.req_states.insert(i, state)
|
212
|
+
if not insert_sort:
|
213
|
+
self.req_states.append((tokens_left, tokens_occupied))
|
214
|
+
else:
|
215
|
+
for i in range(len(self.req_states)):
|
216
|
+
if tokens_left <= self.req_states[i][0]:
|
273
217
|
break
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
)
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
218
|
+
self.req_states.insert(i, (tokens_left, tokens_occupied))
|
219
|
+
|
220
|
+
if self.req_states is None:
|
221
|
+
self.req_states = []
|
222
|
+
add_req_state(req)
|
223
|
+
if self.running_batch is not None:
|
224
|
+
for r in self.running_batch.reqs:
|
225
|
+
add_req_state(r)
|
226
|
+
for r in self.can_run_list:
|
227
|
+
add_req_state(r)
|
228
|
+
self.req_states.sort(key=lambda x: x[0])
|
229
|
+
else:
|
230
|
+
add_req_state(req, insert_sort=True)
|
231
|
+
|
232
|
+
cur_rem_tokens = self.cur_rem_tokens - len(req.origin_input_ids)
|
233
|
+
tokens_freed = 0
|
234
|
+
for i, (tokens_left, tokens_occupied) in enumerate(self.req_states):
|
235
|
+
decode_steps = (
|
236
|
+
self.req_states[i + 1][0]
|
237
|
+
if i + 1 < len(self.req_states)
|
238
|
+
else tokens_left
|
239
|
+
)
|
240
|
+
bs = len(self.req_states) - i
|
241
|
+
if cur_rem_tokens + tokens_freed - decode_steps * bs <= 0:
|
242
|
+
return False
|
243
|
+
tokens_freed += tokens_occupied
|
288
244
|
|
289
245
|
if req.extend_input_len <= self.rem_chunk_tokens:
|
290
246
|
self.can_run_list.append(req)
|
@@ -40,7 +40,7 @@ global_server_args_dict = {
|
|
40
40
|
"attention_backend": ServerArgs.attention_backend,
|
41
41
|
"sampling_backend": ServerArgs.sampling_backend,
|
42
42
|
"triton_attention_reduce_in_fp32": ServerArgs.triton_attention_reduce_in_fp32,
|
43
|
-
"
|
43
|
+
"disable_mla": ServerArgs.disable_mla,
|
44
44
|
"torchao_config": ServerArgs.torchao_config,
|
45
45
|
}
|
46
46
|
|
@@ -445,9 +445,6 @@ class ModelTpServer:
|
|
445
445
|
num_mixed_running,
|
446
446
|
)
|
447
447
|
|
448
|
-
if self.running_batch is not None:
|
449
|
-
adder.remove_running_tokens(self.running_batch)
|
450
|
-
|
451
448
|
has_inflight = self.current_inflight_req is not None
|
452
449
|
if self.current_inflight_req is not None:
|
453
450
|
self.current_inflight_req.init_next_round_input(
|
@@ -465,9 +462,6 @@ class ModelTpServer:
|
|
465
462
|
)
|
466
463
|
|
467
464
|
for req in self.waiting_queue:
|
468
|
-
if adder.no_remaining_tokens():
|
469
|
-
break
|
470
|
-
req.init_next_round_input(None if prefix_computed else self.tree_cache)
|
471
465
|
if (
|
472
466
|
self.lora_paths is not None
|
473
467
|
and len(
|
@@ -478,6 +472,10 @@ class ModelTpServer:
|
|
478
472
|
> self.max_loras_per_batch
|
479
473
|
):
|
480
474
|
break
|
475
|
+
|
476
|
+
if adder.no_remaining_tokens():
|
477
|
+
break
|
478
|
+
req.init_next_round_input(None if prefix_computed else self.tree_cache)
|
481
479
|
res = adder.add_one_req(req)
|
482
480
|
if (
|
483
481
|
not res
|
@@ -507,6 +505,11 @@ class ModelTpServer:
|
|
507
505
|
else:
|
508
506
|
tree_cache_hit_rate = 0.0
|
509
507
|
|
508
|
+
num_used = self.max_total_num_tokens - (
|
509
|
+
self.token_to_kv_pool.available_size()
|
510
|
+
+ self.tree_cache.evictable_size()
|
511
|
+
)
|
512
|
+
|
510
513
|
if num_mixed_running > 0:
|
511
514
|
logger.info(
|
512
515
|
f"Prefill batch"
|
@@ -515,6 +518,7 @@ class ModelTpServer:
|
|
515
518
|
f"#new-token: {adder.log_input_tokens}, "
|
516
519
|
f"#cached-token: {adder.log_hit_tokens}, "
|
517
520
|
f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
|
521
|
+
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
518
522
|
f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
|
519
523
|
)
|
520
524
|
else:
|
@@ -524,6 +528,7 @@ class ModelTpServer:
|
|
524
528
|
f"#new-token: {adder.log_input_tokens}, "
|
525
529
|
f"#cached-token: {adder.log_hit_tokens}, "
|
526
530
|
f"cache hit rate: {100.0 * tree_cache_hit_rate:.2f}%, "
|
531
|
+
f"token usage: {num_used / self.max_total_num_tokens:.2f}, "
|
527
532
|
f"#running-req: {running_bs}, "
|
528
533
|
f"#queue-req: {len(self.waiting_queue) - len(can_run_list) + has_inflight}"
|
529
534
|
)
|