sglang 0.3.2__tar.gz → 0.3.3.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.2/sglang.egg-info → sglang-0.3.3.post1}/PKG-INFO +46 -21
- {sglang-0.3.2 → sglang-0.3.3.post1}/README.md +44 -20
- {sglang-0.3.2 → sglang-0.3.3.post1}/pyproject.toml +2 -2
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/__init__.py +2 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/api.py +23 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_latency.py +48 -27
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_serving.py +2 -2
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/runtime_endpoint.py +14 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/interpreter.py +16 -6
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/ir.py +20 -4
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/model_config.py +11 -9
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/fsm_cache.py +9 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/jump_forward.py +15 -2
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/conversation.py +11 -2
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/activation.py +4 -4
- sglang-0.3.3.post1/sglang/srt/layers/attention/__init__.py +49 -0
- sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_backend.py +277 -0
- {sglang-0.3.2/sglang/srt/layers → sglang-0.3.3.post1/sglang/srt/layers/attention}/flashinfer_utils.py +82 -80
- sglang-0.3.3.post1/sglang/srt/layers/attention/triton_backend.py +161 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/extend_attention.py +3 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/layernorm.py +4 -4
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/logits_processor.py +19 -15
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/pooler.py +3 -3
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/__init__.py +0 -2
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/radix_attention.py +6 -4
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/sampler.py +6 -4
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/torchao_utils.py +18 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora.py +20 -21
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora_manager.py +97 -25
- sglang-0.3.3.post1/sglang/srt/managers/data_parallel_controller.py +177 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/detokenizer_manager.py +31 -18
- sglang-0.3.3.post1/sglang/srt/managers/image_processor.py +187 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/io_struct.py +105 -76
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/schedule_batch.py +190 -63
- sglang-0.3.2/sglang/srt/managers/policy_scheduler.py → sglang-0.3.3.post1/sglang/srt/managers/schedule_policy.py +31 -21
- sglang-0.3.2/sglang/srt/managers/tp_worker.py → sglang-0.3.3.post1/sglang/srt/managers/scheduler.py +420 -383
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/managers/tokenizer_manager.py +129 -248
- sglang-0.3.3.post1/sglang/srt/managers/tp_worker.py +128 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/memory_pool.py +34 -52
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/model_executor/cuda_graph_runner.py +15 -19
- sglang-0.3.3.post1/sglang/srt/model_executor/forward_batch_info.py +173 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/model_executor/model_runner.py +111 -105
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/baichuan.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/chatglm.py +12 -12
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/commandr.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/dbrx.py +12 -12
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/deepseek.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/deepseek_v2.py +14 -15
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/exaone.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gemma.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gemma2.py +11 -11
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/gpt_bigcode.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/grok.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/internlm2.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama.py +14 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama_classification.py +5 -5
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llama_embedding.py +4 -4
- sglang-0.3.3.post1/sglang/srt/models/llama_reward.py +142 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llava.py +39 -33
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/llavavid.py +31 -28
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/minicpm.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/minicpm3.py +14 -15
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mixtral.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mixtral_quant.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/olmoe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen2.py +11 -11
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/qwen2_moe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/stablelm.py +10 -10
- sglang-0.3.3.post1/sglang/srt/models/torch_native_llama.py +506 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/xverse.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/xverse_moe.py +10 -10
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/openai_api/adapter.py +5 -3
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_batch_info.py +54 -33
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/sampling_params.py +3 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/server.py +203 -117
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/server_args.py +59 -29
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/utils.py +127 -139
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/runners.py +71 -26
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_programs.py +38 -5
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_utils.py +18 -9
- sglang-0.3.3.post1/sglang/version.py +1 -0
- {sglang-0.3.2 → sglang-0.3.3.post1/sglang.egg-info}/PKG-INFO +46 -21
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/SOURCES.txt +13 -8
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/requires.txt +1 -0
- sglang-0.3.2/sglang/srt/layers/attention_backend.py +0 -474
- sglang-0.3.2/sglang/srt/managers/controller_multi.py +0 -207
- sglang-0.3.2/sglang/srt/managers/controller_single.py +0 -164
- sglang-0.3.2/sglang/srt/model_executor/forward_batch_info.py +0 -174
- sglang-0.3.2/sglang/version.py +0 -1
- {sglang-0.3.2 → sglang-0.3.3.post1}/LICENSE +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/setup.cfg +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/check_env.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/global_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/launch_server.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/decode_attention.py +0 -0
- {sglang-0.3.2/sglang/srt/layers/triton_attention → sglang-0.3.3.post1/sglang/srt/layers/attention/triton_ops}/prefill_attention.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang/utils.py +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.2 → sglang-0.3.3.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.3.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -233,6 +233,7 @@ Requires-Dist: uvloop; extra == "srt"
|
|
233
233
|
Requires-Dist: zmq; extra == "srt"
|
234
234
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
235
235
|
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
236
|
+
Requires-Dist: modelscope; extra == "srt"
|
236
237
|
Provides-Extra: openai
|
237
238
|
Requires-Dist: openai>=1.0; extra == "openai"
|
238
239
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -256,8 +257,8 @@ Provides-Extra: dev
|
|
256
257
|
Requires-Dist: sglang[all]; extra == "dev"
|
257
258
|
Requires-Dist: sglang[test]; extra == "dev"
|
258
259
|
|
259
|
-
<div align="center">
|
260
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
260
|
+
<div align="center" id="sglangtop">
|
261
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
261
262
|
|
262
263
|
[](https://pypi.org/project/sglang)
|
263
264
|

|
@@ -269,16 +270,10 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
269
270
|
|
270
271
|
--------------------------------------------------------------------------------
|
271
272
|
|
272
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
273
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
273
274
|
|
274
|
-
|
275
|
-
|
276
|
-
The core features include:
|
277
|
-
|
278
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
279
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
280
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
281
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
275
|
+
## Upcoming Events
|
276
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
282
277
|
|
283
278
|
## News
|
284
279
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -294,6 +289,16 @@ The core features include:
|
|
294
289
|
|
295
290
|
</details>
|
296
291
|
|
292
|
+
## About
|
293
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
294
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
295
|
+
The core features include:
|
296
|
+
|
297
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
298
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
299
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
300
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
301
|
+
|
297
302
|
## Contents
|
298
303
|
- [Install](#install)
|
299
304
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -318,7 +323,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
318
323
|
### Method 2: From source
|
319
324
|
```
|
320
325
|
# Use the last release branch
|
321
|
-
git clone -b v0.3.
|
326
|
+
git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
|
322
327
|
cd sglang
|
323
328
|
|
324
329
|
pip install --upgrade pip
|
@@ -339,7 +344,7 @@ docker run --gpus all \
|
|
339
344
|
--env "HF_TOKEN=<secret>" \
|
340
345
|
--ipc=host \
|
341
346
|
lmsysorg/sglang:latest \
|
342
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
347
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
343
348
|
```
|
344
349
|
|
345
350
|
### Method 4: Using docker compose
|
@@ -379,7 +384,7 @@ resources:
|
|
379
384
|
run: |
|
380
385
|
conda deactivate
|
381
386
|
python3 -m sglang.launch_server \
|
382
|
-
--model-path meta-llama/
|
387
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
383
388
|
--host 0.0.0.0 \
|
384
389
|
--port 30000
|
385
390
|
```
|
@@ -421,7 +426,8 @@ curl http://localhost:30000/generate \
|
|
421
426
|
}
|
422
427
|
}'
|
423
428
|
```
|
424
|
-
|
429
|
+
|
430
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
425
431
|
|
426
432
|
### OpenAI Compatible API
|
427
433
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -460,7 +466,7 @@ response = client.embeddings.create(
|
|
460
466
|
print(response)
|
461
467
|
```
|
462
468
|
|
463
|
-
It supports streaming, vision, and
|
469
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
464
470
|
|
465
471
|
### Additional Server Arguments
|
466
472
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -481,10 +487,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
481
487
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
482
488
|
```
|
483
489
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
490
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
484
491
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
485
492
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
486
493
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
487
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
494
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
488
495
|
```
|
489
496
|
# Node 0
|
490
497
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -499,9 +506,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
499
506
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
500
507
|
- Mistral / Mixtral / Mistral NeMo
|
501
508
|
- Gemma / Gemma 2
|
502
|
-
- OLMoE
|
503
509
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
504
510
|
- DeepSeek / DeepSeek 2
|
511
|
+
- OLMoE
|
505
512
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
506
513
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
507
514
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -523,7 +530,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
523
530
|
- XVERSE / XVERSE MoE
|
524
531
|
- SmolLM
|
525
532
|
|
526
|
-
|
527
533
|
**Embedding Models**
|
528
534
|
|
529
535
|
- e5-mistral
|
@@ -544,6 +550,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
544
550
|
```
|
545
551
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
546
552
|
```
|
553
|
+
|
554
|
+
Or start it by docker.
|
555
|
+
```bash
|
556
|
+
docker run --gpus all \
|
557
|
+
-p 30000:30000 \
|
558
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
559
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
560
|
+
--ipc=host \
|
561
|
+
lmsysorg/sglang:latest \
|
562
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
563
|
+
```
|
547
564
|
|
548
565
|
</details>
|
549
566
|
|
@@ -582,7 +599,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
582
599
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
583
600
|
|
584
601
|
### Quick Start
|
585
|
-
The example below shows how to use sglang to answer a
|
602
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
586
603
|
|
587
604
|
#### Using Local Models
|
588
605
|
First, launch a server with
|
@@ -830,3 +847,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
830
847
|
## Citation And Acknowledgment
|
831
848
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
832
849
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
850
|
+
|
851
|
+
|
852
|
+
|
853
|
+
<p align="center">
|
854
|
+
<a href="#sglangtop" target="_blank">
|
855
|
+
<bold>Back To Top </bold>
|
856
|
+
</a>
|
857
|
+
</p>
|
@@ -1,5 +1,5 @@
|
|
1
|
-
<div align="center">
|
2
|
-
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400"></img>
|
1
|
+
<div align="center" id="sglangtop">
|
2
|
+
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
3
3
|
|
4
4
|
[](https://pypi.org/project/sglang)
|
5
5
|

|
@@ -11,16 +11,10 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pptx) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
15
15
|
|
16
|
-
|
17
|
-
|
18
|
-
The core features include:
|
19
|
-
|
20
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
21
|
-
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
22
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
23
|
-
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
16
|
+
## Upcoming Events
|
17
|
+
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
24
18
|
|
25
19
|
## News
|
26
20
|
- [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
@@ -36,6 +30,16 @@ The core features include:
|
|
36
30
|
|
37
31
|
</details>
|
38
32
|
|
33
|
+
## About
|
34
|
+
SGLang is a fast serving framework for large language models and vision language models.
|
35
|
+
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
36
|
+
The core features include:
|
37
|
+
|
38
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
39
|
+
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
40
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
|
41
|
+
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
42
|
+
|
39
43
|
## Contents
|
40
44
|
- [Install](#install)
|
41
45
|
- [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
|
@@ -60,7 +64,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
60
64
|
### Method 2: From source
|
61
65
|
```
|
62
66
|
# Use the last release branch
|
63
|
-
git clone -b v0.3.
|
67
|
+
git clone -b v0.3.3.post1 https://github.com/sgl-project/sglang.git
|
64
68
|
cd sglang
|
65
69
|
|
66
70
|
pip install --upgrade pip
|
@@ -81,7 +85,7 @@ docker run --gpus all \
|
|
81
85
|
--env "HF_TOKEN=<secret>" \
|
82
86
|
--ipc=host \
|
83
87
|
lmsysorg/sglang:latest \
|
84
|
-
python3 -m sglang.launch_server --model-path meta-llama/
|
88
|
+
python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
|
85
89
|
```
|
86
90
|
|
87
91
|
### Method 4: Using docker compose
|
@@ -121,7 +125,7 @@ resources:
|
|
121
125
|
run: |
|
122
126
|
conda deactivate
|
123
127
|
python3 -m sglang.launch_server \
|
124
|
-
--model-path meta-llama/
|
128
|
+
--model-path meta-llama/Llama-3.1-8B-Instruct \
|
125
129
|
--host 0.0.0.0 \
|
126
130
|
--port 30000
|
127
131
|
```
|
@@ -163,7 +167,8 @@ curl http://localhost:30000/generate \
|
|
163
167
|
}
|
164
168
|
}'
|
165
169
|
```
|
166
|
-
|
170
|
+
|
171
|
+
Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
|
167
172
|
|
168
173
|
### OpenAI Compatible API
|
169
174
|
In addition, the server supports OpenAI-compatible APIs.
|
@@ -202,7 +207,7 @@ response = client.embeddings.create(
|
|
202
207
|
print(response)
|
203
208
|
```
|
204
209
|
|
205
|
-
It supports streaming, vision, and
|
210
|
+
It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
|
206
211
|
|
207
212
|
### Additional Server Arguments
|
208
213
|
- To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
|
@@ -223,10 +228,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
223
228
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
|
224
229
|
```
|
225
230
|
- To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
|
231
|
+
- To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
|
226
232
|
- To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
|
227
233
|
- To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
|
228
234
|
- If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
|
229
|
-
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
|
235
|
+
- To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
|
230
236
|
```
|
231
237
|
# Node 0
|
232
238
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
|
@@ -241,9 +247,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
247
|
- Llama / Llama 2 / Llama 3 / Llama 3.1
|
242
248
|
- Mistral / Mixtral / Mistral NeMo
|
243
249
|
- Gemma / Gemma 2
|
244
|
-
- OLMoE
|
245
250
|
- Qwen / Qwen 2 / Qwen 2 MoE
|
246
251
|
- DeepSeek / DeepSeek 2
|
252
|
+
- OLMoE
|
247
253
|
- [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
|
248
254
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
|
249
255
|
- `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
|
@@ -265,7 +271,6 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
265
271
|
- XVERSE / XVERSE MoE
|
266
272
|
- SmolLM
|
267
273
|
|
268
|
-
|
269
274
|
**Embedding Models**
|
270
275
|
|
271
276
|
- e5-mistral
|
@@ -286,6 +291,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
|
|
286
291
|
```
|
287
292
|
SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
|
288
293
|
```
|
294
|
+
|
295
|
+
Or start it by docker.
|
296
|
+
```bash
|
297
|
+
docker run --gpus all \
|
298
|
+
-p 30000:30000 \
|
299
|
+
-v ~/.cache/modelscope:/root/.cache/modelscope \
|
300
|
+
--env "SGLANG_USE_MODELSCOPE=true" \
|
301
|
+
--ipc=host \
|
302
|
+
lmsysorg/sglang:latest \
|
303
|
+
python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
|
304
|
+
```
|
289
305
|
|
290
306
|
</details>
|
291
307
|
|
@@ -324,7 +340,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
|
|
324
340
|
The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
|
325
341
|
|
326
342
|
### Quick Start
|
327
|
-
The example below shows how to use sglang to answer a
|
343
|
+
The example below shows how to use sglang to answer a multi-turn question.
|
328
344
|
|
329
345
|
#### Using Local Models
|
330
346
|
First, launch a server with
|
@@ -572,3 +588,11 @@ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
|
572
588
|
## Citation And Acknowledgment
|
573
589
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
574
590
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
591
|
+
|
592
|
+
|
593
|
+
|
594
|
+
<p align="center">
|
595
|
+
<a href="#sglangtop" target="_blank">
|
596
|
+
<bold>Back To Top </bold>
|
597
|
+
</a>
|
598
|
+
</p>
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.3.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -23,7 +23,7 @@ dependencies = [
|
|
23
23
|
srt = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
24
|
"packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
25
|
"torch", "torchao", "uvicorn", "uvloop", "zmq",
|
26
|
-
"vllm==0.5.5", "outlines>=0.0.44"]
|
26
|
+
"vllm==0.5.5", "outlines>=0.0.44", "modelscope"]
|
27
27
|
openai = ["openai>=1.0", "tiktoken"]
|
28
28
|
anthropic = ["anthropic>=0.20.0"]
|
29
29
|
litellm = ["litellm>=1.0.0"]
|
@@ -1,6 +1,7 @@
|
|
1
1
|
# SGL API Components
|
2
2
|
|
3
3
|
from sglang.api import (
|
4
|
+
Engine,
|
4
5
|
Runtime,
|
5
6
|
assistant,
|
6
7
|
assistant_begin,
|
@@ -31,6 +32,7 @@ from sglang.lang.choices import (
|
|
31
32
|
# SGLang DSL APIs
|
32
33
|
__all__ = [
|
33
34
|
"Runtime",
|
35
|
+
"Engine",
|
34
36
|
"assistant",
|
35
37
|
"assistant_begin",
|
36
38
|
"assistant_end",
|
@@ -33,13 +33,23 @@ def function(
|
|
33
33
|
|
34
34
|
|
35
35
|
def Runtime(*args, **kwargs):
|
36
|
-
# Avoid importing unnecessary dependency
|
37
36
|
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
37
|
+
|
38
|
+
# Avoid importing unnecessary dependency
|
38
39
|
from sglang.srt.server import Runtime
|
39
40
|
|
40
41
|
return Runtime(*args, **kwargs)
|
41
42
|
|
42
43
|
|
44
|
+
def Engine(*args, **kwargs):
|
45
|
+
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
|
46
|
+
|
47
|
+
# Avoid importing unnecessary dependency
|
48
|
+
from sglang.srt.server import Engine
|
49
|
+
|
50
|
+
return Engine(*args, **kwargs)
|
51
|
+
|
52
|
+
|
43
53
|
def set_default_backend(backend: BaseBackend):
|
44
54
|
global_config.default_backend = backend
|
45
55
|
|
@@ -48,6 +58,10 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
48
58
|
backend = backend or global_config.default_backend
|
49
59
|
if backend is None:
|
50
60
|
return False
|
61
|
+
|
62
|
+
# If backend is Runtime
|
63
|
+
if hasattr(backend, "endpoint"):
|
64
|
+
backend = backend.endpoint
|
51
65
|
return backend.flush_cache()
|
52
66
|
|
53
67
|
|
@@ -55,12 +69,17 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
55
69
|
backend = backend or global_config.default_backend
|
56
70
|
if backend is None:
|
57
71
|
return None
|
72
|
+
|
73
|
+
# If backend is Runtime
|
74
|
+
if hasattr(backend, "endpoint"):
|
75
|
+
backend = backend.endpoint
|
58
76
|
return backend.get_server_args()
|
59
77
|
|
60
78
|
|
61
79
|
def gen(
|
62
80
|
name: Optional[str] = None,
|
63
81
|
max_tokens: Optional[int] = None,
|
82
|
+
min_tokens: Optional[int] = None,
|
64
83
|
stop: Optional[Union[str, List[str]]] = None,
|
65
84
|
stop_token_ids: Optional[List[int]] = None,
|
66
85
|
temperature: Optional[float] = None,
|
@@ -100,6 +119,7 @@ def gen(
|
|
100
119
|
return SglGen(
|
101
120
|
name,
|
102
121
|
max_tokens,
|
122
|
+
min_tokens,
|
103
123
|
stop,
|
104
124
|
stop_token_ids,
|
105
125
|
temperature,
|
@@ -139,6 +159,7 @@ def gen_int(
|
|
139
159
|
return SglGen(
|
140
160
|
name,
|
141
161
|
max_tokens,
|
162
|
+
None,
|
142
163
|
stop,
|
143
164
|
stop_token_ids,
|
144
165
|
temperature,
|
@@ -177,6 +198,7 @@ def gen_string(
|
|
177
198
|
return SglGen(
|
178
199
|
name,
|
179
200
|
max_tokens,
|
201
|
+
None,
|
180
202
|
stop,
|
181
203
|
stop_token_ids,
|
182
204
|
temperature,
|
@@ -47,6 +47,7 @@ I'm going to the park
|
|
47
47
|
import argparse
|
48
48
|
import dataclasses
|
49
49
|
import itertools
|
50
|
+
import json
|
50
51
|
import logging
|
51
52
|
import multiprocessing
|
52
53
|
import os
|
@@ -62,10 +63,11 @@ import torch.distributed as dist
|
|
62
63
|
from sglang.srt.configs.model_config import ModelConfig
|
63
64
|
from sglang.srt.hf_transformers_utils import get_tokenizer
|
64
65
|
from sglang.srt.managers.schedule_batch import Req, ScheduleBatch
|
66
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
65
67
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
66
68
|
from sglang.srt.sampling.sampling_params import SamplingParams
|
67
69
|
from sglang.srt.server import _set_envs_and_config
|
68
|
-
from sglang.srt.server_args import ServerArgs
|
70
|
+
from sglang.srt.server_args import PortArgs, ServerArgs
|
69
71
|
from sglang.srt.utils import (
|
70
72
|
configure_logger,
|
71
73
|
kill_child_process,
|
@@ -121,7 +123,7 @@ class BenchArgs:
|
|
121
123
|
)
|
122
124
|
|
123
125
|
|
124
|
-
def load_model(server_args, tp_rank):
|
126
|
+
def load_model(server_args, port_args, tp_rank):
|
125
127
|
suppress_other_loggers()
|
126
128
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
127
129
|
|
@@ -129,6 +131,7 @@ def load_model(server_args, tp_rank):
|
|
129
131
|
server_args.model_path,
|
130
132
|
server_args.trust_remote_code,
|
131
133
|
context_length=server_args.context_length,
|
134
|
+
model_override_args=json.loads(server_args.json_model_override_args),
|
132
135
|
)
|
133
136
|
model_runner = ModelRunner(
|
134
137
|
model_config=model_config,
|
@@ -136,7 +139,7 @@ def load_model(server_args, tp_rank):
|
|
136
139
|
gpu_id=tp_rank,
|
137
140
|
tp_rank=tp_rank,
|
138
141
|
tp_size=server_args.tp_size,
|
139
|
-
nccl_port=
|
142
|
+
nccl_port=port_args.nccl_port,
|
140
143
|
server_args=server_args,
|
141
144
|
)
|
142
145
|
rank_print(f"max_total_num_tokens={model_runner.max_total_num_tokens}")
|
@@ -167,9 +170,13 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer):
|
|
167
170
|
assert len(input_ids[i]) > bench_args.cut_len
|
168
171
|
|
169
172
|
tmp_input_ids = input_ids[i][: bench_args.cut_len]
|
170
|
-
req = Req(
|
173
|
+
req = Req(
|
174
|
+
rid=i,
|
175
|
+
origin_input_text=prompts[i],
|
176
|
+
origin_input_ids=tmp_input_ids,
|
177
|
+
sampling_params=sampling_params,
|
178
|
+
)
|
171
179
|
req.prefix_indices = []
|
172
|
-
req.sampling_params = sampling_params
|
173
180
|
req.fill_ids = req.origin_input_ids
|
174
181
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
175
182
|
reqs.append(req)
|
@@ -199,9 +206,13 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
199
206
|
|
200
207
|
reqs = []
|
201
208
|
for i in range(len(input_ids)):
|
202
|
-
req = Req(
|
209
|
+
req = Req(
|
210
|
+
rid=i,
|
211
|
+
origin_input_text="",
|
212
|
+
origin_input_ids=list(input_ids[i]),
|
213
|
+
sampling_params=sampling_params,
|
214
|
+
)
|
203
215
|
req.prefix_indices = []
|
204
|
-
req.sampling_params = sampling_params
|
205
216
|
req.fill_ids = req.origin_input_ids
|
206
217
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
207
218
|
reqs.append(req)
|
@@ -209,6 +220,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
209
220
|
return reqs
|
210
221
|
|
211
222
|
|
223
|
+
@torch.inference_mode()
|
212
224
|
def extend(reqs, model_runner):
|
213
225
|
batch = ScheduleBatch.init_new(
|
214
226
|
reqs=reqs,
|
@@ -217,28 +229,33 @@ def extend(reqs, model_runner):
|
|
217
229
|
tree_cache=None,
|
218
230
|
)
|
219
231
|
batch.prepare_for_extend(model_runner.model_config.vocab_size)
|
220
|
-
|
221
|
-
|
232
|
+
model_worker_batch = batch.get_model_worker_batch()
|
233
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
234
|
+
logits_output = model_runner.forward(forward_batch)
|
235
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
|
222
236
|
return next_token_ids, logits_output.next_token_logits, batch
|
223
237
|
|
224
238
|
|
239
|
+
@torch.inference_mode()
|
225
240
|
def decode(input_token_ids, batch, model_runner):
|
226
241
|
batch.prepare_for_decode(input_token_ids)
|
227
|
-
|
228
|
-
|
242
|
+
model_worker_batch = batch.get_model_worker_batch()
|
243
|
+
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
244
|
+
logits_output = model_runner.forward(forward_batch)
|
245
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch).tolist()
|
229
246
|
return next_token_ids, logits_output.next_token_logits
|
230
247
|
|
231
248
|
|
232
|
-
@torch.inference_mode()
|
233
249
|
def correctness_test(
|
234
250
|
server_args,
|
251
|
+
port_args,
|
235
252
|
bench_args,
|
236
253
|
tp_rank,
|
237
254
|
):
|
238
255
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
239
256
|
|
240
257
|
# Load the model
|
241
|
-
model_runner, tokenizer = load_model(server_args, tp_rank)
|
258
|
+
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
242
259
|
|
243
260
|
# Prepare inputs
|
244
261
|
input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
|
@@ -271,7 +288,6 @@ def correctness_test(
|
|
271
288
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
272
289
|
|
273
290
|
|
274
|
-
@torch.inference_mode()
|
275
291
|
def latency_test_run_once(
|
276
292
|
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
277
293
|
):
|
@@ -324,13 +340,16 @@ def latency_test_run_once(
|
|
324
340
|
rank_print(
|
325
341
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
326
342
|
)
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
343
|
+
|
344
|
+
# record decode timing from 2nd output
|
345
|
+
if output_len > 1:
|
346
|
+
med_decode_latency = np.median(decode_latencies)
|
347
|
+
med_decode_throughput = batch_size / med_decode_latency
|
348
|
+
rank_print(
|
349
|
+
f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
|
350
|
+
)
|
351
|
+
measurement_results["median_decode_latency"] = med_decode_latency
|
352
|
+
measurement_results["median_decode_throughput"] = med_decode_throughput
|
334
353
|
|
335
354
|
throughput = (input_len + output_len) * batch_size / tot_latency
|
336
355
|
rank_print(
|
@@ -343,15 +362,15 @@ def latency_test_run_once(
|
|
343
362
|
|
344
363
|
def latency_test(
|
345
364
|
server_args,
|
365
|
+
port_args,
|
346
366
|
bench_args,
|
347
367
|
tp_rank,
|
348
368
|
):
|
349
369
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
350
|
-
_set_envs_and_config(server_args)
|
351
370
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
352
371
|
|
353
372
|
# Load the model
|
354
|
-
model_runner, tokenizer = load_model(server_args, tp_rank)
|
373
|
+
model_runner, tokenizer = load_model(server_args, port_args, tp_rank)
|
355
374
|
|
356
375
|
# Prepare inputs for warm up
|
357
376
|
reqs = prepare_synthetic_inputs_for_latency_test(
|
@@ -367,7 +386,7 @@ def latency_test(
|
|
367
386
|
reqs,
|
368
387
|
bench_args.batch_size[0],
|
369
388
|
bench_args.input_len[0],
|
370
|
-
|
389
|
+
8, # shorter decoding to speed up the warmup
|
371
390
|
)
|
372
391
|
rank_print("Benchmark ...")
|
373
392
|
|
@@ -453,6 +472,7 @@ def plot_latency_test(
|
|
453
472
|
|
454
473
|
|
455
474
|
def main(server_args, bench_args):
|
475
|
+
_set_envs_and_config(server_args)
|
456
476
|
|
457
477
|
if server_args.model_path:
|
458
478
|
if bench_args.correctness_test:
|
@@ -468,8 +488,10 @@ def main(server_args, bench_args):
|
|
468
488
|
"provide --result-filename for plotting the results"
|
469
489
|
)
|
470
490
|
|
491
|
+
port_args = PortArgs.init_new(server_args)
|
492
|
+
|
471
493
|
if server_args.tp_size == 1:
|
472
|
-
work_func(server_args, bench_args, 0)
|
494
|
+
work_func(server_args, port_args, bench_args, 0)
|
473
495
|
else:
|
474
496
|
workers = []
|
475
497
|
for tp_rank in range(server_args.tp_size):
|
@@ -477,6 +499,7 @@ def main(server_args, bench_args):
|
|
477
499
|
target=work_func,
|
478
500
|
args=(
|
479
501
|
server_args,
|
502
|
+
port_args,
|
480
503
|
bench_args,
|
481
504
|
tp_rank,
|
482
505
|
),
|
@@ -503,8 +526,6 @@ if __name__ == "__main__":
|
|
503
526
|
format="%(message)s",
|
504
527
|
)
|
505
528
|
|
506
|
-
multiprocessing.set_start_method("spawn", force=True)
|
507
|
-
|
508
529
|
try:
|
509
530
|
main(server_args, bench_args)
|
510
531
|
except Exception as e:
|