sglang 0.3.3.post1__tar.gz → 0.3.4.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.3.post1/sglang.egg-info → sglang-0.3.4.post1}/PKG-INFO +75 -32
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/README.md +43 -13
- sglang-0.3.4.post1/pyproject.toml +68 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_latency.py +30 -11
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_server_latency.py +21 -10
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/bench_serving.py +101 -7
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/global_config.py +0 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/chat_template.py +17 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/launch_server_llavavid.py +1 -1
- sglang-0.3.4.post1/sglang/srt/configs/__init__.py +8 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/configs/model_config.py +2 -0
- sglang-0.3.4.post1/sglang/srt/configs/qwen2vl.py +133 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/conversation.py +27 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/hf_transformers_utils.py +2 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/__init__.py +38 -5
- sglang-0.3.4.post1/sglang/srt/layers/attention/double_sparsity_backend.py +297 -0
- sglang-0.3.4.post1/sglang/srt/layers/attention/flashinfer_backend.py +666 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_backend.py +26 -8
- sglang-0.3.4.post1/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +30 -6
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/linear.py +89 -63
- sglang-0.3.4.post1/sglang/srt/layers/rotary_embedding.py +145 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/sampler.py +6 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora.py +3 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/detokenizer_manager.py +31 -10
- sglang-0.3.4.post1/sglang/srt/managers/image_processor.py +360 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/io_struct.py +4 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_batch.py +319 -82
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/schedule_policy.py +2 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/scheduler.py +233 -158
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/tokenizer_manager.py +15 -5
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/tp_worker.py +30 -5
- sglang-0.3.4.post1/sglang/srt/managers/tp_worker_overlap_thread.py +172 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/chunk_cache.py +8 -4
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/memory_pool.py +123 -11
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/radix_cache.py +19 -10
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/cuda_graph_runner.py +63 -12
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/forward_batch_info.py +101 -23
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/model_executor/model_runner.py +92 -12
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/baichuan.py +2 -3
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/chatglm.py +8 -9
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/commandr.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/dbrx.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/deepseek.py +4 -5
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/deepseek_v2.py +7 -8
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/exaone.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gemma.py +2 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gemma2.py +5 -5
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/gpt_bigcode.py +5 -5
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/grok.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/internlm2.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_classification.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_reward.py +2 -3
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llava.py +4 -8
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/minicpm.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/minicpm3.py +5 -6
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mixtral.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mixtral_quant.py +1 -2
- sglang-0.3.4.post1/sglang/srt/models/mllama.py +1004 -0
- sglang-0.3.4.post1/sglang/srt/models/olmo.py +352 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/olmoe.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen2.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/qwen2_moe.py +4 -5
- sglang-0.3.4.post1/sglang/srt/models/qwen2_vl.py +724 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/stablelm.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/torch_native_llama.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/xverse.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/xverse_moe.py +4 -5
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/yivl.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/openai_api/adapter.py +92 -49
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/openai_api/protocol.py +10 -2
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_batch_info.py +103 -59
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/sampling_params.py +2 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/server.py +116 -17
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/server_args.py +131 -45
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/utils.py +33 -3
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/few_shot_gsm8k.py +4 -1
- sglang-0.3.4.post1/sglang/test/few_shot_gsm8k_engine.py +144 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/runners.py +20 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/srt/sampling/penaltylib/utils.py +16 -12
- sglang-0.3.4.post1/sglang/version.py +1 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1/sglang.egg-info}/PKG-INFO +75 -32
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/SOURCES.txt +9 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/requires.txt +20 -3
- sglang-0.3.3.post1/pyproject.toml +0 -42
- sglang-0.3.3.post1/sglang/srt/configs/__init__.py +0 -5
- sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_backend.py +0 -277
- sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_utils.py +0 -237
- sglang-0.3.3.post1/sglang/srt/managers/image_processor.py +0 -187
- sglang-0.3.3.post1/sglang/version.py +0 -1
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/LICENSE +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/setup.cfg +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/api.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/check_env.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/launch_server.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/test/test_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang/utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.4.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -214,26 +214,31 @@ License-File: LICENSE
|
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
216
|
Requires-Dist: numpy
|
217
|
+
Provides-Extra: runtime-common
|
218
|
+
Requires-Dist: aiohttp; extra == "runtime-common"
|
219
|
+
Requires-Dist: decord; extra == "runtime-common"
|
220
|
+
Requires-Dist: fastapi; extra == "runtime-common"
|
221
|
+
Requires-Dist: hf_transfer; extra == "runtime-common"
|
222
|
+
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
223
|
+
Requires-Dist: interegular; extra == "runtime-common"
|
224
|
+
Requires-Dist: orjson; extra == "runtime-common"
|
225
|
+
Requires-Dist: packaging; extra == "runtime-common"
|
226
|
+
Requires-Dist: pillow; extra == "runtime-common"
|
227
|
+
Requires-Dist: psutil; extra == "runtime-common"
|
228
|
+
Requires-Dist: pydantic; extra == "runtime-common"
|
229
|
+
Requires-Dist: python-multipart; extra == "runtime-common"
|
230
|
+
Requires-Dist: torchao; extra == "runtime-common"
|
231
|
+
Requires-Dist: uvicorn; extra == "runtime-common"
|
232
|
+
Requires-Dist: uvloop; extra == "runtime-common"
|
233
|
+
Requires-Dist: zmq; extra == "runtime-common"
|
234
|
+
Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
|
235
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
217
236
|
Provides-Extra: srt
|
218
|
-
Requires-Dist:
|
219
|
-
Requires-Dist: decord; extra == "srt"
|
220
|
-
Requires-Dist: fastapi; extra == "srt"
|
221
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
222
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
223
|
-
Requires-Dist: interegular; extra == "srt"
|
224
|
-
Requires-Dist: packaging; extra == "srt"
|
225
|
-
Requires-Dist: pillow; extra == "srt"
|
226
|
-
Requires-Dist: psutil; extra == "srt"
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: python-multipart; extra == "srt"
|
237
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
229
238
|
Requires-Dist: torch; extra == "srt"
|
230
|
-
Requires-Dist:
|
231
|
-
|
232
|
-
Requires-Dist:
|
233
|
-
Requires-Dist: zmq; extra == "srt"
|
234
|
-
Requires-Dist: vllm==0.5.5; extra == "srt"
|
235
|
-
Requires-Dist: outlines>=0.0.44; extra == "srt"
|
236
|
-
Requires-Dist: modelscope; extra == "srt"
|
239
|
+
Requires-Dist: vllm==0.6.3.post1; extra == "srt"
|
240
|
+
Provides-Extra: srt-xpu
|
241
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
237
242
|
Provides-Extra: openai
|
238
243
|
Requires-Dist: openai>=1.0; extra == "openai"
|
239
244
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -253,9 +258,17 @@ Requires-Dist: sglang[srt]; extra == "all"
|
|
253
258
|
Requires-Dist: sglang[openai]; extra == "all"
|
254
259
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
255
260
|
Requires-Dist: sglang[litellm]; extra == "all"
|
261
|
+
Provides-Extra: all-xpu
|
262
|
+
Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
263
|
+
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
264
|
+
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
265
|
+
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
256
266
|
Provides-Extra: dev
|
257
267
|
Requires-Dist: sglang[all]; extra == "dev"
|
258
268
|
Requires-Dist: sglang[test]; extra == "dev"
|
269
|
+
Provides-Extra: dev-xpu
|
270
|
+
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
271
|
+
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
259
272
|
|
260
273
|
<div align="center" id="sglangtop">
|
261
274
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -270,19 +283,18 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
270
283
|
|
271
284
|
--------------------------------------------------------------------------------
|
272
285
|
|
273
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.
|
274
|
-
|
275
|
-
## Upcoming Events
|
276
|
-
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
286
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
287
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
277
288
|
|
278
289
|
## News
|
279
|
-
- [2024/
|
280
|
-
- [2024/
|
281
|
-
- [2024/
|
290
|
+
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
291
|
+
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
292
|
+
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
282
293
|
|
283
294
|
<details>
|
284
295
|
<summary>More</summary>
|
285
296
|
|
297
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
286
298
|
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
287
299
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
288
300
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -323,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
323
335
|
### Method 2: From source
|
324
336
|
```
|
325
337
|
# Use the last release branch
|
326
|
-
git clone -b v0.3.
|
338
|
+
git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
|
327
339
|
cd sglang
|
328
340
|
|
329
341
|
pip install --upgrade pip
|
@@ -500,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
500
512
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
501
513
|
```
|
502
514
|
|
515
|
+
### Engine Without HTTP Server
|
516
|
+
|
517
|
+
We also provide an inference engine **without a HTTP server**. For example,
|
518
|
+
|
519
|
+
```python
|
520
|
+
import sglang as sgl
|
521
|
+
|
522
|
+
|
523
|
+
def main():
|
524
|
+
prompts = [
|
525
|
+
"Hello, my name is",
|
526
|
+
"The president of the United States is",
|
527
|
+
"The capital of France is",
|
528
|
+
"The future of AI is",
|
529
|
+
]
|
530
|
+
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
531
|
+
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
532
|
+
|
533
|
+
outputs = llm.generate(prompts, sampling_params)
|
534
|
+
for prompt, output in zip(prompts, outputs):
|
535
|
+
print("===============================")
|
536
|
+
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
537
|
+
|
538
|
+
if __name__ == "__main__":
|
539
|
+
main()
|
540
|
+
```
|
541
|
+
|
542
|
+
This can be used for:
|
543
|
+
|
544
|
+
1. **Offline Batch Inference**
|
545
|
+
2. **Building Custom Servers**
|
546
|
+
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
548
|
+
|
503
549
|
### Supported Models
|
504
550
|
|
505
551
|
**Generative Models**
|
@@ -529,6 +575,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
529
575
|
- MiniCPM / MiniCPM 3
|
530
576
|
- XVERSE / XVERSE MoE
|
531
577
|
- SmolLM
|
578
|
+
- GLM-4
|
532
579
|
|
533
580
|
**Embedding Models**
|
534
581
|
|
@@ -836,10 +883,7 @@ def chat_example(s):
|
|
836
883
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
837
884
|
|
838
885
|
## Benchmark And Performance
|
839
|
-
|
840
|
-

|
841
|
-
|
842
|
-
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
886
|
+
Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
|
843
887
|
|
844
888
|
## Roadmap
|
845
889
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -849,7 +893,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
|
|
849
893
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
850
894
|
|
851
895
|
|
852
|
-
|
853
896
|
<p align="center">
|
854
897
|
<a href="#sglangtop" target="_blank">
|
855
898
|
<bold>Back To Top </bold>
|
@@ -11,19 +11,18 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.
|
15
|
-
|
16
|
-
## Upcoming Events
|
17
|
-
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
15
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
18
16
|
|
19
17
|
## News
|
20
|
-
- [2024/
|
21
|
-
- [2024/
|
22
|
-
- [2024/
|
18
|
+
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
19
|
+
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
20
|
+
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
23
21
|
|
24
22
|
<details>
|
25
23
|
<summary>More</summary>
|
26
24
|
|
25
|
+
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
27
26
|
- [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
|
28
27
|
- [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
|
29
28
|
- [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
|
@@ -64,7 +63,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
64
63
|
### Method 2: From source
|
65
64
|
```
|
66
65
|
# Use the last release branch
|
67
|
-
git clone -b v0.3.
|
66
|
+
git clone -b v0.3.4.post1 https://github.com/sgl-project/sglang.git
|
68
67
|
cd sglang
|
69
68
|
|
70
69
|
pip install --upgrade pip
|
@@ -241,6 +240,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
240
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
242
241
|
```
|
243
242
|
|
243
|
+
### Engine Without HTTP Server
|
244
|
+
|
245
|
+
We also provide an inference engine **without a HTTP server**. For example,
|
246
|
+
|
247
|
+
```python
|
248
|
+
import sglang as sgl
|
249
|
+
|
250
|
+
|
251
|
+
def main():
|
252
|
+
prompts = [
|
253
|
+
"Hello, my name is",
|
254
|
+
"The president of the United States is",
|
255
|
+
"The capital of France is",
|
256
|
+
"The future of AI is",
|
257
|
+
]
|
258
|
+
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
259
|
+
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
260
|
+
|
261
|
+
outputs = llm.generate(prompts, sampling_params)
|
262
|
+
for prompt, output in zip(prompts, outputs):
|
263
|
+
print("===============================")
|
264
|
+
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
265
|
+
|
266
|
+
if __name__ == "__main__":
|
267
|
+
main()
|
268
|
+
```
|
269
|
+
|
270
|
+
This can be used for:
|
271
|
+
|
272
|
+
1. **Offline Batch Inference**
|
273
|
+
2. **Building Custom Servers**
|
274
|
+
|
275
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
276
|
+
|
244
277
|
### Supported Models
|
245
278
|
|
246
279
|
**Generative Models**
|
@@ -270,6 +303,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
270
303
|
- MiniCPM / MiniCPM 3
|
271
304
|
- XVERSE / XVERSE MoE
|
272
305
|
- SmolLM
|
306
|
+
- GLM-4
|
273
307
|
|
274
308
|
**Embedding Models**
|
275
309
|
|
@@ -577,10 +611,7 @@ def chat_example(s):
|
|
577
611
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
578
612
|
|
579
613
|
## Benchmark And Performance
|
580
|
-
|
581
|
-

|
582
|
-
|
583
|
-
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
614
|
+
Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
|
584
615
|
|
585
616
|
## Roadmap
|
586
617
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -590,7 +621,6 @@ Please cite our paper, [SGLang: Efficient Execution of Structured Language Model
|
|
590
621
|
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
591
622
|
|
592
623
|
|
593
|
-
|
594
624
|
<p align="center">
|
595
625
|
<a href="#sglangtop" target="_blank">
|
596
626
|
<bold>Back To Top </bold>
|
@@ -0,0 +1,68 @@
|
|
1
|
+
[build-system]
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
3
|
+
build-backend = "setuptools.build_meta"
|
4
|
+
|
5
|
+
[project]
|
6
|
+
name = "sglang"
|
7
|
+
version = "0.3.4.post1"
|
8
|
+
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
|
+
readme = "README.md"
|
10
|
+
requires-python = ">=3.8"
|
11
|
+
license = { file = "LICENSE" }
|
12
|
+
classifiers = [
|
13
|
+
"Programming Language :: Python :: 3",
|
14
|
+
"License :: OSI Approved :: Apache Software License",
|
15
|
+
]
|
16
|
+
dependencies = ["requests", "tqdm", "numpy"]
|
17
|
+
|
18
|
+
[project.optional-dependencies]
|
19
|
+
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
20
|
+
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
21
|
+
"torchao", "uvicorn", "uvloop", "zmq",
|
22
|
+
"outlines>=0.0.44", "modelscope"]
|
23
|
+
# xpu is not enabled in public vllm and torch whl,
|
24
|
+
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
25
|
+
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
|
26
|
+
srt_xpu = ["sglang[runtime_common]"]
|
27
|
+
|
28
|
+
openai = ["openai>=1.0", "tiktoken"]
|
29
|
+
anthropic = ["anthropic>=0.20.0"]
|
30
|
+
litellm = ["litellm>=1.0.0"]
|
31
|
+
test = [
|
32
|
+
"jsonlines",
|
33
|
+
"matplotlib",
|
34
|
+
"pandas",
|
35
|
+
"sentence_transformers",
|
36
|
+
"accelerate",
|
37
|
+
"peft",
|
38
|
+
]
|
39
|
+
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
40
|
+
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
41
|
+
dev = ["sglang[all]", "sglang[test]"]
|
42
|
+
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
43
|
+
|
44
|
+
[project.urls]
|
45
|
+
"Homepage" = "https://github.com/sgl-project/sglang"
|
46
|
+
"Bug Tracker" = "https://github.com/sgl-project/sglang/issues"
|
47
|
+
|
48
|
+
[tool.setuptools.packages.find]
|
49
|
+
exclude = [
|
50
|
+
"assets*",
|
51
|
+
"benchmark*",
|
52
|
+
"docs*",
|
53
|
+
"dist*",
|
54
|
+
"playground*",
|
55
|
+
"scripts*",
|
56
|
+
"tests*",
|
57
|
+
]
|
58
|
+
|
59
|
+
[tool.wheel]
|
60
|
+
exclude = [
|
61
|
+
"assets*",
|
62
|
+
"benchmark*",
|
63
|
+
"docs*",
|
64
|
+
"dist*",
|
65
|
+
"playground*",
|
66
|
+
"scripts*",
|
67
|
+
"tests*",
|
68
|
+
]
|
@@ -227,22 +227,24 @@ def extend(reqs, model_runner):
|
|
227
227
|
req_to_token_pool=model_runner.req_to_token_pool,
|
228
228
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
229
229
|
tree_cache=None,
|
230
|
+
model_config=model_runner.model_config,
|
230
231
|
)
|
231
|
-
batch.prepare_for_extend(
|
232
|
+
batch.prepare_for_extend()
|
232
233
|
model_worker_batch = batch.get_model_worker_batch()
|
233
234
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
234
235
|
logits_output = model_runner.forward(forward_batch)
|
235
|
-
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
236
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
236
237
|
return next_token_ids, logits_output.next_token_logits, batch
|
237
238
|
|
238
239
|
|
239
240
|
@torch.inference_mode()
|
240
241
|
def decode(input_token_ids, batch, model_runner):
|
241
|
-
batch.
|
242
|
+
batch.output_ids = input_token_ids
|
243
|
+
batch.prepare_for_decode()
|
242
244
|
model_worker_batch = batch.get_model_worker_batch()
|
243
245
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
244
246
|
logits_output = model_runner.forward(forward_batch)
|
245
|
-
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
247
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
246
248
|
return next_token_ids, logits_output.next_token_logits
|
247
249
|
|
248
250
|
|
@@ -252,6 +254,7 @@ def correctness_test(
|
|
252
254
|
bench_args,
|
253
255
|
tp_rank,
|
254
256
|
):
|
257
|
+
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
255
258
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
256
259
|
|
257
260
|
# Load the model
|
@@ -279,8 +282,9 @@ def correctness_test(
|
|
279
282
|
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
280
283
|
for _ in range(bench_args.output_len[0] - 1):
|
281
284
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
285
|
+
next_token_ids_list = next_token_ids.tolist()
|
282
286
|
for i in range(len(reqs)):
|
283
|
-
output_ids[i].append(
|
287
|
+
output_ids[i].append(next_token_ids_list[i])
|
284
288
|
|
285
289
|
# Print
|
286
290
|
for i in range(len(reqs)):
|
@@ -288,8 +292,15 @@ def correctness_test(
|
|
288
292
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
289
293
|
|
290
294
|
|
295
|
+
def synchronize(device):
|
296
|
+
if device == "cuda":
|
297
|
+
torch.cuda.synchronize()
|
298
|
+
elif device == "xpu":
|
299
|
+
torch.xpu.synchronize()
|
300
|
+
|
301
|
+
|
291
302
|
def latency_test_run_once(
|
292
|
-
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
303
|
+
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
|
293
304
|
):
|
294
305
|
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
|
295
306
|
if batch_size > max_batch_size:
|
@@ -312,10 +323,10 @@ def latency_test_run_once(
|
|
312
323
|
tot_latency = 0
|
313
324
|
|
314
325
|
# Prefill
|
315
|
-
|
326
|
+
synchronize(device)
|
316
327
|
tic = time.time()
|
317
328
|
next_token_ids, _, batch = extend(reqs, model_runner)
|
318
|
-
|
329
|
+
synchronize(device)
|
319
330
|
prefill_latency = time.time() - tic
|
320
331
|
tot_latency += prefill_latency
|
321
332
|
throughput = input_len * batch_size / prefill_latency
|
@@ -328,10 +339,10 @@ def latency_test_run_once(
|
|
328
339
|
# Decode
|
329
340
|
decode_latencies = []
|
330
341
|
for i in range(output_len - 1):
|
331
|
-
|
342
|
+
synchronize(device)
|
332
343
|
tic = time.time()
|
333
344
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
334
|
-
|
345
|
+
synchronize(device)
|
335
346
|
latency = time.time() - tic
|
336
347
|
tot_latency += latency
|
337
348
|
throughput = batch_size / latency
|
@@ -387,6 +398,7 @@ def latency_test(
|
|
387
398
|
bench_args.batch_size[0],
|
388
399
|
bench_args.input_len[0],
|
389
400
|
8, # shorter decoding to speed up the warmup
|
401
|
+
server_args.device,
|
390
402
|
)
|
391
403
|
rank_print("Benchmark ...")
|
392
404
|
|
@@ -397,7 +409,14 @@ def latency_test(
|
|
397
409
|
):
|
398
410
|
reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
|
399
411
|
ret = latency_test_run_once(
|
400
|
-
bench_args.run_name,
|
412
|
+
bench_args.run_name,
|
413
|
+
model_runner,
|
414
|
+
rank_print,
|
415
|
+
reqs,
|
416
|
+
bs,
|
417
|
+
il,
|
418
|
+
ol,
|
419
|
+
server_args.device,
|
401
420
|
)
|
402
421
|
if ret is not None:
|
403
422
|
result_list.append(ret)
|
@@ -6,6 +6,8 @@ It accepts arguments similar to those of launch_server.py.
|
|
6
6
|
Usage:
|
7
7
|
|
8
8
|
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
|
+
|
10
|
+
python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
9
11
|
"""
|
10
12
|
|
11
13
|
import argparse
|
@@ -32,6 +34,8 @@ class BenchArgs:
|
|
32
34
|
input_len: Tuple[int] = (1024,)
|
33
35
|
output_len: Tuple[int] = (16,)
|
34
36
|
result_filename: str = "result.jsonl"
|
37
|
+
base_url: str = ""
|
38
|
+
skip_warmup: bool = False
|
35
39
|
|
36
40
|
@staticmethod
|
37
41
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -48,6 +52,8 @@ class BenchArgs:
|
|
48
52
|
parser.add_argument(
|
49
53
|
"--result-filename", type=str, default=BenchArgs.result_filename
|
50
54
|
)
|
55
|
+
parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
|
56
|
+
parser.add_argument("--skip-warmup", action="store_true")
|
51
57
|
|
52
58
|
@classmethod
|
53
59
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -139,17 +145,21 @@ def run_one_case(
|
|
139
145
|
|
140
146
|
|
141
147
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
142
|
-
|
148
|
+
if bench_args.base_url:
|
149
|
+
proc, base_url = None, bench_args.base_url
|
150
|
+
else:
|
151
|
+
proc, base_url = launch_server_process(server_args)
|
143
152
|
|
144
153
|
# warmup
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
154
|
+
if not bench_args.skip_warmup:
|
155
|
+
run_one_case(
|
156
|
+
base_url,
|
157
|
+
batch_size=16,
|
158
|
+
input_len=1024,
|
159
|
+
output_len=16,
|
160
|
+
run_name="",
|
161
|
+
result_filename="",
|
162
|
+
)
|
153
163
|
|
154
164
|
# benchmark
|
155
165
|
try:
|
@@ -165,7 +175,8 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
165
175
|
bench_args.result_filename,
|
166
176
|
)
|
167
177
|
finally:
|
168
|
-
|
178
|
+
if proc:
|
179
|
+
kill_child_process(proc.pid)
|
169
180
|
|
170
181
|
print(f"\nResults are saved to {bench_args.result_filename}")
|
171
182
|
|