sglang 0.3.5.post2__tar.gz → 0.3.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.5.post2 → sglang-0.3.6}/PKG-INFO +5 -5
- {sglang-0.3.5.post2 → sglang-0.3.6}/README.md +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/pyproject.toml +9 -6
- sglang-0.3.6/sglang/bench_latency.py +1 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/bench_offline_throughput.py +48 -20
- sglang-0.3.5.post2/sglang/bench_latency.py → sglang-0.3.6/sglang/bench_one_batch.py +19 -98
- sglang-0.3.5.post2/sglang/bench_server_latency.py → sglang-0.3.6/sglang/bench_one_batch_server.py +3 -3
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/bench_serving.py +71 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/check_env.py +3 -6
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/constrained/outlines_backend.py +15 -2
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/constrained/xgrammar_backend.py +22 -14
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/activation.py +3 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/flashinfer_backend.py +93 -48
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/triton_backend.py +9 -7
- sglang-0.3.6/sglang/srt/layers/custom_op_util.py +26 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/fused_moe/fused_moe.py +11 -4
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/layernorm.py +4 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/logits_processor.py +10 -10
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/sampler.py +4 -8
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/torchao_utils.py +2 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/data_parallel_controller.py +74 -9
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/detokenizer_manager.py +1 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/io_struct.py +27 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/schedule_batch.py +104 -38
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/schedule_policy.py +5 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/scheduler.py +204 -54
- sglang-0.3.6/sglang/srt/managers/session_controller.py +62 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/tokenizer_manager.py +38 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/tp_worker.py +12 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/tp_worker_overlap_thread.py +49 -52
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/model_executor/cuda_graph_runner.py +43 -6
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/model_executor/forward_batch_info.py +109 -15
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/model_executor/model_runner.py +99 -43
- sglang-0.3.6/sglang/srt/model_parallel.py +98 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/deepseek_v2.py +147 -44
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/gemma2.py +9 -8
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llava.py +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llavavid.py +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/olmo.py +3 -3
- sglang-0.3.6/sglang/srt/models/phi3_small.py +447 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/qwen2_vl.py +13 -6
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/torch_native_llama.py +94 -78
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/openai_api/adapter.py +6 -2
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/openai_api/protocol.py +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/sampling_batch_info.py +58 -57
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/server.py +27 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/server_args.py +78 -62
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/utils.py +71 -52
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/runners.py +25 -6
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/srt/sampling/penaltylib/utils.py +23 -21
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/test_utils.py +30 -19
- sglang-0.3.6/sglang/version.py +1 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang.egg-info/PKG-INFO +5 -5
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang.egg-info/SOURCES.txt +6 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang.egg-info/requires.txt +3 -3
- sglang-0.3.5.post2/sglang/version.py +0 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/LICENSE +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/setup.cfg +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/api.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/global_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/choices.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/ir.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/launch_server.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/constrained/base_grammar_backend.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/constrained/outlines_jump_forward.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/extend_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/metrics/collector.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/metrics/func_timer.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/gemma2_reward.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/internlm2_reward.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llama.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/llama_reward.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/srt/sampling/sampling_params.py +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang/utils.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -223,22 +223,22 @@ Requires-Dist: hf_transfer; extra == "runtime-common"
|
|
223
223
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
224
224
|
Requires-Dist: interegular; extra == "runtime-common"
|
225
225
|
Requires-Dist: orjson; extra == "runtime-common"
|
226
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
226
227
|
Requires-Dist: packaging; extra == "runtime-common"
|
227
228
|
Requires-Dist: pillow; extra == "runtime-common"
|
228
229
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
229
230
|
Requires-Dist: psutil; extra == "runtime-common"
|
230
231
|
Requires-Dist: pydantic; extra == "runtime-common"
|
231
232
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
233
|
+
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
232
234
|
Requires-Dist: torchao; extra == "runtime-common"
|
233
235
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
234
236
|
Requires-Dist: uvloop; extra == "runtime-common"
|
235
|
-
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
236
|
-
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
237
237
|
Requires-Dist: modelscope; extra == "runtime-common"
|
238
238
|
Provides-Extra: srt
|
239
239
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
240
240
|
Requires-Dist: torch; extra == "srt"
|
241
|
-
Requires-Dist: vllm
|
241
|
+
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
242
242
|
Provides-Extra: srt-hip
|
243
243
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
244
244
|
Requires-Dist: torch; extra == "srt-hip"
|
@@ -323,7 +323,7 @@ The core features include:
|
|
323
323
|
|
324
324
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
325
325
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
326
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
326
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
327
327
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
328
328
|
|
329
329
|
## Getting Started
|
@@ -37,7 +37,7 @@ The core features include:
|
|
37
37
|
|
38
38
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
39
39
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
40
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
40
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
41
41
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
42
42
|
|
43
43
|
## Getting Started
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.6"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -16,11 +16,14 @@ classifiers = [
|
|
16
16
|
dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
|
-
runtime_common = ["aiohttp", "decord", "fastapi",
|
20
|
-
"
|
21
|
-
"
|
22
|
-
"
|
23
|
-
|
19
|
+
runtime_common = ["aiohttp", "decord", "fastapi",
|
20
|
+
"hf_transfer", "huggingface_hub", "interegular",
|
21
|
+
"orjson", "outlines>=0.0.44,<0.1.0",
|
22
|
+
"packaging", "pillow", "prometheus-client>=0.20.0",
|
23
|
+
"psutil", "pydantic", "python-multipart",
|
24
|
+
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
|
+
"modelscope"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
|
24
27
|
|
25
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
26
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -0,0 +1 @@
|
|
1
|
+
raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
|
@@ -1,20 +1,13 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the throughput
|
3
|
-
This script does not launch a server.
|
2
|
+
Benchmark the throughput in the offline mode.
|
4
3
|
It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
|
5
4
|
|
6
5
|
# Usage
|
7
6
|
## Sharegpt dataset with default args
|
8
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
7
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
|
9
8
|
|
10
9
|
## Random dataset with default args
|
11
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
|
12
|
-
|
13
|
-
## Shared prefix dataset with default args
|
14
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
|
15
|
-
|
16
|
-
## Sharegpt dataset on runtime backend
|
17
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
|
10
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
|
18
11
|
"""
|
19
12
|
|
20
13
|
import argparse
|
@@ -23,7 +16,7 @@ import json
|
|
23
16
|
import logging
|
24
17
|
import random
|
25
18
|
import time
|
26
|
-
from typing import List, Optional, Tuple
|
19
|
+
from typing import Dict, List, Optional, Tuple
|
27
20
|
|
28
21
|
import numpy as np
|
29
22
|
|
@@ -55,7 +48,10 @@ class BenchArgs:
|
|
55
48
|
gen_question_len: int = 128
|
56
49
|
gen_output_len: int = 256
|
57
50
|
disable_ignore_eos: bool = False
|
51
|
+
extra_request_body: Optional[str] = None
|
58
52
|
seed: int = 1
|
53
|
+
skip_warmup: bool = False
|
54
|
+
do_not_exit: bool = False
|
59
55
|
|
60
56
|
@staticmethod
|
61
57
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -142,7 +138,24 @@ class BenchArgs:
|
|
142
138
|
default=BenchArgs.disable_ignore_eos,
|
143
139
|
help="Disable ignore EOS token",
|
144
140
|
)
|
141
|
+
parser.add_argument(
|
142
|
+
"--extra-request-body",
|
143
|
+
metavar='{"key1": "value1", "key2": "value2"}',
|
144
|
+
type=str,
|
145
|
+
help="Append given JSON object to the request payload. You can use this to specify"
|
146
|
+
"additional generate params like sampling params.",
|
147
|
+
)
|
145
148
|
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
149
|
+
parser.add_argument(
|
150
|
+
"--skip-warmup",
|
151
|
+
action="store_true",
|
152
|
+
help="Skip the warmup batches.",
|
153
|
+
)
|
154
|
+
parser.add_argument(
|
155
|
+
"--do-not-exit",
|
156
|
+
action="store_true",
|
157
|
+
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
158
|
+
)
|
146
159
|
|
147
160
|
@classmethod
|
148
161
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -155,6 +168,7 @@ def throughput_test_once(
|
|
155
168
|
backend,
|
156
169
|
reqs: List[Tuple[str, int, int]],
|
157
170
|
ignore_eos: bool,
|
171
|
+
extra_request_body: Dict,
|
158
172
|
):
|
159
173
|
measurement_results = {
|
160
174
|
"backend": backend_name,
|
@@ -174,6 +188,7 @@ def throughput_test_once(
|
|
174
188
|
"temperature": 0,
|
175
189
|
"max_new_tokens": r[2],
|
176
190
|
"ignore_eos": ignore_eos,
|
191
|
+
**extra_request_body,
|
177
192
|
}
|
178
193
|
for r in reqs
|
179
194
|
]
|
@@ -227,31 +242,41 @@ def throughput_test(
|
|
227
242
|
random.seed(bench_args.seed)
|
228
243
|
np.random.seed(bench_args.seed)
|
229
244
|
|
245
|
+
# Parse args
|
246
|
+
extra_request_body = {}
|
247
|
+
if bench_args.extra_request_body:
|
248
|
+
extra_request_body = json.loads(args.extra_request_body)
|
249
|
+
|
230
250
|
# Read dataset
|
231
251
|
input_requests = get_dataset(bench_args, tokenizer)
|
232
252
|
|
233
253
|
warmup_requests = sample_random_requests(
|
234
|
-
input_len=
|
235
|
-
output_len=
|
236
|
-
num_prompts=
|
254
|
+
input_len=256,
|
255
|
+
output_len=16,
|
256
|
+
num_prompts=16,
|
237
257
|
range_ratio=0.8,
|
238
258
|
tokenizer=tokenizer,
|
239
259
|
dataset_path=bench_args.dataset_path,
|
240
260
|
)
|
241
261
|
|
242
262
|
# Warm up
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
263
|
+
if not bench_args.skip_warmup:
|
264
|
+
logging.info("\nWarmup...")
|
265
|
+
throughput_test_once(
|
266
|
+
backend_name=bench_args.backend,
|
267
|
+
backend=backend,
|
268
|
+
reqs=warmup_requests,
|
269
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
270
|
+
extra_request_body=extra_request_body,
|
271
|
+
)
|
249
272
|
|
273
|
+
logging.info("\nBenchmark...")
|
250
274
|
result = throughput_test_once(
|
251
275
|
backend_name=bench_args.backend,
|
252
276
|
backend=backend,
|
253
277
|
reqs=input_requests,
|
254
278
|
ignore_eos=not bench_args.disable_ignore_eos,
|
279
|
+
extra_request_body=extra_request_body,
|
255
280
|
)
|
256
281
|
|
257
282
|
if bench_args.result_filename:
|
@@ -307,3 +332,6 @@ if __name__ == "__main__":
|
|
307
332
|
)
|
308
333
|
|
309
334
|
throughput_test(server_args, bench_args)
|
335
|
+
|
336
|
+
while bench_args.do_not_exit:
|
337
|
+
pass
|
@@ -1,20 +1,17 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of running a single static batch.
|
2
|
+
Benchmark the latency of running a single static batch without a server.
|
3
|
+
|
3
4
|
This script does not launch a server and uses the low-level APIs.
|
4
|
-
It accepts arguments
|
5
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
5
6
|
|
6
7
|
# Usage (latency test)
|
7
8
|
## with dummy weights:
|
8
|
-
python -m sglang.
|
9
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
9
10
|
## sweep through multiple data points and store (append) the results in a jsonl file:
|
10
|
-
python -m sglang.
|
11
|
-
## do some changes, and store the results under a different run_name:
|
12
|
-
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
|
13
|
-
## plot the results in series of lines:
|
14
|
-
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
|
11
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
|
15
12
|
|
16
13
|
# Usage (correctness test):
|
17
|
-
python -m sglang.
|
14
|
+
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
18
15
|
|
19
16
|
## Reference output (of the correctness test above, can be gpu dependent):
|
20
17
|
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
|
@@ -50,13 +47,10 @@ import itertools
|
|
50
47
|
import json
|
51
48
|
import logging
|
52
49
|
import multiprocessing
|
53
|
-
import os
|
54
|
-
import sqlite3
|
55
50
|
import time
|
56
51
|
from typing import Tuple
|
57
52
|
|
58
53
|
import numpy as np
|
59
|
-
import pandas as pd
|
60
54
|
import torch
|
61
55
|
import torch.distributed as dist
|
62
56
|
|
@@ -77,19 +71,14 @@ from sglang.srt.utils import (
|
|
77
71
|
|
78
72
|
@dataclasses.dataclass
|
79
73
|
class BenchArgs:
|
80
|
-
run_name: str = "
|
74
|
+
run_name: str = "default"
|
81
75
|
batch_size: Tuple[int] = (1,)
|
82
76
|
input_len: Tuple[int] = (1024,)
|
83
77
|
output_len: Tuple[int] = (16,)
|
84
|
-
result_filename: str = ""
|
78
|
+
result_filename: str = "result.jsonl"
|
85
79
|
correctness_test: bool = False
|
86
80
|
# This is only used for correctness test
|
87
81
|
cut_len: int = 4
|
88
|
-
# Plotting args
|
89
|
-
graph_sql: str = (
|
90
|
-
"select run_name, batch_size, prefill_throughput from results where run_name='before'"
|
91
|
-
)
|
92
|
-
graph_filename: str = "out.png"
|
93
82
|
|
94
83
|
@staticmethod
|
95
84
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -108,11 +97,6 @@ class BenchArgs:
|
|
108
97
|
)
|
109
98
|
parser.add_argument("--correctness-test", action="store_true")
|
110
99
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
111
|
-
# graphing
|
112
|
-
parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
|
113
|
-
parser.add_argument(
|
114
|
-
"--graph-filename", type=str, default=BenchArgs.graph_filename
|
115
|
-
)
|
116
100
|
|
117
101
|
@classmethod
|
118
102
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -220,7 +204,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
220
204
|
return reqs
|
221
205
|
|
222
206
|
|
223
|
-
@torch.
|
207
|
+
@torch.no_grad
|
224
208
|
def extend(reqs, model_runner):
|
225
209
|
batch = ScheduleBatch.init_new(
|
226
210
|
reqs=reqs,
|
@@ -237,7 +221,7 @@ def extend(reqs, model_runner):
|
|
237
221
|
return next_token_ids, logits_output.next_token_logits, batch
|
238
222
|
|
239
223
|
|
240
|
-
@torch.
|
224
|
+
@torch.no_grad
|
241
225
|
def decode(input_token_ids, batch, model_runner):
|
242
226
|
batch.output_ids = input_token_ids
|
243
227
|
batch.prepare_for_decode()
|
@@ -254,6 +238,7 @@ def correctness_test(
|
|
254
238
|
bench_args,
|
255
239
|
tp_rank,
|
256
240
|
):
|
241
|
+
# Configure the logger
|
257
242
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
258
243
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
259
244
|
|
@@ -274,7 +259,7 @@ def correctness_test(
|
|
274
259
|
bench_args, input_ids, reqs, model_runner
|
275
260
|
)
|
276
261
|
|
277
|
-
# Extend
|
262
|
+
# Extend (prefill w/ KV cache)
|
278
263
|
next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
|
279
264
|
rank_print(f"prefill logits (final): {next_token_logits} \n")
|
280
265
|
|
@@ -286,7 +271,7 @@ def correctness_test(
|
|
286
271
|
for i in range(len(reqs)):
|
287
272
|
output_ids[i].append(next_token_ids_list[i])
|
288
273
|
|
289
|
-
# Print
|
274
|
+
# Print output texts
|
290
275
|
for i in range(len(reqs)):
|
291
276
|
rank_print(f"========== Prompt {i} ==========")
|
292
277
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
@@ -352,7 +337,7 @@ def latency_test_run_once(
|
|
352
337
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
353
338
|
)
|
354
339
|
|
355
|
-
#
|
340
|
+
# Record decode timing from 2nd output
|
356
341
|
if output_len > 1:
|
357
342
|
med_decode_latency = np.median(decode_latencies)
|
358
343
|
med_decode_throughput = batch_size / med_decode_latency
|
@@ -367,7 +352,7 @@ def latency_test_run_once(
|
|
367
352
|
f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
|
368
353
|
)
|
369
354
|
measurement_results["total_latency"] = tot_latency
|
370
|
-
measurement_results["
|
355
|
+
measurement_results["overall_throughput"] = throughput
|
371
356
|
return measurement_results
|
372
357
|
|
373
358
|
|
@@ -377,6 +362,7 @@ def latency_test(
|
|
377
362
|
bench_args,
|
378
363
|
tp_rank,
|
379
364
|
):
|
365
|
+
# Configure the logger
|
380
366
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
381
367
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
382
368
|
|
@@ -423,71 +409,9 @@ def latency_test(
|
|
423
409
|
|
424
410
|
# Write results in jsonlines format on rank 0.
|
425
411
|
if tp_rank == 0 and bench_args.result_filename:
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
f.write_all(result_list)
|
430
|
-
|
431
|
-
|
432
|
-
def plot_latency_test(
|
433
|
-
server_args,
|
434
|
-
bench_args,
|
435
|
-
tp_rank,
|
436
|
-
):
|
437
|
-
assert tp_rank == 0
|
438
|
-
|
439
|
-
# read the jsonl file and put in sqlite
|
440
|
-
df = pd.read_json(bench_args.result_filename, lines=True)
|
441
|
-
conn = sqlite3.connect(":memory:")
|
442
|
-
cur = conn.cursor()
|
443
|
-
|
444
|
-
# get the columns and their types
|
445
|
-
column_names = list(df.iloc[0].keys())
|
446
|
-
type_dict = {
|
447
|
-
str: "TEXT",
|
448
|
-
np.int64: "INTEGER",
|
449
|
-
np.float64: "FLOAT",
|
450
|
-
}
|
451
|
-
column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
|
452
|
-
|
453
|
-
# create the table
|
454
|
-
cur.execute(
|
455
|
-
f"""
|
456
|
-
CREATE TABLE IF NOT EXISTS results (
|
457
|
-
{", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
|
458
|
-
)
|
459
|
-
"""
|
460
|
-
)
|
461
|
-
conn.commit()
|
462
|
-
|
463
|
-
# write the results to DB
|
464
|
-
df.to_sql("results", conn, if_exists="replace", index=False)
|
465
|
-
conn.commit()
|
466
|
-
|
467
|
-
# read it back using sql
|
468
|
-
df = pd.read_sql_query(bench_args.graph_sql, conn)
|
469
|
-
conn.close()
|
470
|
-
|
471
|
-
# plot it and save to a file
|
472
|
-
import matplotlib.pyplot as plt
|
473
|
-
|
474
|
-
assert (
|
475
|
-
len(df.columns) == 3
|
476
|
-
), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
|
477
|
-
for label in df[df.columns[0]].unique():
|
478
|
-
q = f"{df.columns[0]}=='{label}'"
|
479
|
-
series = df.query(q)
|
480
|
-
plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
|
481
|
-
plt.xlabel(df.columns[1])
|
482
|
-
plt.ylabel(df.columns[2])
|
483
|
-
plt.legend()
|
484
|
-
plt.savefig(bench_args.graph_filename, dpi=300)
|
485
|
-
|
486
|
-
# if in kitty, just dump it to the terminal
|
487
|
-
if os.environ["TERM"] == "xterm-kitty":
|
488
|
-
os.system(
|
489
|
-
f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
|
490
|
-
)
|
412
|
+
with open(bench_args.result_filename, "a") as fout:
|
413
|
+
for result in result_list:
|
414
|
+
fout.write(json.dumps(result) + "\n")
|
491
415
|
|
492
416
|
|
493
417
|
def main(server_args, bench_args):
|
@@ -498,9 +422,6 @@ def main(server_args, bench_args):
|
|
498
422
|
work_func = correctness_test
|
499
423
|
else:
|
500
424
|
work_func = latency_test
|
501
|
-
elif os.path.isfile(bench_args.result_filename):
|
502
|
-
assert bench_args.graph_filename, "please provide a filename for the graph"
|
503
|
-
work_func = plot_latency_test
|
504
425
|
else:
|
505
426
|
raise ValueError(
|
506
427
|
"Provide --model-path for running the tests or "
|
sglang-0.3.5.post2/sglang/bench_server_latency.py → sglang-0.3.6/sglang/bench_one_batch_server.py
RENAMED
@@ -1,10 +1,10 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of
|
2
|
+
Benchmark the latency of running a single batch with a server.
|
3
|
+
|
3
4
|
This script launches a server and uses the HTTP interface.
|
4
|
-
It accepts arguments
|
5
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
5
6
|
|
6
7
|
Usage:
|
7
|
-
|
8
8
|
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
9
|
|
10
10
|
python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
@@ -15,6 +15,7 @@ import argparse
|
|
15
15
|
import asyncio
|
16
16
|
import json
|
17
17
|
import os
|
18
|
+
import pickle
|
18
19
|
import random
|
19
20
|
import resource
|
20
21
|
import sys
|
@@ -387,6 +388,24 @@ async def async_request_gserver(
|
|
387
388
|
raise NotImplementedError()
|
388
389
|
|
389
390
|
|
391
|
+
async def async_request_profile(api_url: str) -> RequestFuncOutput:
|
392
|
+
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
393
|
+
output = RequestFuncOutput()
|
394
|
+
try:
|
395
|
+
async with session.post(url=api_url) as response:
|
396
|
+
if response.status == 200:
|
397
|
+
output.success = True
|
398
|
+
else:
|
399
|
+
output.error = response.reason or ""
|
400
|
+
output.success = False
|
401
|
+
except Exception:
|
402
|
+
output.success = False
|
403
|
+
exc_info = sys.exc_info()
|
404
|
+
output.error = "".join(traceback.format_exception(*exc_info))
|
405
|
+
|
406
|
+
return output
|
407
|
+
|
408
|
+
|
390
409
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
391
410
|
if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
|
392
411
|
import huggingface_hub.constants
|
@@ -682,6 +701,11 @@ def sample_generated_shared_prefix_requests(
|
|
682
701
|
output_len: int,
|
683
702
|
tokenizer: PreTrainedTokenizerBase,
|
684
703
|
) -> List[Tuple[str, int, int]]:
|
704
|
+
if args.generated_input_path and os.path.exists(args.generated_input_path):
|
705
|
+
print(f"\nloading generated input data from {args.generated_input_path}")
|
706
|
+
with open(args.generated_input_path, "rb") as f:
|
707
|
+
return pickle.load(f)
|
708
|
+
|
685
709
|
"""Generate benchmark requests with shared system prompts using random tokens."""
|
686
710
|
# Generate system prompts for each group
|
687
711
|
system_prompts = []
|
@@ -695,6 +719,9 @@ def sample_generated_shared_prefix_requests(
|
|
695
719
|
question = gen_prompt(tokenizer, question_len)
|
696
720
|
questions.append(question)
|
697
721
|
|
722
|
+
# Shuffle questions
|
723
|
+
random.shuffle(questions)
|
724
|
+
|
698
725
|
# Combine system prompts with questions
|
699
726
|
input_requests = []
|
700
727
|
total_input_tokens = 0
|
@@ -723,6 +750,11 @@ def sample_generated_shared_prefix_requests(
|
|
723
750
|
print(
|
724
751
|
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
|
725
752
|
)
|
753
|
+
if args.generated_input_save_path:
|
754
|
+
print(f"Saving generated input data to {args.generated_input_save_path}")
|
755
|
+
os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
|
756
|
+
with open(args.generated_input_save_path, "wb") as f:
|
757
|
+
pickle.dump(input_requests, f)
|
726
758
|
|
727
759
|
return input_requests
|
728
760
|
|
@@ -822,12 +854,14 @@ def calculate_metrics(
|
|
822
854
|
async def benchmark(
|
823
855
|
backend: str,
|
824
856
|
api_url: str,
|
857
|
+
base_url: str,
|
825
858
|
model_id: str,
|
826
859
|
tokenizer: PreTrainedTokenizerBase,
|
827
860
|
input_requests: List[Tuple[str, int, int]],
|
828
861
|
request_rate: float,
|
829
862
|
disable_tqdm: bool,
|
830
863
|
extra_request_body: Dict[str, Any],
|
864
|
+
profile: bool,
|
831
865
|
):
|
832
866
|
if backend in ASYNC_REQUEST_FUNCS:
|
833
867
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -855,6 +889,14 @@ async def benchmark(
|
|
855
889
|
|
856
890
|
time.sleep(1.5)
|
857
891
|
|
892
|
+
if profile:
|
893
|
+
print("Starting profiler...")
|
894
|
+
profile_output = await async_request_profile(
|
895
|
+
api_url=base_url + "/start_profile"
|
896
|
+
)
|
897
|
+
if profile_output.success:
|
898
|
+
print("Profiler started")
|
899
|
+
|
858
900
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
859
901
|
|
860
902
|
benchmark_start_time = time.perf_counter()
|
@@ -876,6 +918,12 @@ async def benchmark(
|
|
876
918
|
)
|
877
919
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
878
920
|
|
921
|
+
if profile:
|
922
|
+
print("Stopping profiler...")
|
923
|
+
profile_output = await async_request_profile(api_url=base_url + "/stop_profile")
|
924
|
+
if profile_output.success:
|
925
|
+
print("Profiler stopped")
|
926
|
+
|
879
927
|
if pbar is not None:
|
880
928
|
pbar.close()
|
881
929
|
|
@@ -1100,6 +1148,9 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1100
1148
|
if args.base_url
|
1101
1149
|
else f"http://{args.host}:{args.port}/v1/models/model:predict"
|
1102
1150
|
)
|
1151
|
+
base_url = (
|
1152
|
+
f"http://{args.host}:{args.port}" if args.base_url is None else args.base_url
|
1153
|
+
)
|
1103
1154
|
|
1104
1155
|
# Get model name
|
1105
1156
|
if args.model is None:
|
@@ -1145,12 +1196,14 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1145
1196
|
benchmark(
|
1146
1197
|
backend=backend,
|
1147
1198
|
api_url=api_url,
|
1199
|
+
base_url=base_url,
|
1148
1200
|
model_id=model_id,
|
1149
1201
|
tokenizer=tokenizer,
|
1150
1202
|
input_requests=input_requests,
|
1151
1203
|
request_rate=args.request_rate,
|
1152
1204
|
disable_tqdm=args.disable_tqdm,
|
1153
1205
|
extra_request_body=extra_request_body,
|
1206
|
+
profile=args.profile,
|
1154
1207
|
)
|
1155
1208
|
)
|
1156
1209
|
else:
|
@@ -1162,12 +1215,14 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1162
1215
|
benchmark(
|
1163
1216
|
backend=backend,
|
1164
1217
|
api_url=api_url,
|
1218
|
+
base_url=base_url,
|
1165
1219
|
model_id=model_id,
|
1166
1220
|
tokenizer=tokenizer,
|
1167
1221
|
input_requests=input_requests,
|
1168
1222
|
request_rate=rate,
|
1169
1223
|
disable_tqdm=args.disable_tqdm,
|
1170
1224
|
extra_request_body=extra_request_body,
|
1225
|
+
profile=args.profile,
|
1171
1226
|
)
|
1172
1227
|
)
|
1173
1228
|
|
@@ -1331,6 +1386,21 @@ if __name__ == "__main__":
|
|
1331
1386
|
default=256,
|
1332
1387
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1333
1388
|
)
|
1334
|
-
|
1389
|
+
parser.add_argument(
|
1390
|
+
"--generated-input-save-path",
|
1391
|
+
type=str,
|
1392
|
+
help="Path to save generated input data",
|
1393
|
+
)
|
1394
|
+
parser.add_argument(
|
1395
|
+
"--generated-input-path",
|
1396
|
+
type=str,
|
1397
|
+
help="Path to load previously generated input data",
|
1398
|
+
)
|
1399
|
+
parser.add_argument(
|
1400
|
+
"--profile",
|
1401
|
+
action="store_true",
|
1402
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
1403
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1404
|
+
)
|
1335
1405
|
args = parser.parse_args()
|
1336
1406
|
run_benchmark(args)
|
@@ -15,24 +15,21 @@ PACKAGE_LIST = [
|
|
15
15
|
"flashinfer",
|
16
16
|
"triton",
|
17
17
|
"transformers",
|
18
|
-
"
|
19
|
-
"tqdm",
|
18
|
+
"torchao",
|
20
19
|
"numpy",
|
21
20
|
"aiohttp",
|
22
21
|
"fastapi",
|
23
22
|
"hf_transfer",
|
24
23
|
"huggingface_hub",
|
25
24
|
"interegular",
|
26
|
-
"packaging",
|
27
|
-
"PIL",
|
28
25
|
"psutil",
|
29
26
|
"pydantic",
|
27
|
+
"multipart",
|
28
|
+
"zmq",
|
30
29
|
"uvicorn",
|
31
30
|
"uvloop",
|
32
|
-
"zmq",
|
33
31
|
"vllm",
|
34
32
|
"outlines",
|
35
|
-
"multipart",
|
36
33
|
"openai",
|
37
34
|
"tiktoken",
|
38
35
|
"anthropic",
|