sglang 0.3.5__tar.gz → 0.3.5.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.5 → sglang-0.3.5.post2}/PKG-INFO +12 -8
- {sglang-0.3.5 → sglang-0.3.5.post2}/README.md +8 -5
- {sglang-0.3.5 → sglang-0.3.5.post2}/pyproject.toml +5 -4
- sglang-0.3.5.post2/sglang/bench_offline_throughput.py +309 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_serving.py +148 -24
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/model_config.py +5 -2
- sglang-0.3.5.post2/sglang/srt/constrained/__init__.py +17 -0
- sglang-0.3.5.post2/sglang/srt/constrained/base_grammar_backend.py +73 -0
- sglang-0.3.5.post2/sglang/srt/constrained/outlines_backend.py +165 -0
- sglang-0.3.5.post2/sglang/srt/constrained/outlines_jump_forward.py +182 -0
- sglang-0.3.5.post2/sglang/srt/constrained/xgrammar_backend.py +150 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/fused_moe.py +23 -7
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/patch.py +4 -2
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/quantization/base_config.py +4 -6
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/vocab_parallel_embedding.py +216 -150
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/detokenizer_manager.py +0 -14
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/io_struct.py +5 -3
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/schedule_batch.py +14 -20
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/scheduler.py +159 -96
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tokenizer_manager.py +81 -17
- sglang-0.3.5.post2/sglang/srt/metrics/collector.py +211 -0
- sglang-0.3.5.post2/sglang/srt/metrics/func_timer.py +108 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mm_utils.py +1 -1
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/forward_batch_info.py +7 -3
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/model_executor/model_runner.py +6 -2
- sglang-0.3.5.post2/sglang/srt/models/gemma2_reward.py +69 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gpt2.py +31 -37
- sglang-0.3.5.post2/sglang/srt/models/internlm2_reward.py +62 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama.py +11 -6
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_reward.py +5 -26
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2_vl.py +5 -7
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/openai_api/adapter.py +11 -4
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/openai_api/protocol.py +29 -26
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/sampling_batch_info.py +2 -3
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/sampling_params.py +2 -16
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/server.py +60 -17
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/server_args.py +66 -25
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/utils.py +120 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_common.py +1 -1
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_humaneval.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_mgsm.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_utils.py +21 -7
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/utils.py +1 -0
- sglang-0.3.5.post2/sglang/version.py +1 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/PKG-INFO +12 -8
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/SOURCES.txt +9 -5
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/requires.txt +3 -2
- sglang-0.3.5/sglang/srt/constrained/__init__.py +0 -81
- sglang-0.3.5/sglang/srt/constrained/base_tool_cache.py +0 -65
- sglang-0.3.5/sglang/srt/constrained/bnf_cache.py +0 -61
- sglang-0.3.5/sglang/srt/constrained/fsm_cache.py +0 -95
- sglang-0.3.5/sglang/srt/constrained/grammar.py +0 -190
- sglang-0.3.5/sglang/srt/constrained/jump_forward.py +0 -203
- sglang-0.3.5/sglang/version.py +0 -1
- {sglang-0.3.5 → sglang-0.3.5.post2}/LICENSE +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/setup.cfg +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/api.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_latency.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/check_env.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/global_config.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/launch_server.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llava.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/runners.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.5
|
3
|
+
Version: 0.3.5.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,14 +225,15 @@ Requires-Dist: interegular; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: orjson; extra == "runtime-common"
|
226
226
|
Requires-Dist: packaging; extra == "runtime-common"
|
227
227
|
Requires-Dist: pillow; extra == "runtime-common"
|
228
|
+
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
228
229
|
Requires-Dist: psutil; extra == "runtime-common"
|
229
230
|
Requires-Dist: pydantic; extra == "runtime-common"
|
230
231
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
231
232
|
Requires-Dist: torchao; extra == "runtime-common"
|
232
233
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
233
234
|
Requires-Dist: uvloop; extra == "runtime-common"
|
234
|
-
Requires-Dist:
|
235
|
-
Requires-Dist: outlines
|
235
|
+
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
236
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
236
237
|
Requires-Dist: modelscope; extra == "runtime-common"
|
237
238
|
Provides-Extra: srt
|
238
239
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
@@ -291,13 +292,14 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
|
|
291
292
|
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
292
293
|
[](https://github.com/sgl-project/sglang/issues)
|
293
294
|
[](https://github.com/sgl-project/sglang/issues)
|
295
|
+
[-006BFF)](https://gurubase.io/g/sglang)
|
294
296
|
|
295
297
|
</div>
|
296
298
|
|
297
299
|
--------------------------------------------------------------------------------
|
298
300
|
|
299
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
300
|
-
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
301
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
|
302
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
301
303
|
|
302
304
|
## News
|
303
305
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
@@ -321,11 +323,13 @@ The core features include:
|
|
321
323
|
|
322
324
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
323
325
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
324
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.)
|
326
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
325
327
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
326
328
|
|
327
|
-
##
|
328
|
-
See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
329
|
+
## Getting Started
|
330
|
+
Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
331
|
+
|
332
|
+
Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
|
329
333
|
|
330
334
|
## Backend: SGLang Runtime (SRT)
|
331
335
|
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
@@ -6,13 +6,14 @@
|
|
6
6
|
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
7
7
|
[](https://github.com/sgl-project/sglang/issues)
|
8
8
|
[](https://github.com/sgl-project/sglang/issues)
|
9
|
+
[-006BFF)](https://gurubase.io/g/sglang)
|
9
10
|
|
10
11
|
</div>
|
11
12
|
|
12
13
|
--------------------------------------------------------------------------------
|
13
14
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
15
|
-
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
15
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
|
16
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
16
17
|
|
17
18
|
## News
|
18
19
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
@@ -36,11 +37,13 @@ The core features include:
|
|
36
37
|
|
37
38
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
38
39
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
39
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.)
|
40
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
40
41
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
41
42
|
|
42
|
-
##
|
43
|
-
See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
43
|
+
## Getting Started
|
44
|
+
Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
45
|
+
|
46
|
+
Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
|
44
47
|
|
45
48
|
## Backend: SGLang Runtime (SRT)
|
46
49
|
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.5"
|
7
|
+
version = "0.3.5.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -17,10 +17,11 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
20
|
-
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
21
|
-
"torchao", "uvicorn", "uvloop", "
|
22
|
-
"outlines>=0.0.44", "modelscope"]
|
20
|
+
"orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
|
21
|
+
"torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
|
22
|
+
"outlines>=0.0.44,<0.1.0", "modelscope"]
|
23
23
|
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
|
24
|
+
|
24
25
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
25
26
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
26
27
|
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
|
@@ -0,0 +1,309 @@
|
|
1
|
+
"""
|
2
|
+
Benchmark the throughput of using the offline LLM engine.
|
3
|
+
This script does not launch a server.
|
4
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
|
5
|
+
|
6
|
+
# Usage
|
7
|
+
## Sharegpt dataset with default args
|
8
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
9
|
+
|
10
|
+
## Random dataset with default args
|
11
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
|
12
|
+
|
13
|
+
## Shared prefix dataset with default args
|
14
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
|
15
|
+
|
16
|
+
## Sharegpt dataset on runtime backend
|
17
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
|
18
|
+
"""
|
19
|
+
|
20
|
+
import argparse
|
21
|
+
import dataclasses
|
22
|
+
import json
|
23
|
+
import logging
|
24
|
+
import random
|
25
|
+
import time
|
26
|
+
from typing import List, Optional, Tuple
|
27
|
+
|
28
|
+
import numpy as np
|
29
|
+
|
30
|
+
from sglang.api import Engine
|
31
|
+
from sglang.bench_serving import (
|
32
|
+
get_dataset,
|
33
|
+
get_tokenizer,
|
34
|
+
sample_random_requests,
|
35
|
+
set_ulimit,
|
36
|
+
)
|
37
|
+
from sglang.srt.server import Runtime
|
38
|
+
from sglang.srt.server_args import ServerArgs
|
39
|
+
|
40
|
+
|
41
|
+
@dataclasses.dataclass
|
42
|
+
class BenchArgs:
|
43
|
+
backend: str = "engine"
|
44
|
+
result_filename: str = ""
|
45
|
+
dataset_name: str = "sharegpt"
|
46
|
+
dataset_path: str = ""
|
47
|
+
num_prompts: int = 1000
|
48
|
+
sharegpt_output_len: Optional[int] = None
|
49
|
+
random_input_len: int = 1024
|
50
|
+
random_output_len: int = 1024
|
51
|
+
random_range_ratio: float = 0.0
|
52
|
+
gen_num_groups: int = 64
|
53
|
+
gen_prompts_per_group: int = 16
|
54
|
+
gen_system_prompt_len: int = 2048
|
55
|
+
gen_question_len: int = 128
|
56
|
+
gen_output_len: int = 256
|
57
|
+
disable_ignore_eos: bool = False
|
58
|
+
seed: int = 1
|
59
|
+
|
60
|
+
@staticmethod
|
61
|
+
def add_cli_args(parser: argparse.ArgumentParser):
|
62
|
+
parser.add_argument("--backend", type=str, default=BenchArgs.backend)
|
63
|
+
parser.add_argument(
|
64
|
+
"--result-filename", type=str, default=BenchArgs.result_filename
|
65
|
+
)
|
66
|
+
parser.add_argument(
|
67
|
+
"--dataset-name",
|
68
|
+
type=str,
|
69
|
+
default="sharegpt",
|
70
|
+
choices=["sharegpt", "random", "generated-shared-prefix"],
|
71
|
+
help="Name of the dataset to benchmark on.",
|
72
|
+
)
|
73
|
+
parser.add_argument(
|
74
|
+
"--dataset-path", type=str, default="", help="Path to the dataset."
|
75
|
+
)
|
76
|
+
parser.add_argument(
|
77
|
+
"--num-prompts",
|
78
|
+
type=int,
|
79
|
+
default=BenchArgs.num_prompts,
|
80
|
+
help="Number of prompts to process. Default is 1000.",
|
81
|
+
)
|
82
|
+
parser.add_argument(
|
83
|
+
"--sharegpt-output-len",
|
84
|
+
type=int,
|
85
|
+
default=BenchArgs.sharegpt_output_len,
|
86
|
+
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
87
|
+
)
|
88
|
+
parser.add_argument(
|
89
|
+
"--random-input-len",
|
90
|
+
type=int,
|
91
|
+
default=BenchArgs.random_input_len,
|
92
|
+
help="Number of input tokens per request, used only for random dataset.",
|
93
|
+
)
|
94
|
+
parser.add_argument(
|
95
|
+
"--random-output-len",
|
96
|
+
type=int,
|
97
|
+
default=BenchArgs.random_output_len,
|
98
|
+
help="Number of output tokens per request, used only for random dataset.",
|
99
|
+
)
|
100
|
+
parser.add_argument(
|
101
|
+
"--random-range-ratio",
|
102
|
+
type=float,
|
103
|
+
default=BenchArgs.random_range_ratio,
|
104
|
+
help="Range of sampled ratio of input/output length, "
|
105
|
+
"used only for random dataset.",
|
106
|
+
)
|
107
|
+
parser.add_argument(
|
108
|
+
"--gen-num-groups",
|
109
|
+
type=int,
|
110
|
+
default=BenchArgs.gen_num_groups,
|
111
|
+
help="Number of groups with shared prefix, used"
|
112
|
+
"only for generate-shared-prefix",
|
113
|
+
)
|
114
|
+
parser.add_argument(
|
115
|
+
"--gen-prompts-per-group",
|
116
|
+
type=int,
|
117
|
+
default=BenchArgs.gen_prompts_per_group,
|
118
|
+
help="Number of prompts per group of shared prefix, used"
|
119
|
+
"only for generate-shared-prefix",
|
120
|
+
)
|
121
|
+
parser.add_argument(
|
122
|
+
"--gen-system-prompt-len",
|
123
|
+
type=int,
|
124
|
+
default=BenchArgs.gen_system_prompt_len,
|
125
|
+
help="System prompt length, used" "only for generate-shared-prefix",
|
126
|
+
)
|
127
|
+
parser.add_argument(
|
128
|
+
"--gen-question-len",
|
129
|
+
type=int,
|
130
|
+
default=BenchArgs.gen_question_len,
|
131
|
+
help="Question length, used" "only for generate-shared-prefix",
|
132
|
+
)
|
133
|
+
parser.add_argument(
|
134
|
+
"--gen-output-len",
|
135
|
+
type=int,
|
136
|
+
default=BenchArgs.gen_output_len,
|
137
|
+
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
138
|
+
)
|
139
|
+
parser.add_argument(
|
140
|
+
"--disable-ignore-eos",
|
141
|
+
type=bool,
|
142
|
+
default=BenchArgs.disable_ignore_eos,
|
143
|
+
help="Disable ignore EOS token",
|
144
|
+
)
|
145
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
146
|
+
|
147
|
+
@classmethod
|
148
|
+
def from_cli_args(cls, args: argparse.Namespace):
|
149
|
+
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
150
|
+
return cls(**{attr: getattr(args, attr) for attr in attrs})
|
151
|
+
|
152
|
+
|
153
|
+
def throughput_test_once(
|
154
|
+
backend_name: str,
|
155
|
+
backend,
|
156
|
+
reqs: List[Tuple[str, int, int]],
|
157
|
+
ignore_eos: bool,
|
158
|
+
):
|
159
|
+
measurement_results = {
|
160
|
+
"backend": backend_name,
|
161
|
+
"successful_requests": len(reqs),
|
162
|
+
"total_latency": -1,
|
163
|
+
"total_input_tokens": sum(r[1] for r in reqs),
|
164
|
+
"total_output_tokens": -1,
|
165
|
+
"request_throughput": -1,
|
166
|
+
"input_throughput": -1,
|
167
|
+
"output_throughput": -1,
|
168
|
+
"total_throughput": -1,
|
169
|
+
}
|
170
|
+
|
171
|
+
prompt = [r[0] for r in reqs]
|
172
|
+
sampling_params = [
|
173
|
+
{
|
174
|
+
"temperature": 0,
|
175
|
+
"max_new_tokens": r[2],
|
176
|
+
"ignore_eos": ignore_eos,
|
177
|
+
}
|
178
|
+
for r in reqs
|
179
|
+
]
|
180
|
+
|
181
|
+
st = time.perf_counter()
|
182
|
+
gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
|
183
|
+
latency = time.perf_counter() - st
|
184
|
+
|
185
|
+
if backend_name == "runtime":
|
186
|
+
gen_out = json.loads(gen_out)
|
187
|
+
|
188
|
+
measurement_results["total_latency"] = latency
|
189
|
+
measurement_results["total_output_tokens"] = sum(
|
190
|
+
o["meta_info"]["completion_tokens"] for o in gen_out
|
191
|
+
)
|
192
|
+
measurement_results["request_throughput"] = (
|
193
|
+
measurement_results["successful_requests"] / latency
|
194
|
+
)
|
195
|
+
measurement_results["input_throughput"] = (
|
196
|
+
measurement_results["total_input_tokens"] / latency
|
197
|
+
)
|
198
|
+
measurement_results["output_throughput"] = (
|
199
|
+
measurement_results["total_output_tokens"] / latency
|
200
|
+
)
|
201
|
+
measurement_results["total_throughput"] = (
|
202
|
+
measurement_results["total_input_tokens"]
|
203
|
+
+ measurement_results["total_output_tokens"]
|
204
|
+
) / latency
|
205
|
+
|
206
|
+
return measurement_results
|
207
|
+
|
208
|
+
|
209
|
+
def throughput_test(
|
210
|
+
server_args: ServerArgs,
|
211
|
+
bench_args: BenchArgs,
|
212
|
+
):
|
213
|
+
if bench_args.backend == "engine":
|
214
|
+
backend = Engine(**dataclasses.asdict(server_args))
|
215
|
+
if not backend:
|
216
|
+
raise ValueError("Please provide valid engine arguments")
|
217
|
+
elif bench_args.backend == "runtime":
|
218
|
+
backend = Runtime(**dataclasses.asdict(server_args))
|
219
|
+
else:
|
220
|
+
raise ValueError('Please set backend to either "engine" or "runtime"')
|
221
|
+
|
222
|
+
tokenizer_id = server_args.model_path
|
223
|
+
tokenizer = get_tokenizer(tokenizer_id)
|
224
|
+
|
225
|
+
# Set global environmnets
|
226
|
+
set_ulimit()
|
227
|
+
random.seed(bench_args.seed)
|
228
|
+
np.random.seed(bench_args.seed)
|
229
|
+
|
230
|
+
# Read dataset
|
231
|
+
input_requests = get_dataset(bench_args, tokenizer)
|
232
|
+
|
233
|
+
warmup_requests = sample_random_requests(
|
234
|
+
input_len=20,
|
235
|
+
output_len=4,
|
236
|
+
num_prompts=2,
|
237
|
+
range_ratio=0.8,
|
238
|
+
tokenizer=tokenizer,
|
239
|
+
dataset_path=bench_args.dataset_path,
|
240
|
+
)
|
241
|
+
|
242
|
+
# Warm up
|
243
|
+
throughput_test_once(
|
244
|
+
backend_name=bench_args.backend,
|
245
|
+
backend=backend,
|
246
|
+
reqs=warmup_requests,
|
247
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
248
|
+
)
|
249
|
+
|
250
|
+
result = throughput_test_once(
|
251
|
+
backend_name=bench_args.backend,
|
252
|
+
backend=backend,
|
253
|
+
reqs=input_requests,
|
254
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
255
|
+
)
|
256
|
+
|
257
|
+
if bench_args.result_filename:
|
258
|
+
with open(bench_args.result_filename, "a") as fout:
|
259
|
+
fout.write(json.dumps(result) + "\n")
|
260
|
+
|
261
|
+
print(
|
262
|
+
"\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
|
263
|
+
)
|
264
|
+
print("{:<40} {:<10}".format("Backend:", result["backend"]))
|
265
|
+
print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
|
266
|
+
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
|
267
|
+
print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
|
268
|
+
print(
|
269
|
+
"{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
|
270
|
+
)
|
271
|
+
print(
|
272
|
+
"{:<40} {:<10.2f}".format(
|
273
|
+
"Request throughput (req/s):", result["request_throughput"]
|
274
|
+
)
|
275
|
+
)
|
276
|
+
print(
|
277
|
+
"{:<40} {:<10.2f}".format(
|
278
|
+
"Input token throughput (tok/s):", result["input_throughput"]
|
279
|
+
)
|
280
|
+
)
|
281
|
+
print(
|
282
|
+
"{:<40} {:<10.2f}".format(
|
283
|
+
"Output token throughput (tok/s):", result["output_throughput"]
|
284
|
+
)
|
285
|
+
)
|
286
|
+
print(
|
287
|
+
"{:<40} {:<10.2f}".format(
|
288
|
+
"Total token throughput (tok/s):", result["total_throughput"]
|
289
|
+
)
|
290
|
+
)
|
291
|
+
print("=" * 50)
|
292
|
+
|
293
|
+
return result
|
294
|
+
|
295
|
+
|
296
|
+
if __name__ == "__main__":
|
297
|
+
parser = argparse.ArgumentParser()
|
298
|
+
ServerArgs.add_cli_args(parser)
|
299
|
+
BenchArgs.add_cli_args(parser)
|
300
|
+
args = parser.parse_args()
|
301
|
+
server_args = ServerArgs.from_cli_args(args)
|
302
|
+
bench_args = BenchArgs.from_cli_args(args)
|
303
|
+
|
304
|
+
logging.basicConfig(
|
305
|
+
level=getattr(logging, server_args.log_level.upper()),
|
306
|
+
format="%(message)s",
|
307
|
+
)
|
308
|
+
|
309
|
+
throughput_test(server_args, bench_args)
|
@@ -421,6 +421,37 @@ def get_tokenizer(
|
|
421
421
|
)
|
422
422
|
|
423
423
|
|
424
|
+
def get_dataset(args, tokenizer):
|
425
|
+
if args.dataset_name == "sharegpt":
|
426
|
+
input_requests = sample_sharegpt_requests(
|
427
|
+
dataset_path=args.dataset_path,
|
428
|
+
num_requests=args.num_prompts,
|
429
|
+
tokenizer=tokenizer,
|
430
|
+
fixed_output_len=args.sharegpt_output_len,
|
431
|
+
)
|
432
|
+
elif args.dataset_name == "random":
|
433
|
+
input_requests = sample_random_requests(
|
434
|
+
input_len=args.random_input_len,
|
435
|
+
output_len=args.random_output_len,
|
436
|
+
num_prompts=args.num_prompts,
|
437
|
+
range_ratio=args.random_range_ratio,
|
438
|
+
tokenizer=tokenizer,
|
439
|
+
dataset_path=args.dataset_path,
|
440
|
+
)
|
441
|
+
elif args.dataset_name == "generated-shared-prefix":
|
442
|
+
input_requests = sample_generated_shared_prefix_requests(
|
443
|
+
num_groups=args.gen_num_groups,
|
444
|
+
prompts_per_group=args.gen_prompts_per_group,
|
445
|
+
system_prompt_len=args.gen_system_prompt_len,
|
446
|
+
question_len=args.gen_question_len,
|
447
|
+
output_len=args.gen_output_len,
|
448
|
+
tokenizer=tokenizer,
|
449
|
+
)
|
450
|
+
else:
|
451
|
+
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
452
|
+
return input_requests
|
453
|
+
|
454
|
+
|
424
455
|
ASYNC_REQUEST_FUNCS = {
|
425
456
|
"sglang": async_request_sglang_generate,
|
426
457
|
"sglang-native": async_request_sglang_generate,
|
@@ -443,6 +474,8 @@ class BenchmarkMetrics:
|
|
443
474
|
input_throughput: float
|
444
475
|
output_throughput: float
|
445
476
|
output_throughput_retokenized: float
|
477
|
+
total_throughput: float
|
478
|
+
total_throughput_retokenized: float
|
446
479
|
mean_ttft_ms: float
|
447
480
|
median_ttft_ms: float
|
448
481
|
std_ttft_ms: float
|
@@ -590,18 +623,25 @@ def sample_random_requests(
|
|
590
623
|
(data["conversations"][0]["value"], data["conversations"][1]["value"])
|
591
624
|
for data in dataset
|
592
625
|
]
|
593
|
-
|
594
626
|
# Shuffle the dataset.
|
595
627
|
random.shuffle(dataset)
|
596
628
|
|
597
629
|
# Filter out sequences that are too long or too short
|
598
630
|
input_requests: List[Tuple[str, int, int]] = []
|
599
|
-
for
|
631
|
+
for data in dataset:
|
632
|
+
i = len(input_requests)
|
633
|
+
if i == num_prompts:
|
634
|
+
break
|
635
|
+
|
600
636
|
# Tokenize the prompts and completions.
|
601
|
-
prompt =
|
637
|
+
prompt = data[0]
|
602
638
|
prompt_token_ids = tokenizer.encode(prompt)
|
603
639
|
prompt_len = len(prompt_token_ids)
|
604
640
|
|
641
|
+
# Skip empty prompt
|
642
|
+
if prompt_len == 0:
|
643
|
+
continue
|
644
|
+
|
605
645
|
if prompt_len > input_lens[i]:
|
606
646
|
input_ids = prompt_token_ids[: input_lens[i]]
|
607
647
|
else:
|
@@ -627,6 +667,66 @@ def sample_random_requests(
|
|
627
667
|
return input_requests
|
628
668
|
|
629
669
|
|
670
|
+
def gen_prompt(tokenizer, token_num):
|
671
|
+
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
672
|
+
all_available_tokens = list(tokenizer.get_vocab().values())
|
673
|
+
selected_tokens = random.choices(all_available_tokens, k=token_num)
|
674
|
+
return tokenizer.decode(selected_tokens)
|
675
|
+
|
676
|
+
|
677
|
+
def sample_generated_shared_prefix_requests(
|
678
|
+
num_groups: int,
|
679
|
+
prompts_per_group: int,
|
680
|
+
system_prompt_len: int,
|
681
|
+
question_len: int,
|
682
|
+
output_len: int,
|
683
|
+
tokenizer: PreTrainedTokenizerBase,
|
684
|
+
) -> List[Tuple[str, int, int]]:
|
685
|
+
"""Generate benchmark requests with shared system prompts using random tokens."""
|
686
|
+
# Generate system prompts for each group
|
687
|
+
system_prompts = []
|
688
|
+
for _ in range(num_groups):
|
689
|
+
system_prompt = gen_prompt(tokenizer, system_prompt_len)
|
690
|
+
system_prompts.append(system_prompt)
|
691
|
+
|
692
|
+
# Generate questions
|
693
|
+
questions = []
|
694
|
+
for _ in range(num_groups * prompts_per_group):
|
695
|
+
question = gen_prompt(tokenizer, question_len)
|
696
|
+
questions.append(question)
|
697
|
+
|
698
|
+
# Combine system prompts with questions
|
699
|
+
input_requests = []
|
700
|
+
total_input_tokens = 0
|
701
|
+
total_output_tokens = 0
|
702
|
+
|
703
|
+
for group_idx in range(num_groups):
|
704
|
+
system_prompt = system_prompts[group_idx]
|
705
|
+
for prompt_idx in range(prompts_per_group):
|
706
|
+
question = questions[group_idx * prompts_per_group + prompt_idx]
|
707
|
+
full_prompt = f"{system_prompt}\n\n{question}"
|
708
|
+
prompt_len = len(tokenizer.encode(full_prompt))
|
709
|
+
|
710
|
+
input_requests.append((full_prompt, prompt_len, output_len))
|
711
|
+
total_input_tokens += prompt_len
|
712
|
+
total_output_tokens += output_len
|
713
|
+
|
714
|
+
print(f"\nGenerated shared prefix dataset statistics:")
|
715
|
+
print(f"Number of groups: {num_groups}")
|
716
|
+
print(f"Prompts per group: {prompts_per_group}")
|
717
|
+
print(f"Total prompts: {len(input_requests)}")
|
718
|
+
print(f"Total input tokens: {total_input_tokens}")
|
719
|
+
print(f"Total output tokens: {total_output_tokens}")
|
720
|
+
print(
|
721
|
+
f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
|
722
|
+
)
|
723
|
+
print(
|
724
|
+
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
|
725
|
+
)
|
726
|
+
|
727
|
+
return input_requests
|
728
|
+
|
729
|
+
|
630
730
|
async def get_request(
|
631
731
|
input_requests: List[Tuple[str, int, int]],
|
632
732
|
request_rate: float,
|
@@ -696,6 +796,9 @@ def calculate_metrics(
|
|
696
796
|
input_throughput=total_input / dur_s,
|
697
797
|
output_throughput=sum(output_lens) / dur_s,
|
698
798
|
output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
|
799
|
+
total_throughput=(total_input + sum(output_lens)) / dur_s,
|
800
|
+
total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
|
801
|
+
/ dur_s,
|
699
802
|
mean_ttft_ms=np.mean(ttfts or 0)
|
700
803
|
* 1000, # ttfts is empty if streaming is not supported by backend
|
701
804
|
median_ttft_ms=np.median(ttfts or 0) * 1000,
|
@@ -813,6 +916,11 @@ async def benchmark(
|
|
813
916
|
"Output token throughput (tok/s):", metrics.output_throughput
|
814
917
|
)
|
815
918
|
)
|
919
|
+
print(
|
920
|
+
"{:<40} {:<10.2f}".format(
|
921
|
+
"Total token throughput (tok/s):", metrics.total_throughput
|
922
|
+
)
|
923
|
+
)
|
816
924
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
817
925
|
print(
|
818
926
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
@@ -1030,26 +1138,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1030
1138
|
|
1031
1139
|
tokenizer = get_tokenizer(tokenizer_id)
|
1032
1140
|
|
1033
|
-
|
1034
|
-
assert args.random_input_len is None and args.random_output_len is None
|
1035
|
-
input_requests = sample_sharegpt_requests(
|
1036
|
-
dataset_path=args.dataset_path,
|
1037
|
-
num_requests=args.num_prompts,
|
1038
|
-
tokenizer=tokenizer,
|
1039
|
-
fixed_output_len=args.sharegpt_output_len,
|
1040
|
-
)
|
1041
|
-
elif args.dataset_name == "random":
|
1042
|
-
assert args.random_input_len is not None and args.random_output_len is not None
|
1043
|
-
input_requests = sample_random_requests(
|
1044
|
-
input_len=args.random_input_len,
|
1045
|
-
output_len=args.random_output_len,
|
1046
|
-
num_prompts=args.num_prompts,
|
1047
|
-
range_ratio=args.random_range_ratio,
|
1048
|
-
tokenizer=tokenizer,
|
1049
|
-
dataset_path=args.dataset_path,
|
1050
|
-
)
|
1051
|
-
else:
|
1052
|
-
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
1141
|
+
input_requests = get_dataset(args, tokenizer)
|
1053
1142
|
|
1054
1143
|
if not args.multi:
|
1055
1144
|
return asyncio.run(
|
@@ -1121,7 +1210,7 @@ if __name__ == "__main__":
|
|
1121
1210
|
"--dataset-name",
|
1122
1211
|
type=str,
|
1123
1212
|
default="sharegpt",
|
1124
|
-
choices=["sharegpt", "random"],
|
1213
|
+
choices=["sharegpt", "random", "generated-shared-prefix"],
|
1125
1214
|
help="Name of the dataset to benchmark on.",
|
1126
1215
|
)
|
1127
1216
|
parser.add_argument(
|
@@ -1152,10 +1241,12 @@ if __name__ == "__main__":
|
|
1152
1241
|
parser.add_argument(
|
1153
1242
|
"--random-input-len",
|
1154
1243
|
type=int,
|
1244
|
+
default=1024,
|
1155
1245
|
help="Number of input tokens per request, used only for random dataset.",
|
1156
1246
|
)
|
1157
1247
|
parser.add_argument(
|
1158
1248
|
"--random-output-len",
|
1249
|
+
default=1024,
|
1159
1250
|
type=int,
|
1160
1251
|
help="Number of output tokens per request, used only for random dataset.",
|
1161
1252
|
)
|
@@ -1208,5 +1299,38 @@ if __name__ == "__main__":
|
|
1208
1299
|
help="Append given JSON object to the request payload. You can use this to specify"
|
1209
1300
|
"additional generate params like sampling params.",
|
1210
1301
|
)
|
1302
|
+
|
1303
|
+
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1304
|
+
group.add_argument(
|
1305
|
+
"--gen-num-groups",
|
1306
|
+
type=int,
|
1307
|
+
default=64,
|
1308
|
+
help="Number of system prompt groups for generated-shared-prefix dataset",
|
1309
|
+
)
|
1310
|
+
group.add_argument(
|
1311
|
+
"--gen-prompts-per-group",
|
1312
|
+
type=int,
|
1313
|
+
default=16,
|
1314
|
+
help="Number of prompts per system prompt group for generated-shared-prefix dataset",
|
1315
|
+
)
|
1316
|
+
group.add_argument(
|
1317
|
+
"--gen-system-prompt-len",
|
1318
|
+
type=int,
|
1319
|
+
default=2048,
|
1320
|
+
help="Target length in tokens for system prompts in generated-shared-prefix dataset",
|
1321
|
+
)
|
1322
|
+
group.add_argument(
|
1323
|
+
"--gen-question-len",
|
1324
|
+
type=int,
|
1325
|
+
default=128,
|
1326
|
+
help="Target length in tokens for questions in generated-shared-prefix dataset",
|
1327
|
+
)
|
1328
|
+
group.add_argument(
|
1329
|
+
"--gen-output-len",
|
1330
|
+
type=int,
|
1331
|
+
default=256,
|
1332
|
+
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1333
|
+
)
|
1334
|
+
|
1211
1335
|
args = parser.parse_args()
|
1212
1336
|
run_benchmark(args)
|