sglang 0.3.3.post1__tar.gz → 0.3.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.3.post1/sglang.egg-info → sglang-0.3.4}/PKG-INFO +72 -29
- {sglang-0.3.3.post1 → sglang-0.3.4}/README.md +41 -11
- {sglang-0.3.3.post1 → sglang-0.3.4}/pyproject.toml +12 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/bench_latency.py +28 -10
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/bench_server_latency.py +21 -10
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/bench_serving.py +101 -7
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/global_config.py +0 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/attention/__init__.py +27 -5
- sglang-0.3.4/sglang/srt/layers/attention/double_sparsity_backend.py +281 -0
- sglang-0.3.4/sglang/srt/layers/attention/flashinfer_backend.py +546 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/attention/triton_backend.py +6 -4
- sglang-0.3.4/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +772 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/extend_attention.py +5 -3
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +4 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/sampler.py +6 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/detokenizer_manager.py +31 -10
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/io_struct.py +4 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/schedule_batch.py +120 -43
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/schedule_policy.py +2 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/scheduler.py +202 -140
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/tokenizer_manager.py +5 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/tp_worker.py +111 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mem_cache/chunk_cache.py +8 -4
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mem_cache/memory_pool.py +77 -4
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mem_cache/radix_cache.py +15 -7
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/model_executor/cuda_graph_runner.py +4 -4
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/model_executor/forward_batch_info.py +16 -21
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/model_executor/model_runner.py +60 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/baichuan.py +2 -3
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/chatglm.py +5 -6
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/commandr.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/dbrx.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/deepseek.py +4 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/deepseek_v2.py +5 -6
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/exaone.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/gemma.py +2 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/gemma2.py +5 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/gpt_bigcode.py +5 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/grok.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/internlm2.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llama.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llama_classification.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llama_reward.py +2 -3
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llava.py +4 -8
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llavavid.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/minicpm.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/minicpm3.py +5 -6
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/mixtral.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/mixtral_quant.py +1 -2
- sglang-0.3.4/sglang/srt/models/olmo.py +352 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/olmoe.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/qwen.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/qwen2.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/qwen2_moe.py +4 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/stablelm.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/torch_native_llama.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/xverse.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/xverse_moe.py +4 -5
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/yivl.py +1 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/openai_api/adapter.py +92 -49
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/openai_api/protocol.py +10 -2
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/orchestrator.py +28 -9
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/sampling_batch_info.py +92 -58
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/sampling_params.py +2 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/server.py +116 -17
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/server_args.py +121 -45
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/utils.py +11 -3
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/few_shot_gsm8k.py +4 -1
- sglang-0.3.4/sglang/test/few_shot_gsm8k_engine.py +144 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/srt/sampling/penaltylib/utils.py +16 -12
- sglang-0.3.4/sglang/version.py +1 -0
- {sglang-0.3.3.post1 → sglang-0.3.4/sglang.egg-info}/PKG-INFO +72 -29
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang.egg-info/SOURCES.txt +4 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang.egg-info/requires.txt +20 -3
- sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_backend.py +0 -277
- sglang-0.3.3.post1/sglang/srt/layers/attention/flashinfer_utils.py +0 -237
- sglang-0.3.3.post1/sglang/version.py +0 -1
- {sglang-0.3.3.post1 → sglang-0.3.4}/LICENSE +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/setup.cfg +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/api.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/check_env.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/choices.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/ir.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/launch_server.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/configs/model_config.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/constrained/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/constrained/base_tool_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/constrained/fsm_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/constrained/jump_forward.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/attention/triton_ops/decode_attention.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/mm_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/runners.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/test/test_utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang/utils.py +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.3.post1 → sglang-0.3.4}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.4
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -214,26 +214,31 @@ License-File: LICENSE
|
|
214
214
|
Requires-Dist: requests
|
215
215
|
Requires-Dist: tqdm
|
216
216
|
Requires-Dist: numpy
|
217
|
+
Provides-Extra: runtime-common
|
218
|
+
Requires-Dist: aiohttp; extra == "runtime-common"
|
219
|
+
Requires-Dist: decord; extra == "runtime-common"
|
220
|
+
Requires-Dist: fastapi; extra == "runtime-common"
|
221
|
+
Requires-Dist: hf_transfer; extra == "runtime-common"
|
222
|
+
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
223
|
+
Requires-Dist: interegular; extra == "runtime-common"
|
224
|
+
Requires-Dist: orjson; extra == "runtime-common"
|
225
|
+
Requires-Dist: packaging; extra == "runtime-common"
|
226
|
+
Requires-Dist: pillow; extra == "runtime-common"
|
227
|
+
Requires-Dist: psutil; extra == "runtime-common"
|
228
|
+
Requires-Dist: pydantic; extra == "runtime-common"
|
229
|
+
Requires-Dist: python-multipart; extra == "runtime-common"
|
230
|
+
Requires-Dist: torchao; extra == "runtime-common"
|
231
|
+
Requires-Dist: uvicorn; extra == "runtime-common"
|
232
|
+
Requires-Dist: uvloop; extra == "runtime-common"
|
233
|
+
Requires-Dist: zmq; extra == "runtime-common"
|
234
|
+
Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
|
235
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
217
236
|
Provides-Extra: srt
|
218
|
-
Requires-Dist:
|
219
|
-
Requires-Dist: decord; extra == "srt"
|
220
|
-
Requires-Dist: fastapi; extra == "srt"
|
221
|
-
Requires-Dist: hf_transfer; extra == "srt"
|
222
|
-
Requires-Dist: huggingface_hub; extra == "srt"
|
223
|
-
Requires-Dist: interegular; extra == "srt"
|
224
|
-
Requires-Dist: packaging; extra == "srt"
|
225
|
-
Requires-Dist: pillow; extra == "srt"
|
226
|
-
Requires-Dist: psutil; extra == "srt"
|
227
|
-
Requires-Dist: pydantic; extra == "srt"
|
228
|
-
Requires-Dist: python-multipart; extra == "srt"
|
237
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
229
238
|
Requires-Dist: torch; extra == "srt"
|
230
|
-
Requires-Dist: torchao; extra == "srt"
|
231
|
-
Requires-Dist: uvicorn; extra == "srt"
|
232
|
-
Requires-Dist: uvloop; extra == "srt"
|
233
|
-
Requires-Dist: zmq; extra == "srt"
|
234
239
|
Requires-Dist: vllm==0.5.5; extra == "srt"
|
235
|
-
|
236
|
-
Requires-Dist:
|
240
|
+
Provides-Extra: srt-xpu
|
241
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
237
242
|
Provides-Extra: openai
|
238
243
|
Requires-Dist: openai>=1.0; extra == "openai"
|
239
244
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -253,9 +258,17 @@ Requires-Dist: sglang[srt]; extra == "all"
|
|
253
258
|
Requires-Dist: sglang[openai]; extra == "all"
|
254
259
|
Requires-Dist: sglang[anthropic]; extra == "all"
|
255
260
|
Requires-Dist: sglang[litellm]; extra == "all"
|
261
|
+
Provides-Extra: all-xpu
|
262
|
+
Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
263
|
+
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
264
|
+
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
265
|
+
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
256
266
|
Provides-Extra: dev
|
257
267
|
Requires-Dist: sglang[all]; extra == "dev"
|
258
268
|
Requires-Dist: sglang[test]; extra == "dev"
|
269
|
+
Provides-Extra: dev-xpu
|
270
|
+
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
271
|
+
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
259
272
|
|
260
273
|
<div align="center" id="sglangtop">
|
261
274
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -270,14 +283,13 @@ Requires-Dist: sglang[test]; extra == "dev"
|
|
270
283
|
|
271
284
|
--------------------------------------------------------------------------------
|
272
285
|
|
273
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.
|
274
|
-
|
275
|
-
## Upcoming Events
|
276
|
-
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
286
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
287
|
+
[**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
277
288
|
|
278
289
|
## News
|
279
|
-
- [2024/
|
280
|
-
- [2024/
|
290
|
+
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
291
|
+
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
292
|
+
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
281
293
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
282
294
|
|
283
295
|
<details>
|
@@ -323,7 +335,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
323
335
|
### Method 2: From source
|
324
336
|
```
|
325
337
|
# Use the last release branch
|
326
|
-
git clone -b v0.3.
|
338
|
+
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
327
339
|
cd sglang
|
328
340
|
|
329
341
|
pip install --upgrade pip
|
@@ -500,6 +512,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
500
512
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
501
513
|
```
|
502
514
|
|
515
|
+
### Engine Without HTTP Server
|
516
|
+
|
517
|
+
We also provide an inference engine **without a HTTP server**. For example,
|
518
|
+
|
519
|
+
```python
|
520
|
+
import sglang as sgl
|
521
|
+
|
522
|
+
|
523
|
+
def main():
|
524
|
+
prompts = [
|
525
|
+
"Hello, my name is",
|
526
|
+
"The president of the United States is",
|
527
|
+
"The capital of France is",
|
528
|
+
"The future of AI is",
|
529
|
+
]
|
530
|
+
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
531
|
+
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
532
|
+
|
533
|
+
outputs = llm.generate(prompts, sampling_params)
|
534
|
+
for prompt, output in zip(prompts, outputs):
|
535
|
+
print("===============================")
|
536
|
+
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
537
|
+
|
538
|
+
if __name__ == "__main__":
|
539
|
+
main()
|
540
|
+
```
|
541
|
+
|
542
|
+
This can be used for:
|
543
|
+
|
544
|
+
1. **Offline Batch Inference**
|
545
|
+
2. **Building Custom Servers**
|
546
|
+
|
547
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
548
|
+
|
503
549
|
### Supported Models
|
504
550
|
|
505
551
|
**Generative Models**
|
@@ -836,10 +882,7 @@ def chat_example(s):
|
|
836
882
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
837
883
|
|
838
884
|
## Benchmark And Performance
|
839
|
-
|
840
|
-

|
841
|
-
|
842
|
-
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
885
|
+
Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
|
843
886
|
|
844
887
|
## Roadmap
|
845
888
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -11,14 +11,13 @@
|
|
11
11
|
|
12
12
|
--------------------------------------------------------------------------------
|
13
13
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.
|
15
|
-
|
16
|
-
## Upcoming Events
|
17
|
-
- [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
|
14
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials/blob/main/slides/amd_dev_day_v2.pdf) | [**Learn More**](https://github.com/sgl-project/sgl-learning-materials) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) |
|
15
|
+
[**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
|
18
16
|
|
19
17
|
## News
|
20
|
-
- [2024/
|
21
|
-
- [2024/
|
18
|
+
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
19
|
+
- [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
|
20
|
+
- [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
|
22
21
|
- [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
|
23
22
|
|
24
23
|
<details>
|
@@ -64,7 +63,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
|
|
64
63
|
### Method 2: From source
|
65
64
|
```
|
66
65
|
# Use the last release branch
|
67
|
-
git clone -b v0.3.
|
66
|
+
git clone -b v0.3.4 https://github.com/sgl-project/sglang.git
|
68
67
|
cd sglang
|
69
68
|
|
70
69
|
pip install --upgrade pip
|
@@ -241,6 +240,40 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
|
|
241
240
|
python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 1
|
242
241
|
```
|
243
242
|
|
243
|
+
### Engine Without HTTP Server
|
244
|
+
|
245
|
+
We also provide an inference engine **without a HTTP server**. For example,
|
246
|
+
|
247
|
+
```python
|
248
|
+
import sglang as sgl
|
249
|
+
|
250
|
+
|
251
|
+
def main():
|
252
|
+
prompts = [
|
253
|
+
"Hello, my name is",
|
254
|
+
"The president of the United States is",
|
255
|
+
"The capital of France is",
|
256
|
+
"The future of AI is",
|
257
|
+
]
|
258
|
+
sampling_params = {"temperature": 0.8, "top_p": 0.95}
|
259
|
+
llm = sgl.Engine(model_path="meta-llama/Meta-Llama-3.1-8B-Instruct")
|
260
|
+
|
261
|
+
outputs = llm.generate(prompts, sampling_params)
|
262
|
+
for prompt, output in zip(prompts, outputs):
|
263
|
+
print("===============================")
|
264
|
+
print(f"Prompt: {prompt}\nGenerated text: {output['text']}")
|
265
|
+
|
266
|
+
if __name__ == "__main__":
|
267
|
+
main()
|
268
|
+
```
|
269
|
+
|
270
|
+
This can be used for:
|
271
|
+
|
272
|
+
1. **Offline Batch Inference**
|
273
|
+
2. **Building Custom Servers**
|
274
|
+
|
275
|
+
You can view the full example [here](https://github.com/sgl-project/sglang/tree/main/examples/runtime/engine)
|
276
|
+
|
244
277
|
### Supported Models
|
245
278
|
|
246
279
|
**Generative Models**
|
@@ -577,10 +610,7 @@ def chat_example(s):
|
|
577
610
|
- The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
|
578
611
|
|
579
612
|
## Benchmark And Performance
|
580
|
-
|
581
|
-

|
582
|
-
|
583
|
-
Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
|
613
|
+
Learn more in our release blogs: [v0.2](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3](https://lmsys.org/blog/2024-09-04-sglang-v0-3/).
|
584
614
|
|
585
615
|
## Roadmap
|
586
616
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.4"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -20,16 +20,23 @@ dependencies = [
|
|
20
20
|
]
|
21
21
|
|
22
22
|
[project.optional-dependencies]
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
24
|
+
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
25
|
+
"torchao", "uvicorn", "uvloop", "zmq",
|
26
|
+
"outlines>=0.0.44", "modelscope"]
|
27
|
+
# xpu is not enabled in public vllm and torch whl,
|
28
|
+
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
29
|
+
srt = ["sglang[runtime_common]", "torch", "vllm==0.5.5"]
|
30
|
+
srt_xpu = ["sglang[runtime_common]"]
|
31
|
+
|
27
32
|
openai = ["openai>=1.0", "tiktoken"]
|
28
33
|
anthropic = ["anthropic>=0.20.0"]
|
29
34
|
litellm = ["litellm>=1.0.0"]
|
30
35
|
test = ["jsonlines", "matplotlib", "pandas", "sentence_transformers", "accelerate", "peft"]
|
31
36
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
37
|
+
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
32
38
|
dev = ["sglang[all]", "sglang[test]"]
|
39
|
+
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
33
40
|
|
34
41
|
[project.urls]
|
35
42
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -232,17 +232,18 @@ def extend(reqs, model_runner):
|
|
232
232
|
model_worker_batch = batch.get_model_worker_batch()
|
233
233
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
234
234
|
logits_output = model_runner.forward(forward_batch)
|
235
|
-
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
235
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
236
236
|
return next_token_ids, logits_output.next_token_logits, batch
|
237
237
|
|
238
238
|
|
239
239
|
@torch.inference_mode()
|
240
240
|
def decode(input_token_ids, batch, model_runner):
|
241
|
-
batch.
|
241
|
+
batch.output_ids = input_token_ids
|
242
|
+
batch.prepare_for_decode()
|
242
243
|
model_worker_batch = batch.get_model_worker_batch()
|
243
244
|
forward_batch = ForwardBatch.init_new(model_worker_batch, model_runner)
|
244
245
|
logits_output = model_runner.forward(forward_batch)
|
245
|
-
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
246
|
+
next_token_ids = model_runner.sample(logits_output, forward_batch)
|
246
247
|
return next_token_ids, logits_output.next_token_logits
|
247
248
|
|
248
249
|
|
@@ -252,6 +253,7 @@ def correctness_test(
|
|
252
253
|
bench_args,
|
253
254
|
tp_rank,
|
254
255
|
):
|
256
|
+
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
255
257
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
256
258
|
|
257
259
|
# Load the model
|
@@ -279,8 +281,9 @@ def correctness_test(
|
|
279
281
|
output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
|
280
282
|
for _ in range(bench_args.output_len[0] - 1):
|
281
283
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
284
|
+
next_token_ids_list = next_token_ids.tolist()
|
282
285
|
for i in range(len(reqs)):
|
283
|
-
output_ids[i].append(
|
286
|
+
output_ids[i].append(next_token_ids_list[i])
|
284
287
|
|
285
288
|
# Print
|
286
289
|
for i in range(len(reqs)):
|
@@ -288,8 +291,15 @@ def correctness_test(
|
|
288
291
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
289
292
|
|
290
293
|
|
294
|
+
def synchronize(device):
|
295
|
+
if device == "cuda":
|
296
|
+
torch.cuda.synchronize()
|
297
|
+
elif device == "xpu":
|
298
|
+
torch.xpu.synchronize()
|
299
|
+
|
300
|
+
|
291
301
|
def latency_test_run_once(
|
292
|
-
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len
|
302
|
+
run_name, model_runner, rank_print, reqs, batch_size, input_len, output_len, device
|
293
303
|
):
|
294
304
|
max_batch_size = model_runner.max_total_num_tokens // (input_len + output_len)
|
295
305
|
if batch_size > max_batch_size:
|
@@ -312,10 +322,10 @@ def latency_test_run_once(
|
|
312
322
|
tot_latency = 0
|
313
323
|
|
314
324
|
# Prefill
|
315
|
-
|
325
|
+
synchronize(device)
|
316
326
|
tic = time.time()
|
317
327
|
next_token_ids, _, batch = extend(reqs, model_runner)
|
318
|
-
|
328
|
+
synchronize(device)
|
319
329
|
prefill_latency = time.time() - tic
|
320
330
|
tot_latency += prefill_latency
|
321
331
|
throughput = input_len * batch_size / prefill_latency
|
@@ -328,10 +338,10 @@ def latency_test_run_once(
|
|
328
338
|
# Decode
|
329
339
|
decode_latencies = []
|
330
340
|
for i in range(output_len - 1):
|
331
|
-
|
341
|
+
synchronize(device)
|
332
342
|
tic = time.time()
|
333
343
|
next_token_ids, _ = decode(next_token_ids, batch, model_runner)
|
334
|
-
|
344
|
+
synchronize(device)
|
335
345
|
latency = time.time() - tic
|
336
346
|
tot_latency += latency
|
337
347
|
throughput = batch_size / latency
|
@@ -387,6 +397,7 @@ def latency_test(
|
|
387
397
|
bench_args.batch_size[0],
|
388
398
|
bench_args.input_len[0],
|
389
399
|
8, # shorter decoding to speed up the warmup
|
400
|
+
server_args.device,
|
390
401
|
)
|
391
402
|
rank_print("Benchmark ...")
|
392
403
|
|
@@ -397,7 +408,14 @@ def latency_test(
|
|
397
408
|
):
|
398
409
|
reqs = prepare_synthetic_inputs_for_latency_test(bs, il)
|
399
410
|
ret = latency_test_run_once(
|
400
|
-
bench_args.run_name,
|
411
|
+
bench_args.run_name,
|
412
|
+
model_runner,
|
413
|
+
rank_print,
|
414
|
+
reqs,
|
415
|
+
bs,
|
416
|
+
il,
|
417
|
+
ol,
|
418
|
+
server_args.device,
|
401
419
|
)
|
402
420
|
if ret is not None:
|
403
421
|
result_list.append(ret)
|
@@ -6,6 +6,8 @@ It accepts arguments similar to those of launch_server.py.
|
|
6
6
|
Usage:
|
7
7
|
|
8
8
|
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
|
+
|
10
|
+
python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
9
11
|
"""
|
10
12
|
|
11
13
|
import argparse
|
@@ -32,6 +34,8 @@ class BenchArgs:
|
|
32
34
|
input_len: Tuple[int] = (1024,)
|
33
35
|
output_len: Tuple[int] = (16,)
|
34
36
|
result_filename: str = "result.jsonl"
|
37
|
+
base_url: str = ""
|
38
|
+
skip_warmup: bool = False
|
35
39
|
|
36
40
|
@staticmethod
|
37
41
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -48,6 +52,8 @@ class BenchArgs:
|
|
48
52
|
parser.add_argument(
|
49
53
|
"--result-filename", type=str, default=BenchArgs.result_filename
|
50
54
|
)
|
55
|
+
parser.add_argument("--base-url", type=str, default=BenchArgs.base_url)
|
56
|
+
parser.add_argument("--skip-warmup", action="store_true")
|
51
57
|
|
52
58
|
@classmethod
|
53
59
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -139,17 +145,21 @@ def run_one_case(
|
|
139
145
|
|
140
146
|
|
141
147
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
142
|
-
|
148
|
+
if bench_args.base_url:
|
149
|
+
proc, base_url = None, bench_args.base_url
|
150
|
+
else:
|
151
|
+
proc, base_url = launch_server_process(server_args)
|
143
152
|
|
144
153
|
# warmup
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
154
|
+
if not bench_args.skip_warmup:
|
155
|
+
run_one_case(
|
156
|
+
base_url,
|
157
|
+
batch_size=16,
|
158
|
+
input_len=1024,
|
159
|
+
output_len=16,
|
160
|
+
run_name="",
|
161
|
+
result_filename="",
|
162
|
+
)
|
153
163
|
|
154
164
|
# benchmark
|
155
165
|
try:
|
@@ -165,7 +175,8 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
165
175
|
bench_args.result_filename,
|
166
176
|
)
|
167
177
|
finally:
|
168
|
-
|
178
|
+
if proc:
|
179
|
+
kill_child_process(proc.pid)
|
169
180
|
|
170
181
|
print(f"\nResults are saved to {bench_args.result_filename}")
|
171
182
|
|
@@ -222,6 +222,85 @@ async def async_request_openai_completions(
|
|
222
222
|
return output
|
223
223
|
|
224
224
|
|
225
|
+
async def async_request_sglang_generate(
|
226
|
+
request_func_input: RequestFuncInput,
|
227
|
+
pbar: Optional[tqdm] = None,
|
228
|
+
) -> RequestFuncOutput:
|
229
|
+
api_url = request_func_input.api_url
|
230
|
+
prompt = request_func_input.prompt
|
231
|
+
|
232
|
+
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
|
233
|
+
payload = {
|
234
|
+
"text": prompt,
|
235
|
+
"sampling_params": {
|
236
|
+
"temperature": 0.0,
|
237
|
+
"max_new_tokens": request_func_input.output_len,
|
238
|
+
"ignore_eos": not args.disable_ignore_eos,
|
239
|
+
},
|
240
|
+
"stream": not args.disable_stream,
|
241
|
+
**request_func_input.extra_request_body,
|
242
|
+
}
|
243
|
+
headers = {}
|
244
|
+
|
245
|
+
output = RequestFuncOutput()
|
246
|
+
output.prompt_len = request_func_input.prompt_len
|
247
|
+
|
248
|
+
generated_text = ""
|
249
|
+
ttft = 0.0
|
250
|
+
st = time.perf_counter()
|
251
|
+
most_recent_timestamp = st
|
252
|
+
try:
|
253
|
+
async with session.post(
|
254
|
+
url=api_url, json=payload, headers=headers
|
255
|
+
) as response:
|
256
|
+
if response.status == 200:
|
257
|
+
async for chunk_bytes in response.content:
|
258
|
+
chunk_bytes = chunk_bytes.strip()
|
259
|
+
if not chunk_bytes:
|
260
|
+
continue
|
261
|
+
# print(chunk_bytes)
|
262
|
+
|
263
|
+
chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
|
264
|
+
latency = time.perf_counter() - st
|
265
|
+
if chunk == "[DONE]":
|
266
|
+
pass
|
267
|
+
else:
|
268
|
+
data = json.loads(chunk)
|
269
|
+
|
270
|
+
# NOTE: Some completion API might have a last
|
271
|
+
# usage summary response without a token so we
|
272
|
+
# want to check a token was generated
|
273
|
+
if data["text"]:
|
274
|
+
timestamp = time.perf_counter()
|
275
|
+
# First token
|
276
|
+
if ttft == 0.0:
|
277
|
+
ttft = time.perf_counter() - st
|
278
|
+
output.ttft = ttft
|
279
|
+
|
280
|
+
# Decoding phase
|
281
|
+
else:
|
282
|
+
output.itl.append(timestamp - most_recent_timestamp)
|
283
|
+
|
284
|
+
most_recent_timestamp = timestamp
|
285
|
+
generated_text = data["text"]
|
286
|
+
|
287
|
+
output.generated_text = generated_text
|
288
|
+
output.success = True
|
289
|
+
output.latency = latency
|
290
|
+
output.output_len = request_func_input.output_len
|
291
|
+
else:
|
292
|
+
output.error = response.reason or ""
|
293
|
+
output.success = False
|
294
|
+
except Exception:
|
295
|
+
output.success = False
|
296
|
+
exc_info = sys.exc_info()
|
297
|
+
output.error = "".join(traceback.format_exception(*exc_info))
|
298
|
+
|
299
|
+
if pbar:
|
300
|
+
pbar.update(1)
|
301
|
+
return output
|
302
|
+
|
303
|
+
|
225
304
|
async def async_request_gserver(
|
226
305
|
request_func_input: RequestFuncInput,
|
227
306
|
pbar: Optional[tqdm] = None,
|
@@ -264,7 +343,9 @@ def get_tokenizer(
|
|
264
343
|
|
265
344
|
|
266
345
|
ASYNC_REQUEST_FUNCS = {
|
267
|
-
"sglang":
|
346
|
+
"sglang": async_request_sglang_generate,
|
347
|
+
"sglang-native": async_request_sglang_generate,
|
348
|
+
"sglang-oai": async_request_openai_completions,
|
268
349
|
"vllm": async_request_openai_completions,
|
269
350
|
"lmdeploy": async_request_openai_completions,
|
270
351
|
"trt": async_request_trt_llm,
|
@@ -387,6 +468,8 @@ def sample_sharegpt_requests(
|
|
387
468
|
continue
|
388
469
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
389
470
|
|
471
|
+
print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
|
472
|
+
print(f"#Output tokens: {np.sum([x[2] for x in filtered_dataset])}")
|
390
473
|
return filtered_dataset
|
391
474
|
|
392
475
|
|
@@ -587,6 +670,8 @@ async def benchmark(
|
|
587
670
|
else:
|
588
671
|
print("Initial test run completed. Starting main benchmark run...")
|
589
672
|
|
673
|
+
time.sleep(1.5)
|
674
|
+
|
590
675
|
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
|
591
676
|
|
592
677
|
benchmark_start_time = time.perf_counter()
|
@@ -782,24 +867,33 @@ def run_benchmark(args_: argparse.Namespace):
|
|
782
867
|
if args.port is None:
|
783
868
|
args.port = {
|
784
869
|
"sglang": 30000,
|
870
|
+
"sglang-native": 30000,
|
871
|
+
"sglang-oai": 30000,
|
785
872
|
"lmdeploy": 23333,
|
786
873
|
"vllm": 8000,
|
787
874
|
"trt": 8000,
|
788
875
|
"gserver": 9988,
|
789
876
|
}.get(args.backend, 30000)
|
790
877
|
|
791
|
-
api_url = (
|
792
|
-
f"{args.base_url}/v1/completions"
|
793
|
-
if args.base_url
|
794
|
-
else f"http://{args.host}:{args.port}/v1/completions"
|
795
|
-
)
|
796
878
|
model_url = (
|
797
879
|
f"{args.base_url}/v1/models"
|
798
880
|
if args.base_url
|
799
881
|
else f"http://{args.host}:{args.port}/v1/models"
|
800
882
|
)
|
801
883
|
|
802
|
-
if args.backend
|
884
|
+
if args.backend in ["sglang", "sglang-native"]:
|
885
|
+
api_url = (
|
886
|
+
f"{args.base_url}/generate"
|
887
|
+
if args.base_url
|
888
|
+
else f"http://{args.host}:{args.port}/generate"
|
889
|
+
)
|
890
|
+
elif args.backend in ["sglang-oai", "vllm", "lmdeploy"]:
|
891
|
+
api_url = (
|
892
|
+
f"{args.base_url}/v1/completions"
|
893
|
+
if args.base_url
|
894
|
+
else f"http://{args.host}:{args.port}/v1/completions"
|
895
|
+
)
|
896
|
+
elif args.backend == "trt":
|
803
897
|
api_url = (
|
804
898
|
f"{args.base_url}/v2/models/ensemble/generate_stream"
|
805
899
|
if args.base_url
|
@@ -19,7 +19,6 @@ class GlobalConfig:
|
|
19
19
|
self.new_token_ratio_decay = 0.001
|
20
20
|
|
21
21
|
# Runtime constants: others
|
22
|
-
self.num_continue_decode_steps = 10
|
23
22
|
self.retract_decode_steps = 20
|
24
23
|
self.flashinfer_workspace_size = os.environ.get(
|
25
24
|
"FLASHINFER_WORKSPACE_SIZE", 384 * 1024 * 1024
|