sglang 0.3.5.post2__tar.gz → 0.3.6.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/LICENSE +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/PKG-INFO +28 -19
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/README.md +11 -13
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/pyproject.toml +14 -6
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/__init__.py +2 -2
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/api.py +2 -2
- sglang-0.3.6.post1/sglang/bench_latency.py +1 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/bench_offline_throughput.py +48 -20
- sglang-0.3.5.post2/sglang/bench_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch.py +21 -102
- sglang-0.3.5.post2/sglang/bench_server_latency.py → sglang-0.3.6.post1/sglang/bench_one_batch_server.py +3 -3
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/bench_serving.py +125 -6
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/check_env.py +3 -6
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/base_backend.py +1 -1
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/runtime_endpoint.py +2 -2
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/model_config.py +13 -14
- sglang-0.3.6.post1/sglang/srt/constrained/__init__.py +16 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/base_grammar_backend.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_backend.py +28 -17
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/outlines_jump_forward.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/constrained/xgrammar_backend.py +47 -58
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/conversation.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/hf_transformers_utils.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/activation.py +16 -13
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/flashinfer_backend.py +106 -54
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_backend.py +9 -7
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang-0.3.6.post1/sglang/srt/layers/custom_op_util.py +25 -0
- sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- {sglang-0.3.5.post2/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/fused_moe.py +11 -4
- {sglang-0.3.5.post2/sglang/srt/layers/fused_moe → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_grok}/layer.py +4 -9
- sglang-0.3.5.post2/sglang/srt/layers/fused_moe/patch.py → sglang-0.3.6.post1/sglang/srt/layers/fused_moe_patch.py +5 -0
- sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang-0.3.6.post1/sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/layernorm.py +17 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/logits_processor.py +23 -25
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/__init__.py +77 -17
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/radix_attention.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/rotary_embedding.py +13 -13
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/sampler.py +4 -8
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/torchao_utils.py +2 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora_config.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/lora/lora_manager.py +22 -24
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/data_parallel_controller.py +98 -27
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/detokenizer_manager.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/io_struct.py +63 -21
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_batch.py +154 -59
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/schedule_policy.py +18 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/scheduler.py +278 -109
- sglang-0.3.6.post1/sglang/srt/managers/session_controller.py +61 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tokenizer_manager.py +63 -18
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker.py +25 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +62 -67
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/metrics/collector.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/metrics/func_timer.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mm_utils.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/cuda_graph_runner.py +63 -25
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/forward_batch_info.py +128 -32
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/model_executor/model_runner.py +132 -64
- sglang-0.3.6.post1/sglang/srt/model_parallel.py +98 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/chatglm.py +15 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/commandr.py +15 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/dbrx.py +15 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/deepseek.py +15 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/deepseek_v2.py +162 -59
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/exaone.py +14 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma.py +14 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma2.py +31 -25
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gemma2_reward.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gpt_bigcode.py +14 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/grok.py +15 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/internlm2.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/internlm2_reward.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama.py +21 -21
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_classification.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_reward.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llava.py +14 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llavavid.py +14 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/minicpm.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/minicpm3.py +13 -15
- sglang-0.3.6.post1/sglang/srt/models/mistral.py +23 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mixtral.py +15 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mixtral_quant.py +14 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/olmo.py +22 -20
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/olmoe.py +23 -20
- sglang-0.3.6.post1/sglang/srt/models/phi3_small.py +447 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen.py +14 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2.py +22 -19
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_moe.py +17 -18
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/qwen2_vl.py +13 -6
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/stablelm.py +18 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/torch_native_llama.py +107 -93
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/xverse.py +13 -14
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/xverse_moe.py +15 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/yivl.py +13 -15
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/openai_api/adapter.py +19 -17
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/openai_api/protocol.py +14 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +49 -79
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +3 -9
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +3 -8
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_batch_info.py +61 -57
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/sampling_params.py +14 -16
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/server.py +86 -35
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/server_args.py +96 -80
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/utils.py +266 -68
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k.py +8 -4
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/runners.py +38 -20
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/srt/sampling/penaltylib/utils.py +23 -21
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_utils.py +31 -20
- sglang-0.3.6.post1/sglang/version.py +1 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/PKG-INFO +28 -19
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/SOURCES.txt +13 -5
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/requires.txt +18 -4
- sglang-0.3.5.post2/sglang/srt/constrained/__init__.py +0 -17
- sglang-0.3.5.post2/sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.5.post2/sglang/srt/models/mistral.py +0 -25
- sglang-0.3.5.post2/sglang/version.py +0 -1
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/setup.cfg +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/global_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/launch_server.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang/utils.py +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.5.post2 → sglang-0.3.6.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -186,7 +186,7 @@
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
187
187
|
identification within third-party archives.
|
188
188
|
|
189
|
-
Copyright
|
189
|
+
Copyright 2023-2024 SGLang Team
|
190
190
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
192
192
|
you may not use this file except in compliance with the License.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.6.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -190,7 +190,7 @@ License: Apache License
|
|
190
190
|
same "printed page" as the copyright notice for easier
|
191
191
|
identification within third-party archives.
|
192
192
|
|
193
|
-
Copyright
|
193
|
+
Copyright 2023-2024 SGLang Team
|
194
194
|
|
195
195
|
Licensed under the Apache License, Version 2.0 (the "License");
|
196
196
|
you may not use this file except in compliance with the License.
|
@@ -222,29 +222,32 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
222
222
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
223
223
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
224
224
|
Requires-Dist: interegular; extra == "runtime-common"
|
225
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
225
226
|
Requires-Dist: orjson; extra == "runtime-common"
|
227
|
+
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
226
228
|
Requires-Dist: packaging; extra == "runtime-common"
|
227
229
|
Requires-Dist: pillow; extra == "runtime-common"
|
228
230
|
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
229
231
|
Requires-Dist: psutil; extra == "runtime-common"
|
230
232
|
Requires-Dist: pydantic; extra == "runtime-common"
|
231
233
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
234
|
+
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
232
235
|
Requires-Dist: torchao; extra == "runtime-common"
|
233
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
234
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
235
|
-
Requires-Dist:
|
236
|
-
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
237
|
-
Requires-Dist: modelscope; extra == "runtime-common"
|
238
|
+
Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
|
238
239
|
Provides-Extra: srt
|
239
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
240
241
|
Requires-Dist: torch; extra == "srt"
|
241
|
-
Requires-Dist: vllm
|
242
|
+
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
242
243
|
Provides-Extra: srt-hip
|
243
244
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
244
245
|
Requires-Dist: torch; extra == "srt-hip"
|
245
246
|
Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
|
246
247
|
Provides-Extra: srt-xpu
|
247
248
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
249
|
+
Provides-Extra: srt-hpu
|
250
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
248
251
|
Provides-Extra: openai
|
249
252
|
Requires-Dist: openai>=1.0; extra == "openai"
|
250
253
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
|
274
277
|
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
275
278
|
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
276
279
|
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
280
|
+
Provides-Extra: all-hpu
|
281
|
+
Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
282
|
+
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
283
|
+
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
284
|
+
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
277
285
|
Provides-Extra: dev
|
278
286
|
Requires-Dist: sglang[all]; extra == "dev"
|
279
287
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
|
|
283
291
|
Provides-Extra: dev-xpu
|
284
292
|
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
285
293
|
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
294
|
+
Provides-Extra: dev-hpu
|
295
|
+
Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
|
296
|
+
Requires-Dist: sglang[test]; extra == "dev-hpu"
|
286
297
|
|
287
298
|
<div align="center" id="sglangtop">
|
288
299
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
|
|
321
332
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
322
333
|
The core features include:
|
323
334
|
|
324
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/
|
335
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
|
325
336
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
326
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
337
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
327
338
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
328
339
|
|
329
340
|
## Getting Started
|
330
|
-
Install SGLang
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
## Backend: SGLang Runtime (SRT)
|
335
|
-
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
336
|
-
|
337
|
-
## Frontend: Structured Generation Language (SGLang)
|
338
|
-
See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
|
341
|
+
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
342
|
+
- [Send requests](https://sgl-project.github.io/start/send_request.html)
|
343
|
+
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
344
|
+
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
339
345
|
|
340
346
|
## Benchmark And Performance
|
341
347
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
343
349
|
## Roadmap
|
344
350
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
345
351
|
|
346
|
-
##
|
352
|
+
## Adoption and Sponsorship
|
353
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
|
354
|
+
|
355
|
+
## Acknowledgment and Citation
|
356
|
+
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
347
357
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
348
|
-
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
|
|
35
35
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
36
36
|
The core features include:
|
37
37
|
|
38
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/
|
38
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
|
39
39
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
40
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
40
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
41
41
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
42
42
|
|
43
43
|
## Getting Started
|
44
|
-
Install SGLang
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
## Backend: SGLang Runtime (SRT)
|
49
|
-
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
50
|
-
|
51
|
-
## Frontend: Structured Generation Language (SGLang)
|
52
|
-
See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
|
44
|
+
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
45
|
+
- [Send requests](https://sgl-project.github.io/start/send_request.html)
|
46
|
+
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
47
|
+
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
53
48
|
|
54
49
|
## Benchmark And Performance
|
55
50
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
57
52
|
## Roadmap
|
58
53
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
59
54
|
|
60
|
-
##
|
55
|
+
## Adoption and Sponsorship
|
56
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
|
57
|
+
|
58
|
+
## Acknowledgment and Citation
|
59
|
+
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
61
60
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
62
|
-
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.
|
7
|
+
version = "0.3.6.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -16,11 +16,14 @@ classifiers = [
|
|
16
16
|
dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
|
-
runtime_common = ["aiohttp", "decord", "fastapi",
|
20
|
-
"
|
21
|
-
"
|
22
|
-
"
|
23
|
-
|
19
|
+
runtime_common = ["aiohttp", "decord", "fastapi",
|
20
|
+
"hf_transfer", "huggingface_hub", "interegular", "modelscope",
|
21
|
+
"orjson", "outlines>=0.0.44,<0.1.0",
|
22
|
+
"packaging", "pillow", "prometheus-client>=0.20.0",
|
23
|
+
"psutil", "pydantic", "python-multipart",
|
24
|
+
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
|
+
"xgrammar>=0.1.4"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
|
24
27
|
|
25
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
26
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -28,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
|
|
28
31
|
# xpu is not enabled in public vllm and torch whl,
|
29
32
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
30
33
|
srt_xpu = ["sglang[runtime_common]"]
|
34
|
+
#For Intel Gaudi(device : hpu) follow the installation guide
|
35
|
+
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
36
|
+
srt_hpu = ["sglang[runtime_common]"]
|
31
37
|
|
32
38
|
openai = ["openai>=1.0", "tiktoken"]
|
33
39
|
anthropic = ["anthropic>=0.20.0"]
|
@@ -43,9 +49,11 @@ test = [
|
|
43
49
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
44
50
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
45
51
|
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
52
|
+
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
46
53
|
dev = ["sglang[all]", "sglang[test]"]
|
47
54
|
dev_hip = ["sglang[all_hip]", "sglang[test]"]
|
48
55
|
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
56
|
+
dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
|
49
57
|
|
50
58
|
[project.urls]
|
51
59
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -11,7 +11,7 @@ from sglang.api import (
|
|
11
11
|
gen,
|
12
12
|
gen_int,
|
13
13
|
gen_string,
|
14
|
-
|
14
|
+
get_server_info,
|
15
15
|
image,
|
16
16
|
select,
|
17
17
|
set_default_backend,
|
@@ -41,7 +41,7 @@ __all__ = [
|
|
41
41
|
"gen",
|
42
42
|
"gen_int",
|
43
43
|
"gen_string",
|
44
|
-
"
|
44
|
+
"get_server_info",
|
45
45
|
"image",
|
46
46
|
"select",
|
47
47
|
"set_default_backend",
|
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
65
65
|
return backend.flush_cache()
|
66
66
|
|
67
67
|
|
68
|
-
def
|
68
|
+
def get_server_info(backend: Optional[BaseBackend] = None):
|
69
69
|
backend = backend or global_config.default_backend
|
70
70
|
if backend is None:
|
71
71
|
return None
|
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
73
73
|
# If backend is Runtime
|
74
74
|
if hasattr(backend, "endpoint"):
|
75
75
|
backend = backend.endpoint
|
76
|
-
return backend.
|
76
|
+
return backend.get_server_info()
|
77
77
|
|
78
78
|
|
79
79
|
def gen(
|
@@ -0,0 +1 @@
|
|
1
|
+
raise ValueError("bench_latency.py has been renamed to bench_one_batch.py")
|
@@ -1,20 +1,13 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the throughput
|
3
|
-
This script does not launch a server.
|
2
|
+
Benchmark the throughput in the offline mode.
|
4
3
|
It accepts server arguments (the same as launch_server.py) and benchmark arguments (the same as bench_serving.py).
|
5
4
|
|
6
5
|
# Usage
|
7
6
|
## Sharegpt dataset with default args
|
8
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct
|
7
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --num-prompts 10
|
9
8
|
|
10
9
|
## Random dataset with default args
|
11
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random
|
12
|
-
|
13
|
-
## Shared prefix dataset with default args
|
14
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name generated-shared-prefix
|
15
|
-
|
16
|
-
## Sharegpt dataset on runtime backend
|
17
|
-
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --backend runtime
|
10
|
+
python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --dataset-name random --random-input 1024 --random-output 1024
|
18
11
|
"""
|
19
12
|
|
20
13
|
import argparse
|
@@ -23,7 +16,7 @@ import json
|
|
23
16
|
import logging
|
24
17
|
import random
|
25
18
|
import time
|
26
|
-
from typing import List, Optional, Tuple
|
19
|
+
from typing import Dict, List, Optional, Tuple
|
27
20
|
|
28
21
|
import numpy as np
|
29
22
|
|
@@ -55,7 +48,10 @@ class BenchArgs:
|
|
55
48
|
gen_question_len: int = 128
|
56
49
|
gen_output_len: int = 256
|
57
50
|
disable_ignore_eos: bool = False
|
51
|
+
extra_request_body: Optional[str] = None
|
58
52
|
seed: int = 1
|
53
|
+
skip_warmup: bool = False
|
54
|
+
do_not_exit: bool = False
|
59
55
|
|
60
56
|
@staticmethod
|
61
57
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -142,7 +138,24 @@ class BenchArgs:
|
|
142
138
|
default=BenchArgs.disable_ignore_eos,
|
143
139
|
help="Disable ignore EOS token",
|
144
140
|
)
|
141
|
+
parser.add_argument(
|
142
|
+
"--extra-request-body",
|
143
|
+
metavar='{"key1": "value1", "key2": "value2"}',
|
144
|
+
type=str,
|
145
|
+
help="Append given JSON object to the request payload. You can use this to specify"
|
146
|
+
"additional generate params like sampling params.",
|
147
|
+
)
|
145
148
|
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
149
|
+
parser.add_argument(
|
150
|
+
"--skip-warmup",
|
151
|
+
action="store_true",
|
152
|
+
help="Skip the warmup batches.",
|
153
|
+
)
|
154
|
+
parser.add_argument(
|
155
|
+
"--do-not-exit",
|
156
|
+
action="store_true",
|
157
|
+
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
158
|
+
)
|
146
159
|
|
147
160
|
@classmethod
|
148
161
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -155,6 +168,7 @@ def throughput_test_once(
|
|
155
168
|
backend,
|
156
169
|
reqs: List[Tuple[str, int, int]],
|
157
170
|
ignore_eos: bool,
|
171
|
+
extra_request_body: Dict,
|
158
172
|
):
|
159
173
|
measurement_results = {
|
160
174
|
"backend": backend_name,
|
@@ -174,6 +188,7 @@ def throughput_test_once(
|
|
174
188
|
"temperature": 0,
|
175
189
|
"max_new_tokens": r[2],
|
176
190
|
"ignore_eos": ignore_eos,
|
191
|
+
**extra_request_body,
|
177
192
|
}
|
178
193
|
for r in reqs
|
179
194
|
]
|
@@ -227,31 +242,41 @@ def throughput_test(
|
|
227
242
|
random.seed(bench_args.seed)
|
228
243
|
np.random.seed(bench_args.seed)
|
229
244
|
|
245
|
+
# Parse args
|
246
|
+
extra_request_body = {}
|
247
|
+
if bench_args.extra_request_body:
|
248
|
+
extra_request_body = json.loads(args.extra_request_body)
|
249
|
+
|
230
250
|
# Read dataset
|
231
251
|
input_requests = get_dataset(bench_args, tokenizer)
|
232
252
|
|
233
253
|
warmup_requests = sample_random_requests(
|
234
|
-
input_len=
|
235
|
-
output_len=
|
236
|
-
num_prompts=
|
254
|
+
input_len=256,
|
255
|
+
output_len=16,
|
256
|
+
num_prompts=16,
|
237
257
|
range_ratio=0.8,
|
238
258
|
tokenizer=tokenizer,
|
239
259
|
dataset_path=bench_args.dataset_path,
|
240
260
|
)
|
241
261
|
|
242
262
|
# Warm up
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
263
|
+
if not bench_args.skip_warmup:
|
264
|
+
logging.info("\nWarmup...")
|
265
|
+
throughput_test_once(
|
266
|
+
backend_name=bench_args.backend,
|
267
|
+
backend=backend,
|
268
|
+
reqs=warmup_requests,
|
269
|
+
ignore_eos=not bench_args.disable_ignore_eos,
|
270
|
+
extra_request_body=extra_request_body,
|
271
|
+
)
|
249
272
|
|
273
|
+
logging.info("\nBenchmark...")
|
250
274
|
result = throughput_test_once(
|
251
275
|
backend_name=bench_args.backend,
|
252
276
|
backend=backend,
|
253
277
|
reqs=input_requests,
|
254
278
|
ignore_eos=not bench_args.disable_ignore_eos,
|
279
|
+
extra_request_body=extra_request_body,
|
255
280
|
)
|
256
281
|
|
257
282
|
if bench_args.result_filename:
|
@@ -307,3 +332,6 @@ if __name__ == "__main__":
|
|
307
332
|
)
|
308
333
|
|
309
334
|
throughput_test(server_args, bench_args)
|
335
|
+
|
336
|
+
while bench_args.do_not_exit:
|
337
|
+
pass
|
@@ -1,20 +1,17 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of running a single static batch.
|
2
|
+
Benchmark the latency of running a single static batch without a server.
|
3
|
+
|
3
4
|
This script does not launch a server and uses the low-level APIs.
|
4
|
-
It accepts arguments
|
5
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
5
6
|
|
6
7
|
# Usage (latency test)
|
7
8
|
## with dummy weights:
|
8
|
-
python -m sglang.
|
9
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --load-format dummy
|
9
10
|
## sweep through multiple data points and store (append) the results in a jsonl file:
|
10
|
-
python -m sglang.
|
11
|
-
## do some changes, and store the results under a different run_name:
|
12
|
-
python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl --run-name after
|
13
|
-
## plot the results in series of lines:
|
14
|
-
python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
|
11
|
+
python -m sglang.bench_one_batch --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --run-name test_run
|
15
12
|
|
16
13
|
# Usage (correctness test):
|
17
|
-
python -m sglang.
|
14
|
+
python -m sglang.bench_one_batch --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
|
18
15
|
|
19
16
|
## Reference output (of the correctness test above, can be gpu dependent):
|
20
17
|
input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
|
@@ -50,13 +47,10 @@ import itertools
|
|
50
47
|
import json
|
51
48
|
import logging
|
52
49
|
import multiprocessing
|
53
|
-
import os
|
54
|
-
import sqlite3
|
55
50
|
import time
|
56
51
|
from typing import Tuple
|
57
52
|
|
58
53
|
import numpy as np
|
59
|
-
import pandas as pd
|
60
54
|
import torch
|
61
55
|
import torch.distributed as dist
|
62
56
|
|
@@ -77,19 +71,14 @@ from sglang.srt.utils import (
|
|
77
71
|
|
78
72
|
@dataclasses.dataclass
|
79
73
|
class BenchArgs:
|
80
|
-
run_name: str = "
|
74
|
+
run_name: str = "default"
|
81
75
|
batch_size: Tuple[int] = (1,)
|
82
76
|
input_len: Tuple[int] = (1024,)
|
83
77
|
output_len: Tuple[int] = (16,)
|
84
|
-
result_filename: str = ""
|
78
|
+
result_filename: str = "result.jsonl"
|
85
79
|
correctness_test: bool = False
|
86
80
|
# This is only used for correctness test
|
87
81
|
cut_len: int = 4
|
88
|
-
# Plotting args
|
89
|
-
graph_sql: str = (
|
90
|
-
"select run_name, batch_size, prefill_throughput from results where run_name='before'"
|
91
|
-
)
|
92
|
-
graph_filename: str = "out.png"
|
93
82
|
|
94
83
|
@staticmethod
|
95
84
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -108,11 +97,6 @@ class BenchArgs:
|
|
108
97
|
)
|
109
98
|
parser.add_argument("--correctness-test", action="store_true")
|
110
99
|
parser.add_argument("--cut-len", type=int, default=BenchArgs.cut_len)
|
111
|
-
# graphing
|
112
|
-
parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
|
113
|
-
parser.add_argument(
|
114
|
-
"--graph-filename", type=str, default=BenchArgs.graph_filename
|
115
|
-
)
|
116
100
|
|
117
101
|
@classmethod
|
118
102
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -220,7 +204,7 @@ def prepare_synthetic_inputs_for_latency_test(batch_size, input_len):
|
|
220
204
|
return reqs
|
221
205
|
|
222
206
|
|
223
|
-
@torch.
|
207
|
+
@torch.no_grad
|
224
208
|
def extend(reqs, model_runner):
|
225
209
|
batch = ScheduleBatch.init_new(
|
226
210
|
reqs=reqs,
|
@@ -228,6 +212,7 @@ def extend(reqs, model_runner):
|
|
228
212
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
229
213
|
tree_cache=None,
|
230
214
|
model_config=model_runner.model_config,
|
215
|
+
enable_overlap=False,
|
231
216
|
)
|
232
217
|
batch.prepare_for_extend()
|
233
218
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -237,7 +222,7 @@ def extend(reqs, model_runner):
|
|
237
222
|
return next_token_ids, logits_output.next_token_logits, batch
|
238
223
|
|
239
224
|
|
240
|
-
@torch.
|
225
|
+
@torch.no_grad
|
241
226
|
def decode(input_token_ids, batch, model_runner):
|
242
227
|
batch.output_ids = input_token_ids
|
243
228
|
batch.prepare_for_decode()
|
@@ -254,6 +239,7 @@ def correctness_test(
|
|
254
239
|
bench_args,
|
255
240
|
tp_rank,
|
256
241
|
):
|
242
|
+
# Configure the logger
|
257
243
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
258
244
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
259
245
|
|
@@ -274,7 +260,7 @@ def correctness_test(
|
|
274
260
|
bench_args, input_ids, reqs, model_runner
|
275
261
|
)
|
276
262
|
|
277
|
-
# Extend
|
263
|
+
# Extend (prefill w/ KV cache)
|
278
264
|
next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
|
279
265
|
rank_print(f"prefill logits (final): {next_token_logits} \n")
|
280
266
|
|
@@ -286,17 +272,14 @@ def correctness_test(
|
|
286
272
|
for i in range(len(reqs)):
|
287
273
|
output_ids[i].append(next_token_ids_list[i])
|
288
274
|
|
289
|
-
# Print
|
275
|
+
# Print output texts
|
290
276
|
for i in range(len(reqs)):
|
291
277
|
rank_print(f"========== Prompt {i} ==========")
|
292
278
|
rank_print(tokenizer.decode(output_ids[i]), "\n")
|
293
279
|
|
294
280
|
|
295
281
|
def synchronize(device):
|
296
|
-
|
297
|
-
torch.cuda.synchronize()
|
298
|
-
elif device == "xpu":
|
299
|
-
torch.xpu.synchronize()
|
282
|
+
torch.get_device_module(device).synchronize()
|
300
283
|
|
301
284
|
|
302
285
|
def latency_test_run_once(
|
@@ -352,7 +335,7 @@ def latency_test_run_once(
|
|
352
335
|
f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
|
353
336
|
)
|
354
337
|
|
355
|
-
#
|
338
|
+
# Record decode timing from 2nd output
|
356
339
|
if output_len > 1:
|
357
340
|
med_decode_latency = np.median(decode_latencies)
|
358
341
|
med_decode_throughput = batch_size / med_decode_latency
|
@@ -367,7 +350,7 @@ def latency_test_run_once(
|
|
367
350
|
f"Total. latency: {tot_latency:6.3f} s, throughput: {throughput:9.2f} token/s"
|
368
351
|
)
|
369
352
|
measurement_results["total_latency"] = tot_latency
|
370
|
-
measurement_results["
|
353
|
+
measurement_results["overall_throughput"] = throughput
|
371
354
|
return measurement_results
|
372
355
|
|
373
356
|
|
@@ -377,6 +360,7 @@ def latency_test(
|
|
377
360
|
bench_args,
|
378
361
|
tp_rank,
|
379
362
|
):
|
363
|
+
# Configure the logger
|
380
364
|
configure_logger(server_args, prefix=f" TP{tp_rank}")
|
381
365
|
rank_print = print if tp_rank == 0 else lambda *args, **kwargs: None
|
382
366
|
|
@@ -423,71 +407,9 @@ def latency_test(
|
|
423
407
|
|
424
408
|
# Write results in jsonlines format on rank 0.
|
425
409
|
if tp_rank == 0 and bench_args.result_filename:
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
f.write_all(result_list)
|
430
|
-
|
431
|
-
|
432
|
-
def plot_latency_test(
|
433
|
-
server_args,
|
434
|
-
bench_args,
|
435
|
-
tp_rank,
|
436
|
-
):
|
437
|
-
assert tp_rank == 0
|
438
|
-
|
439
|
-
# read the jsonl file and put in sqlite
|
440
|
-
df = pd.read_json(bench_args.result_filename, lines=True)
|
441
|
-
conn = sqlite3.connect(":memory:")
|
442
|
-
cur = conn.cursor()
|
443
|
-
|
444
|
-
# get the columns and their types
|
445
|
-
column_names = list(df.iloc[0].keys())
|
446
|
-
type_dict = {
|
447
|
-
str: "TEXT",
|
448
|
-
np.int64: "INTEGER",
|
449
|
-
np.float64: "FLOAT",
|
450
|
-
}
|
451
|
-
column_types = [type_dict[type(i)] for i in list(df.iloc[0])]
|
452
|
-
|
453
|
-
# create the table
|
454
|
-
cur.execute(
|
455
|
-
f"""
|
456
|
-
CREATE TABLE IF NOT EXISTS results (
|
457
|
-
{", ".join([f"{name} {type}" for name, type in zip(column_names, column_types)])}
|
458
|
-
)
|
459
|
-
"""
|
460
|
-
)
|
461
|
-
conn.commit()
|
462
|
-
|
463
|
-
# write the results to DB
|
464
|
-
df.to_sql("results", conn, if_exists="replace", index=False)
|
465
|
-
conn.commit()
|
466
|
-
|
467
|
-
# read it back using sql
|
468
|
-
df = pd.read_sql_query(bench_args.graph_sql, conn)
|
469
|
-
conn.close()
|
470
|
-
|
471
|
-
# plot it and save to a file
|
472
|
-
import matplotlib.pyplot as plt
|
473
|
-
|
474
|
-
assert (
|
475
|
-
len(df.columns) == 3
|
476
|
-
), f"The sql should have fetched <series, x, y> columns, not {df.columns}"
|
477
|
-
for label in df[df.columns[0]].unique():
|
478
|
-
q = f"{df.columns[0]}=='{label}'"
|
479
|
-
series = df.query(q)
|
480
|
-
plt.plot(series[df.columns[1]], series[df.columns[2]], label=q, marker="o")
|
481
|
-
plt.xlabel(df.columns[1])
|
482
|
-
plt.ylabel(df.columns[2])
|
483
|
-
plt.legend()
|
484
|
-
plt.savefig(bench_args.graph_filename, dpi=300)
|
485
|
-
|
486
|
-
# if in kitty, just dump it to the terminal
|
487
|
-
if os.environ["TERM"] == "xterm-kitty":
|
488
|
-
os.system(
|
489
|
-
f"kitty icat --use-window-size 1,1,600,600 {bench_args.graph_filename}"
|
490
|
-
)
|
410
|
+
with open(bench_args.result_filename, "a") as fout:
|
411
|
+
for result in result_list:
|
412
|
+
fout.write(json.dumps(result) + "\n")
|
491
413
|
|
492
414
|
|
493
415
|
def main(server_args, bench_args):
|
@@ -498,9 +420,6 @@ def main(server_args, bench_args):
|
|
498
420
|
work_func = correctness_test
|
499
421
|
else:
|
500
422
|
work_func = latency_test
|
501
|
-
elif os.path.isfile(bench_args.result_filename):
|
502
|
-
assert bench_args.graph_filename, "please provide a filename for the graph"
|
503
|
-
work_func = plot_latency_test
|
504
423
|
else:
|
505
424
|
raise ValueError(
|
506
425
|
"Provide --model-path for running the tests or "
|
@@ -1,10 +1,10 @@
|
|
1
1
|
"""
|
2
|
-
Benchmark the latency of
|
2
|
+
Benchmark the latency of running a single batch with a server.
|
3
|
+
|
3
4
|
This script launches a server and uses the HTTP interface.
|
4
|
-
It accepts arguments
|
5
|
+
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
5
6
|
|
6
7
|
Usage:
|
7
|
-
|
8
8
|
python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
9
|
|
10
10
|
python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|