sglang 0.3.6__tar.gz → 0.3.6.post2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.6 → sglang-0.3.6.post2}/LICENSE +1 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/PKG-INFO +25 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/README.md +10 -12
- {sglang-0.3.6 → sglang-0.3.6.post2}/pyproject.toml +9 -4
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/__init__.py +2 -2
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/api.py +2 -2
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch.py +4 -7
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_one_batch_server.py +2 -2
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_serving.py +75 -26
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/check_env.py +7 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/base_backend.py +1 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/runtime_endpoint.py +2 -2
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/tracer.py +1 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/launch_server.py +0 -3
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/model_config.py +15 -20
- sglang-0.3.6.post2/sglang/srt/constrained/__init__.py +16 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/base_grammar_backend.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/outlines_backend.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/outlines_jump_forward.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/constrained/xgrammar_backend.py +38 -57
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/conversation.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/hf_transformers_utils.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/activation.py +13 -13
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/flashinfer_backend.py +14 -7
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang-0.3.6.post2/sglang/srt/layers/custom_op_util.py +25 -0
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok}/layer.py +4 -9
- sglang-0.3.6/sglang/srt/layers/fused_moe/patch.py → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_patch.py +5 -0
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang-0.3.6.post2/sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/layernorm.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/logits_processor.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/quantization/__init__.py +77 -17
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/radix_attention.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/rotary_embedding.py +13 -13
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/sampler.py +1 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora_config.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/lora/lora_manager.py +22 -24
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/data_parallel_controller.py +25 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/detokenizer_manager.py +13 -18
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/image_processor.py +6 -9
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/io_struct.py +43 -28
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/schedule_batch.py +92 -27
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/schedule_policy.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/scheduler.py +94 -72
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/session_controller.py +29 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tokenizer_manager.py +29 -22
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tp_worker.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/metrics/collector.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/metrics/func_timer.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mm_utils.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/cuda_graph_runner.py +20 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/forward_batch_info.py +19 -17
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_executor/model_runner.py +42 -30
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/chatglm.py +15 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/commandr.py +15 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/dbrx.py +15 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/deepseek.py +15 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/deepseek_v2.py +15 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/exaone.py +14 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma.py +14 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma2.py +24 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gemma2_reward.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gpt_bigcode.py +14 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/grok.py +15 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/internlm2.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/internlm2_reward.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama.py +21 -21
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_classification.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_reward.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llava.py +20 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llavavid.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/minicpm.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/minicpm3.py +13 -15
- sglang-0.3.6.post2/sglang/srt/models/mistral.py +23 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mixtral.py +15 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mixtral_quant.py +14 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/olmo.py +21 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/olmoe.py +23 -20
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen.py +14 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2.py +22 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2_moe.py +17 -18
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/stablelm.py +18 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/torch_native_llama.py +15 -17
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/xverse.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/xverse_moe.py +15 -16
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/yivl.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/openai_api/adapter.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/openai_api/protocol.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/sampling_batch_info.py +4 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/sampling_params.py +13 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/server.py +60 -34
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/server_args.py +22 -22
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/utils.py +208 -19
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/few_shot_gsm8k.py +8 -4
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/runners.py +13 -14
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_utils.py +2 -2
- sglang-0.3.6.post2/sglang/version.py +1 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/PKG-INFO +25 -15
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/SOURCES.txt +7 -4
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/requires.txt +16 -1
- sglang-0.3.6/sglang/srt/constrained/__init__.py +0 -17
- sglang-0.3.6/sglang/srt/layers/custom_op_util.py +0 -26
- sglang-0.3.6/sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.6/sglang/srt/models/mistral.py +0 -25
- sglang-0.3.6/sglang/version.py +0 -1
- {sglang-0.3.6 → sglang-0.3.6.post2}/setup.cfg +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_latency.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/bench_offline_throughput.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/global_config.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/choices.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/lang/ir.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.6/sglang/srt/layers/fused_moe → sglang-0.3.6.post2/sglang/srt/layers/fused_moe_grok}/fused_moe.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/quantization/base_config.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/layers/vocab_parallel_embedding.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/model_parallel.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/gpt2.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/phi3_small.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/models/qwen2_vl.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_common.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_humaneval.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_mgsm.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang/utils.py +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.6 → sglang-0.3.6.post2}/sglang.egg-info/top_level.txt +0 -0
@@ -186,7 +186,7 @@
|
|
186
186
|
same "printed page" as the copyright notice for easier
|
187
187
|
identification within third-party archives.
|
188
188
|
|
189
|
-
Copyright
|
189
|
+
Copyright 2023-2024 SGLang Team
|
190
190
|
|
191
191
|
Licensed under the Apache License, Version 2.0 (the "License");
|
192
192
|
you may not use this file except in compliance with the License.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.6
|
3
|
+
Version: 0.3.6.post2
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -190,7 +190,7 @@ License: Apache License
|
|
190
190
|
same "printed page" as the copyright notice for easier
|
191
191
|
identification within third-party archives.
|
192
192
|
|
193
|
-
Copyright
|
193
|
+
Copyright 2023-2024 SGLang Team
|
194
194
|
|
195
195
|
Licensed under the Apache License, Version 2.0 (the "License");
|
196
196
|
you may not use this file except in compliance with the License.
|
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
|
|
222
222
|
Requires-Dist: hf_transfer; extra == "runtime-common"
|
223
223
|
Requires-Dist: huggingface_hub; extra == "runtime-common"
|
224
224
|
Requires-Dist: interegular; extra == "runtime-common"
|
225
|
+
Requires-Dist: modelscope; extra == "runtime-common"
|
225
226
|
Requires-Dist: orjson; extra == "runtime-common"
|
226
227
|
Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
|
227
228
|
Requires-Dist: packaging; extra == "runtime-common"
|
@@ -234,17 +235,20 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
|
234
235
|
Requires-Dist: torchao; extra == "runtime-common"
|
235
236
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
236
237
|
Requires-Dist: uvloop; extra == "runtime-common"
|
237
|
-
Requires-Dist:
|
238
|
+
Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
|
238
239
|
Provides-Extra: srt
|
239
240
|
Requires-Dist: sglang[runtime_common]; extra == "srt"
|
240
241
|
Requires-Dist: torch; extra == "srt"
|
241
242
|
Requires-Dist: vllm>=0.6.3.post1; extra == "srt"
|
243
|
+
Requires-Dist: cuda-python; extra == "srt"
|
242
244
|
Provides-Extra: srt-hip
|
243
245
|
Requires-Dist: sglang[runtime_common]; extra == "srt-hip"
|
244
246
|
Requires-Dist: torch; extra == "srt-hip"
|
245
247
|
Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
|
246
248
|
Provides-Extra: srt-xpu
|
247
249
|
Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
|
250
|
+
Provides-Extra: srt-hpu
|
251
|
+
Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
|
248
252
|
Provides-Extra: openai
|
249
253
|
Requires-Dist: openai>=1.0; extra == "openai"
|
250
254
|
Requires-Dist: tiktoken; extra == "openai"
|
@@ -274,6 +278,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
|
|
274
278
|
Requires-Dist: sglang[openai]; extra == "all-xpu"
|
275
279
|
Requires-Dist: sglang[anthropic]; extra == "all-xpu"
|
276
280
|
Requires-Dist: sglang[litellm]; extra == "all-xpu"
|
281
|
+
Provides-Extra: all-hpu
|
282
|
+
Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
|
283
|
+
Requires-Dist: sglang[openai]; extra == "all-hpu"
|
284
|
+
Requires-Dist: sglang[anthropic]; extra == "all-hpu"
|
285
|
+
Requires-Dist: sglang[litellm]; extra == "all-hpu"
|
277
286
|
Provides-Extra: dev
|
278
287
|
Requires-Dist: sglang[all]; extra == "dev"
|
279
288
|
Requires-Dist: sglang[test]; extra == "dev"
|
@@ -283,6 +292,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
|
|
283
292
|
Provides-Extra: dev-xpu
|
284
293
|
Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
|
285
294
|
Requires-Dist: sglang[test]; extra == "dev-xpu"
|
295
|
+
Provides-Extra: dev-hpu
|
296
|
+
Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
|
297
|
+
Requires-Dist: sglang[test]; extra == "dev-hpu"
|
286
298
|
|
287
299
|
<div align="center" id="sglangtop">
|
288
300
|
<img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
|
@@ -321,21 +333,16 @@ SGLang is a fast serving framework for large language models and vision language
|
|
321
333
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
322
334
|
The core features include:
|
323
335
|
|
324
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/
|
336
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
|
325
337
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
326
338
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
327
339
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
328
340
|
|
329
341
|
## Getting Started
|
330
|
-
Install SGLang
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
## Backend: SGLang Runtime (SRT)
|
335
|
-
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
336
|
-
|
337
|
-
## Frontend: Structured Generation Language (SGLang)
|
338
|
-
See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
|
342
|
+
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
343
|
+
- [Send requests](https://sgl-project.github.io/start/send_request.html)
|
344
|
+
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
345
|
+
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
339
346
|
|
340
347
|
## Benchmark And Performance
|
341
348
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
@@ -343,6 +350,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
343
350
|
## Roadmap
|
344
351
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
345
352
|
|
346
|
-
##
|
353
|
+
## Adoption and Sponsorship
|
354
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
355
|
+
|
356
|
+
## Acknowledgment and Citation
|
357
|
+
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
347
358
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
348
|
-
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -35,21 +35,16 @@ SGLang is a fast serving framework for large language models and vision language
|
|
35
35
|
It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
|
36
36
|
The core features include:
|
37
37
|
|
38
|
-
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/
|
38
|
+
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
|
39
39
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
40
40
|
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
|
41
41
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
42
42
|
|
43
43
|
## Getting Started
|
44
|
-
Install SGLang
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
## Backend: SGLang Runtime (SRT)
|
49
|
-
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
50
|
-
|
51
|
-
## Frontend: Structured Generation Language (SGLang)
|
52
|
-
See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
|
44
|
+
- [Install SGLang](https://sgl-project.github.io/start/install.html)
|
45
|
+
- [Send requests](https://sgl-project.github.io/start/send_request.html)
|
46
|
+
- [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
|
47
|
+
- [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
|
53
48
|
|
54
49
|
## Benchmark And Performance
|
55
50
|
Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
|
@@ -57,6 +52,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
|
|
57
52
|
## Roadmap
|
58
53
|
[Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
|
59
54
|
|
60
|
-
##
|
55
|
+
## Adoption and Sponsorship
|
56
|
+
The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, xAI and 01.AI.
|
57
|
+
|
58
|
+
## Acknowledgment and Citation
|
59
|
+
We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
61
60
|
Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
|
62
|
-
We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.6"
|
7
|
+
version = "0.3.6.post2"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -17,13 +17,13 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = ["aiohttp", "decord", "fastapi",
|
20
|
-
"hf_transfer", "huggingface_hub", "interegular",
|
20
|
+
"hf_transfer", "huggingface_hub", "interegular", "modelscope",
|
21
21
|
"orjson", "outlines>=0.0.44,<0.1.0",
|
22
22
|
"packaging", "pillow", "prometheus-client>=0.20.0",
|
23
23
|
"psutil", "pydantic", "python-multipart",
|
24
24
|
"pyzmq>=25.1.2", "torchao", "uvicorn", "uvloop",
|
25
|
-
"
|
26
|
-
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1"]
|
25
|
+
"xgrammar>=0.1.4"]
|
26
|
+
srt = ["sglang[runtime_common]", "torch", "vllm>=0.6.3.post1", "cuda-python"]
|
27
27
|
|
28
28
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
29
29
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
@@ -31,6 +31,9 @@ srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
|
|
31
31
|
# xpu is not enabled in public vllm and torch whl,
|
32
32
|
# need to follow https://docs.vllm.ai/en/latest/getting_started/xpu-installation.htmlinstall vllm
|
33
33
|
srt_xpu = ["sglang[runtime_common]"]
|
34
|
+
#For Intel Gaudi(device : hpu) follow the installation guide
|
35
|
+
#https://docs.vllm.ai/en/latest/getting_started/gaudi-installation.html
|
36
|
+
srt_hpu = ["sglang[runtime_common]"]
|
34
37
|
|
35
38
|
openai = ["openai>=1.0", "tiktoken"]
|
36
39
|
anthropic = ["anthropic>=0.20.0"]
|
@@ -46,9 +49,11 @@ test = [
|
|
46
49
|
all = ["sglang[srt]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
47
50
|
all_hip = ["sglang[srt_hip]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
48
51
|
all_xpu = ["sglang[srt_xpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
52
|
+
all_hpu = ["sglang[srt_hpu]", "sglang[openai]", "sglang[anthropic]", "sglang[litellm]"]
|
49
53
|
dev = ["sglang[all]", "sglang[test]"]
|
50
54
|
dev_hip = ["sglang[all_hip]", "sglang[test]"]
|
51
55
|
dev_xpu = ["sglang[all_xpu]", "sglang[test]"]
|
56
|
+
dev_hpu = ["sglang[all_hpu]", "sglang[test]"]
|
52
57
|
|
53
58
|
[project.urls]
|
54
59
|
"Homepage" = "https://github.com/sgl-project/sglang"
|
@@ -11,7 +11,7 @@ from sglang.api import (
|
|
11
11
|
gen,
|
12
12
|
gen_int,
|
13
13
|
gen_string,
|
14
|
-
|
14
|
+
get_server_info,
|
15
15
|
image,
|
16
16
|
select,
|
17
17
|
set_default_backend,
|
@@ -41,7 +41,7 @@ __all__ = [
|
|
41
41
|
"gen",
|
42
42
|
"gen_int",
|
43
43
|
"gen_string",
|
44
|
-
"
|
44
|
+
"get_server_info",
|
45
45
|
"image",
|
46
46
|
"select",
|
47
47
|
"set_default_backend",
|
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
65
65
|
return backend.flush_cache()
|
66
66
|
|
67
67
|
|
68
|
-
def
|
68
|
+
def get_server_info(backend: Optional[BaseBackend] = None):
|
69
69
|
backend = backend or global_config.default_backend
|
70
70
|
if backend is None:
|
71
71
|
return None
|
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
73
73
|
# If backend is Runtime
|
74
74
|
if hasattr(backend, "endpoint"):
|
75
75
|
backend = backend.endpoint
|
76
|
-
return backend.
|
76
|
+
return backend.get_server_info()
|
77
77
|
|
78
78
|
|
79
79
|
def gen(
|
@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
|
|
212
212
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
213
213
|
tree_cache=None,
|
214
214
|
model_config=model_runner.model_config,
|
215
|
+
enable_overlap=False,
|
215
216
|
)
|
216
217
|
batch.prepare_for_extend()
|
217
218
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -278,10 +279,7 @@ def correctness_test(
|
|
278
279
|
|
279
280
|
|
280
281
|
def synchronize(device):
|
281
|
-
|
282
|
-
torch.cuda.synchronize()
|
283
|
-
elif device == "xpu":
|
284
|
-
torch.xpu.synchronize()
|
282
|
+
torch.get_device_module(device).synchronize()
|
285
283
|
|
286
284
|
|
287
285
|
def latency_test_run_once(
|
@@ -468,7 +466,6 @@ if __name__ == "__main__":
|
|
468
466
|
|
469
467
|
try:
|
470
468
|
main(server_args, bench_args)
|
471
|
-
except Exception as e:
|
472
|
-
raise e
|
473
469
|
finally:
|
474
|
-
|
470
|
+
if server_args.tp_size != 1:
|
471
|
+
kill_child_process()
|
@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
|
|
5
5
|
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
6
6
|
|
7
7
|
Usage:
|
8
|
-
python3 -m sglang.
|
8
|
+
python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
9
|
|
10
|
-
python3 -m sglang.
|
10
|
+
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
11
11
|
"""
|
12
12
|
|
13
13
|
import argparse
|
@@ -25,6 +25,7 @@ import warnings
|
|
25
25
|
from argparse import ArgumentParser
|
26
26
|
from dataclasses import dataclass, field
|
27
27
|
from datetime import datetime
|
28
|
+
from pathlib import Path
|
28
29
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
29
30
|
|
30
31
|
import aiohttp
|
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
|
|
407
408
|
|
408
409
|
|
409
410
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
410
|
-
if os.getenv("SGLANG_USE_MODELSCOPE", "
|
411
|
+
if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
|
411
412
|
import huggingface_hub.constants
|
412
413
|
from modelscope import snapshot_download
|
413
414
|
|
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
|
|
693
694
|
return tokenizer.decode(selected_tokens)
|
694
695
|
|
695
696
|
|
697
|
+
def get_gen_prefix_cache_path(args, tokenizer):
|
698
|
+
"""Create cache directory under ~/.cache/sglang/benchmark"""
|
699
|
+
cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
|
700
|
+
|
701
|
+
# Create a unique cache filename based on the generation parameters
|
702
|
+
cache_key = (
|
703
|
+
f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
|
704
|
+
f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
|
705
|
+
f"{tokenizer.__class__.__name__}.pkl"
|
706
|
+
)
|
707
|
+
return cache_dir / cache_key
|
708
|
+
|
709
|
+
|
696
710
|
def sample_generated_shared_prefix_requests(
|
697
711
|
num_groups: int,
|
698
712
|
prompts_per_group: int,
|
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
|
|
701
715
|
output_len: int,
|
702
716
|
tokenizer: PreTrainedTokenizerBase,
|
703
717
|
) -> List[Tuple[str, int, int]]:
|
704
|
-
|
705
|
-
|
706
|
-
|
718
|
+
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
|
719
|
+
cache_path = get_gen_prefix_cache_path(args, tokenizer)
|
720
|
+
|
721
|
+
# Try to load from cache first
|
722
|
+
if cache_path.exists():
|
723
|
+
print(f"\nLoading cached generated input data from {cache_path}")
|
724
|
+
with open(cache_path, "rb") as f:
|
707
725
|
return pickle.load(f)
|
708
726
|
|
709
|
-
"
|
727
|
+
print("\nGenerating new input data...")
|
728
|
+
|
710
729
|
# Generate system prompts for each group
|
711
730
|
system_prompts = []
|
712
731
|
for _ in range(num_groups):
|
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
|
|
719
738
|
question = gen_prompt(tokenizer, question_len)
|
720
739
|
questions.append(question)
|
721
740
|
|
722
|
-
# Shuffle questions
|
723
|
-
random.shuffle(questions)
|
724
|
-
|
725
741
|
# Combine system prompts with questions
|
726
742
|
input_requests = []
|
727
743
|
total_input_tokens = 0
|
728
744
|
total_output_tokens = 0
|
729
745
|
|
730
|
-
for group_idx in range(num_groups):
|
746
|
+
for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
|
731
747
|
system_prompt = system_prompts[group_idx]
|
732
|
-
for prompt_idx in
|
748
|
+
for prompt_idx in tqdm(
|
749
|
+
range(prompts_per_group), desc="Generating questions", leave=False
|
750
|
+
):
|
733
751
|
question = questions[group_idx * prompts_per_group + prompt_idx]
|
734
752
|
full_prompt = f"{system_prompt}\n\n{question}"
|
735
753
|
prompt_len = len(tokenizer.encode(full_prompt))
|
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
|
|
738
756
|
total_input_tokens += prompt_len
|
739
757
|
total_output_tokens += output_len
|
740
758
|
|
759
|
+
# Shuffle questions
|
760
|
+
random.shuffle(input_requests)
|
761
|
+
|
762
|
+
# Print statistics
|
741
763
|
print(f"\nGenerated shared prefix dataset statistics:")
|
742
764
|
print(f"Number of groups: {num_groups}")
|
743
765
|
print(f"Prompts per group: {prompts_per_group}")
|
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
|
|
750
772
|
print(
|
751
773
|
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
|
752
774
|
)
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
775
|
+
|
776
|
+
# Save to cache
|
777
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
778
|
+
print(f"Caching generated input data to {cache_path}")
|
779
|
+
with open(cache_path, "wb") as f:
|
780
|
+
pickle.dump(input_requests, f)
|
758
781
|
|
759
782
|
return input_requests
|
760
783
|
|
@@ -859,6 +882,7 @@ async def benchmark(
|
|
859
882
|
tokenizer: PreTrainedTokenizerBase,
|
860
883
|
input_requests: List[Tuple[str, int, int]],
|
861
884
|
request_rate: float,
|
885
|
+
max_concurrency: Optional[int],
|
862
886
|
disable_tqdm: bool,
|
863
887
|
extra_request_body: Dict[str, Any],
|
864
888
|
profile: bool,
|
@@ -868,6 +892,15 @@ async def benchmark(
|
|
868
892
|
else:
|
869
893
|
raise ValueError(f"Unknown backend: {backend}")
|
870
894
|
|
895
|
+
# From https://github.com/vllm-project/vllm/pull/9390
|
896
|
+
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
897
|
+
|
898
|
+
async def limited_request_func(request_func_input, pbar):
|
899
|
+
if semaphore is None:
|
900
|
+
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
901
|
+
async with semaphore:
|
902
|
+
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
903
|
+
|
871
904
|
print("Starting initial single prompt test run...")
|
872
905
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
873
906
|
test_input = RequestFuncInput(
|
@@ -913,7 +946,7 @@ async def benchmark(
|
|
913
946
|
)
|
914
947
|
tasks.append(
|
915
948
|
asyncio.create_task(
|
916
|
-
|
949
|
+
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
917
950
|
)
|
918
951
|
)
|
919
952
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
@@ -940,6 +973,12 @@ async def benchmark(
|
|
940
973
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
941
974
|
print("{:<40} {:<10}".format("Backend:", backend))
|
942
975
|
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
976
|
+
print(
|
977
|
+
"{:<40} {:<10}".format(
|
978
|
+
"Max reqeuest concurrency:",
|
979
|
+
max_concurrency if max_concurrency else "not set",
|
980
|
+
)
|
981
|
+
)
|
943
982
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
944
983
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
945
984
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
@@ -1003,6 +1042,7 @@ async def benchmark(
|
|
1003
1042
|
"backend": args.backend,
|
1004
1043
|
"dataset_name": args.dataset_name,
|
1005
1044
|
"request_rate": request_rate,
|
1045
|
+
"max_concurrency": max_concurrency,
|
1006
1046
|
"total_input_tokens": metrics.total_input,
|
1007
1047
|
"total_output_tokens": metrics.total_output,
|
1008
1048
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1090
1130
|
global args
|
1091
1131
|
args = args_
|
1092
1132
|
|
1133
|
+
# Set default value for max_concurrency if not present
|
1134
|
+
if not hasattr(args, "max_concurrency"):
|
1135
|
+
args.max_concurrency = None
|
1136
|
+
|
1093
1137
|
# Set global environments
|
1094
1138
|
set_ulimit()
|
1095
1139
|
random.seed(args.seed)
|
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1201
1245
|
tokenizer=tokenizer,
|
1202
1246
|
input_requests=input_requests,
|
1203
1247
|
request_rate=args.request_rate,
|
1248
|
+
max_concurrency=args.max_concurrency,
|
1204
1249
|
disable_tqdm=args.disable_tqdm,
|
1205
1250
|
extra_request_body=extra_request_body,
|
1206
1251
|
profile=args.profile,
|
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1220
1265
|
tokenizer=tokenizer,
|
1221
1266
|
input_requests=input_requests,
|
1222
1267
|
request_rate=rate,
|
1268
|
+
max_concurrency=args.max_concurrency,
|
1223
1269
|
disable_tqdm=args.disable_tqdm,
|
1224
1270
|
extra_request_body=extra_request_body,
|
1225
1271
|
profile=args.profile,
|
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
|
|
1319
1365
|
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
|
1320
1366
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
|
1321
1367
|
)
|
1368
|
+
parser.add_argument(
|
1369
|
+
"--max-concurrency",
|
1370
|
+
type=int,
|
1371
|
+
default=None,
|
1372
|
+
help="Maximum number of concurrent requests. This can be used "
|
1373
|
+
"to help simulate an environment where a higher level component "
|
1374
|
+
"is enforcing a maximum number of concurrent requests. While the "
|
1375
|
+
"--request-rate argument controls the rate at which requests are "
|
1376
|
+
"initiated, this argument will control how many are actually allowed "
|
1377
|
+
"to execute at a time. This means that when used in combination, the "
|
1378
|
+
"actual request rate may be lower than specified with --request-rate, "
|
1379
|
+
"if the server is not processing requests fast enough to keep up.",
|
1380
|
+
)
|
1322
1381
|
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1323
1382
|
parser.add_argument(
|
1324
1383
|
"--multi",
|
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
|
|
1386
1445
|
default=256,
|
1387
1446
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1388
1447
|
)
|
1389
|
-
parser.add_argument(
|
1390
|
-
"--generated-input-save-path",
|
1391
|
-
type=str,
|
1392
|
-
help="Path to save generated input data",
|
1393
|
-
)
|
1394
|
-
parser.add_argument(
|
1395
|
-
"--generated-input-path",
|
1396
|
-
type=str,
|
1397
|
-
help="Path to load previously generated input data",
|
1398
|
-
)
|
1399
1448
|
parser.add_argument(
|
1400
1449
|
"--profile",
|
1401
1450
|
action="store_true",
|
@@ -22,18 +22,24 @@ PACKAGE_LIST = [
|
|
22
22
|
"hf_transfer",
|
23
23
|
"huggingface_hub",
|
24
24
|
"interegular",
|
25
|
+
"modelscope",
|
26
|
+
"orjson",
|
27
|
+
"outlines",
|
28
|
+
"packaging",
|
25
29
|
"psutil",
|
26
30
|
"pydantic",
|
27
31
|
"multipart",
|
28
32
|
"zmq",
|
33
|
+
"torchao",
|
29
34
|
"uvicorn",
|
30
35
|
"uvloop",
|
31
36
|
"vllm",
|
32
|
-
"
|
37
|
+
"xgrammar",
|
33
38
|
"openai",
|
34
39
|
"tiktoken",
|
35
40
|
"anthropic",
|
36
41
|
"litellm",
|
42
|
+
"decord",
|
37
43
|
]
|
38
44
|
|
39
45
|
|
@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
|
|
58
58
|
)
|
59
59
|
self._assert_success(res)
|
60
60
|
|
61
|
-
def
|
61
|
+
def get_server_info(self):
|
62
62
|
res = http_request(
|
63
|
-
self.base_url + "/
|
63
|
+
self.base_url + "/get_server_info",
|
64
64
|
api_key=self.api_key,
|
65
65
|
verify=self.verify,
|
66
66
|
)
|
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Launch the inference server."""
|
2
2
|
|
3
|
-
import os
|
4
3
|
import sys
|
5
4
|
|
6
5
|
from sglang.srt.server import launch_server
|
@@ -12,7 +11,5 @@ if __name__ == "__main__":
|
|
12
11
|
|
13
12
|
try:
|
14
13
|
launch_server(server_args)
|
15
|
-
except Exception as e:
|
16
|
-
raise e
|
17
14
|
finally:
|
18
15
|
kill_child_process()
|
@@ -1,27 +1,26 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
import json
|
17
16
|
import logging
|
18
|
-
import os
|
19
17
|
from enum import IntEnum, auto
|
20
18
|
from typing import List, Optional
|
21
19
|
|
22
20
|
from transformers import PretrainedConfig
|
23
21
|
|
24
22
|
from sglang.srt.hf_transformers_utils import get_config, get_context_length
|
23
|
+
from sglang.srt.utils import get_bool_env_var
|
25
24
|
|
26
25
|
logger = logging.getLogger(__name__)
|
27
26
|
|
@@ -60,13 +59,9 @@ class ModelConfig:
|
|
60
59
|
|
61
60
|
# Derive context length
|
62
61
|
derived_context_len = get_context_length(self.hf_text_config)
|
63
|
-
allow_long_context = os.environ.get(
|
64
|
-
"SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
|
65
|
-
)
|
66
|
-
|
67
62
|
if context_length is not None:
|
68
63
|
if context_length > derived_context_len:
|
69
|
-
if
|
64
|
+
if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
|
70
65
|
logger.warning(
|
71
66
|
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
72
67
|
f"This may lead to incorrect model outputs or CUDA errors."
|