sglang 0.4.1.post1__py3-none-any.whl → 0.4.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_offline_throughput.py +1 -0
- sglang/srt/configs/model_config.py +11 -2
- sglang/srt/layers/attention/__init__.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +54 -41
- sglang/srt/layers/logits_processor.py +30 -2
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3072,device_name=NVIDIA_H100_80GB_HBM3.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=1,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1344,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=14336,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=2688,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3072,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3200,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +218 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=6400,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=int8_w8a16.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=16,N=800,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +130 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=1280,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=2560,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=320,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_A800-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=64,N=640,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +138 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-40GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_GeForce_RTX_4090,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=NVIDIA_L40S.json +173 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +178 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +200 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +175 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8.json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +46 -26
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=3072,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=36864,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H100_80GB_HBM3,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=NVIDIA_H200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/fp8.py +42 -2
- sglang/srt/layers/quantization/fp8_kernel.py +77 -18
- sglang/srt/layers/quantization/fp8_utils.py +8 -2
- sglang/srt/managers/detokenizer_manager.py +2 -0
- sglang/srt/managers/io_struct.py +40 -9
- sglang/srt/managers/schedule_batch.py +22 -15
- sglang/srt/managers/scheduler.py +69 -21
- sglang/srt/managers/session_controller.py +102 -27
- sglang/srt/managers/tokenizer_manager.py +48 -10
- sglang/srt/managers/tp_worker.py +7 -0
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -0
- sglang/srt/model_executor/forward_batch_info.py +42 -3
- sglang/srt/model_executor/model_runner.py +4 -0
- sglang/srt/models/llama.py +11 -0
- sglang/srt/models/llama_eagle.py +132 -0
- sglang/srt/openai_api/adapter.py +60 -2
- sglang/srt/openai_api/protocol.py +48 -0
- sglang/srt/server.py +26 -3
- sglang/srt/server_args.py +24 -30
- sglang/srt/speculative/spec_info.py +19 -0
- sglang/srt/utils.py +62 -0
- sglang/version.py +1 -1
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/METADATA +3 -3
- sglang-0.4.1.post3.dist-info/RECORD +305 -0
- sglang-0.4.1.post1.dist-info/RECORD +0 -195
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post1.dist-info → sglang-0.4.1.post3.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ if TYPE_CHECKING:
|
|
45
45
|
from sglang.srt.mem_cache.memory_pool import BaseTokenToKVPool, ReqToTokenPool
|
46
46
|
from sglang.srt.model_executor.model_runner import ModelRunner
|
47
47
|
from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
|
48
|
+
from sglang.srt.speculative.spec_info import SpecInfo, SpeculativeAlgorithm
|
48
49
|
|
49
50
|
|
50
51
|
class ForwardMode(IntEnum):
|
@@ -59,6 +60,11 @@ class ForwardMode(IntEnum):
|
|
59
60
|
# No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
|
60
61
|
IDLE = auto()
|
61
62
|
|
63
|
+
# Used in speculative decoding: verify a batch in the target model.
|
64
|
+
TARGET_VERIFY = auto()
|
65
|
+
# Used in speculative decoding: extend a batch in the draft model.
|
66
|
+
DRAFT_EXTEND = auto()
|
67
|
+
|
62
68
|
# A dummy first batch to start the pipeline for overlap scheduler.
|
63
69
|
# It is now used for triggering the sampling_info_done event for the first prefill batch.
|
64
70
|
DUMMY_FIRST = auto()
|
@@ -67,7 +73,12 @@ class ForwardMode(IntEnum):
|
|
67
73
|
return self == ForwardMode.PREFILL
|
68
74
|
|
69
75
|
def is_extend(self):
|
70
|
-
return
|
76
|
+
return (
|
77
|
+
self == ForwardMode.EXTEND
|
78
|
+
or self == ForwardMode.MIXED
|
79
|
+
or self == ForwardMode.DRAFT_EXTEND
|
80
|
+
or self == self.TARGET_VERIFY
|
81
|
+
)
|
71
82
|
|
72
83
|
def is_decode(self):
|
73
84
|
return self == ForwardMode.DECODE
|
@@ -78,6 +89,15 @@ class ForwardMode(IntEnum):
|
|
78
89
|
def is_idle(self):
|
79
90
|
return self == ForwardMode.IDLE
|
80
91
|
|
92
|
+
def is_target_verify(self):
|
93
|
+
return self == ForwardMode.TARGET_VERIFY
|
94
|
+
|
95
|
+
def is_draft_extend(self):
|
96
|
+
return self == ForwardMode.DRAFT_EXTEND
|
97
|
+
|
98
|
+
def is_cuda_graph(self):
|
99
|
+
return self in (ForwardMode.DECODE, ForwardMode.TARGET_VERIFY)
|
100
|
+
|
81
101
|
def is_dummy_first(self):
|
82
102
|
return self == ForwardMode.DUMMY_FIRST
|
83
103
|
|
@@ -141,14 +161,18 @@ class ForwardBatch:
|
|
141
161
|
token_to_kv_pool: BaseTokenToKVPool = None
|
142
162
|
attn_backend: AttentionBackend = None
|
143
163
|
|
144
|
-
#
|
145
|
-
|
164
|
+
# Speculative decoding
|
165
|
+
spec_info: SpecInfo = None
|
166
|
+
spec_algorithm: SpeculativeAlgorithm = None
|
146
167
|
|
147
168
|
# For DP attention
|
148
169
|
global_num_tokens: Optional[List[int]] = None
|
149
170
|
gathered_buffer: Optional[torch.Tensor] = None
|
150
171
|
can_run_dp_cuda_graph: bool = False
|
151
172
|
|
173
|
+
# For Qwen2-VL
|
174
|
+
mrope_positions: torch.Tensor = None
|
175
|
+
|
152
176
|
def compute_mrope_positions(
|
153
177
|
self, model_runner: ModelRunner, batch: ModelWorkerBatch
|
154
178
|
):
|
@@ -351,3 +375,18 @@ def compute_position_torch(
|
|
351
375
|
extend_start_loc = torch.zeros_like(extend_seq_lens)
|
352
376
|
extend_start_loc[1:] = torch.cumsum(extend_seq_lens[:-1], dim=0)
|
353
377
|
return positions.to(torch.int64), extend_start_loc
|
378
|
+
|
379
|
+
|
380
|
+
class CaptureHiddenMode(IntEnum):
|
381
|
+
NULL = auto()
|
382
|
+
FULL = auto()
|
383
|
+
LAST = auto()
|
384
|
+
|
385
|
+
def need_capture(self):
|
386
|
+
return self != CaptureHiddenMode.NULL
|
387
|
+
|
388
|
+
def is_full(self):
|
389
|
+
return self == CaptureHiddenMode.FULL
|
390
|
+
|
391
|
+
def is_last(self):
|
392
|
+
return self == CaptureHiddenMode.LAST
|
@@ -429,6 +429,10 @@ class ModelRunner:
|
|
429
429
|
logger.error(error_msg)
|
430
430
|
return False, error_msg
|
431
431
|
|
432
|
+
def update_weights_from_tensor(self, name, tensor: torch.Tensor):
|
433
|
+
self.model.load_weights([(name, tensor)])
|
434
|
+
return True, "Success" # TODO error handling
|
435
|
+
|
432
436
|
def get_weights_by_name(
|
433
437
|
self, name: str, truncate_size: int = 100
|
434
438
|
) -> Optional[torch.Tensor]:
|
sglang/srt/models/llama.py
CHANGED
@@ -516,6 +516,17 @@ class LlamaForCausalLM(nn.Module):
|
|
516
516
|
)
|
517
517
|
return None
|
518
518
|
|
519
|
+
def get_embed_and_head(self):
|
520
|
+
return self.model.embed_tokens.weight, self.lm_head.weight
|
521
|
+
|
522
|
+
def set_embed_and_head(self, embed, head):
|
523
|
+
del self.model.embed_tokens.weight
|
524
|
+
del self.lm_head.weight
|
525
|
+
self.model.embed_tokens.weight = embed
|
526
|
+
self.lm_head.weight = head
|
527
|
+
torch.cuda.empty_cache()
|
528
|
+
torch.cuda.synchronize()
|
529
|
+
|
519
530
|
|
520
531
|
class Phi3ForCausalLM(LlamaForCausalLM):
|
521
532
|
pass
|
@@ -0,0 +1,132 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
# Adapted from
|
17
|
+
# https://github.com/SafeAILab/EAGLE/blob/main/eagle/model/cnets.py
|
18
|
+
"""Inference-only LLaMA-EAGLE model compatible with HuggingFace weights."""
|
19
|
+
|
20
|
+
from typing import Iterable, Optional, Tuple
|
21
|
+
|
22
|
+
import torch
|
23
|
+
from torch import nn
|
24
|
+
from transformers import LlamaConfig
|
25
|
+
|
26
|
+
from sglang.srt.layers.logits_processor import LogitsProcessor
|
27
|
+
from sglang.srt.layers.quantization.base_config import QuantizationConfig
|
28
|
+
from sglang.srt.layers.vocab_parallel_embedding import (
|
29
|
+
ParallelLMHead,
|
30
|
+
VocabParallelEmbedding,
|
31
|
+
)
|
32
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardBatch
|
33
|
+
from sglang.srt.models.llama import LlamaDecoderLayer, LlamaForCausalLM
|
34
|
+
|
35
|
+
|
36
|
+
class LlamaDecoderLayer(LlamaDecoderLayer):
|
37
|
+
def __init__(
|
38
|
+
self,
|
39
|
+
config: LlamaConfig,
|
40
|
+
layer_id: int = 0,
|
41
|
+
quant_config: Optional[QuantizationConfig] = None,
|
42
|
+
prefix: str = "",
|
43
|
+
) -> None:
|
44
|
+
super().__init__(config, layer_id, quant_config, prefix)
|
45
|
+
|
46
|
+
# Skip the input_layernorm
|
47
|
+
# https://github.com/SafeAILab/EAGLE/blob/35c78f6cdc19a73e05cf5c330b4c358dad970c6a/eagle/model/cnets.py#L427
|
48
|
+
if layer_id == 0:
|
49
|
+
del self.input_layernorm
|
50
|
+
setattr(self, "input_layernorm", lambda x: x)
|
51
|
+
|
52
|
+
|
53
|
+
class LlamaModel(nn.Module):
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
config: LlamaConfig,
|
57
|
+
quant_config: Optional[QuantizationConfig] = None,
|
58
|
+
) -> None:
|
59
|
+
super().__init__()
|
60
|
+
self.config = config
|
61
|
+
self.vocab_size = config.vocab_size
|
62
|
+
self.embed_tokens = VocabParallelEmbedding(
|
63
|
+
config.vocab_size,
|
64
|
+
config.hidden_size,
|
65
|
+
)
|
66
|
+
self.layers = nn.ModuleList(
|
67
|
+
[
|
68
|
+
LlamaDecoderLayer(
|
69
|
+
config, i, quant_config=quant_config, prefix=f"model.layers.{i}"
|
70
|
+
)
|
71
|
+
for i in range(config.num_hidden_layers)
|
72
|
+
]
|
73
|
+
)
|
74
|
+
self.fc = torch.nn.Linear(config.hidden_size * 2, config.hidden_size)
|
75
|
+
|
76
|
+
def forward(
|
77
|
+
self,
|
78
|
+
input_ids: torch.Tensor,
|
79
|
+
positions: torch.Tensor,
|
80
|
+
forward_batch: ForwardBatch,
|
81
|
+
input_embeds: torch.Tensor = None,
|
82
|
+
) -> torch.Tensor:
|
83
|
+
if input_embeds is None:
|
84
|
+
hidden_states = self.embed_tokens(input_ids)
|
85
|
+
else:
|
86
|
+
hidden_states = input_embeds
|
87
|
+
|
88
|
+
hidden_states = self.fc(
|
89
|
+
torch.cat((hidden_states, forward_batch.spec_info.hidden_states), dim=-1)
|
90
|
+
)
|
91
|
+
|
92
|
+
residual = None
|
93
|
+
for i in range(len(self.layers)):
|
94
|
+
layer = self.layers[i]
|
95
|
+
hidden_states, residual = layer(
|
96
|
+
positions,
|
97
|
+
hidden_states,
|
98
|
+
forward_batch,
|
99
|
+
residual,
|
100
|
+
)
|
101
|
+
return hidden_states + residual
|
102
|
+
|
103
|
+
|
104
|
+
class LlamaForCausalLMEagle(LlamaForCausalLM):
|
105
|
+
def __init__(
|
106
|
+
self,
|
107
|
+
config: LlamaConfig,
|
108
|
+
quant_config: Optional[QuantizationConfig] = None,
|
109
|
+
cache_config=None,
|
110
|
+
) -> None:
|
111
|
+
nn.Module.__init__(self)
|
112
|
+
self.config = config
|
113
|
+
self.quant_config = quant_config
|
114
|
+
self.model = LlamaModel(config, quant_config=quant_config)
|
115
|
+
# Llama 3.2 1B Instruct set tie_word_embeddings to True
|
116
|
+
# Llama 3.1 8B Instruct set tie_word_embeddings to False
|
117
|
+
if self.config.tie_word_embeddings:
|
118
|
+
self.lm_head = self.model.embed_tokens
|
119
|
+
else:
|
120
|
+
self.lm_head = ParallelLMHead(
|
121
|
+
config.vocab_size, config.hidden_size, quant_config=quant_config
|
122
|
+
)
|
123
|
+
self.logits_processor = LogitsProcessor(config)
|
124
|
+
|
125
|
+
def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
126
|
+
for name, loaded_weight in weights:
|
127
|
+
if "lm_head" not in name:
|
128
|
+
name = "model." + name
|
129
|
+
super().load_weights([(name, loaded_weight)])
|
130
|
+
|
131
|
+
|
132
|
+
EntryClass = [LlamaForCausalLMEagle]
|
sglang/srt/openai_api/adapter.py
CHANGED
@@ -65,10 +65,13 @@ from sglang.srt.openai_api.protocol import (
|
|
65
65
|
FileDeleteResponse,
|
66
66
|
FileRequest,
|
67
67
|
FileResponse,
|
68
|
+
FunctionResponse,
|
68
69
|
LogProbs,
|
70
|
+
ToolCall,
|
69
71
|
TopLogprob,
|
70
72
|
UsageInfo,
|
71
73
|
)
|
74
|
+
from sglang.srt.utils import TOOLS_TAG_LIST, parse_tool_response
|
72
75
|
from sglang.utils import get_exception_traceback
|
73
76
|
|
74
77
|
logger = logging.getLogger(__name__)
|
@@ -879,6 +882,21 @@ def v1_chat_generate_request(
|
|
879
882
|
# None skips any image processing in GenerateReqInput.
|
880
883
|
if not isinstance(request.messages, str):
|
881
884
|
# Apply chat template and its stop strings.
|
885
|
+
tools = None
|
886
|
+
if request.tools and request.tool_choice != "none":
|
887
|
+
request.skip_special_tokens = False
|
888
|
+
if request.stream:
|
889
|
+
logger.warning("Streaming is not supported with tools.")
|
890
|
+
request.stream = False
|
891
|
+
if not isinstance(request.tool_choice, str):
|
892
|
+
tools = [
|
893
|
+
item.function.model_dump()
|
894
|
+
for item in request.tools
|
895
|
+
if item.function.name == request.tool_choice.function.name
|
896
|
+
]
|
897
|
+
else:
|
898
|
+
tools = [item.function.model_dump() for item in request.tools]
|
899
|
+
|
882
900
|
if chat_template_name is None:
|
883
901
|
openai_compatible_messages = []
|
884
902
|
for message in request.messages:
|
@@ -902,6 +920,7 @@ def v1_chat_generate_request(
|
|
902
920
|
openai_compatible_messages,
|
903
921
|
tokenize=True,
|
904
922
|
add_generation_prompt=True,
|
923
|
+
tools=tools,
|
905
924
|
)
|
906
925
|
if assistant_prefix:
|
907
926
|
prompt_ids += tokenizer_manager.tokenizer.encode(assistant_prefix)
|
@@ -1041,11 +1060,46 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
|
|
1041
1060
|
|
1042
1061
|
finish_reason = ret_item["meta_info"]["finish_reason"]
|
1043
1062
|
|
1063
|
+
tool_calls = None
|
1064
|
+
text = ret_item["text"]
|
1065
|
+
|
1066
|
+
if isinstance(request, list):
|
1067
|
+
tool_choice = request[idx].tool_choice
|
1068
|
+
tools = request[idx].tools
|
1069
|
+
else:
|
1070
|
+
tool_choice = request.tool_choice
|
1071
|
+
tools = request.tools
|
1072
|
+
|
1073
|
+
if tool_choice != "none" and any([i in text for i in TOOLS_TAG_LIST]):
|
1074
|
+
if finish_reason == "stop":
|
1075
|
+
finish_reason = "tool_calls"
|
1076
|
+
try:
|
1077
|
+
text, call_info_list = parse_tool_response(text, tools) # noqa
|
1078
|
+
tool_calls = [
|
1079
|
+
ToolCall(
|
1080
|
+
id=str(call_info[0]),
|
1081
|
+
function=FunctionResponse(
|
1082
|
+
name=call_info[1], arguments=call_info[2]
|
1083
|
+
),
|
1084
|
+
)
|
1085
|
+
for call_info in call_info_list
|
1086
|
+
]
|
1087
|
+
except Exception as e:
|
1088
|
+
logger.error(f"Exception: {e}")
|
1089
|
+
return create_error_response(
|
1090
|
+
HTTPStatus.BAD_REQUEST,
|
1091
|
+
"Failed to parse fc related info to json format!",
|
1092
|
+
)
|
1093
|
+
|
1044
1094
|
if to_file:
|
1045
1095
|
# to make the choice data json serializable
|
1046
1096
|
choice_data = {
|
1047
1097
|
"index": 0,
|
1048
|
-
"message": {
|
1098
|
+
"message": {
|
1099
|
+
"role": "assistant",
|
1100
|
+
"content": ret_item["text"] if tool_calls is None else None,
|
1101
|
+
"tool_calls": tool_calls,
|
1102
|
+
},
|
1049
1103
|
"logprobs": choice_logprobs,
|
1050
1104
|
"finish_reason": (finish_reason["type"] if finish_reason else ""),
|
1051
1105
|
"matched_stop": (
|
@@ -1057,7 +1111,11 @@ def v1_chat_generate_response(request, ret, to_file=False, cache_report=False):
|
|
1057
1111
|
else:
|
1058
1112
|
choice_data = ChatCompletionResponseChoice(
|
1059
1113
|
index=idx,
|
1060
|
-
message=ChatMessage(
|
1114
|
+
message=ChatMessage(
|
1115
|
+
role="assistant",
|
1116
|
+
content=ret_item["text"] if tool_calls is None else None,
|
1117
|
+
tool_calls=tool_calls,
|
1118
|
+
),
|
1061
1119
|
logprobs=choice_logprobs,
|
1062
1120
|
finish_reason=(finish_reason["type"] if finish_reason else ""),
|
1063
1121
|
matched_stop=(
|
@@ -257,6 +257,34 @@ class ResponseFormat(BaseModel):
|
|
257
257
|
json_schema: Optional[JsonSchemaResponseFormat] = None
|
258
258
|
|
259
259
|
|
260
|
+
class Function(BaseModel):
|
261
|
+
"""Function descriptions."""
|
262
|
+
|
263
|
+
description: Optional[str] = Field(default=None, examples=[None])
|
264
|
+
name: str
|
265
|
+
parameters: Optional[object] = None
|
266
|
+
|
267
|
+
|
268
|
+
class Tool(BaseModel):
|
269
|
+
"""Function wrapper."""
|
270
|
+
|
271
|
+
type: str = Field(default="function", examples=["function"])
|
272
|
+
function: Function
|
273
|
+
|
274
|
+
|
275
|
+
class ToolChoiceFuncName(BaseModel):
|
276
|
+
"""The name of tool choice function."""
|
277
|
+
|
278
|
+
name: str
|
279
|
+
|
280
|
+
|
281
|
+
class ToolChoice(BaseModel):
|
282
|
+
"""The tool choice definition."""
|
283
|
+
|
284
|
+
function: ToolChoiceFuncName
|
285
|
+
type: Literal["function"] = Field(default="function", examples=["function"])
|
286
|
+
|
287
|
+
|
260
288
|
class ChatCompletionRequest(BaseModel):
|
261
289
|
# Ordered by official OpenAI API documentation
|
262
290
|
# https://platform.openai.com/docs/api-reference/chat/create
|
@@ -277,6 +305,10 @@ class ChatCompletionRequest(BaseModel):
|
|
277
305
|
temperature: float = 0.7
|
278
306
|
top_p: float = 1.0
|
279
307
|
user: Optional[str] = None
|
308
|
+
tools: Optional[List[Tool]] = Field(default=None, examples=[None])
|
309
|
+
tool_choice: Union[ToolChoice, Literal["auto", "required", "none"]] = Field(
|
310
|
+
default="auto", examples=["none"]
|
311
|
+
) # noqa
|
280
312
|
|
281
313
|
# Extra parameters for SRT backend only and will be ignored by OpenAI models.
|
282
314
|
top_k: int = -1
|
@@ -292,9 +324,25 @@ class ChatCompletionRequest(BaseModel):
|
|
292
324
|
ebnf: Optional[str] = None
|
293
325
|
|
294
326
|
|
327
|
+
class FunctionResponse(BaseModel):
|
328
|
+
"""Function response."""
|
329
|
+
|
330
|
+
name: str
|
331
|
+
arguments: str
|
332
|
+
|
333
|
+
|
334
|
+
class ToolCall(BaseModel):
|
335
|
+
"""Tool call response."""
|
336
|
+
|
337
|
+
id: str
|
338
|
+
type: Literal["function"] = "function"
|
339
|
+
function: FunctionResponse
|
340
|
+
|
341
|
+
|
295
342
|
class ChatMessage(BaseModel):
|
296
343
|
role: Optional[str] = None
|
297
344
|
content: Optional[str] = None
|
345
|
+
tool_calls: Optional[List[ToolCall]] = Field(default=None, examples=[None])
|
298
346
|
|
299
347
|
|
300
348
|
class ChatCompletionResponseChoice(BaseModel):
|
sglang/srt/server.py
CHANGED
@@ -57,6 +57,7 @@ from sglang.srt.managers.io_struct import (
|
|
57
57
|
OpenSessionReqInput,
|
58
58
|
UpdateWeightFromDiskReqInput,
|
59
59
|
UpdateWeightsFromDistributedReqInput,
|
60
|
+
UpdateWeightsFromTensorReqInput,
|
60
61
|
)
|
61
62
|
from sglang.srt.managers.scheduler import run_scheduler_process
|
62
63
|
from sglang.srt.managers.tokenizer_manager import TokenizerManager
|
@@ -109,6 +110,7 @@ app.add_middleware(
|
|
109
110
|
tokenizer_manager: TokenizerManager = None
|
110
111
|
scheduler_info: Dict = None
|
111
112
|
|
113
|
+
|
112
114
|
##### Native API endpoints #####
|
113
115
|
|
114
116
|
|
@@ -257,6 +259,10 @@ async def open_session(obj: OpenSessionReqInput, request: Request):
|
|
257
259
|
"""Open a session, and return its unique session id."""
|
258
260
|
try:
|
259
261
|
session_id = await tokenizer_manager.open_session(obj, request)
|
262
|
+
if session_id is None:
|
263
|
+
raise Exception(
|
264
|
+
"Failed to open the session. Check if a session with the same id is still open."
|
265
|
+
)
|
260
266
|
return session_id
|
261
267
|
except Exception as e:
|
262
268
|
return _create_error_response(e)
|
@@ -484,7 +490,16 @@ def launch_engine(
|
|
484
490
|
# Wait for model to finish loading
|
485
491
|
scheduler_infos = []
|
486
492
|
for i in range(len(scheduler_pipe_readers)):
|
487
|
-
|
493
|
+
try:
|
494
|
+
data = scheduler_pipe_readers[i].recv()
|
495
|
+
except EOFError as e:
|
496
|
+
logger.exception(e)
|
497
|
+
logger.error(
|
498
|
+
f"Rank {i} scheduler is dead. Please check if there are relevant logs."
|
499
|
+
)
|
500
|
+
scheduler_procs[i].join()
|
501
|
+
logger.error(f"Exit code: {scheduler_procs[i].exitcode}")
|
502
|
+
raise
|
488
503
|
|
489
504
|
if data["status"] != "ready":
|
490
505
|
raise RuntimeError(
|
@@ -492,7 +507,7 @@ def launch_engine(
|
|
492
507
|
)
|
493
508
|
scheduler_infos.append(data)
|
494
509
|
|
495
|
-
# Assume all schedulers have same
|
510
|
+
# Assume all schedulers have same scheduler_info
|
496
511
|
scheduler_info = scheduler_infos[0]
|
497
512
|
|
498
513
|
|
@@ -857,6 +872,14 @@ class Engine:
|
|
857
872
|
tokenizer_manager.update_weights_from_distributed(obj, None)
|
858
873
|
)
|
859
874
|
|
875
|
+
def update_weights_from_tensor(self, name, tensor):
|
876
|
+
"""Update weights from distributed source."""
|
877
|
+
obj = UpdateWeightsFromTensorReqInput(name=name, tensor=tensor)
|
878
|
+
loop = asyncio.get_event_loop()
|
879
|
+
return loop.run_until_complete(
|
880
|
+
tokenizer_manager.update_weights_from_tensor(obj, None)
|
881
|
+
)
|
882
|
+
|
860
883
|
def get_weights_by_name(self, name, truncate_size=100):
|
861
884
|
"""Get weights by parameter name."""
|
862
885
|
obj = GetWeightsByNameReqInput(name=name, truncate_size=truncate_size)
|
@@ -871,7 +894,7 @@ class Runtime:
|
|
871
894
|
using the commond line interface.
|
872
895
|
|
873
896
|
It is mainly used for the frontend language.
|
874
|
-
You should use the Engine class if you want to do normal offline processing.
|
897
|
+
You should use the Engine class above if you want to do normal offline processing.
|
875
898
|
"""
|
876
899
|
|
877
900
|
def __init__(
|
sglang/srt/server_args.py
CHANGED
@@ -54,8 +54,9 @@ class ServerArgs:
|
|
54
54
|
chat_template: Optional[str] = None
|
55
55
|
is_embedding: bool = False
|
56
56
|
revision: Optional[str] = None
|
57
|
+
return_token_ids: bool = False
|
57
58
|
|
58
|
-
# Port
|
59
|
+
# Port for the HTTP server
|
59
60
|
host: str = "127.0.0.1"
|
60
61
|
port: int = 30000
|
61
62
|
|
@@ -68,6 +69,7 @@ class ServerArgs:
|
|
68
69
|
schedule_policy: str = "lpm"
|
69
70
|
schedule_conservativeness: float = 1.0
|
70
71
|
cpu_offload_gb: int = 0
|
72
|
+
prefill_only_one_req: bool = False
|
71
73
|
|
72
74
|
# Other runtime options
|
73
75
|
tp_size: int = 1
|
@@ -94,6 +96,7 @@ class ServerArgs:
|
|
94
96
|
# Data parallelism
|
95
97
|
dp_size: int = 1
|
96
98
|
load_balance_method: str = "round_robin"
|
99
|
+
|
97
100
|
# Expert parallelism
|
98
101
|
ep_size: int = 1
|
99
102
|
|
@@ -217,6 +220,13 @@ class ServerArgs:
|
|
217
220
|
)
|
218
221
|
self.disable_cuda_graph = True
|
219
222
|
|
223
|
+
# Expert parallelism
|
224
|
+
if self.enable_ep_moe:
|
225
|
+
self.ep_size = self.tp_size
|
226
|
+
logger.info(
|
227
|
+
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
228
|
+
)
|
229
|
+
|
220
230
|
# Others
|
221
231
|
if self.enable_dp_attention:
|
222
232
|
self.dp_size = self.tp_size
|
@@ -229,12 +239,6 @@ class ServerArgs:
|
|
229
239
|
"Data parallel size is adjusted to be the same as tensor parallel size. "
|
230
240
|
"Overlap scheduler is disabled."
|
231
241
|
)
|
232
|
-
# Expert parallelism
|
233
|
-
if self.enable_ep_moe:
|
234
|
-
self.ep_size = self.tp_size
|
235
|
-
logger.info(
|
236
|
-
f"EP MoE is enabled. The expert parallel size is adjusted to be the same as the tensor parallel size[{self.tp_size}]."
|
237
|
-
)
|
238
242
|
|
239
243
|
# GGUF
|
240
244
|
if (
|
@@ -277,6 +281,12 @@ class ServerArgs:
|
|
277
281
|
action="store_true",
|
278
282
|
help="If set, skip init tokenizer and pass input_ids in generate request",
|
279
283
|
)
|
284
|
+
parser.add_argument(
|
285
|
+
"--return-token-ids",
|
286
|
+
action="store_true",
|
287
|
+
default=ServerArgs.return_token_ids,
|
288
|
+
help="Whether to return token IDs in the output, this may introduce additional overhead.",
|
289
|
+
)
|
280
290
|
parser.add_argument(
|
281
291
|
"--load-format",
|
282
292
|
type=str,
|
@@ -430,13 +440,18 @@ class ServerArgs:
|
|
430
440
|
default=ServerArgs.schedule_conservativeness,
|
431
441
|
help="How conservative the schedule policy is. A larger value means more conservative scheduling. Use a larger value if you see requests being retracted frequently.",
|
432
442
|
)
|
433
|
-
|
434
443
|
parser.add_argument(
|
435
444
|
"--cpu-offload-gb",
|
436
445
|
type=int,
|
437
446
|
default=ServerArgs.cpu_offload_gb,
|
438
447
|
help="How many GBs of RAM to reserve for CPU offloading",
|
439
448
|
)
|
449
|
+
parser.add_argument(
|
450
|
+
"--prefill-only-one-req",
|
451
|
+
type=bool,
|
452
|
+
help="If true, we only prefill one request at one prefill batch",
|
453
|
+
default=ServerArgs.prefill_only_one_req,
|
454
|
+
)
|
440
455
|
|
441
456
|
# Other runtime options
|
442
457
|
parser.add_argument(
|
@@ -555,6 +570,7 @@ class ServerArgs:
|
|
555
570
|
"shortest_queue",
|
556
571
|
],
|
557
572
|
)
|
573
|
+
|
558
574
|
# Expert parallelism
|
559
575
|
parser.add_argument(
|
560
576
|
"--expert-parallel-size",
|
@@ -777,28 +793,6 @@ class ServerArgs:
|
|
777
793
|
help="Delete the model checkpoint after loading the model.",
|
778
794
|
)
|
779
795
|
|
780
|
-
# Deprecated arguments
|
781
|
-
parser.add_argument(
|
782
|
-
"--enable-overlap-schedule",
|
783
|
-
action=DeprecatedAction,
|
784
|
-
help="'--enable-overlap-schedule' is deprecated. It is enabled by default now. Please drop this argument.",
|
785
|
-
)
|
786
|
-
parser.add_argument(
|
787
|
-
"--disable-flashinfer",
|
788
|
-
action=DeprecatedAction,
|
789
|
-
help="'--disable-flashinfer' is deprecated. Please use '--attention-backend triton' instead.",
|
790
|
-
)
|
791
|
-
parser.add_argument(
|
792
|
-
"--disable-flashinfer-sampling",
|
793
|
-
action=DeprecatedAction,
|
794
|
-
help="'--disable-flashinfer-sampling' is deprecated. Please use '--sampling-backend pytroch' instead.",
|
795
|
-
)
|
796
|
-
parser.add_argument(
|
797
|
-
"--disable-disk-cache",
|
798
|
-
action=DeprecatedAction,
|
799
|
-
help="'--disable-disk-cache' is deprecated. Please use '--disable-outlines-disk-cache' instead.",
|
800
|
-
)
|
801
|
-
|
802
796
|
@classmethod
|
803
797
|
def from_cli_args(cls, args: argparse.Namespace):
|
804
798
|
args.tp_size = args.tensor_parallel_size
|
@@ -0,0 +1,19 @@
|
|
1
|
+
from enum import IntEnum, auto
|
2
|
+
|
3
|
+
|
4
|
+
class SpeculativeAlgorithm(IntEnum):
|
5
|
+
EAGLE = auto()
|
6
|
+
|
7
|
+
def is_eagle(self):
|
8
|
+
return self == SpeculativeAlgorithm.EAGLE
|
9
|
+
|
10
|
+
@staticmethod
|
11
|
+
def from_string(name: str):
|
12
|
+
name_map = {
|
13
|
+
"EAGLE": SpeculativeAlgorithm.EAGLE,
|
14
|
+
}
|
15
|
+
return name_map[name]
|
16
|
+
|
17
|
+
|
18
|
+
class SpecInfo:
|
19
|
+
pass
|