sglang 0.4.3.post1__py3-none-any.whl → 0.4.3.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/api.py +1 -1
- sglang/bench_offline_throughput.py +19 -0
- sglang/bench_one_batch.py +2 -2
- sglang/bench_serving.py +123 -79
- sglang/global_config.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +1 -1
- sglang/lang/ir.py +1 -1
- sglang/srt/_custom_ops.py +83 -91
- sglang/srt/configs/load_config.py +4 -1
- sglang/srt/configs/model_config.py +48 -2
- sglang/srt/configs/qwen2_5_vl_config.py +5 -2
- sglang/srt/constrained/base_grammar_backend.py +117 -15
- sglang/srt/constrained/llguidance_backend.py +151 -0
- sglang/srt/constrained/outlines_backend.py +24 -33
- sglang/srt/constrained/xgrammar_backend.py +69 -38
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +225 -80
- sglang/srt/distributed/parallel_state.py +48 -3
- sglang/srt/entrypoints/engine.py +67 -9
- sglang/srt/entrypoints/http_server.py +190 -41
- sglang/srt/entrypoints/verl_engine.py +147 -0
- sglang/srt/function_call_parser.py +0 -1
- sglang/srt/layers/activation.py +11 -0
- sglang/srt/layers/attention/{__init__.py → base_attn_backend.py} +14 -6
- sglang/srt/layers/attention/double_sparsity_backend.py +1 -1
- sglang/srt/layers/attention/flashinfer_backend.py +208 -295
- sglang/srt/layers/attention/flashinfer_mla_backend.py +582 -0
- sglang/srt/layers/attention/torch_native_backend.py +1 -1
- sglang/srt/layers/attention/triton_backend.py +9 -6
- sglang/srt/layers/attention/triton_ops/decode_attention.py +3 -0
- sglang/srt/layers/attention/triton_ops/extend_attention.py +20 -4
- sglang/srt/layers/attention/triton_ops/rocm_mla_decode_rope.py +439 -0
- sglang/srt/layers/attention/utils.py +39 -0
- sglang/srt/layers/attention/vision.py +60 -63
- sglang/srt/layers/dp_attention.py +142 -1
- sglang/srt/layers/layernorm.py +1 -1
- sglang/srt/layers/linear.py +3 -1
- sglang/srt/layers/logits_processor.py +281 -45
- sglang/srt/layers/moe/ep_moe/kernels.py +126 -8
- sglang/srt/layers/moe/ep_moe/layer.py +140 -28
- sglang/srt/layers/moe/fused_moe_native.py +2 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +50 -50
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=14336,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=1792,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=3584,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=4096,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +16 -16
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI300X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Instinct_MI325X.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=7168,device_name=AMD_Radeon_Graphics.json +18 -18
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/configs/E=8,N=8192,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8.json +15 -15
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +88 -20
- sglang/srt/layers/moe/fused_moe_triton/layer.py +34 -13
- sglang/srt/layers/moe/topk.py +13 -4
- sglang/srt/layers/quantization/__init__.py +111 -7
- sglang/srt/layers/quantization/blockwise_int8.py +409 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=1536,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2048,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=2304,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=1536,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=24576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=256,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=32768,K=512,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1024,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=1152,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=128,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Instinct_MI325X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=AMD_Radeon_Graphics,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=16384,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_A100-SXM4-80GB,dtype=int8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/quantization/configs/N=7168,K=18432,device_name=NVIDIA_L20Y,dtype=fp8_w8a8,block_shape=[128, 128].json +26 -0
- sglang/srt/layers/quantization/fp8.py +69 -28
- sglang/srt/layers/quantization/fp8_utils.py +17 -1
- sglang/srt/layers/quantization/gptq.py +416 -0
- sglang/srt/layers/quantization/int8_kernel.py +327 -0
- sglang/srt/layers/quantization/int8_utils.py +73 -0
- sglang/srt/layers/quantization/modelopt_quant.py +18 -1
- sglang/srt/layers/radix_attention.py +1 -0
- sglang/srt/layers/rotary_embedding.py +0 -1
- sglang/srt/layers/sampler.py +76 -31
- sglang/srt/layers/vocab_parallel_embedding.py +14 -13
- sglang/srt/lora/lora.py +17 -1
- sglang/srt/lora/lora_config.py +5 -0
- sglang/srt/lora/lora_manager.py +1 -3
- sglang/srt/managers/cache_controller.py +193 -62
- sglang/srt/managers/configure_logging.py +2 -1
- sglang/srt/managers/data_parallel_controller.py +6 -2
- sglang/srt/managers/detokenizer_manager.py +124 -102
- sglang/srt/managers/image_processor.py +2 -1
- sglang/srt/managers/io_struct.py +143 -6
- sglang/srt/managers/schedule_batch.py +238 -197
- sglang/srt/managers/schedule_policy.py +29 -29
- sglang/srt/managers/scheduler.py +681 -259
- sglang/srt/managers/session_controller.py +6 -2
- sglang/srt/managers/tokenizer_manager.py +224 -68
- sglang/srt/managers/tp_worker.py +15 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +3 -4
- sglang/srt/mem_cache/chunk_cache.py +18 -11
- sglang/srt/mem_cache/hiradix_cache.py +394 -0
- sglang/srt/mem_cache/memory_pool.py +44 -18
- sglang/srt/mem_cache/radix_cache.py +58 -47
- sglang/srt/metrics/collector.py +94 -36
- sglang/srt/model_executor/cuda_graph_runner.py +55 -24
- sglang/srt/model_executor/forward_batch_info.py +49 -16
- sglang/srt/model_executor/model_runner.py +209 -28
- sglang/srt/model_loader/loader.py +3 -3
- sglang/srt/model_loader/weight_utils.py +36 -14
- sglang/srt/models/baichuan.py +31 -6
- sglang/srt/models/chatglm.py +39 -7
- sglang/srt/models/commandr.py +29 -5
- sglang/srt/models/dbrx.py +31 -5
- sglang/srt/models/deepseek.py +43 -6
- sglang/srt/models/deepseek_nextn.py +32 -19
- sglang/srt/models/deepseek_v2.py +265 -29
- sglang/srt/models/exaone.py +19 -9
- sglang/srt/models/gemma.py +22 -8
- sglang/srt/models/gemma2.py +25 -12
- sglang/srt/models/gemma2_reward.py +5 -1
- sglang/srt/models/gpt2.py +28 -13
- sglang/srt/models/gpt_bigcode.py +27 -5
- sglang/srt/models/granite.py +21 -9
- sglang/srt/models/grok.py +21 -4
- sglang/srt/models/internlm2.py +36 -6
- sglang/srt/models/internlm2_reward.py +5 -1
- sglang/srt/models/llama.py +26 -9
- sglang/srt/models/llama_classification.py +5 -1
- sglang/srt/models/llama_eagle.py +17 -4
- sglang/srt/models/llama_embedding.py +5 -1
- sglang/srt/models/llama_reward.py +7 -2
- sglang/srt/models/llava.py +19 -3
- sglang/srt/models/llavavid.py +10 -1
- sglang/srt/models/minicpm.py +26 -2
- sglang/srt/models/minicpm3.py +39 -3
- sglang/srt/models/minicpmv.py +45 -14
- sglang/srt/models/mixtral.py +20 -9
- sglang/srt/models/mixtral_quant.py +50 -8
- sglang/srt/models/mllama.py +57 -11
- sglang/srt/models/olmo.py +34 -6
- sglang/srt/models/olmo2.py +34 -13
- sglang/srt/models/olmoe.py +26 -4
- sglang/srt/models/phi3_small.py +29 -10
- sglang/srt/models/qwen.py +26 -3
- sglang/srt/models/qwen2.py +26 -4
- sglang/srt/models/qwen2_5_vl.py +46 -8
- sglang/srt/models/qwen2_eagle.py +17 -5
- sglang/srt/models/qwen2_moe.py +44 -6
- sglang/srt/models/qwen2_rm.py +78 -0
- sglang/srt/models/qwen2_vl.py +39 -8
- sglang/srt/models/stablelm.py +32 -5
- sglang/srt/models/torch_native_llama.py +5 -2
- sglang/srt/models/xverse.py +21 -9
- sglang/srt/models/xverse_moe.py +45 -7
- sglang/srt/models/yivl.py +2 -1
- sglang/srt/openai_api/adapter.py +109 -24
- sglang/srt/openai_api/protocol.py +17 -1
- sglang/srt/reasoning_parser.py +154 -0
- sglang/srt/sampling/penaltylib/__init__.py +4 -6
- sglang/srt/sampling/penaltylib/frequency_penalty.py +66 -0
- sglang/srt/sampling/penaltylib/{penalizers/min_new_tokens.py → min_new_tokens.py} +15 -23
- sglang/srt/sampling/penaltylib/orchestrator.py +39 -188
- sglang/srt/sampling/penaltylib/presence_penalty.py +66 -0
- sglang/srt/sampling/sampling_batch_info.py +79 -157
- sglang/srt/sampling/sampling_params.py +16 -13
- sglang/srt/server_args.py +136 -52
- sglang/srt/speculative/build_eagle_tree.py +2 -8
- sglang/srt/speculative/eagle_draft_cuda_graph_runner.py +0 -1
- sglang/srt/speculative/eagle_utils.py +92 -58
- sglang/srt/speculative/eagle_worker.py +186 -94
- sglang/srt/speculative/spec_info.py +1 -13
- sglang/srt/utils.py +43 -17
- sglang/srt/warmup.py +47 -0
- sglang/test/few_shot_gsm8k.py +4 -1
- sglang/test/runners.py +389 -126
- sglang/test/send_one.py +88 -0
- sglang/test/test_block_fp8_ep.py +361 -0
- sglang/test/test_programs.py +1 -1
- sglang/test/test_utils.py +138 -84
- sglang/utils.py +50 -60
- sglang/version.py +1 -1
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/METADATA +21 -15
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/RECORD +214 -166
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/WHEEL +1 -1
- sglang/bench_latency.py +0 -1
- sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -75
- sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -74
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -85
- sglang/test/srt/sampling/penaltylib/utils.py +0 -344
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/LICENSE +0 -0
- {sglang-0.4.3.post1.dist-info → sglang-0.4.3.post3.dist-info}/top_level.txt +0 -0
sglang/api.py
CHANGED
@@ -94,7 +94,7 @@ def gen(
|
|
94
94
|
regex: Optional[str] = None,
|
95
95
|
json_schema: Optional[str] = None,
|
96
96
|
):
|
97
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
97
|
+
"""Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
|
98
98
|
|
99
99
|
if choices:
|
100
100
|
return SglSelect(
|
@@ -56,6 +56,7 @@ class BenchArgs:
|
|
56
56
|
profile: bool = False
|
57
57
|
skip_warmup: bool = False
|
58
58
|
do_not_exit: bool = False
|
59
|
+
prompt_suffix: str = ""
|
59
60
|
|
60
61
|
@staticmethod
|
61
62
|
def add_cli_args(parser: argparse.ArgumentParser):
|
@@ -177,6 +178,12 @@ class BenchArgs:
|
|
177
178
|
action="store_true",
|
178
179
|
help="Do not exit the program. This is useful for nsys profile with --duration and --delay.",
|
179
180
|
)
|
181
|
+
parser.add_argument(
|
182
|
+
"--prompt-suffix",
|
183
|
+
type=str,
|
184
|
+
default="",
|
185
|
+
help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
|
186
|
+
)
|
180
187
|
|
181
188
|
@classmethod
|
182
189
|
def from_cli_args(cls, args: argparse.Namespace):
|
@@ -216,6 +223,10 @@ def throughput_test_once(
|
|
216
223
|
]
|
217
224
|
|
218
225
|
if profile:
|
226
|
+
assert (
|
227
|
+
"SGLANG_TORCH_PROFILER_DIR" in os.environ
|
228
|
+
), "Please set SGLANG_TORCH_PROFILER_DIR."
|
229
|
+
os.makedirs(os.environ["SGLANG_TORCH_PROFILER_DIR"], exist_ok=True)
|
219
230
|
backend.start_profile()
|
220
231
|
|
221
232
|
st = time.perf_counter()
|
@@ -229,6 +240,8 @@ def throughput_test_once(
|
|
229
240
|
if backend_name == "runtime":
|
230
241
|
gen_out = json.loads(gen_out)
|
231
242
|
|
243
|
+
server_info = backend.get_server_info()
|
244
|
+
|
232
245
|
measurement_results["total_latency"] = latency
|
233
246
|
measurement_results["total_output_tokens"] = sum(
|
234
247
|
o["meta_info"]["completion_tokens"] for o in gen_out
|
@@ -246,6 +259,7 @@ def throughput_test_once(
|
|
246
259
|
measurement_results["total_input_tokens"]
|
247
260
|
+ measurement_results["total_output_tokens"]
|
248
261
|
) / latency
|
262
|
+
measurement_results["last_gen_throughput"] = server_info["last_gen_throughput"]
|
249
263
|
|
250
264
|
return measurement_results
|
251
265
|
|
@@ -361,6 +375,11 @@ def throughput_test(
|
|
361
375
|
print(
|
362
376
|
"{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
|
363
377
|
)
|
378
|
+
print(
|
379
|
+
"{:<40} {:<10.2f}".format(
|
380
|
+
"Last generation throughput (tok/s):", result["last_gen_throughput"]
|
381
|
+
)
|
382
|
+
)
|
364
383
|
print(
|
365
384
|
"{:<40} {:<10.2f}".format(
|
366
385
|
"Request throughput (req/s):", result["request_throughput"]
|
sglang/bench_one_batch.py
CHANGED
@@ -230,7 +230,7 @@ def extend(reqs, model_runner):
|
|
230
230
|
batch = ScheduleBatch.init_new(
|
231
231
|
reqs=reqs,
|
232
232
|
req_to_token_pool=model_runner.req_to_token_pool,
|
233
|
-
|
233
|
+
token_to_kv_pool_allocator=model_runner.token_to_kv_pool_allocator,
|
234
234
|
tree_cache=None,
|
235
235
|
model_config=model_runner.model_config,
|
236
236
|
enable_overlap=False,
|
@@ -326,7 +326,7 @@ def latency_test_run_once(
|
|
326
326
|
|
327
327
|
# Clear the pools.
|
328
328
|
model_runner.req_to_token_pool.clear()
|
329
|
-
model_runner.
|
329
|
+
model_runner.token_to_kv_pool_allocator.clear()
|
330
330
|
|
331
331
|
measurement_results = {
|
332
332
|
"run_name": run_name,
|
sglang/bench_serving.py
CHANGED
@@ -8,7 +8,6 @@ Usage:
|
|
8
8
|
python3 -m sglang.bench_serving --backend sglang --num-prompt 10
|
9
9
|
|
10
10
|
python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-prompts 3000 --random-input 1024 --random-output 1024 --random-range-ratio 0.5
|
11
|
-
python3 -m sglang.bench_serving --backend sglang --dataset-name random --request-rate-range 1,2,4,8,16,32 --random-input 4096 --random-output 1024 --random-range-ratio 0.125 --multi
|
12
11
|
"""
|
13
12
|
|
14
13
|
import argparse
|
@@ -40,6 +39,7 @@ from transformers import (
|
|
40
39
|
)
|
41
40
|
|
42
41
|
AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
|
42
|
+
ASSISTANT_SUFFIX = "Assistant:"
|
43
43
|
|
44
44
|
global args
|
45
45
|
|
@@ -71,7 +71,19 @@ def remove_prefix(text: str, prefix: str) -> str:
|
|
71
71
|
return text[len(prefix) :] if text.startswith(prefix) else text
|
72
72
|
|
73
73
|
|
74
|
-
|
74
|
+
def remove_suffix(text: str, suffix: str) -> str:
|
75
|
+
return text[: -len(suffix)] if text.endswith(suffix) else text
|
76
|
+
|
77
|
+
|
78
|
+
def get_auth_headers() -> Dict[str, str]:
|
79
|
+
api_key = os.environ.get("OPENAI_API_KEY")
|
80
|
+
if api_key:
|
81
|
+
return {"Authorization": f"Bearer {api_key}"}
|
82
|
+
else:
|
83
|
+
return {}
|
84
|
+
|
85
|
+
|
86
|
+
# trt llm does not support ignore_eos
|
75
87
|
# https://github.com/triton-inference-server/tensorrtllm_backend/issues/505
|
76
88
|
async def async_request_trt_llm(
|
77
89
|
request_func_input: RequestFuncInput,
|
@@ -165,12 +177,13 @@ async def async_request_openai_completions(
|
|
165
177
|
"ignore_eos": not args.disable_ignore_eos,
|
166
178
|
**request_func_input.extra_request_body,
|
167
179
|
}
|
168
|
-
headers =
|
180
|
+
headers = get_auth_headers()
|
169
181
|
|
170
182
|
output = RequestFuncOutput()
|
171
183
|
output.prompt_len = request_func_input.prompt_len
|
172
184
|
|
173
185
|
generated_text = ""
|
186
|
+
output_len = request_func_input.output_len
|
174
187
|
ttft = 0.0
|
175
188
|
st = time.perf_counter()
|
176
189
|
most_recent_timestamp = st
|
@@ -207,11 +220,14 @@ async def async_request_openai_completions(
|
|
207
220
|
|
208
221
|
most_recent_timestamp = timestamp
|
209
222
|
generated_text += data["choices"][0]["text"]
|
223
|
+
output_len = data.get("usage", {}).get(
|
224
|
+
"completion_tokens", output_len
|
225
|
+
)
|
210
226
|
|
211
227
|
output.generated_text = generated_text
|
212
228
|
output.success = True
|
213
229
|
output.latency = latency
|
214
|
-
output.output_len =
|
230
|
+
output.output_len = output_len
|
215
231
|
else:
|
216
232
|
output.error = response.reason or ""
|
217
233
|
output.success = False
|
@@ -244,7 +260,7 @@ async def async_request_truss(
|
|
244
260
|
"ignore_eos": not args.disable_ignore_eos,
|
245
261
|
**request_func_input.extra_request_body,
|
246
262
|
}
|
247
|
-
headers =
|
263
|
+
headers = get_auth_headers()
|
248
264
|
|
249
265
|
output = RequestFuncOutput()
|
250
266
|
output.prompt_len = request_func_input.prompt_len
|
@@ -325,15 +341,17 @@ async def async_request_sglang_generate(
|
|
325
341
|
"logprob_start_len": -1,
|
326
342
|
**request_func_input.extra_request_body,
|
327
343
|
}
|
328
|
-
headers =
|
344
|
+
headers = get_auth_headers()
|
329
345
|
|
330
346
|
output = RequestFuncOutput()
|
331
347
|
output.prompt_len = request_func_input.prompt_len
|
332
348
|
|
333
349
|
generated_text = ""
|
350
|
+
output_len = request_func_input.output_len
|
334
351
|
ttft = 0.0
|
335
352
|
st = time.perf_counter()
|
336
353
|
most_recent_timestamp = st
|
354
|
+
last_output_len = 0
|
337
355
|
try:
|
338
356
|
async with session.post(
|
339
357
|
url=api_url, json=payload, headers=headers
|
@@ -357,6 +375,9 @@ async def async_request_sglang_generate(
|
|
357
375
|
# want to check a token was generated
|
358
376
|
if data["text"]:
|
359
377
|
timestamp = time.perf_counter()
|
378
|
+
generated_text = data["text"]
|
379
|
+
output_len = data["meta_info"]["completion_tokens"]
|
380
|
+
|
360
381
|
# First token
|
361
382
|
if ttft == 0.0:
|
362
383
|
ttft = time.perf_counter() - st
|
@@ -364,15 +385,21 @@ async def async_request_sglang_generate(
|
|
364
385
|
|
365
386
|
# Decoding phase
|
366
387
|
else:
|
367
|
-
|
388
|
+
num_new_tokens = output_len - last_output_len
|
389
|
+
if num_new_tokens == 0:
|
390
|
+
continue
|
391
|
+
adjust_itl = (
|
392
|
+
timestamp - most_recent_timestamp
|
393
|
+
) / num_new_tokens
|
394
|
+
output.itl.extend([adjust_itl] * num_new_tokens)
|
368
395
|
|
369
396
|
most_recent_timestamp = timestamp
|
370
|
-
|
397
|
+
last_output_len = output_len
|
371
398
|
|
372
399
|
output.generated_text = generated_text
|
373
400
|
output.success = True
|
374
401
|
output.latency = latency
|
375
|
-
output.output_len =
|
402
|
+
output.output_len = output_len
|
376
403
|
else:
|
377
404
|
output.error = response.reason or ""
|
378
405
|
output.success = False
|
@@ -380,6 +407,7 @@ async def async_request_sglang_generate(
|
|
380
407
|
output.success = False
|
381
408
|
exc_info = sys.exc_info()
|
382
409
|
output.error = "".join(traceback.format_exception(*exc_info))
|
410
|
+
print(f"{output.error=}")
|
383
411
|
|
384
412
|
if pbar:
|
385
413
|
pbar.update(1)
|
@@ -453,6 +481,7 @@ def get_dataset(args, tokenizer):
|
|
453
481
|
tokenizer=tokenizer,
|
454
482
|
fixed_output_len=args.sharegpt_output_len,
|
455
483
|
context_len=args.sharegpt_context_len,
|
484
|
+
prompt_suffix=args.prompt_suffix,
|
456
485
|
apply_chat_template=args.apply_chat_template,
|
457
486
|
)
|
458
487
|
elif args.dataset_name == "random":
|
@@ -513,7 +542,9 @@ class BenchmarkMetrics:
|
|
513
542
|
mean_itl_ms: float
|
514
543
|
median_itl_ms: float
|
515
544
|
std_itl_ms: float
|
545
|
+
p95_itl_ms: float
|
516
546
|
p99_itl_ms: float
|
547
|
+
max_itl_ms: float
|
517
548
|
mean_e2e_latency_ms: float
|
518
549
|
median_e2e_latency_ms: float
|
519
550
|
std_e2e_latency_ms: float
|
@@ -564,6 +595,7 @@ def sample_sharegpt_requests(
|
|
564
595
|
tokenizer: PreTrainedTokenizerBase,
|
565
596
|
fixed_output_len: Optional[int] = None,
|
566
597
|
context_len: Optional[int] = None,
|
598
|
+
prompt_suffix: Optional[str] = "",
|
567
599
|
apply_chat_template=False,
|
568
600
|
) -> List[Tuple[str, int, int]]:
|
569
601
|
if fixed_output_len is not None and fixed_output_len < 4:
|
@@ -576,11 +608,19 @@ def sample_sharegpt_requests(
|
|
576
608
|
# Load the dataset.
|
577
609
|
with open(dataset_path) as f:
|
578
610
|
dataset = json.load(f)
|
611
|
+
|
579
612
|
# Filter out the conversations with less than 2 turns.
|
580
|
-
dataset = [
|
613
|
+
dataset = [
|
614
|
+
data
|
615
|
+
for data in dataset
|
616
|
+
if len(data.get("conversations", data.get("conversation", []))) >= 2
|
617
|
+
]
|
581
618
|
# Only keep the first two turns of each conversation.
|
582
619
|
dataset = [
|
583
|
-
(
|
620
|
+
(
|
621
|
+
data.get("conversations", data.get("conversation", []))[0]["value"],
|
622
|
+
data.get("conversations", data.get("conversation", []))[1]["value"],
|
623
|
+
)
|
584
624
|
for data in dataset
|
585
625
|
]
|
586
626
|
|
@@ -595,6 +635,12 @@ def sample_sharegpt_requests(
|
|
595
635
|
|
596
636
|
# Tokenize the prompts and completions.
|
597
637
|
prompt = dataset[i][0]
|
638
|
+
if prompt_suffix:
|
639
|
+
prompt = (
|
640
|
+
remove_suffix(prompt, ASSISTANT_SUFFIX)
|
641
|
+
+ prompt_suffix
|
642
|
+
+ ASSISTANT_SUFFIX
|
643
|
+
)
|
598
644
|
|
599
645
|
if apply_chat_template:
|
600
646
|
prompt = tokenizer.apply_chat_template(
|
@@ -658,10 +704,17 @@ def sample_random_requests(
|
|
658
704
|
with open(dataset_path) as f:
|
659
705
|
dataset = json.load(f)
|
660
706
|
# Filter out the conversations with less than 2 turns.
|
661
|
-
dataset = [
|
707
|
+
dataset = [
|
708
|
+
data
|
709
|
+
for data in dataset
|
710
|
+
if len(data.get("conversations", data.get("conversation", []))) >= 2
|
711
|
+
]
|
662
712
|
# Only keep the first two turns of each conversation.
|
663
713
|
dataset = [
|
664
|
-
(
|
714
|
+
(
|
715
|
+
data.get("conversations", data.get("conversation", []))[0]["value"],
|
716
|
+
data.get("conversations", data.get("conversation", []))[1]["value"],
|
717
|
+
)
|
665
718
|
for data in dataset
|
666
719
|
]
|
667
720
|
# Shuffle the dataset.
|
@@ -887,7 +940,9 @@ def calculate_metrics(
|
|
887
940
|
mean_itl_ms=np.mean(itls or 0) * 1000,
|
888
941
|
median_itl_ms=np.median(itls or 0) * 1000,
|
889
942
|
std_itl_ms=np.std(itls or 0) * 1000,
|
943
|
+
p95_itl_ms=np.percentile(itls or 0, 95) * 1000,
|
890
944
|
p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
|
945
|
+
max_itl_ms=np.max(itls or 0) * 1000,
|
891
946
|
mean_e2e_latency_ms=np.mean(e2e_latencies) * 1000,
|
892
947
|
median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
|
893
948
|
std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
|
@@ -911,6 +966,7 @@ async def benchmark(
|
|
911
966
|
lora_name: str,
|
912
967
|
extra_request_body: Dict[str, Any],
|
913
968
|
profile: bool,
|
969
|
+
pd_seperated: bool = False,
|
914
970
|
):
|
915
971
|
if backend in ASYNC_REQUEST_FUNCS:
|
916
972
|
request_func = ASYNC_REQUEST_FUNCS[backend]
|
@@ -996,6 +1052,17 @@ async def benchmark(
|
|
996
1052
|
if pbar is not None:
|
997
1053
|
pbar.close()
|
998
1054
|
|
1055
|
+
if "sglang" in backend:
|
1056
|
+
server_info = requests.get(base_url + "/get_server_info")
|
1057
|
+
if pd_seperated:
|
1058
|
+
accept_length = server_info.json()["decode"][0].get(
|
1059
|
+
"avg_spec_accept_length", None
|
1060
|
+
)
|
1061
|
+
else:
|
1062
|
+
accept_length = server_info.json().get("avg_spec_accept_length", None)
|
1063
|
+
else:
|
1064
|
+
accept_length = None
|
1065
|
+
|
999
1066
|
# Compute metrics and print results
|
1000
1067
|
benchmark_duration = time.perf_counter() - benchmark_start_time
|
1001
1068
|
metrics, output_lens = calculate_metrics(
|
@@ -1045,6 +1112,8 @@ async def benchmark(
|
|
1045
1112
|
)
|
1046
1113
|
)
|
1047
1114
|
print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
|
1115
|
+
if accept_length:
|
1116
|
+
print("{:<40} {:<10.2f}".format("Accept length:", accept_length))
|
1048
1117
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
1049
1118
|
print(
|
1050
1119
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
@@ -1058,16 +1127,12 @@ async def benchmark(
|
|
1058
1127
|
print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
|
1059
1128
|
print("{:<40} {:<10.2f}".format("Median TTFT (ms):", metrics.median_ttft_ms))
|
1060
1129
|
print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
|
1061
|
-
print(
|
1062
|
-
"{s:{c}^{n}}".format(s="Time per Output Token (excl. 1st token)", n=50, c="-")
|
1063
|
-
)
|
1064
|
-
print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
|
1065
|
-
print("{:<40} {:<10.2f}".format("Median TPOT (ms):", metrics.median_tpot_ms))
|
1066
|
-
print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
|
1067
|
-
print("{s:{c}^{n}}".format(s="Inter-token Latency", n=50, c="-"))
|
1130
|
+
print("{s:{c}^{n}}".format(s="Inter-Token Latency", n=50, c="-"))
|
1068
1131
|
print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
|
1069
1132
|
print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
|
1133
|
+
print("{:<40} {:<10.2f}".format("P95 ITL (ms):", metrics.p95_itl_ms))
|
1070
1134
|
print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
|
1135
|
+
print("{:<40} {:<10.2f}".format("Max ITL (ms):", metrics.max_itl_ms))
|
1071
1136
|
print("=" * 50)
|
1072
1137
|
|
1073
1138
|
if (
|
@@ -1109,8 +1174,10 @@ async def benchmark(
|
|
1109
1174
|
"mean_itl_ms": metrics.mean_itl_ms,
|
1110
1175
|
"median_itl_ms": metrics.median_itl_ms,
|
1111
1176
|
"std_itl_ms": metrics.std_itl_ms,
|
1177
|
+
"p95_itl_ms": metrics.p95_itl_ms,
|
1112
1178
|
"p99_itl_ms": metrics.p99_itl_ms,
|
1113
1179
|
"concurrency": metrics.concurrency,
|
1180
|
+
"accept_length": accept_length,
|
1114
1181
|
}
|
1115
1182
|
else:
|
1116
1183
|
print(f"Error running benchmark for request rate: {request_rate}")
|
@@ -1143,14 +1210,6 @@ async def benchmark(
|
|
1143
1210
|
return result
|
1144
1211
|
|
1145
1212
|
|
1146
|
-
def parse_request_rate_range(request_rate_range):
|
1147
|
-
if len(request_rate_range.split(",")) == 3:
|
1148
|
-
start, stop, step = map(int, request_rate_range.split(","))
|
1149
|
-
return list(range(start, stop, step))
|
1150
|
-
else:
|
1151
|
-
return list(map(int, request_rate_range.split(",")))
|
1152
|
-
|
1153
|
-
|
1154
1213
|
def check_chat_template(model_path):
|
1155
1214
|
try:
|
1156
1215
|
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
|
@@ -1160,6 +1219,12 @@ def check_chat_template(model_path):
|
|
1160
1219
|
return False
|
1161
1220
|
|
1162
1221
|
|
1222
|
+
def set_global_args(args_: argparse.Namespace):
|
1223
|
+
"""Set the global args."""
|
1224
|
+
global args
|
1225
|
+
args = args_
|
1226
|
+
|
1227
|
+
|
1163
1228
|
def run_benchmark(args_: argparse.Namespace):
|
1164
1229
|
global args
|
1165
1230
|
args = args_
|
@@ -1168,6 +1233,8 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1168
1233
|
if not hasattr(args, "max_concurrency"):
|
1169
1234
|
args.max_concurrency = None
|
1170
1235
|
|
1236
|
+
print(f"benchmark_args={args}")
|
1237
|
+
|
1171
1238
|
# Set global environments
|
1172
1239
|
set_ulimit()
|
1173
1240
|
random.seed(args.seed)
|
@@ -1238,7 +1305,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1238
1305
|
)
|
1239
1306
|
sys.exit(1)
|
1240
1307
|
try:
|
1241
|
-
response = requests.get(model_url)
|
1308
|
+
response = requests.get(model_url, headers=get_auth_headers())
|
1242
1309
|
model_list = response.json().get("data", [])
|
1243
1310
|
args.model = model_list[0]["id"] if model_list else None
|
1244
1311
|
except Exception as e:
|
@@ -1264,49 +1331,26 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1264
1331
|
backend = args.backend
|
1265
1332
|
model_id = args.model
|
1266
1333
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
1267
|
-
|
1268
1334
|
tokenizer = get_tokenizer(tokenizer_id)
|
1269
|
-
|
1270
1335
|
input_requests = get_dataset(args, tokenizer)
|
1271
1336
|
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
|
1277
|
-
|
1278
|
-
|
1279
|
-
|
1280
|
-
|
1281
|
-
|
1282
|
-
|
1283
|
-
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1287
|
-
)
|
1337
|
+
return asyncio.run(
|
1338
|
+
benchmark(
|
1339
|
+
backend=backend,
|
1340
|
+
api_url=api_url,
|
1341
|
+
base_url=base_url,
|
1342
|
+
model_id=model_id,
|
1343
|
+
tokenizer=tokenizer,
|
1344
|
+
input_requests=input_requests,
|
1345
|
+
request_rate=args.request_rate,
|
1346
|
+
max_concurrency=args.max_concurrency,
|
1347
|
+
disable_tqdm=args.disable_tqdm,
|
1348
|
+
lora_name=args.lora_name,
|
1349
|
+
extra_request_body=extra_request_body,
|
1350
|
+
profile=args.profile,
|
1351
|
+
pd_seperated=args.pd_seperated,
|
1288
1352
|
)
|
1289
|
-
|
1290
|
-
# Benchmark multiple rps. TODO: use a fixed duration to compute num_prompts
|
1291
|
-
request_rates = parse_request_rate_range(args.request_rate_range)
|
1292
|
-
|
1293
|
-
for rate in request_rates:
|
1294
|
-
asyncio.run(
|
1295
|
-
benchmark(
|
1296
|
-
backend=backend,
|
1297
|
-
api_url=api_url,
|
1298
|
-
base_url=base_url,
|
1299
|
-
model_id=model_id,
|
1300
|
-
tokenizer=tokenizer,
|
1301
|
-
input_requests=input_requests,
|
1302
|
-
request_rate=rate,
|
1303
|
-
max_concurrency=args.max_concurrency,
|
1304
|
-
disable_tqdm=args.disable_tqdm,
|
1305
|
-
lora_name=args.lora_name,
|
1306
|
-
extra_request_body=extra_request_body,
|
1307
|
-
profile=args.profile,
|
1308
|
-
)
|
1309
|
-
)
|
1353
|
+
)
|
1310
1354
|
|
1311
1355
|
|
1312
1356
|
def set_ulimit(target_soft_limit=65535):
|
@@ -1420,17 +1464,6 @@ if __name__ == "__main__":
|
|
1420
1464
|
"actual request rate may be lower than specified with --request-rate, "
|
1421
1465
|
"if the server is not processing requests fast enough to keep up.",
|
1422
1466
|
)
|
1423
|
-
parser.add_argument(
|
1424
|
-
"--multi",
|
1425
|
-
action="store_true",
|
1426
|
-
help="Use request rate range rather than single value.",
|
1427
|
-
)
|
1428
|
-
parser.add_argument(
|
1429
|
-
"--request-rate-range",
|
1430
|
-
type=str,
|
1431
|
-
default="2,34,2",
|
1432
|
-
help="Range of request rates in the format start,stop,step. Default is 2,34,2. It also supports a list of request rates, requiring the parameters to not equal three.",
|
1433
|
-
)
|
1434
1467
|
parser.add_argument("--output-file", type=str, help="Output JSONL file name.")
|
1435
1468
|
parser.add_argument(
|
1436
1469
|
"--disable-tqdm",
|
@@ -1477,6 +1510,17 @@ if __name__ == "__main__":
|
|
1477
1510
|
default=None,
|
1478
1511
|
help="The name of LoRA adapter",
|
1479
1512
|
)
|
1513
|
+
parser.add_argument(
|
1514
|
+
"--prompt-suffix",
|
1515
|
+
type=str,
|
1516
|
+
default="",
|
1517
|
+
help="Suffix applied to the end of all user prompts, followed by assistant prompt suffix.",
|
1518
|
+
)
|
1519
|
+
parser.add_argument(
|
1520
|
+
"--pd-seperated",
|
1521
|
+
action="store_true",
|
1522
|
+
help="Benchmark PD disaggregation server",
|
1523
|
+
)
|
1480
1524
|
|
1481
1525
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1482
1526
|
group.add_argument(
|
sglang/global_config.py
CHANGED
@@ -4,6 +4,13 @@ import os
|
|
4
4
|
|
5
5
|
|
6
6
|
class GlobalConfig:
|
7
|
+
"""
|
8
|
+
Store some global constants.
|
9
|
+
|
10
|
+
See also python/sglang/srt/managers/schedule_batch.py::global_server_args_dict, which stores
|
11
|
+
many global runtime arguments as well.
|
12
|
+
"""
|
13
|
+
|
7
14
|
def __init__(self):
|
8
15
|
# Verbosity level
|
9
16
|
# 0: do not output anything
|
@@ -34,11 +41,9 @@ class GlobalConfig:
|
|
34
41
|
self.skip_special_tokens_in_output = True
|
35
42
|
self.spaces_between_special_tokens_in_out = True
|
36
43
|
|
37
|
-
#
|
44
|
+
# Language frontend interpreter optimization configs
|
38
45
|
self.enable_precache_with_tracing = True
|
39
46
|
self.enable_parallel_encoding = True
|
40
47
|
|
41
|
-
self.enable_flashinfer_mla = False
|
42
|
-
|
43
48
|
|
44
49
|
global_config = GlobalConfig()
|
@@ -336,7 +336,7 @@ class Runtime:
|
|
336
336
|
"""
|
337
337
|
A wrapper for the HTTP server.
|
338
338
|
This is used for launching the server in a python program without
|
339
|
-
using the
|
339
|
+
using the command line interface.
|
340
340
|
|
341
341
|
It is mainly used for the frontend language.
|
342
342
|
You should use the Engine class if you want to do normal offline processing without the frontend language.
|
sglang/lang/ir.py
CHANGED
@@ -457,7 +457,7 @@ class SglGen(SglExpr):
|
|
457
457
|
regex: Optional[str] = None,
|
458
458
|
json_schema: Optional[str] = None,
|
459
459
|
):
|
460
|
-
"""Call the model to generate. See the meaning of the arguments in docs/sampling_params.md"""
|
460
|
+
"""Call the model to generate. See the meaning of the arguments in docs/backend/sampling_params.md"""
|
461
461
|
super().__init__()
|
462
462
|
self.name = name
|
463
463
|
self.sampling_params = SglSamplingParams(
|