sglang 0.4.1.post6__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +21 -23
- sglang/api.py +2 -7
- sglang/bench_offline_throughput.py +41 -27
- sglang/bench_one_batch.py +60 -4
- sglang/bench_one_batch_server.py +1 -1
- sglang/bench_serving.py +83 -71
- sglang/lang/backend/runtime_endpoint.py +183 -4
- sglang/lang/chat_template.py +46 -4
- sglang/launch_server.py +1 -1
- sglang/srt/_custom_ops.py +80 -42
- sglang/srt/configs/device_config.py +1 -1
- sglang/srt/configs/load_config.py +1 -0
- sglang/srt/configs/model_config.py +1 -0
- sglang/srt/constrained/base_grammar_backend.py +21 -0
- sglang/srt/constrained/xgrammar_backend.py +8 -4
- sglang/srt/conversation.py +14 -1
- sglang/srt/distributed/__init__.py +3 -3
- sglang/srt/distributed/communication_op.py +2 -1
- sglang/srt/distributed/device_communicators/cuda_wrapper.py +2 -1
- sglang/srt/distributed/device_communicators/custom_all_reduce.py +112 -42
- sglang/srt/distributed/device_communicators/custom_all_reduce_utils.py +2 -2
- sglang/srt/distributed/device_communicators/hpu_communicator.py +2 -1
- sglang/srt/distributed/device_communicators/pynccl.py +80 -1
- sglang/srt/distributed/device_communicators/pynccl_wrapper.py +112 -2
- sglang/srt/distributed/device_communicators/shm_broadcast.py +5 -72
- sglang/srt/distributed/device_communicators/xpu_communicator.py +2 -1
- sglang/srt/distributed/parallel_state.py +1 -1
- sglang/srt/distributed/utils.py +2 -1
- sglang/srt/entrypoints/engine.py +452 -0
- sglang/srt/entrypoints/http_server.py +603 -0
- sglang/srt/function_call_parser.py +494 -0
- sglang/srt/layers/activation.py +8 -8
- sglang/srt/layers/attention/flashinfer_backend.py +10 -9
- sglang/srt/layers/attention/triton_backend.py +4 -6
- sglang/srt/layers/attention/vision.py +204 -0
- sglang/srt/layers/dp_attention.py +71 -0
- sglang/srt/layers/layernorm.py +5 -5
- sglang/srt/layers/linear.py +65 -14
- sglang/srt/layers/logits_processor.py +49 -64
- sglang/srt/layers/moe/ep_moe/layer.py +24 -16
- sglang/srt/layers/moe/fused_moe_native.py +84 -1
- sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +27 -7
- sglang/srt/layers/moe/fused_moe_triton/layer.py +38 -5
- sglang/srt/layers/parameter.py +18 -8
- sglang/srt/layers/quantization/__init__.py +20 -23
- sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
- sglang/srt/layers/quantization/fp8.py +10 -4
- sglang/srt/layers/quantization/modelopt_quant.py +1 -2
- sglang/srt/layers/quantization/w8a8_int8.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -2
- sglang/srt/layers/rotary_embedding.py +1184 -31
- sglang/srt/layers/sampler.py +64 -6
- sglang/srt/layers/torchao_utils.py +12 -6
- sglang/srt/layers/vocab_parallel_embedding.py +2 -2
- sglang/srt/lora/lora.py +1 -9
- sglang/srt/managers/configure_logging.py +3 -0
- sglang/srt/managers/data_parallel_controller.py +79 -72
- sglang/srt/managers/detokenizer_manager.py +24 -6
- sglang/srt/managers/image_processor.py +158 -2
- sglang/srt/managers/io_struct.py +57 -3
- sglang/srt/managers/schedule_batch.py +78 -45
- sglang/srt/managers/schedule_policy.py +26 -12
- sglang/srt/managers/scheduler.py +326 -201
- sglang/srt/managers/session_controller.py +1 -0
- sglang/srt/managers/tokenizer_manager.py +210 -121
- sglang/srt/managers/tp_worker.py +6 -4
- sglang/srt/managers/tp_worker_overlap_thread.py +5 -8
- sglang/srt/managers/utils.py +44 -0
- sglang/srt/mem_cache/memory_pool.py +10 -32
- sglang/srt/metrics/collector.py +15 -6
- sglang/srt/model_executor/cuda_graph_runner.py +26 -30
- sglang/srt/model_executor/forward_batch_info.py +5 -7
- sglang/srt/model_executor/model_runner.py +44 -19
- sglang/srt/model_loader/loader.py +83 -6
- sglang/srt/model_loader/weight_utils.py +145 -6
- sglang/srt/models/baichuan.py +6 -6
- sglang/srt/models/chatglm.py +2 -2
- sglang/srt/models/commandr.py +17 -5
- sglang/srt/models/dbrx.py +13 -5
- sglang/srt/models/deepseek.py +3 -3
- sglang/srt/models/deepseek_v2.py +11 -11
- sglang/srt/models/exaone.py +2 -2
- sglang/srt/models/gemma.py +2 -2
- sglang/srt/models/gemma2.py +15 -25
- sglang/srt/models/gpt2.py +3 -5
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/granite.py +2 -2
- sglang/srt/models/grok.py +4 -3
- sglang/srt/models/internlm2.py +2 -2
- sglang/srt/models/llama.py +7 -5
- sglang/srt/models/minicpm.py +2 -2
- sglang/srt/models/minicpm3.py +9 -9
- sglang/srt/models/minicpmv.py +1238 -0
- sglang/srt/models/mixtral.py +3 -3
- sglang/srt/models/mixtral_quant.py +3 -3
- sglang/srt/models/mllama.py +2 -2
- sglang/srt/models/olmo.py +3 -3
- sglang/srt/models/olmo2.py +4 -4
- sglang/srt/models/olmoe.py +7 -13
- sglang/srt/models/phi3_small.py +2 -2
- sglang/srt/models/qwen.py +2 -2
- sglang/srt/models/qwen2.py +41 -4
- sglang/srt/models/qwen2_moe.py +3 -3
- sglang/srt/models/qwen2_vl.py +22 -122
- sglang/srt/models/stablelm.py +2 -2
- sglang/srt/models/torch_native_llama.py +20 -7
- sglang/srt/models/xverse.py +6 -6
- sglang/srt/models/xverse_moe.py +6 -6
- sglang/srt/openai_api/adapter.py +139 -37
- sglang/srt/openai_api/protocol.py +7 -4
- sglang/srt/sampling/custom_logit_processor.py +38 -0
- sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
- sglang/srt/sampling/sampling_batch_info.py +143 -18
- sglang/srt/sampling/sampling_params.py +3 -1
- sglang/srt/server.py +4 -1090
- sglang/srt/server_args.py +77 -15
- sglang/srt/speculative/eagle_utils.py +37 -15
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/utils.py +164 -129
- sglang/test/runners.py +8 -13
- sglang/test/test_programs.py +2 -1
- sglang/test/test_utils.py +83 -22
- sglang/utils.py +12 -2
- sglang/version.py +1 -1
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/METADATA +21 -10
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/RECORD +138 -123
- sglang/launch_server_llavavid.py +0 -25
- sglang/srt/constrained/__init__.py +0 -16
- sglang/srt/distributed/device_communicators/__init__.py +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
- {sglang-0.4.1.post6.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/bench_serving.py
CHANGED
@@ -452,6 +452,8 @@ def get_dataset(args, tokenizer):
|
|
452
452
|
num_requests=args.num_prompts,
|
453
453
|
tokenizer=tokenizer,
|
454
454
|
fixed_output_len=args.sharegpt_output_len,
|
455
|
+
context_len=args.sharegpt_context_len,
|
456
|
+
apply_chat_template=args.apply_chat_template,
|
455
457
|
)
|
456
458
|
elif args.dataset_name == "random":
|
457
459
|
input_requests = sample_random_requests(
|
@@ -464,11 +466,11 @@ def get_dataset(args, tokenizer):
|
|
464
466
|
)
|
465
467
|
elif args.dataset_name == "generated-shared-prefix":
|
466
468
|
input_requests = sample_generated_shared_prefix_requests(
|
467
|
-
num_groups=args.
|
468
|
-
prompts_per_group=args.
|
469
|
-
system_prompt_len=args.
|
470
|
-
question_len=args.
|
471
|
-
output_len=args.
|
469
|
+
num_groups=args.gsp_num_groups,
|
470
|
+
prompts_per_group=args.gsp_prompts_per_group,
|
471
|
+
system_prompt_len=args.gsp_system_prompt_len,
|
472
|
+
question_len=args.gsp_question_len,
|
473
|
+
output_len=args.gsp_output_len,
|
472
474
|
tokenizer=tokenizer,
|
473
475
|
)
|
474
476
|
else:
|
@@ -516,6 +518,7 @@ class BenchmarkMetrics:
|
|
516
518
|
median_e2e_latency_ms: float
|
517
519
|
std_e2e_latency_ms: float
|
518
520
|
p99_e2e_latency_ms: float
|
521
|
+
concurrency: float
|
519
522
|
|
520
523
|
|
521
524
|
SHAREGPT_URL = "https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json"
|
@@ -560,6 +563,8 @@ def sample_sharegpt_requests(
|
|
560
563
|
num_requests: int,
|
561
564
|
tokenizer: PreTrainedTokenizerBase,
|
562
565
|
fixed_output_len: Optional[int] = None,
|
566
|
+
context_len: Optional[int] = None,
|
567
|
+
apply_chat_template=False,
|
563
568
|
) -> List[Tuple[str, int, int]]:
|
564
569
|
if fixed_output_len is not None and fixed_output_len < 4:
|
565
570
|
raise ValueError("output_len too small")
|
@@ -590,6 +595,15 @@ def sample_sharegpt_requests(
|
|
590
595
|
|
591
596
|
# Tokenize the prompts and completions.
|
592
597
|
prompt = dataset[i][0]
|
598
|
+
|
599
|
+
if apply_chat_template:
|
600
|
+
prompt = tokenizer.apply_chat_template(
|
601
|
+
[{"role": "user", "content": prompt}],
|
602
|
+
add_generation_prompt=True,
|
603
|
+
tokenize=False,
|
604
|
+
)
|
605
|
+
prompt = prompt.replace(tokenizer.bos_token, "")
|
606
|
+
|
593
607
|
prompt_token_ids = tokenizer.encode(prompt)
|
594
608
|
completion = dataset[i][1]
|
595
609
|
completion_token_ids = tokenizer.encode(completion)
|
@@ -597,14 +611,15 @@ def sample_sharegpt_requests(
|
|
597
611
|
output_len = (
|
598
612
|
len(completion_token_ids) if fixed_output_len is None else fixed_output_len
|
599
613
|
)
|
600
|
-
|
614
|
+
|
615
|
+
if prompt_len < 2 or output_len < 2:
|
601
616
|
# Prune too short sequences.
|
602
617
|
continue
|
603
|
-
|
604
|
-
|
605
|
-
):
|
618
|
+
|
619
|
+
if context_len and prompt_len + output_len > context_len:
|
606
620
|
# Prune too long sequences.
|
607
621
|
continue
|
622
|
+
|
608
623
|
filtered_dataset.append((prompt, prompt_len, output_len))
|
609
624
|
|
610
625
|
print(f"#Input tokens: {np.sum([x[1] for x in filtered_dataset])}")
|
@@ -706,8 +721,8 @@ def get_gen_prefix_cache_path(args, tokenizer):
|
|
706
721
|
|
707
722
|
# Create a unique cache filename based on the generation parameters
|
708
723
|
cache_key = (
|
709
|
-
f"
|
710
|
-
f"{args.
|
724
|
+
f"gen_shared_prefix_{args.gsp_num_groups}_{args.gsp_prompts_per_group}_"
|
725
|
+
f"{args.gsp_system_prompt_len}_{args.gsp_question_len}_{args.gsp_output_len}_"
|
711
726
|
f"{tokenizer.__class__.__name__}.pkl"
|
712
727
|
)
|
713
728
|
return cache_dir / cache_key
|
@@ -877,6 +892,7 @@ def calculate_metrics(
|
|
877
892
|
median_e2e_latency_ms=np.median(e2e_latencies) * 1000,
|
878
893
|
std_e2e_latency_ms=np.std(e2e_latencies) * 1000,
|
879
894
|
p99_e2e_latency_ms=np.percentile(e2e_latencies, 99) * 1000,
|
895
|
+
concurrency=np.sum(e2e_latencies) / dur_s,
|
880
896
|
)
|
881
897
|
|
882
898
|
return metrics, output_lens
|
@@ -1028,6 +1044,7 @@ async def benchmark(
|
|
1028
1044
|
"Total token throughput (tok/s):", metrics.total_throughput
|
1029
1045
|
)
|
1030
1046
|
)
|
1047
|
+
print("{:<40} {:<10.2f}".format("Concurrency:", metrics.concurrency))
|
1031
1048
|
print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
|
1032
1049
|
print(
|
1033
1050
|
"{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
|
@@ -1059,13 +1076,24 @@ async def benchmark(
|
|
1059
1076
|
and metrics.output_throughput is not None
|
1060
1077
|
):
|
1061
1078
|
result = {
|
1079
|
+
# Arguments
|
1062
1080
|
"backend": args.backend,
|
1063
1081
|
"dataset_name": args.dataset_name,
|
1064
1082
|
"request_rate": request_rate,
|
1065
1083
|
"max_concurrency": max_concurrency,
|
1084
|
+
"sharegpt_output_len": args.sharegpt_output_len,
|
1085
|
+
"random_input_len": args.random_input_len,
|
1086
|
+
"random_output_len": args.random_output_len,
|
1087
|
+
"random_range_ratio": args.random_range_ratio,
|
1088
|
+
# Results
|
1089
|
+
"duration": benchmark_duration,
|
1090
|
+
"completed": metrics.completed,
|
1066
1091
|
"total_input_tokens": metrics.total_input,
|
1067
1092
|
"total_output_tokens": metrics.total_output,
|
1068
1093
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
1094
|
+
"request_throughput": metrics.request_throughput,
|
1095
|
+
"input_throughput": metrics.input_throughput,
|
1096
|
+
"output_throughput": metrics.output_throughput,
|
1069
1097
|
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
1070
1098
|
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
1071
1099
|
"std_e2e_latency_ms": metrics.std_e2e_latency_ms,
|
@@ -1082,14 +1110,7 @@ async def benchmark(
|
|
1082
1110
|
"median_itl_ms": metrics.median_itl_ms,
|
1083
1111
|
"std_itl_ms": metrics.std_itl_ms,
|
1084
1112
|
"p99_itl_ms": metrics.p99_itl_ms,
|
1085
|
-
"
|
1086
|
-
"output_throughput": metrics.output_throughput,
|
1087
|
-
"sharegpt_output_len": args.sharegpt_output_len,
|
1088
|
-
"random_input_len": args.random_input_len,
|
1089
|
-
"random_output_len": args.random_output_len,
|
1090
|
-
"random_range_ratio": args.random_range_ratio,
|
1091
|
-
"duration": benchmark_duration,
|
1092
|
-
"completed": metrics.completed,
|
1113
|
+
"concurrency": metrics.concurrency,
|
1093
1114
|
}
|
1094
1115
|
else:
|
1095
1116
|
print(f"Error running benchmark for request rate: {request_rate}")
|
@@ -1109,36 +1130,16 @@ async def benchmark(
|
|
1109
1130
|
with open(output_file_name, "a") as file:
|
1110
1131
|
file.write(json.dumps(result) + "\n")
|
1111
1132
|
|
1112
|
-
result
|
1113
|
-
|
1114
|
-
|
1115
|
-
|
1116
|
-
|
1117
|
-
|
1118
|
-
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
"median_ttft_ms": metrics.median_ttft_ms,
|
1123
|
-
"std_ttft_ms": metrics.std_ttft_ms,
|
1124
|
-
"p99_ttft_ms": metrics.p99_ttft_ms,
|
1125
|
-
"mean_tpot_ms": metrics.mean_tpot_ms,
|
1126
|
-
"median_tpot_ms": metrics.median_tpot_ms,
|
1127
|
-
"std_tpot_ms": metrics.std_tpot_ms,
|
1128
|
-
"p99_tpot_ms": metrics.p99_tpot_ms,
|
1129
|
-
"mean_itl_ms": metrics.mean_itl_ms,
|
1130
|
-
"median_itl_ms": metrics.median_itl_ms,
|
1131
|
-
"std_itl_ms": metrics.std_itl_ms,
|
1132
|
-
"p99_itl_ms": metrics.p99_itl_ms,
|
1133
|
-
"input_lens": [output.prompt_len for output in outputs],
|
1134
|
-
"output_lens": output_lens,
|
1135
|
-
"ttfts": [output.ttft for output in outputs],
|
1136
|
-
"itls": [output.itl for output in outputs],
|
1137
|
-
"generated_texts": [output.generated_text for output in outputs],
|
1138
|
-
"errors": [output.error for output in outputs],
|
1139
|
-
"mean_e2e_latency_ms": metrics.mean_e2e_latency_ms,
|
1140
|
-
"median_e2e_latency_ms": metrics.median_e2e_latency_ms,
|
1141
|
-
}
|
1133
|
+
result.update(
|
1134
|
+
{
|
1135
|
+
"input_lens": [output.prompt_len for output in outputs],
|
1136
|
+
"output_lens": output_lens,
|
1137
|
+
"ttfts": [output.ttft for output in outputs],
|
1138
|
+
"itls": [output.itl for output in outputs],
|
1139
|
+
"generated_texts": [output.generated_text for output in outputs],
|
1140
|
+
"errors": [output.error for output in outputs],
|
1141
|
+
}
|
1142
|
+
)
|
1142
1143
|
return result
|
1143
1144
|
|
1144
1145
|
|
@@ -1374,6 +1375,12 @@ if __name__ == "__main__":
|
|
1374
1375
|
default=None,
|
1375
1376
|
help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
|
1376
1377
|
)
|
1378
|
+
parser.add_argument(
|
1379
|
+
"--sharegpt-context-len",
|
1380
|
+
type=int,
|
1381
|
+
default=None,
|
1382
|
+
help="The context length of the model for the ShareGPT dataset. Requests longer than the context length will be dropped.",
|
1383
|
+
)
|
1377
1384
|
parser.add_argument(
|
1378
1385
|
"--random-input-len",
|
1379
1386
|
type=int,
|
@@ -1413,7 +1420,6 @@ if __name__ == "__main__":
|
|
1413
1420
|
"actual request rate may be lower than specified with --request-rate, "
|
1414
1421
|
"if the server is not processing requests fast enough to keep up.",
|
1415
1422
|
)
|
1416
|
-
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1417
1423
|
parser.add_argument(
|
1418
1424
|
"--multi",
|
1419
1425
|
action="store_true",
|
@@ -1437,14 +1443,15 @@ if __name__ == "__main__":
|
|
1437
1443
|
help="Disable streaming mode.",
|
1438
1444
|
)
|
1439
1445
|
parser.add_argument(
|
1440
|
-
"--
|
1446
|
+
"--return-logprob",
|
1441
1447
|
action="store_true",
|
1442
|
-
help="
|
1448
|
+
help="Return logprob.",
|
1443
1449
|
)
|
1450
|
+
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1444
1451
|
parser.add_argument(
|
1445
|
-
"--
|
1452
|
+
"--disable-ignore-eos",
|
1446
1453
|
action="store_true",
|
1447
|
-
help="
|
1454
|
+
help="Disable ignoring EOS.",
|
1448
1455
|
)
|
1449
1456
|
parser.add_argument(
|
1450
1457
|
"--extra-request-body",
|
@@ -1453,49 +1460,54 @@ if __name__ == "__main__":
|
|
1453
1460
|
help="Append given JSON object to the request payload. You can use this to specify"
|
1454
1461
|
"additional generate params like sampling params.",
|
1455
1462
|
)
|
1463
|
+
parser.add_argument(
|
1464
|
+
"--apply-chat-template",
|
1465
|
+
action="store_true",
|
1466
|
+
help="Apply chat template",
|
1467
|
+
)
|
1468
|
+
parser.add_argument(
|
1469
|
+
"--profile",
|
1470
|
+
action="store_true",
|
1471
|
+
help="Use Torch Profiler. The endpoint must be launched with "
|
1472
|
+
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1473
|
+
)
|
1474
|
+
parser.add_argument(
|
1475
|
+
"--lora-name",
|
1476
|
+
type=str,
|
1477
|
+
default=None,
|
1478
|
+
help="The name of LoRA adapter",
|
1479
|
+
)
|
1456
1480
|
|
1457
1481
|
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1458
1482
|
group.add_argument(
|
1459
|
-
"--
|
1483
|
+
"--gsp-num-groups",
|
1460
1484
|
type=int,
|
1461
1485
|
default=64,
|
1462
1486
|
help="Number of system prompt groups for generated-shared-prefix dataset",
|
1463
1487
|
)
|
1464
1488
|
group.add_argument(
|
1465
|
-
"--
|
1489
|
+
"--gsp-prompts-per-group",
|
1466
1490
|
type=int,
|
1467
1491
|
default=16,
|
1468
1492
|
help="Number of prompts per system prompt group for generated-shared-prefix dataset",
|
1469
1493
|
)
|
1470
1494
|
group.add_argument(
|
1471
|
-
"--
|
1495
|
+
"--gsp-system-prompt-len",
|
1472
1496
|
type=int,
|
1473
1497
|
default=2048,
|
1474
1498
|
help="Target length in tokens for system prompts in generated-shared-prefix dataset",
|
1475
1499
|
)
|
1476
1500
|
group.add_argument(
|
1477
|
-
"--
|
1501
|
+
"--gsp-question-len",
|
1478
1502
|
type=int,
|
1479
1503
|
default=128,
|
1480
1504
|
help="Target length in tokens for questions in generated-shared-prefix dataset",
|
1481
1505
|
)
|
1482
1506
|
group.add_argument(
|
1483
|
-
"--
|
1507
|
+
"--gsp-output-len",
|
1484
1508
|
type=int,
|
1485
1509
|
default=256,
|
1486
1510
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1487
1511
|
)
|
1488
|
-
parser.add_argument(
|
1489
|
-
"--profile",
|
1490
|
-
action="store_true",
|
1491
|
-
help="Use Torch Profiler. The endpoint must be launched with "
|
1492
|
-
"SGLANG_TORCH_PROFILER_DIR to enable profiler.",
|
1493
|
-
)
|
1494
|
-
parser.add_argument(
|
1495
|
-
"--lora-name",
|
1496
|
-
type=str,
|
1497
|
-
default=None,
|
1498
|
-
help="The name of LoRA adapter",
|
1499
|
-
)
|
1500
1512
|
args = parser.parse_args()
|
1501
1513
|
run_benchmark(args)
|
@@ -1,6 +1,11 @@
|
|
1
|
+
import atexit
|
1
2
|
import json
|
3
|
+
import multiprocessing
|
2
4
|
import warnings
|
3
|
-
from typing import List, Optional
|
5
|
+
from typing import Dict, List, Optional, Union
|
6
|
+
|
7
|
+
import aiohttp
|
8
|
+
import requests
|
4
9
|
|
5
10
|
from sglang.global_config import global_config
|
6
11
|
from sglang.lang.backend.base_backend import BaseBackend
|
@@ -251,11 +256,12 @@ class RuntimeEndpoint(BaseBackend):
|
|
251
256
|
}
|
252
257
|
obj = self._generate_http_request(s, data)
|
253
258
|
|
254
|
-
normalized_prompt_logprobs = [
|
255
|
-
r["meta_info"]["normalized_prompt_logprob"] for r in obj
|
256
|
-
]
|
257
259
|
input_token_logprobs = [r["meta_info"]["input_token_logprobs"] for r in obj]
|
258
260
|
output_token_logprobs = [r["meta_info"]["output_token_logprobs"] for r in obj]
|
261
|
+
normalized_prompt_logprobs = [
|
262
|
+
compute_normalized_prompt_logprobs(r["meta_info"]["input_token_logprobs"])
|
263
|
+
for r in obj
|
264
|
+
]
|
259
265
|
|
260
266
|
# Remove extra token if no token healing occurred
|
261
267
|
for i in range(len(input_token_logprobs)):
|
@@ -319,3 +325,176 @@ class RuntimeEndpoint(BaseBackend):
|
|
319
325
|
def _assert_success(self, res):
|
320
326
|
if res.status_code != 200:
|
321
327
|
raise RuntimeError(res.json())
|
328
|
+
|
329
|
+
|
330
|
+
def compute_normalized_prompt_logprobs(input_logprobs):
|
331
|
+
values = [x[0] for x in input_logprobs if x[0]]
|
332
|
+
return sum(values) / len(values)
|
333
|
+
|
334
|
+
|
335
|
+
class Runtime:
|
336
|
+
"""
|
337
|
+
A wrapper for the HTTP server.
|
338
|
+
This is used for launching the server in a python program without
|
339
|
+
using the commond line interface.
|
340
|
+
|
341
|
+
It is mainly used for the frontend language.
|
342
|
+
You should use the Engine class if you want to do normal offline processing without the frontend language.
|
343
|
+
"""
|
344
|
+
|
345
|
+
def __init__(
|
346
|
+
self,
|
347
|
+
log_level: str = "error",
|
348
|
+
*args,
|
349
|
+
**kwargs,
|
350
|
+
):
|
351
|
+
"""See the arguments in server_args.py::ServerArgs"""
|
352
|
+
# We delay the import of any `sglang.srt` components in `sglang.lang`, so users can run
|
353
|
+
# client code without installing SRT server and its dependency if they want.
|
354
|
+
from sglang.srt.entrypoints.http_server import launch_server
|
355
|
+
from sglang.srt.server_args import ServerArgs
|
356
|
+
from sglang.srt.utils import is_port_available
|
357
|
+
|
358
|
+
self.server_args = ServerArgs(*args, log_level=log_level, **kwargs)
|
359
|
+
|
360
|
+
# Pre-allocate ports
|
361
|
+
for port in range(self.server_args.port, 40000):
|
362
|
+
if is_port_available(port):
|
363
|
+
break
|
364
|
+
self.server_args.port = port
|
365
|
+
|
366
|
+
self.url = self.server_args.url()
|
367
|
+
self.generate_url = self.url + "/generate"
|
368
|
+
|
369
|
+
# NOTE: We store pid instead of proc to fix some issues during __delete__
|
370
|
+
self.pid = None
|
371
|
+
pipe_reader, pipe_writer = multiprocessing.Pipe(duplex=False)
|
372
|
+
|
373
|
+
proc = multiprocessing.Process(
|
374
|
+
target=launch_server,
|
375
|
+
args=(self.server_args, pipe_writer),
|
376
|
+
)
|
377
|
+
proc.start()
|
378
|
+
pipe_writer.close()
|
379
|
+
self.pid = proc.pid
|
380
|
+
|
381
|
+
# Before python program terminates, call shutdown implicitly. Therefore, users don't have to explicitly call .shutdown()
|
382
|
+
atexit.register(self.shutdown)
|
383
|
+
|
384
|
+
# TODO: remove this pipe_writer mechanism and use `/health_generate` instead.
|
385
|
+
try:
|
386
|
+
init_state = pipe_reader.recv()
|
387
|
+
except EOFError:
|
388
|
+
init_state = ""
|
389
|
+
|
390
|
+
if init_state != "ready":
|
391
|
+
self.shutdown()
|
392
|
+
raise RuntimeError(
|
393
|
+
"Initialization failed. Please see the error messages above."
|
394
|
+
)
|
395
|
+
|
396
|
+
self.endpoint = RuntimeEndpoint(self.url)
|
397
|
+
|
398
|
+
def shutdown(self):
|
399
|
+
from sglang.srt.utils import kill_process_tree
|
400
|
+
|
401
|
+
if self.pid is not None:
|
402
|
+
kill_process_tree(self.pid)
|
403
|
+
self.pid = None
|
404
|
+
|
405
|
+
def cache_prefix(self, prefix: str):
|
406
|
+
self.endpoint.cache_prefix(prefix)
|
407
|
+
|
408
|
+
def get_tokenizer(self):
|
409
|
+
from sglang.srt.hf_transformers_utils import get_tokenizer
|
410
|
+
|
411
|
+
return get_tokenizer(
|
412
|
+
self.server_args.tokenizer_path,
|
413
|
+
tokenizer_mode=self.server_args.tokenizer_mode,
|
414
|
+
trust_remote_code=self.server_args.trust_remote_code,
|
415
|
+
revision=self.server_args.revision,
|
416
|
+
)
|
417
|
+
|
418
|
+
async def async_generate(
|
419
|
+
self,
|
420
|
+
prompt: str,
|
421
|
+
sampling_params: Optional[Dict] = None,
|
422
|
+
):
|
423
|
+
if self.server_args.skip_tokenizer_init:
|
424
|
+
json_data = {
|
425
|
+
"input_ids": prompt,
|
426
|
+
"sampling_params": sampling_params,
|
427
|
+
"stream": True,
|
428
|
+
}
|
429
|
+
else:
|
430
|
+
json_data = {
|
431
|
+
"text": prompt,
|
432
|
+
"sampling_params": sampling_params,
|
433
|
+
"stream": True,
|
434
|
+
}
|
435
|
+
pos = 0
|
436
|
+
|
437
|
+
timeout = aiohttp.ClientTimeout(total=3 * 3600)
|
438
|
+
async with aiohttp.ClientSession(timeout=timeout, trust_env=True) as session:
|
439
|
+
async with session.post(self.generate_url, json=json_data) as response:
|
440
|
+
async for chunk, _ in response.content.iter_chunks():
|
441
|
+
chunk = chunk.decode("utf-8")
|
442
|
+
if chunk and chunk.startswith("data:"):
|
443
|
+
if chunk == "data: [DONE]\n\n":
|
444
|
+
break
|
445
|
+
data = json.loads(chunk[5:].strip("\n"))
|
446
|
+
if "text" in data:
|
447
|
+
cur = data["text"][pos:]
|
448
|
+
if cur:
|
449
|
+
yield cur
|
450
|
+
pos += len(cur)
|
451
|
+
else:
|
452
|
+
yield data
|
453
|
+
|
454
|
+
add_request = async_generate
|
455
|
+
|
456
|
+
def generate(
|
457
|
+
self,
|
458
|
+
prompt: Union[str, List[str]],
|
459
|
+
sampling_params: Optional[Dict] = None,
|
460
|
+
return_logprob: Optional[Union[List[bool], bool]] = False,
|
461
|
+
logprob_start_len: Optional[Union[List[int], int]] = None,
|
462
|
+
top_logprobs_num: Optional[Union[List[int], int]] = None,
|
463
|
+
lora_path: Optional[List[Optional[str]]] = None,
|
464
|
+
):
|
465
|
+
json_data = {
|
466
|
+
"text": prompt,
|
467
|
+
"sampling_params": sampling_params,
|
468
|
+
"return_logprob": return_logprob,
|
469
|
+
"logprob_start_len": logprob_start_len,
|
470
|
+
"top_logprobs_num": top_logprobs_num,
|
471
|
+
"lora_path": lora_path,
|
472
|
+
}
|
473
|
+
assert not isinstance(lora_path, list) or len(lora_path) == len(prompt)
|
474
|
+
response = requests.post(
|
475
|
+
self.url + "/generate",
|
476
|
+
json=json_data,
|
477
|
+
)
|
478
|
+
return json.dumps(response.json())
|
479
|
+
|
480
|
+
def encode(
|
481
|
+
self,
|
482
|
+
prompt: Union[str, List[str], List[Dict], List[List[Dict]]],
|
483
|
+
):
|
484
|
+
json_data = {"text": prompt}
|
485
|
+
response = requests.post(self.url + "/encode", json=json_data)
|
486
|
+
return json.dumps(response.json())
|
487
|
+
|
488
|
+
async def get_server_info(self):
|
489
|
+
async with aiohttp.ClientSession() as session:
|
490
|
+
async with session.get(f"{self.url}/get_server_info") as response:
|
491
|
+
if response.status == 200:
|
492
|
+
return await response.json()
|
493
|
+
else:
|
494
|
+
error_data = await response.json()
|
495
|
+
raise RuntimeError(
|
496
|
+
f"Failed to get server info. {error_data['error']['message']}"
|
497
|
+
)
|
498
|
+
|
499
|
+
def __del__(self):
|
500
|
+
self.shutdown()
|
sglang/lang/chat_template.py
CHANGED
@@ -88,7 +88,6 @@ register_chat_template(
|
|
88
88
|
)
|
89
89
|
)
|
90
90
|
|
91
|
-
|
92
91
|
register_chat_template(
|
93
92
|
ChatTemplate(
|
94
93
|
name="claude",
|
@@ -101,7 +100,6 @@ register_chat_template(
|
|
101
100
|
)
|
102
101
|
)
|
103
102
|
|
104
|
-
|
105
103
|
register_chat_template(
|
106
104
|
ChatTemplate(
|
107
105
|
name="chatml",
|
@@ -116,7 +114,6 @@ register_chat_template(
|
|
116
114
|
)
|
117
115
|
)
|
118
116
|
|
119
|
-
|
120
117
|
register_chat_template(
|
121
118
|
ChatTemplate(
|
122
119
|
name="chatml-llava",
|
@@ -132,7 +129,6 @@ register_chat_template(
|
|
132
129
|
)
|
133
130
|
)
|
134
131
|
|
135
|
-
|
136
132
|
# There is default system prompt for qwen
|
137
133
|
# reference: https://modelscope.cn/models/qwen/Qwen2-72B-Instruct/file/view/master?fileName=tokenizer_config.json&status=1
|
138
134
|
# The chat template is: "{% for message in messages %}{% if loop.first and messages[0]['role'] != 'system' %}{{ '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
@@ -219,6 +215,21 @@ register_chat_template(
|
|
219
215
|
)
|
220
216
|
)
|
221
217
|
|
218
|
+
# https://huggingface.co/openbmb/MiniCPM-V-2_6
|
219
|
+
register_chat_template(
|
220
|
+
ChatTemplate(
|
221
|
+
name="minicpmv",
|
222
|
+
default_system_prompt=None,
|
223
|
+
role_prefix_and_suffix={
|
224
|
+
"system": ("", " "),
|
225
|
+
"user": ("user:", " "),
|
226
|
+
"assistant": ("assistant:", "</s>"),
|
227
|
+
},
|
228
|
+
stop_str=("<|im_end|>", "<|endoftext|>"),
|
229
|
+
image_token="(<image>./</image>)",
|
230
|
+
)
|
231
|
+
)
|
232
|
+
|
222
233
|
# The difference between "llama-3-instruct-llava" and "llama-3-instruct" is that llava uses a different image_token.
|
223
234
|
register_chat_template(
|
224
235
|
ChatTemplate(
|
@@ -343,6 +354,37 @@ register_chat_template(
|
|
343
354
|
)
|
344
355
|
|
345
356
|
|
357
|
+
register_chat_template(
|
358
|
+
ChatTemplate(
|
359
|
+
name="deepseek-v3",
|
360
|
+
default_system_prompt=None,
|
361
|
+
role_prefix_and_suffix={
|
362
|
+
"system": (
|
363
|
+
"",
|
364
|
+
"",
|
365
|
+
),
|
366
|
+
"user": (
|
367
|
+
"<|User|>",
|
368
|
+
"",
|
369
|
+
),
|
370
|
+
"assistant": (
|
371
|
+
"<|Assistant|>",
|
372
|
+
"<|end▁of▁sentence|>",
|
373
|
+
),
|
374
|
+
},
|
375
|
+
stop_str=("<|end▁of▁sentence|>",),
|
376
|
+
)
|
377
|
+
)
|
378
|
+
|
379
|
+
|
380
|
+
@register_chat_template_matching_function
|
381
|
+
def match_deepseek(model_path: str):
|
382
|
+
if (
|
383
|
+
"deepseek-v3" in model_path.lower() or "deepseek-r1" in model_path.lower()
|
384
|
+
) and "base" not in model_path.lower():
|
385
|
+
return get_chat_template("deepseek-v3")
|
386
|
+
|
387
|
+
|
346
388
|
@register_chat_template_matching_function
|
347
389
|
def match_dbrx(model_path: str):
|
348
390
|
if "dbrx" in model_path.lower() and "instruct" in model_path.lower():
|
sglang/launch_server.py
CHANGED