sglang 0.5.1.post2__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/conversation.py +38 -5
- sglang/srt/entrypoints/engine.py +1 -1
- sglang/srt/entrypoints/openai/protocol.py +27 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +16 -7
- sglang/srt/layers/attention/ascend_backend.py +218 -111
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +5 -2
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/moe/cutlass_moe.py +0 -8
- sglang/srt/layers/moe/ep_moe/layer.py +1 -7
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/mxfp4.py +16 -23
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/managers/cache_controller.py +223 -156
- sglang/srt/managers/detokenizer_manager.py +5 -0
- sglang/srt/managers/io_struct.py +30 -0
- sglang/srt/managers/scheduler.py +58 -7
- sglang/srt/managers/tokenizer_manager.py +36 -3
- sglang/srt/mem_cache/hicache_storage.py +31 -20
- sglang/srt/mem_cache/hiradix_cache.py +12 -3
- sglang/srt/mem_cache/memory_pool.py +73 -14
- sglang/srt/mem_cache/memory_pool_host.py +3 -2
- sglang/srt/mem_cache/radix_cache.py +1 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
- sglang/srt/model_executor/model_runner.py +1 -1
- sglang/srt/models/deepseek_v2.py +12 -3
- sglang/srt/models/gpt_oss.py +2 -1
- sglang/srt/models/qwen2_5_vl.py +1 -0
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/server_args.py +10 -1
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +59 -5
- sglang/version.py +1 -1
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +4 -3
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +57 -54
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post2.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch_server.py
CHANGED
@@ -18,7 +18,7 @@ import json
|
|
18
18
|
import multiprocessing
|
19
19
|
import os
|
20
20
|
import time
|
21
|
-
from typing import Tuple
|
21
|
+
from typing import List, Tuple
|
22
22
|
|
23
23
|
import requests
|
24
24
|
|
@@ -45,6 +45,7 @@ class BenchArgs:
|
|
45
45
|
skip_warmup: bool = False
|
46
46
|
show_report: bool = False
|
47
47
|
profile: bool = False
|
48
|
+
profile_steps: int = 3
|
48
49
|
profile_by_stage: bool = False
|
49
50
|
|
50
51
|
@staticmethod
|
@@ -78,6 +79,9 @@ class BenchArgs:
|
|
78
79
|
parser.add_argument("--skip-warmup", action="store_true")
|
79
80
|
parser.add_argument("--show-report", action="store_true")
|
80
81
|
parser.add_argument("--profile", action="store_true")
|
82
|
+
parser.add_argument(
|
83
|
+
"--profile-steps", type=int, default=BenchArgs.profile_steps
|
84
|
+
)
|
81
85
|
parser.add_argument("--profile-by-stage", action="store_true")
|
82
86
|
|
83
87
|
@classmethod
|
@@ -132,6 +136,7 @@ def run_one_case(
|
|
132
136
|
result_filename: str,
|
133
137
|
tokenizer,
|
134
138
|
profile: bool = False,
|
139
|
+
profile_steps: int = 3,
|
135
140
|
profile_by_stage: bool = False,
|
136
141
|
):
|
137
142
|
requests.post(url + "/flush_cache")
|
@@ -162,7 +167,7 @@ def run_one_case(
|
|
162
167
|
profile_link = None
|
163
168
|
if profile:
|
164
169
|
profile_link: str = run_profile(
|
165
|
-
url,
|
170
|
+
url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
|
166
171
|
)
|
167
172
|
|
168
173
|
tic = time.perf_counter()
|
@@ -247,6 +252,71 @@ def run_one_case(
|
|
247
252
|
)
|
248
253
|
|
249
254
|
|
255
|
+
def get_report_summary(
|
256
|
+
result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
|
257
|
+
):
|
258
|
+
import tabulate
|
259
|
+
|
260
|
+
summary = (
|
261
|
+
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
262
|
+
)
|
263
|
+
|
264
|
+
headers = [
|
265
|
+
"batch size",
|
266
|
+
"latency (s)",
|
267
|
+
"input throughput (tok/s)",
|
268
|
+
"output throughput (tok/s)",
|
269
|
+
"acc length",
|
270
|
+
"ITL (ms)",
|
271
|
+
"input cost ($/1M)",
|
272
|
+
"output cost ($/1M)",
|
273
|
+
]
|
274
|
+
if bench_args.profile:
|
275
|
+
headers.append("profile")
|
276
|
+
rows = []
|
277
|
+
|
278
|
+
for (
|
279
|
+
batch_size,
|
280
|
+
latency,
|
281
|
+
ttft,
|
282
|
+
input_throughput,
|
283
|
+
output_throughput,
|
284
|
+
_,
|
285
|
+
_,
|
286
|
+
acc_length,
|
287
|
+
trace_link,
|
288
|
+
) in result:
|
289
|
+
if is_blackwell():
|
290
|
+
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
291
|
+
else:
|
292
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
293
|
+
|
294
|
+
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
295
|
+
input_util = 0.7
|
296
|
+
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
297
|
+
itl = 1 / (output_throughput / batch_size) * 1000
|
298
|
+
input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
|
299
|
+
output_cost = 1e6 / output_throughput / 3600 * hourly_cost
|
300
|
+
row = [
|
301
|
+
batch_size,
|
302
|
+
latency,
|
303
|
+
input_throughput,
|
304
|
+
output_throughput,
|
305
|
+
accept_length,
|
306
|
+
itl,
|
307
|
+
input_cost,
|
308
|
+
output_cost,
|
309
|
+
]
|
310
|
+
if trace_link:
|
311
|
+
row.append(f"[Profile]({trace_link})")
|
312
|
+
rows.append(row)
|
313
|
+
|
314
|
+
summary += tabulate.tabulate(
|
315
|
+
rows, headers=headers, tablefmt="github", floatfmt=".2f"
|
316
|
+
)
|
317
|
+
return summary
|
318
|
+
|
319
|
+
|
250
320
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
251
321
|
if bench_args.base_url:
|
252
322
|
proc, base_url = None, bench_args.base_url
|
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
321
391
|
result_filename=bench_args.result_filename,
|
322
392
|
tokenizer=tokenizer,
|
323
393
|
profile=bench_args.profile,
|
394
|
+
profile_steps=bench_args.profile_steps,
|
324
395
|
profile_by_stage=bench_args.profile_by_stage,
|
325
396
|
)[-1],
|
326
397
|
)
|
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
337
408
|
if not bench_args.show_report:
|
338
409
|
return
|
339
410
|
|
340
|
-
summary = (
|
341
|
-
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
342
|
-
)
|
343
|
-
summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
|
344
|
-
|
345
|
-
if bench_args.profile:
|
346
|
-
summary += " profile |"
|
347
|
-
|
348
|
-
summary += "\n"
|
349
|
-
summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
|
350
|
-
|
351
|
-
if bench_args.profile:
|
352
|
-
summary += "-------------|"
|
353
|
-
summary += "\n"
|
354
|
-
|
355
|
-
for (
|
356
|
-
batch_size,
|
357
|
-
latency,
|
358
|
-
ttft,
|
359
|
-
input_throughput,
|
360
|
-
output_throughput,
|
361
|
-
overall_throughput,
|
362
|
-
last_gen_throughput,
|
363
|
-
acc_length,
|
364
|
-
trace_link,
|
365
|
-
) in result:
|
366
|
-
if is_blackwell():
|
367
|
-
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
368
|
-
else:
|
369
|
-
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
370
|
-
|
371
|
-
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
372
|
-
input_util = 0.7
|
373
|
-
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
374
|
-
line = (
|
375
|
-
f"| {batch_size} | "
|
376
|
-
f"{latency:.2f} | "
|
377
|
-
f"{input_throughput:.2f} | "
|
378
|
-
f"{output_throughput:.2f} | "
|
379
|
-
f"{accept_length} | "
|
380
|
-
f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
|
381
|
-
f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
|
382
|
-
f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
|
383
|
-
)
|
384
|
-
if trace_link:
|
385
|
-
line += f" [Profile]({trace_link}) |"
|
386
|
-
line += "\n"
|
387
|
-
summary += line
|
388
|
-
|
389
|
-
# print metrics table
|
411
|
+
summary = get_report_summary(result, server_args, bench_args)
|
390
412
|
print(summary)
|
391
413
|
|
392
414
|
if is_in_ci():
|
393
415
|
write_github_step_summary(summary)
|
394
416
|
|
395
417
|
|
396
|
-
|
418
|
+
def main():
|
397
419
|
parser = argparse.ArgumentParser()
|
398
420
|
ServerArgs.add_cli_args(parser)
|
399
421
|
BenchArgs.add_cli_args(parser)
|
@@ -402,3 +424,7 @@ if __name__ == "__main__":
|
|
402
424
|
bench_args = BenchArgs.from_cli_args(args)
|
403
425
|
|
404
426
|
run_benchmark(server_args, bench_args)
|
427
|
+
|
428
|
+
|
429
|
+
if __name__ == "__main__":
|
430
|
+
main()
|
sglang/bench_serving.py
CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
|
12
12
|
|
13
13
|
import argparse
|
14
14
|
import asyncio
|
15
|
+
import base64
|
16
|
+
import io
|
15
17
|
import json
|
16
18
|
import os
|
17
19
|
import pickle
|
@@ -71,7 +73,7 @@ class RequestFuncInput:
|
|
71
73
|
output_len: int
|
72
74
|
model: str
|
73
75
|
lora_name: str
|
74
|
-
image_data: str
|
76
|
+
image_data: Optional[List[str]]
|
75
77
|
extra_request_body: Dict[str, Any]
|
76
78
|
|
77
79
|
|
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
|
|
289
291
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
290
292
|
|
291
293
|
if request_func_input.image_data:
|
294
|
+
# Build multi-image content: a list of image_url entries followed by the text
|
295
|
+
content_items = [
|
296
|
+
{
|
297
|
+
"type": "image_url",
|
298
|
+
"image_url": {"url": img_url},
|
299
|
+
}
|
300
|
+
for img_url in request_func_input.image_data
|
301
|
+
]
|
302
|
+
content_items.append({"type": "text", "text": request_func_input.prompt})
|
292
303
|
messages = [
|
293
304
|
{
|
294
305
|
"role": "user",
|
295
|
-
"content":
|
296
|
-
{
|
297
|
-
"type": "image_url",
|
298
|
-
"image_url": {"url": request_func_input.image_data},
|
299
|
-
},
|
300
|
-
{"type": "text", "text": request_func_input.prompt},
|
301
|
-
],
|
306
|
+
"content": content_items,
|
302
307
|
},
|
303
308
|
]
|
304
309
|
else:
|
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
|
|
497
502
|
**request_func_input.extra_request_body,
|
498
503
|
}
|
499
504
|
|
500
|
-
# Add image data if available
|
505
|
+
# Add image data if available (list of image urls/base64)
|
501
506
|
if request_func_input.image_data:
|
502
507
|
payload["image_data"] = request_func_input.image_data
|
503
508
|
|
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
|
|
648
653
|
prompt_suffix=args.prompt_suffix,
|
649
654
|
apply_chat_template=args.apply_chat_template,
|
650
655
|
)
|
651
|
-
elif args.dataset_name.startswith("random"):
|
656
|
+
elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
|
652
657
|
input_requests = sample_random_requests(
|
653
658
|
input_len=args.random_input_len,
|
654
659
|
output_len=args.random_output_len,
|
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
|
|
659
664
|
random_sample=args.dataset_name == "random",
|
660
665
|
return_text=not tokenize_prompt,
|
661
666
|
)
|
667
|
+
elif args.dataset_name == "random-image":
|
668
|
+
assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
|
669
|
+
input_requests = sample_random_image_requests(
|
670
|
+
num_requests=args.num_prompts,
|
671
|
+
num_images=args.random_image_num_images,
|
672
|
+
input_len=args.random_input_len,
|
673
|
+
output_len=args.random_output_len,
|
674
|
+
range_ratio=args.random_range_ratio,
|
675
|
+
tokenizer=tokenizer,
|
676
|
+
apply_chat_template=args.apply_chat_template,
|
677
|
+
image_resolution=args.random_image_resolution,
|
678
|
+
)
|
662
679
|
elif args.dataset_name == "generated-shared-prefix":
|
663
680
|
assert not tokenize_prompt
|
664
681
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -790,7 +807,7 @@ class DatasetRow:
|
|
790
807
|
prompt: str
|
791
808
|
prompt_len: int
|
792
809
|
output_len: int
|
793
|
-
image_data: Optional[str] = None
|
810
|
+
image_data: Optional[List[str]] = None
|
794
811
|
|
795
812
|
|
796
813
|
def sample_mmmu_requests(
|
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
|
|
913
930
|
prompt=prompt,
|
914
931
|
prompt_len=prompt_len,
|
915
932
|
output_len=output_len,
|
916
|
-
image_data=image_data,
|
933
|
+
image_data=[image_data],
|
917
934
|
)
|
918
935
|
)
|
919
936
|
|
@@ -1113,6 +1130,132 @@ def sample_random_requests(
|
|
1113
1130
|
return input_requests
|
1114
1131
|
|
1115
1132
|
|
1133
|
+
def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1134
|
+
"""Parse image resolution into (width, height).
|
1135
|
+
|
1136
|
+
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
1137
|
+
(e.g., '1080x1920' means height=1080, width=1920).
|
1138
|
+
"""
|
1139
|
+
resolution_to_size = {
|
1140
|
+
"4k": (3840, 2160),
|
1141
|
+
"1080p": (1920, 1080),
|
1142
|
+
"720p": (1280, 720),
|
1143
|
+
"360p": (640, 360),
|
1144
|
+
}
|
1145
|
+
if image_resolution in resolution_to_size:
|
1146
|
+
return resolution_to_size[image_resolution]
|
1147
|
+
|
1148
|
+
res = image_resolution.strip().lower()
|
1149
|
+
if "x" in res:
|
1150
|
+
parts = res.split("x")
|
1151
|
+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
1152
|
+
height = int(parts[0])
|
1153
|
+
width = int(parts[1])
|
1154
|
+
if height > 0 and width > 0:
|
1155
|
+
return (width, height)
|
1156
|
+
|
1157
|
+
raise ValueError(
|
1158
|
+
f"Unsupported random-image resolution: {image_resolution}. "
|
1159
|
+
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
|
1163
|
+
def sample_random_image_requests(
|
1164
|
+
num_requests: int,
|
1165
|
+
num_images: int,
|
1166
|
+
input_len: int,
|
1167
|
+
output_len: int,
|
1168
|
+
range_ratio: float,
|
1169
|
+
tokenizer: PreTrainedTokenizerBase,
|
1170
|
+
apply_chat_template: bool = True,
|
1171
|
+
image_resolution: str = "1080p",
|
1172
|
+
) -> List[DatasetRow]:
|
1173
|
+
"""Generate requests with random images.
|
1174
|
+
|
1175
|
+
- Each request includes ``num_images`` random images.
|
1176
|
+
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1177
|
+
or custom 'heightxwidth' (e.g., 1080x1920).
|
1178
|
+
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
1179
|
+
only counts text tokens and excludes image data.
|
1180
|
+
"""
|
1181
|
+
try:
|
1182
|
+
import pybase64
|
1183
|
+
from PIL import Image
|
1184
|
+
except ImportError as e:
|
1185
|
+
raise ImportError(
|
1186
|
+
"Please install Pillow to generate random images: pip install pillow"
|
1187
|
+
) from e
|
1188
|
+
|
1189
|
+
# Parse resolution (supports presets and 'heightxwidth')
|
1190
|
+
width, height = parse_random_image_resolution(image_resolution)
|
1191
|
+
|
1192
|
+
# Check for potentially problematic combinations and warn user
|
1193
|
+
if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
|
1194
|
+
warnings.warn(
|
1195
|
+
f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
|
1196
|
+
f"may take a long time. Consider reducing resolution or image count.",
|
1197
|
+
UserWarning,
|
1198
|
+
stacklevel=2,
|
1199
|
+
)
|
1200
|
+
|
1201
|
+
# Sample text lengths
|
1202
|
+
input_lens = np.random.randint(
|
1203
|
+
max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
|
1204
|
+
)
|
1205
|
+
output_lens = np.random.randint(
|
1206
|
+
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
|
1210
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1211
|
+
img = Image.fromarray(arr, mode="RGB")
|
1212
|
+
buf = io.BytesIO()
|
1213
|
+
img.save(buf, format="JPEG", quality=85)
|
1214
|
+
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1215
|
+
return f"data:image/jpeg;base64,{encoded}"
|
1216
|
+
|
1217
|
+
dataset: List[DatasetRow] = []
|
1218
|
+
for i in range(num_requests):
|
1219
|
+
# Generate text prompt
|
1220
|
+
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1221
|
+
|
1222
|
+
# Generate image list
|
1223
|
+
images = [_gen_random_image_data_uri() for _ in range(num_images)]
|
1224
|
+
|
1225
|
+
prompt_str = text_prompt
|
1226
|
+
if apply_chat_template:
|
1227
|
+
try:
|
1228
|
+
content_items = [
|
1229
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1230
|
+
for img_url in images
|
1231
|
+
]
|
1232
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1233
|
+
prompt_str = tokenizer.apply_chat_template(
|
1234
|
+
[{"role": "user", "content": content_items}],
|
1235
|
+
add_generation_prompt=True,
|
1236
|
+
tokenize=False,
|
1237
|
+
)
|
1238
|
+
except Exception:
|
1239
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1240
|
+
prompt_str = f"<image>{text_prompt}"
|
1241
|
+
|
1242
|
+
prompt_token_ids = tokenizer.encode(prompt_str)
|
1243
|
+
prompt_token_len = len(prompt_token_ids)
|
1244
|
+
|
1245
|
+
dataset.append(
|
1246
|
+
DatasetRow(
|
1247
|
+
prompt=prompt_str,
|
1248
|
+
prompt_len=prompt_token_len,
|
1249
|
+
output_len=int(output_lens[i]),
|
1250
|
+
image_data=images,
|
1251
|
+
)
|
1252
|
+
)
|
1253
|
+
|
1254
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1255
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1256
|
+
return dataset
|
1257
|
+
|
1258
|
+
|
1116
1259
|
def gen_prompt(tokenizer, token_num):
|
1117
1260
|
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
1118
1261
|
all_available_tokens = list(tokenizer.get_vocab().values())
|
@@ -1579,7 +1722,13 @@ async def benchmark(
|
|
1579
1722
|
output_file_name = args.output_file
|
1580
1723
|
else:
|
1581
1724
|
now = datetime.now().strftime("%m%d")
|
1582
|
-
if args.dataset_name
|
1725
|
+
if args.dataset_name == "random-image":
|
1726
|
+
output_file_name = (
|
1727
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1728
|
+
f"{args.random_output_len}_{args.random_image_num_images}imgs_"
|
1729
|
+
f"{args.random_image_resolution}.jsonl"
|
1730
|
+
)
|
1731
|
+
elif args.dataset_name.startswith("random"):
|
1583
1732
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1584
1733
|
else:
|
1585
1734
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
|
|
1819
1968
|
"--dataset-name",
|
1820
1969
|
type=str,
|
1821
1970
|
default="sharegpt",
|
1822
|
-
choices=[
|
1971
|
+
choices=[
|
1972
|
+
"sharegpt",
|
1973
|
+
"random",
|
1974
|
+
"random-ids",
|
1975
|
+
"generated-shared-prefix",
|
1976
|
+
"mmmu",
|
1977
|
+
"random-image",
|
1978
|
+
],
|
1823
1979
|
help="Name of the dataset to benchmark on.",
|
1824
1980
|
)
|
1825
1981
|
parser.add_argument(
|
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
|
|
1872
2028
|
help="Range of sampled ratio of input/output length, "
|
1873
2029
|
"used only for random dataset.",
|
1874
2030
|
)
|
2031
|
+
# random-image dataset args
|
2032
|
+
parser.add_argument(
|
2033
|
+
"--random-image-num-images",
|
2034
|
+
type=int,
|
2035
|
+
default=1,
|
2036
|
+
help="Number of images per request (only available with the random-image dataset)",
|
2037
|
+
)
|
2038
|
+
parser.add_argument(
|
2039
|
+
"--random-image-resolution",
|
2040
|
+
type=str,
|
2041
|
+
default="1080p",
|
2042
|
+
help=(
|
2043
|
+
"Resolution of random images for random-image dataset. "
|
2044
|
+
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2045
|
+
),
|
2046
|
+
)
|
1875
2047
|
parser.add_argument(
|
1876
2048
|
"--request-rate",
|
1877
2049
|
type=float,
|
sglang/profiler.py
CHANGED
sglang/srt/conversation.py
CHANGED
@@ -26,6 +26,8 @@ Key components:
|
|
26
26
|
# Adapted from
|
27
27
|
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
28
28
|
import dataclasses
|
29
|
+
import json
|
30
|
+
import os
|
29
31
|
import re
|
30
32
|
from enum import IntEnum, auto
|
31
33
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
@@ -959,16 +961,42 @@ register_conv_template(
|
|
959
961
|
)
|
960
962
|
|
961
963
|
|
964
|
+
MODEL_TYPE_TO_TEMPLATE = {
|
965
|
+
"internvl_chat": "internvl-2-5",
|
966
|
+
"deepseek_vl_v2": "deepseek-vl2",
|
967
|
+
"multi_modality": "janus-pro",
|
968
|
+
"phi4mm": "phi-4-mm",
|
969
|
+
"minicpmv": "minicpmv",
|
970
|
+
"minicpmo": "minicpmo",
|
971
|
+
}
|
972
|
+
|
973
|
+
|
974
|
+
def get_model_type(model_path: str) -> Optional[str]:
|
975
|
+
config_path = os.path.join(model_path, "config.json")
|
976
|
+
if not os.path.exists(config_path):
|
977
|
+
return None
|
978
|
+
try:
|
979
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
980
|
+
config = json.load(f)
|
981
|
+
return config.get("model_type")
|
982
|
+
except (IOError, json.JSONDecodeError):
|
983
|
+
return None
|
984
|
+
|
985
|
+
|
962
986
|
@register_conv_template_matching_function
|
963
987
|
def match_internvl(model_path: str):
|
964
988
|
if re.search(r"internvl", model_path, re.IGNORECASE):
|
965
989
|
return "internvl-2-5"
|
990
|
+
model_type = get_model_type(model_path)
|
991
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
966
992
|
|
967
993
|
|
968
994
|
@register_conv_template_matching_function
|
969
995
|
def match_deepseek_janus_pro(model_path: str):
|
970
996
|
if re.search(r"janus", model_path, re.IGNORECASE):
|
971
997
|
return "janus-pro"
|
998
|
+
model_type = get_model_type(model_path)
|
999
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
972
1000
|
|
973
1001
|
|
974
1002
|
@register_conv_template_matching_function
|
@@ -981,6 +1009,8 @@ def match_vicuna(model_path: str):
|
|
981
1009
|
def match_deepseek_vl(model_path: str):
|
982
1010
|
if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
|
983
1011
|
return "deepseek-vl2"
|
1012
|
+
model_type = get_model_type(model_path)
|
1013
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
984
1014
|
|
985
1015
|
|
986
1016
|
@register_conv_template_matching_function
|
@@ -994,14 +1024,17 @@ def match_qwen_chat_ml(model_path: str):
|
|
994
1024
|
|
995
1025
|
|
996
1026
|
@register_conv_template_matching_function
|
997
|
-
def
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1027
|
+
def match_minicpm(model_path: str):
|
1028
|
+
match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
|
1029
|
+
if match:
|
1030
|
+
return f"minicpm{match.group(1).lower()}"
|
1031
|
+
model_type = get_model_type(model_path)
|
1032
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
1002
1033
|
|
1003
1034
|
|
1004
1035
|
@register_conv_template_matching_function
|
1005
1036
|
def match_phi_4_mm(model_path: str):
|
1006
1037
|
if "phi-4-multimodal" in model_path.lower():
|
1007
1038
|
return "phi-4-mm"
|
1039
|
+
model_type = get_model_type(model_path)
|
1040
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -680,7 +680,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
680
680
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
681
681
|
assert_pkg_version(
|
682
682
|
"sgl-kernel",
|
683
|
-
"0.3.
|
683
|
+
"0.3.7",
|
684
684
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
685
685
|
)
|
686
686
|
|
@@ -35,6 +35,8 @@ from pydantic import (
|
|
35
35
|
)
|
36
36
|
from typing_extensions import Literal
|
37
37
|
|
38
|
+
DEFAULT_MODEL_NAME = "default"
|
39
|
+
|
38
40
|
|
39
41
|
class ModelCard(BaseModel):
|
40
42
|
"""Model cards."""
|
@@ -108,6 +110,23 @@ class JsonSchemaResponseFormat(BaseModel):
|
|
108
110
|
strict: Optional[bool] = False
|
109
111
|
|
110
112
|
|
113
|
+
class ResponseFormat(BaseModel):
|
114
|
+
type: Literal["text", "json_object", "json_schema"]
|
115
|
+
json_schema: Optional[JsonSchemaResponseFormat] = None
|
116
|
+
|
117
|
+
|
118
|
+
class StructuresResponseFormat(BaseModel):
|
119
|
+
begin: str
|
120
|
+
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
121
|
+
end: str
|
122
|
+
|
123
|
+
|
124
|
+
class StructuralTagResponseFormat(BaseModel):
|
125
|
+
type: Literal["structural_tag"]
|
126
|
+
structures: List[StructuresResponseFormat]
|
127
|
+
triggers: List[str]
|
128
|
+
|
129
|
+
|
111
130
|
class FileRequest(BaseModel):
|
112
131
|
# https://platform.openai.com/docs/api-reference/files/create
|
113
132
|
file: bytes # The File object (not file name) to be uploaded
|
@@ -166,7 +185,7 @@ class BatchResponse(BaseModel):
|
|
166
185
|
class CompletionRequest(BaseModel):
|
167
186
|
# Ordered by official OpenAI API documentation
|
168
187
|
# https://platform.openai.com/docs/api-reference/completions/create
|
169
|
-
model: str
|
188
|
+
model: str = DEFAULT_MODEL_NAME
|
170
189
|
prompt: Union[List[int], List[List[int]], str, List[str]]
|
171
190
|
best_of: Optional[int] = None
|
172
191
|
echo: bool = False
|
@@ -200,6 +219,7 @@ class CompletionRequest(BaseModel):
|
|
200
219
|
skip_special_tokens: bool = True
|
201
220
|
lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
|
202
221
|
session_params: Optional[Dict] = None
|
222
|
+
response_format: Optional[Union[ResponseFormat, StructuralTagResponseFormat]] = None
|
203
223
|
|
204
224
|
# For PD disaggregation
|
205
225
|
bootstrap_host: Optional[Union[List[str], str]] = None
|
@@ -327,7 +347,7 @@ class ToolCall(BaseModel):
|
|
327
347
|
|
328
348
|
|
329
349
|
class ChatCompletionMessageGenericParam(BaseModel):
|
330
|
-
role: Literal["system", "assistant", "tool"]
|
350
|
+
role: Literal["system", "assistant", "tool", "function"]
|
331
351
|
content: Union[str, List[ChatCompletionMessageContentTextPart], None] = Field(
|
332
352
|
default=None
|
333
353
|
)
|
@@ -341,9 +361,9 @@ class ChatCompletionMessageGenericParam(BaseModel):
|
|
341
361
|
def _normalize_role(cls, v):
|
342
362
|
if isinstance(v, str):
|
343
363
|
v_lower = v.lower()
|
344
|
-
if v_lower not in {"system", "assistant", "tool"}:
|
364
|
+
if v_lower not in {"system", "assistant", "tool", "function"}:
|
345
365
|
raise ValueError(
|
346
|
-
"'role' must be one of 'system', 'assistant', or '
|
366
|
+
"'role' must be one of 'system', 'assistant', 'tool', or 'function' (case-insensitive)."
|
347
367
|
)
|
348
368
|
return v_lower
|
349
369
|
raise ValueError("'role' must be a string")
|
@@ -359,23 +379,6 @@ ChatCompletionMessageParam = Union[
|
|
359
379
|
]
|
360
380
|
|
361
381
|
|
362
|
-
class ResponseFormat(BaseModel):
|
363
|
-
type: Literal["text", "json_object", "json_schema"]
|
364
|
-
json_schema: Optional[JsonSchemaResponseFormat] = None
|
365
|
-
|
366
|
-
|
367
|
-
class StructuresResponseFormat(BaseModel):
|
368
|
-
begin: str
|
369
|
-
schema_: Optional[Dict[str, object]] = Field(alias="schema", default=None)
|
370
|
-
end: str
|
371
|
-
|
372
|
-
|
373
|
-
class StructuralTagResponseFormat(BaseModel):
|
374
|
-
type: Literal["structural_tag"]
|
375
|
-
structures: List[StructuresResponseFormat]
|
376
|
-
triggers: List[str]
|
377
|
-
|
378
|
-
|
379
382
|
class Function(BaseModel):
|
380
383
|
"""Function descriptions."""
|
381
384
|
|
@@ -409,7 +412,7 @@ class ChatCompletionRequest(BaseModel):
|
|
409
412
|
# Ordered by official OpenAI API documentation
|
410
413
|
# https://platform.openai.com/docs/api-reference/chat/create
|
411
414
|
messages: List[ChatCompletionMessageParam]
|
412
|
-
model: str
|
415
|
+
model: str = DEFAULT_MODEL_NAME
|
413
416
|
frequency_penalty: float = 0.0
|
414
417
|
logit_bias: Optional[Dict[str, float]] = None
|
415
418
|
logprobs: bool = False
|
@@ -571,7 +574,7 @@ class EmbeddingRequest(BaseModel):
|
|
571
574
|
# Ordered by official OpenAI API documentation
|
572
575
|
# https://platform.openai.com/docs/api-reference/embeddings/create
|
573
576
|
input: EmbeddingInput
|
574
|
-
model: str
|
577
|
+
model: str = DEFAULT_MODEL_NAME
|
575
578
|
encoding_format: str = "float"
|
576
579
|
dimensions: Optional[int] = None
|
577
580
|
user: Optional[str] = None
|
@@ -605,7 +608,7 @@ class ScoringRequest(BaseModel):
|
|
605
608
|
)
|
606
609
|
apply_softmax: bool = False
|
607
610
|
item_first: bool = False
|
608
|
-
model: str
|
611
|
+
model: str = DEFAULT_MODEL_NAME
|
609
612
|
|
610
613
|
|
611
614
|
class ScoringResponse(BaseModel):
|