sglang 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch_server.py +79 -53
- sglang/bench_serving.py +186 -14
- sglang/profiler.py +0 -1
- sglang/srt/conversation.py +38 -5
- sglang/srt/disaggregation/decode.py +4 -0
- sglang/srt/disaggregation/prefill.py +4 -0
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/entrypoints/openai/protocol.py +27 -24
- sglang/srt/entrypoints/openai/serving_chat.py +50 -9
- sglang/srt/entrypoints/openai/serving_completions.py +15 -0
- sglang/srt/entrypoints/tool.py +7 -7
- sglang/srt/function_call/deepseekv31_detector.py +222 -0
- sglang/srt/function_call/function_call_parser.py +2 -0
- sglang/srt/function_call/gpt_oss_detector.py +144 -256
- sglang/srt/harmony_parser.py +588 -0
- sglang/srt/hf_transformers_utils.py +16 -7
- sglang/srt/layers/attention/ascend_backend.py +218 -111
- sglang/srt/layers/attention/flashattention_backend.py +241 -7
- sglang/srt/layers/attention/flashinfer_backend.py +5 -2
- sglang/srt/layers/attention/flashinfer_mla_backend.py +76 -91
- sglang/srt/layers/attention/utils.py +15 -94
- sglang/srt/layers/communicator.py +1 -2
- sglang/srt/layers/moe/cutlass_moe.py +0 -15
- sglang/srt/layers/moe/ep_moe/layer.py +1 -7
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
- sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
- sglang/srt/layers/moe/topk.py +1 -1
- sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
- sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
- sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
- sglang/srt/layers/quantization/fp8.py +2 -1
- sglang/srt/layers/quantization/fp8_kernel.py +2 -2
- sglang/srt/layers/quantization/fp8_utils.py +2 -2
- sglang/srt/layers/quantization/modelopt_quant.py +2 -2
- sglang/srt/layers/quantization/mxfp4.py +16 -23
- sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
- sglang/srt/layers/utils.py +0 -14
- sglang/srt/lora/lora_manager.py +29 -12
- sglang/srt/managers/cache_controller.py +223 -156
- sglang/srt/managers/detokenizer_manager.py +5 -0
- sglang/srt/managers/io_struct.py +30 -0
- sglang/srt/managers/scheduler.py +58 -7
- sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
- sglang/srt/managers/tokenizer_manager.py +36 -3
- sglang/srt/mem_cache/hicache_storage.py +31 -20
- sglang/srt/mem_cache/hiradix_cache.py +12 -3
- sglang/srt/mem_cache/memory_pool.py +73 -14
- sglang/srt/mem_cache/memory_pool_host.py +3 -2
- sglang/srt/mem_cache/radix_cache.py +1 -0
- sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
- sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
- sglang/srt/metrics/collector.py +5 -5
- sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- sglang/srt/model_executor/model_runner.py +1 -1
- sglang/srt/models/deepseek_v2.py +12 -3
- sglang/srt/models/gpt_oss.py +2 -1
- sglang/srt/models/qwen2_5_vl.py +1 -0
- sglang/srt/offloader.py +115 -0
- sglang/srt/reasoning_parser.py +56 -300
- sglang/srt/server_args.py +10 -5
- sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
- sglang/srt/utils.py +59 -12
- sglang/test/test_cutlass_moe.py +33 -28
- sglang/version.py +1 -1
- {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +6 -5
- {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +69 -65
- {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
- {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch_server.py
CHANGED
@@ -18,7 +18,7 @@ import json
|
|
18
18
|
import multiprocessing
|
19
19
|
import os
|
20
20
|
import time
|
21
|
-
from typing import Tuple
|
21
|
+
from typing import List, Tuple
|
22
22
|
|
23
23
|
import requests
|
24
24
|
|
@@ -45,6 +45,7 @@ class BenchArgs:
|
|
45
45
|
skip_warmup: bool = False
|
46
46
|
show_report: bool = False
|
47
47
|
profile: bool = False
|
48
|
+
profile_steps: int = 3
|
48
49
|
profile_by_stage: bool = False
|
49
50
|
|
50
51
|
@staticmethod
|
@@ -78,6 +79,9 @@ class BenchArgs:
|
|
78
79
|
parser.add_argument("--skip-warmup", action="store_true")
|
79
80
|
parser.add_argument("--show-report", action="store_true")
|
80
81
|
parser.add_argument("--profile", action="store_true")
|
82
|
+
parser.add_argument(
|
83
|
+
"--profile-steps", type=int, default=BenchArgs.profile_steps
|
84
|
+
)
|
81
85
|
parser.add_argument("--profile-by-stage", action="store_true")
|
82
86
|
|
83
87
|
@classmethod
|
@@ -132,6 +136,7 @@ def run_one_case(
|
|
132
136
|
result_filename: str,
|
133
137
|
tokenizer,
|
134
138
|
profile: bool = False,
|
139
|
+
profile_steps: int = 3,
|
135
140
|
profile_by_stage: bool = False,
|
136
141
|
):
|
137
142
|
requests.post(url + "/flush_cache")
|
@@ -162,7 +167,7 @@ def run_one_case(
|
|
162
167
|
profile_link = None
|
163
168
|
if profile:
|
164
169
|
profile_link: str = run_profile(
|
165
|
-
url,
|
170
|
+
url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
|
166
171
|
)
|
167
172
|
|
168
173
|
tic = time.perf_counter()
|
@@ -247,6 +252,71 @@ def run_one_case(
|
|
247
252
|
)
|
248
253
|
|
249
254
|
|
255
|
+
def get_report_summary(
|
256
|
+
result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
|
257
|
+
):
|
258
|
+
import tabulate
|
259
|
+
|
260
|
+
summary = (
|
261
|
+
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
262
|
+
)
|
263
|
+
|
264
|
+
headers = [
|
265
|
+
"batch size",
|
266
|
+
"latency (s)",
|
267
|
+
"input throughput (tok/s)",
|
268
|
+
"output throughput (tok/s)",
|
269
|
+
"acc length",
|
270
|
+
"ITL (ms)",
|
271
|
+
"input cost ($/1M)",
|
272
|
+
"output cost ($/1M)",
|
273
|
+
]
|
274
|
+
if bench_args.profile:
|
275
|
+
headers.append("profile")
|
276
|
+
rows = []
|
277
|
+
|
278
|
+
for (
|
279
|
+
batch_size,
|
280
|
+
latency,
|
281
|
+
ttft,
|
282
|
+
input_throughput,
|
283
|
+
output_throughput,
|
284
|
+
_,
|
285
|
+
_,
|
286
|
+
acc_length,
|
287
|
+
trace_link,
|
288
|
+
) in result:
|
289
|
+
if is_blackwell():
|
290
|
+
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
291
|
+
else:
|
292
|
+
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
293
|
+
|
294
|
+
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
295
|
+
input_util = 0.7
|
296
|
+
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
297
|
+
itl = 1 / (output_throughput / batch_size) * 1000
|
298
|
+
input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
|
299
|
+
output_cost = 1e6 / output_throughput / 3600 * hourly_cost
|
300
|
+
row = [
|
301
|
+
batch_size,
|
302
|
+
latency,
|
303
|
+
input_throughput,
|
304
|
+
output_throughput,
|
305
|
+
accept_length,
|
306
|
+
itl,
|
307
|
+
input_cost,
|
308
|
+
output_cost,
|
309
|
+
]
|
310
|
+
if trace_link:
|
311
|
+
row.append(f"[Profile]({trace_link})")
|
312
|
+
rows.append(row)
|
313
|
+
|
314
|
+
summary += tabulate.tabulate(
|
315
|
+
rows, headers=headers, tablefmt="github", floatfmt=".2f"
|
316
|
+
)
|
317
|
+
return summary
|
318
|
+
|
319
|
+
|
250
320
|
def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
251
321
|
if bench_args.base_url:
|
252
322
|
proc, base_url = None, bench_args.base_url
|
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
321
391
|
result_filename=bench_args.result_filename,
|
322
392
|
tokenizer=tokenizer,
|
323
393
|
profile=bench_args.profile,
|
394
|
+
profile_steps=bench_args.profile_steps,
|
324
395
|
profile_by_stage=bench_args.profile_by_stage,
|
325
396
|
)[-1],
|
326
397
|
)
|
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
|
|
337
408
|
if not bench_args.show_report:
|
338
409
|
return
|
339
410
|
|
340
|
-
summary = (
|
341
|
-
f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
|
342
|
-
)
|
343
|
-
summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
|
344
|
-
|
345
|
-
if bench_args.profile:
|
346
|
-
summary += " profile |"
|
347
|
-
|
348
|
-
summary += "\n"
|
349
|
-
summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
|
350
|
-
|
351
|
-
if bench_args.profile:
|
352
|
-
summary += "-------------|"
|
353
|
-
summary += "\n"
|
354
|
-
|
355
|
-
for (
|
356
|
-
batch_size,
|
357
|
-
latency,
|
358
|
-
ttft,
|
359
|
-
input_throughput,
|
360
|
-
output_throughput,
|
361
|
-
overall_throughput,
|
362
|
-
last_gen_throughput,
|
363
|
-
acc_length,
|
364
|
-
trace_link,
|
365
|
-
) in result:
|
366
|
-
if is_blackwell():
|
367
|
-
hourly_cost_per_gpu = 4 # $4/hour for one B200
|
368
|
-
else:
|
369
|
-
hourly_cost_per_gpu = 2 # $2/hour for one H100
|
370
|
-
|
371
|
-
hourly_cost = hourly_cost_per_gpu * server_args.tp_size
|
372
|
-
input_util = 0.7
|
373
|
-
accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
|
374
|
-
line = (
|
375
|
-
f"| {batch_size} | "
|
376
|
-
f"{latency:.2f} | "
|
377
|
-
f"{input_throughput:.2f} | "
|
378
|
-
f"{output_throughput:.2f} | "
|
379
|
-
f"{accept_length} | "
|
380
|
-
f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
|
381
|
-
f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
|
382
|
-
f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
|
383
|
-
)
|
384
|
-
if trace_link:
|
385
|
-
line += f" [Profile]({trace_link}) |"
|
386
|
-
line += "\n"
|
387
|
-
summary += line
|
388
|
-
|
389
|
-
# print metrics table
|
411
|
+
summary = get_report_summary(result, server_args, bench_args)
|
390
412
|
print(summary)
|
391
413
|
|
392
414
|
if is_in_ci():
|
393
415
|
write_github_step_summary(summary)
|
394
416
|
|
395
417
|
|
396
|
-
|
418
|
+
def main():
|
397
419
|
parser = argparse.ArgumentParser()
|
398
420
|
ServerArgs.add_cli_args(parser)
|
399
421
|
BenchArgs.add_cli_args(parser)
|
@@ -402,3 +424,7 @@ if __name__ == "__main__":
|
|
402
424
|
bench_args = BenchArgs.from_cli_args(args)
|
403
425
|
|
404
426
|
run_benchmark(server_args, bench_args)
|
427
|
+
|
428
|
+
|
429
|
+
if __name__ == "__main__":
|
430
|
+
main()
|
sglang/bench_serving.py
CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
|
|
12
12
|
|
13
13
|
import argparse
|
14
14
|
import asyncio
|
15
|
+
import base64
|
16
|
+
import io
|
15
17
|
import json
|
16
18
|
import os
|
17
19
|
import pickle
|
@@ -71,7 +73,7 @@ class RequestFuncInput:
|
|
71
73
|
output_len: int
|
72
74
|
model: str
|
73
75
|
lora_name: str
|
74
|
-
image_data: str
|
76
|
+
image_data: Optional[List[str]]
|
75
77
|
extra_request_body: Dict[str, Any]
|
76
78
|
|
77
79
|
|
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
|
|
289
291
|
), "OpenAI Chat Completions API URL must end with 'chat/completions'."
|
290
292
|
|
291
293
|
if request_func_input.image_data:
|
294
|
+
# Build multi-image content: a list of image_url entries followed by the text
|
295
|
+
content_items = [
|
296
|
+
{
|
297
|
+
"type": "image_url",
|
298
|
+
"image_url": {"url": img_url},
|
299
|
+
}
|
300
|
+
for img_url in request_func_input.image_data
|
301
|
+
]
|
302
|
+
content_items.append({"type": "text", "text": request_func_input.prompt})
|
292
303
|
messages = [
|
293
304
|
{
|
294
305
|
"role": "user",
|
295
|
-
"content":
|
296
|
-
{
|
297
|
-
"type": "image_url",
|
298
|
-
"image_url": {"url": request_func_input.image_data},
|
299
|
-
},
|
300
|
-
{"type": "text", "text": request_func_input.prompt},
|
301
|
-
],
|
306
|
+
"content": content_items,
|
302
307
|
},
|
303
308
|
]
|
304
309
|
else:
|
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
|
|
497
502
|
**request_func_input.extra_request_body,
|
498
503
|
}
|
499
504
|
|
500
|
-
# Add image data if available
|
505
|
+
# Add image data if available (list of image urls/base64)
|
501
506
|
if request_func_input.image_data:
|
502
507
|
payload["image_data"] = request_func_input.image_data
|
503
508
|
|
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
|
|
648
653
|
prompt_suffix=args.prompt_suffix,
|
649
654
|
apply_chat_template=args.apply_chat_template,
|
650
655
|
)
|
651
|
-
elif args.dataset_name.startswith("random"):
|
656
|
+
elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
|
652
657
|
input_requests = sample_random_requests(
|
653
658
|
input_len=args.random_input_len,
|
654
659
|
output_len=args.random_output_len,
|
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
|
|
659
664
|
random_sample=args.dataset_name == "random",
|
660
665
|
return_text=not tokenize_prompt,
|
661
666
|
)
|
667
|
+
elif args.dataset_name == "random-image":
|
668
|
+
assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
|
669
|
+
input_requests = sample_random_image_requests(
|
670
|
+
num_requests=args.num_prompts,
|
671
|
+
num_images=args.random_image_num_images,
|
672
|
+
input_len=args.random_input_len,
|
673
|
+
output_len=args.random_output_len,
|
674
|
+
range_ratio=args.random_range_ratio,
|
675
|
+
tokenizer=tokenizer,
|
676
|
+
apply_chat_template=args.apply_chat_template,
|
677
|
+
image_resolution=args.random_image_resolution,
|
678
|
+
)
|
662
679
|
elif args.dataset_name == "generated-shared-prefix":
|
663
680
|
assert not tokenize_prompt
|
664
681
|
input_requests = sample_generated_shared_prefix_requests(
|
@@ -790,7 +807,7 @@ class DatasetRow:
|
|
790
807
|
prompt: str
|
791
808
|
prompt_len: int
|
792
809
|
output_len: int
|
793
|
-
image_data: Optional[str] = None
|
810
|
+
image_data: Optional[List[str]] = None
|
794
811
|
|
795
812
|
|
796
813
|
def sample_mmmu_requests(
|
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
|
|
913
930
|
prompt=prompt,
|
914
931
|
prompt_len=prompt_len,
|
915
932
|
output_len=output_len,
|
916
|
-
image_data=image_data,
|
933
|
+
image_data=[image_data],
|
917
934
|
)
|
918
935
|
)
|
919
936
|
|
@@ -1113,6 +1130,132 @@ def sample_random_requests(
|
|
1113
1130
|
return input_requests
|
1114
1131
|
|
1115
1132
|
|
1133
|
+
def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1134
|
+
"""Parse image resolution into (width, height).
|
1135
|
+
|
1136
|
+
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
1137
|
+
(e.g., '1080x1920' means height=1080, width=1920).
|
1138
|
+
"""
|
1139
|
+
resolution_to_size = {
|
1140
|
+
"4k": (3840, 2160),
|
1141
|
+
"1080p": (1920, 1080),
|
1142
|
+
"720p": (1280, 720),
|
1143
|
+
"360p": (640, 360),
|
1144
|
+
}
|
1145
|
+
if image_resolution in resolution_to_size:
|
1146
|
+
return resolution_to_size[image_resolution]
|
1147
|
+
|
1148
|
+
res = image_resolution.strip().lower()
|
1149
|
+
if "x" in res:
|
1150
|
+
parts = res.split("x")
|
1151
|
+
if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
|
1152
|
+
height = int(parts[0])
|
1153
|
+
width = int(parts[1])
|
1154
|
+
if height > 0 and width > 0:
|
1155
|
+
return (width, height)
|
1156
|
+
|
1157
|
+
raise ValueError(
|
1158
|
+
f"Unsupported random-image resolution: {image_resolution}. "
|
1159
|
+
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1160
|
+
)
|
1161
|
+
|
1162
|
+
|
1163
|
+
def sample_random_image_requests(
|
1164
|
+
num_requests: int,
|
1165
|
+
num_images: int,
|
1166
|
+
input_len: int,
|
1167
|
+
output_len: int,
|
1168
|
+
range_ratio: float,
|
1169
|
+
tokenizer: PreTrainedTokenizerBase,
|
1170
|
+
apply_chat_template: bool = True,
|
1171
|
+
image_resolution: str = "1080p",
|
1172
|
+
) -> List[DatasetRow]:
|
1173
|
+
"""Generate requests with random images.
|
1174
|
+
|
1175
|
+
- Each request includes ``num_images`` random images.
|
1176
|
+
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1177
|
+
or custom 'heightxwidth' (e.g., 1080x1920).
|
1178
|
+
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
1179
|
+
only counts text tokens and excludes image data.
|
1180
|
+
"""
|
1181
|
+
try:
|
1182
|
+
import pybase64
|
1183
|
+
from PIL import Image
|
1184
|
+
except ImportError as e:
|
1185
|
+
raise ImportError(
|
1186
|
+
"Please install Pillow to generate random images: pip install pillow"
|
1187
|
+
) from e
|
1188
|
+
|
1189
|
+
# Parse resolution (supports presets and 'heightxwidth')
|
1190
|
+
width, height = parse_random_image_resolution(image_resolution)
|
1191
|
+
|
1192
|
+
# Check for potentially problematic combinations and warn user
|
1193
|
+
if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
|
1194
|
+
warnings.warn(
|
1195
|
+
f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
|
1196
|
+
f"may take a long time. Consider reducing resolution or image count.",
|
1197
|
+
UserWarning,
|
1198
|
+
stacklevel=2,
|
1199
|
+
)
|
1200
|
+
|
1201
|
+
# Sample text lengths
|
1202
|
+
input_lens = np.random.randint(
|
1203
|
+
max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
|
1204
|
+
)
|
1205
|
+
output_lens = np.random.randint(
|
1206
|
+
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1207
|
+
)
|
1208
|
+
|
1209
|
+
def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
|
1210
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1211
|
+
img = Image.fromarray(arr, mode="RGB")
|
1212
|
+
buf = io.BytesIO()
|
1213
|
+
img.save(buf, format="JPEG", quality=85)
|
1214
|
+
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1215
|
+
return f"data:image/jpeg;base64,{encoded}"
|
1216
|
+
|
1217
|
+
dataset: List[DatasetRow] = []
|
1218
|
+
for i in range(num_requests):
|
1219
|
+
# Generate text prompt
|
1220
|
+
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1221
|
+
|
1222
|
+
# Generate image list
|
1223
|
+
images = [_gen_random_image_data_uri() for _ in range(num_images)]
|
1224
|
+
|
1225
|
+
prompt_str = text_prompt
|
1226
|
+
if apply_chat_template:
|
1227
|
+
try:
|
1228
|
+
content_items = [
|
1229
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1230
|
+
for img_url in images
|
1231
|
+
]
|
1232
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1233
|
+
prompt_str = tokenizer.apply_chat_template(
|
1234
|
+
[{"role": "user", "content": content_items}],
|
1235
|
+
add_generation_prompt=True,
|
1236
|
+
tokenize=False,
|
1237
|
+
)
|
1238
|
+
except Exception:
|
1239
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1240
|
+
prompt_str = f"<image>{text_prompt}"
|
1241
|
+
|
1242
|
+
prompt_token_ids = tokenizer.encode(prompt_str)
|
1243
|
+
prompt_token_len = len(prompt_token_ids)
|
1244
|
+
|
1245
|
+
dataset.append(
|
1246
|
+
DatasetRow(
|
1247
|
+
prompt=prompt_str,
|
1248
|
+
prompt_len=prompt_token_len,
|
1249
|
+
output_len=int(output_lens[i]),
|
1250
|
+
image_data=images,
|
1251
|
+
)
|
1252
|
+
)
|
1253
|
+
|
1254
|
+
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1255
|
+
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1256
|
+
return dataset
|
1257
|
+
|
1258
|
+
|
1116
1259
|
def gen_prompt(tokenizer, token_num):
|
1117
1260
|
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
1118
1261
|
all_available_tokens = list(tokenizer.get_vocab().values())
|
@@ -1579,7 +1722,13 @@ async def benchmark(
|
|
1579
1722
|
output_file_name = args.output_file
|
1580
1723
|
else:
|
1581
1724
|
now = datetime.now().strftime("%m%d")
|
1582
|
-
if args.dataset_name
|
1725
|
+
if args.dataset_name == "random-image":
|
1726
|
+
output_file_name = (
|
1727
|
+
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1728
|
+
f"{args.random_output_len}_{args.random_image_num_images}imgs_"
|
1729
|
+
f"{args.random_image_resolution}.jsonl"
|
1730
|
+
)
|
1731
|
+
elif args.dataset_name.startswith("random"):
|
1583
1732
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
1584
1733
|
else:
|
1585
1734
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
|
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
|
|
1819
1968
|
"--dataset-name",
|
1820
1969
|
type=str,
|
1821
1970
|
default="sharegpt",
|
1822
|
-
choices=[
|
1971
|
+
choices=[
|
1972
|
+
"sharegpt",
|
1973
|
+
"random",
|
1974
|
+
"random-ids",
|
1975
|
+
"generated-shared-prefix",
|
1976
|
+
"mmmu",
|
1977
|
+
"random-image",
|
1978
|
+
],
|
1823
1979
|
help="Name of the dataset to benchmark on.",
|
1824
1980
|
)
|
1825
1981
|
parser.add_argument(
|
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
|
|
1872
2028
|
help="Range of sampled ratio of input/output length, "
|
1873
2029
|
"used only for random dataset.",
|
1874
2030
|
)
|
2031
|
+
# random-image dataset args
|
2032
|
+
parser.add_argument(
|
2033
|
+
"--random-image-num-images",
|
2034
|
+
type=int,
|
2035
|
+
default=1,
|
2036
|
+
help="Number of images per request (only available with the random-image dataset)",
|
2037
|
+
)
|
2038
|
+
parser.add_argument(
|
2039
|
+
"--random-image-resolution",
|
2040
|
+
type=str,
|
2041
|
+
default="1080p",
|
2042
|
+
help=(
|
2043
|
+
"Resolution of random images for random-image dataset. "
|
2044
|
+
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2045
|
+
),
|
2046
|
+
)
|
1875
2047
|
parser.add_argument(
|
1876
2048
|
"--request-rate",
|
1877
2049
|
type=float,
|
sglang/profiler.py
CHANGED
sglang/srt/conversation.py
CHANGED
@@ -26,6 +26,8 @@ Key components:
|
|
26
26
|
# Adapted from
|
27
27
|
# https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
|
28
28
|
import dataclasses
|
29
|
+
import json
|
30
|
+
import os
|
29
31
|
import re
|
30
32
|
from enum import IntEnum, auto
|
31
33
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
@@ -959,16 +961,42 @@ register_conv_template(
|
|
959
961
|
)
|
960
962
|
|
961
963
|
|
964
|
+
MODEL_TYPE_TO_TEMPLATE = {
|
965
|
+
"internvl_chat": "internvl-2-5",
|
966
|
+
"deepseek_vl_v2": "deepseek-vl2",
|
967
|
+
"multi_modality": "janus-pro",
|
968
|
+
"phi4mm": "phi-4-mm",
|
969
|
+
"minicpmv": "minicpmv",
|
970
|
+
"minicpmo": "minicpmo",
|
971
|
+
}
|
972
|
+
|
973
|
+
|
974
|
+
def get_model_type(model_path: str) -> Optional[str]:
|
975
|
+
config_path = os.path.join(model_path, "config.json")
|
976
|
+
if not os.path.exists(config_path):
|
977
|
+
return None
|
978
|
+
try:
|
979
|
+
with open(config_path, "r", encoding="utf-8") as f:
|
980
|
+
config = json.load(f)
|
981
|
+
return config.get("model_type")
|
982
|
+
except (IOError, json.JSONDecodeError):
|
983
|
+
return None
|
984
|
+
|
985
|
+
|
962
986
|
@register_conv_template_matching_function
|
963
987
|
def match_internvl(model_path: str):
|
964
988
|
if re.search(r"internvl", model_path, re.IGNORECASE):
|
965
989
|
return "internvl-2-5"
|
990
|
+
model_type = get_model_type(model_path)
|
991
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
966
992
|
|
967
993
|
|
968
994
|
@register_conv_template_matching_function
|
969
995
|
def match_deepseek_janus_pro(model_path: str):
|
970
996
|
if re.search(r"janus", model_path, re.IGNORECASE):
|
971
997
|
return "janus-pro"
|
998
|
+
model_type = get_model_type(model_path)
|
999
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
972
1000
|
|
973
1001
|
|
974
1002
|
@register_conv_template_matching_function
|
@@ -981,6 +1009,8 @@ def match_vicuna(model_path: str):
|
|
981
1009
|
def match_deepseek_vl(model_path: str):
|
982
1010
|
if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
|
983
1011
|
return "deepseek-vl2"
|
1012
|
+
model_type = get_model_type(model_path)
|
1013
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
984
1014
|
|
985
1015
|
|
986
1016
|
@register_conv_template_matching_function
|
@@ -994,14 +1024,17 @@ def match_qwen_chat_ml(model_path: str):
|
|
994
1024
|
|
995
1025
|
|
996
1026
|
@register_conv_template_matching_function
|
997
|
-
def
|
998
|
-
|
999
|
-
|
1000
|
-
|
1001
|
-
|
1027
|
+
def match_minicpm(model_path: str):
|
1028
|
+
match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
|
1029
|
+
if match:
|
1030
|
+
return f"minicpm{match.group(1).lower()}"
|
1031
|
+
model_type = get_model_type(model_path)
|
1032
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
1002
1033
|
|
1003
1034
|
|
1004
1035
|
@register_conv_template_matching_function
|
1005
1036
|
def match_phi_4_mm(model_path: str):
|
1006
1037
|
if "phi-4-multimodal" in model_path.lower():
|
1007
1038
|
return "phi-4-mm"
|
1039
|
+
model_type = get_model_type(model_path)
|
1040
|
+
return MODEL_TYPE_TO_TEMPLATE.get(model_type)
|
@@ -334,6 +334,8 @@ class DecodePreallocQueue:
|
|
334
334
|
error_message,
|
335
335
|
status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
|
336
336
|
)
|
337
|
+
if self.scheduler.enable_metrics:
|
338
|
+
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
|
337
339
|
else:
|
338
340
|
raise ValueError(f"Unexpected poll case: {poll}")
|
339
341
|
|
@@ -595,6 +597,8 @@ class DecodeTransferQueue:
|
|
595
597
|
# unlock the kv cache or it will have memory leak
|
596
598
|
self.tree_cache.cache_finished_req(decode_req.req)
|
597
599
|
indices_to_remove.add(i)
|
600
|
+
if self.scheduler.enable_metrics:
|
601
|
+
self.scheduler.metrics_collector.increment_transfer_failed_reqs()
|
598
602
|
continue
|
599
603
|
elif poll == KVPoll.Success:
|
600
604
|
|
@@ -238,6 +238,8 @@ class PrefillBootstrapQueue:
|
|
238
238
|
self.scheduler.stream_output([req], req.return_logprob)
|
239
239
|
indices_to_remove.add(i)
|
240
240
|
failed_reqs.append(req)
|
241
|
+
if self.scheduler.enable_metrics:
|
242
|
+
self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
|
241
243
|
continue
|
242
244
|
|
243
245
|
# KV.WaitingForInput - init here
|
@@ -522,6 +524,8 @@ class SchedulerDisaggregationPrefillMixin:
|
|
522
524
|
req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
|
523
525
|
)
|
524
526
|
done_reqs.append(req)
|
527
|
+
if self.enable_metrics:
|
528
|
+
self.metrics_collector.increment_transfer_failed_reqs()
|
525
529
|
else:
|
526
530
|
assert False, f"Unexpected polling state {poll=}"
|
527
531
|
|
sglang/srt/entrypoints/engine.py
CHANGED
@@ -672,7 +672,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
672
672
|
if server_args.attention_backend == "flashinfer":
|
673
673
|
assert_pkg_version(
|
674
674
|
"flashinfer_python",
|
675
|
-
"0.2.
|
675
|
+
"0.2.14.post1",
|
676
676
|
"Please uninstall the old version and "
|
677
677
|
"reinstall the latest version by following the instructions "
|
678
678
|
"at https://docs.flashinfer.ai/installation.html.",
|
@@ -680,7 +680,7 @@ def _set_envs_and_config(server_args: ServerArgs):
|
|
680
680
|
if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
|
681
681
|
assert_pkg_version(
|
682
682
|
"sgl-kernel",
|
683
|
-
"0.3.
|
683
|
+
"0.3.7",
|
684
684
|
"Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
|
685
685
|
)
|
686
686
|
|