sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -2
- sglang/bench_serving.py +224 -127
- sglang/compile_deep_gemm.py +3 -0
- sglang/launch_server.py +0 -14
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/falcon_h1.py +12 -58
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +68 -31
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +11 -43
- sglang/srt/disaggregation/decode.py +7 -18
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/nixl/conn.py +55 -23
- sglang/srt/disaggregation/prefill.py +17 -32
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/entrypoints/grpc_request_manager.py +10 -23
- sglang/srt/entrypoints/grpc_server.py +220 -80
- sglang/srt/entrypoints/http_server.py +49 -1
- sglang/srt/entrypoints/openai/protocol.py +159 -31
- sglang/srt/entrypoints/openai/serving_chat.py +13 -71
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +4 -0
- sglang/srt/function_call/function_call_parser.py +8 -6
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
- sglang/srt/layers/attention/attention_registry.py +31 -22
- sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
- sglang/srt/layers/attention/flashattention_backend.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +223 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/triton_backend.py +1 -1
- sglang/srt/layers/logits_processor.py +136 -6
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
- sglang/srt/layers/moe/ep_moe/layer.py +8 -286
- sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/utils.py +7 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/modelopt_quant.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/w4afp8.py +2 -16
- sglang/srt/lora/lora_manager.py +0 -8
- sglang/srt/managers/overlap_utils.py +18 -16
- sglang/srt/managers/schedule_batch.py +119 -90
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +213 -126
- sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
- sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
- sglang/srt/managers/tokenizer_manager.py +270 -53
- sglang/srt/managers/tp_worker.py +39 -28
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +162 -68
- sglang/srt/mem_cache/radix_cache.py +8 -3
- sglang/srt/mem_cache/swa_radix_cache.py +70 -14
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_executor/forward_batch_info.py +4 -18
- sglang/srt/model_executor/model_runner.py +55 -51
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +187 -6
- sglang/srt/model_loader/weight_utils.py +3 -0
- sglang/srt/models/falcon_h1.py +11 -9
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +11 -1
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/utils.py +5 -1
- sglang/srt/sampling/sampling_batch_info.py +11 -9
- sglang/srt/server_args.py +100 -33
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_utils.py +0 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils/common.py +18 -0
- sglang/srt/utils/hf_transformers_utils.py +2 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +40 -0
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +18 -2
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +63 -0
- sglang/test/test_utils.py +32 -11
- sglang/version.py +1 -1
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py
CHANGED
@@ -204,7 +204,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
|
|
204
204
|
origin_input_ids=tmp_input_ids,
|
205
205
|
sampling_params=sampling_params,
|
206
206
|
)
|
207
|
-
req.prefix_indices = []
|
208
207
|
req.fill_ids = req.origin_input_ids
|
209
208
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
210
209
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
@@ -248,7 +247,6 @@ def prepare_synthetic_inputs_for_latency_test(
|
|
248
247
|
origin_input_ids=list(input_ids[i]),
|
249
248
|
sampling_params=sampling_params,
|
250
249
|
)
|
251
|
-
req.prefix_indices = []
|
252
250
|
req.fill_ids = req.origin_input_ids
|
253
251
|
req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
|
254
252
|
req.logprob_start_len = len(req.origin_input_ids) - 1
|
sglang/bench_serving.py
CHANGED
@@ -35,6 +35,7 @@ import numpy as np
|
|
35
35
|
import requests
|
36
36
|
from tqdm.asyncio import tqdm
|
37
37
|
from transformers import (
|
38
|
+
AutoProcessor,
|
38
39
|
AutoTokenizer,
|
39
40
|
PreTrainedTokenizer,
|
40
41
|
PreTrainedTokenizerBase,
|
@@ -209,6 +210,11 @@ async def async_request_openai_completions(
|
|
209
210
|
**request_func_input.extra_request_body,
|
210
211
|
}
|
211
212
|
|
213
|
+
# hack to accommodate different LoRA conventions between SGLang and vLLM.
|
214
|
+
if request_func_input.lora_name:
|
215
|
+
payload["model"] = request_func_input.lora_name
|
216
|
+
payload["lora_path"] = request_func_input.lora_name
|
217
|
+
|
212
218
|
if request_func_input.image_data:
|
213
219
|
payload.update({"image_data": request_func_input.image_data})
|
214
220
|
|
@@ -322,10 +328,17 @@ async def async_request_openai_chat_completions(
|
|
322
328
|
"model": request_func_input.model,
|
323
329
|
"messages": messages,
|
324
330
|
"temperature": 0.0,
|
325
|
-
"
|
331
|
+
"max_completion_tokens": request_func_input.output_len,
|
326
332
|
"stream": not args.disable_stream,
|
333
|
+
"ignore_eos": not args.disable_ignore_eos,
|
327
334
|
**request_func_input.extra_request_body,
|
328
335
|
}
|
336
|
+
|
337
|
+
# hack to accommodate different LoRA conventions between SGLang and vLLM.
|
338
|
+
if request_func_input.lora_name:
|
339
|
+
payload["model"] = request_func_input.lora_name
|
340
|
+
payload["lora_path"] = request_func_input.lora_name
|
341
|
+
|
329
342
|
headers = get_auth_headers()
|
330
343
|
|
331
344
|
output = RequestFuncOutput.init_new(request_func_input)
|
@@ -648,7 +661,30 @@ def get_tokenizer(
|
|
648
661
|
)
|
649
662
|
|
650
663
|
|
651
|
-
def
|
664
|
+
def get_processor(
|
665
|
+
pretrained_model_name_or_path: str,
|
666
|
+
) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
667
|
+
assert (
|
668
|
+
pretrained_model_name_or_path is not None
|
669
|
+
and pretrained_model_name_or_path != ""
|
670
|
+
)
|
671
|
+
if pretrained_model_name_or_path.endswith(
|
672
|
+
".json"
|
673
|
+
) or pretrained_model_name_or_path.endswith(".model"):
|
674
|
+
from sglang.srt.hf_transformers_utils import get_processor
|
675
|
+
|
676
|
+
return get_processor(pretrained_model_name_or_path)
|
677
|
+
|
678
|
+
if pretrained_model_name_or_path is not None and not os.path.exists(
|
679
|
+
pretrained_model_name_or_path
|
680
|
+
):
|
681
|
+
pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
|
682
|
+
return AutoProcessor.from_pretrained(
|
683
|
+
pretrained_model_name_or_path, trust_remote_code=True
|
684
|
+
)
|
685
|
+
|
686
|
+
|
687
|
+
def get_dataset(args, tokenizer, model_id=None):
|
652
688
|
tokenize_prompt = getattr(args, "tokenize_prompt", False)
|
653
689
|
if args.dataset_name == "sharegpt":
|
654
690
|
assert not tokenize_prompt
|
@@ -661,7 +697,7 @@ def get_dataset(args, tokenizer):
|
|
661
697
|
prompt_suffix=args.prompt_suffix,
|
662
698
|
apply_chat_template=args.apply_chat_template,
|
663
699
|
)
|
664
|
-
elif args.dataset_name.startswith("random")
|
700
|
+
elif args.dataset_name.startswith("random"):
|
665
701
|
input_requests = sample_random_requests(
|
666
702
|
input_len=args.random_input_len,
|
667
703
|
output_len=args.random_output_len,
|
@@ -672,17 +708,18 @@ def get_dataset(args, tokenizer):
|
|
672
708
|
random_sample=args.dataset_name == "random",
|
673
709
|
return_text=not tokenize_prompt,
|
674
710
|
)
|
675
|
-
elif args.dataset_name == "
|
676
|
-
|
677
|
-
input_requests =
|
711
|
+
elif args.dataset_name == "image":
|
712
|
+
processor = get_processor(model_id)
|
713
|
+
input_requests = sample_image_requests(
|
678
714
|
num_requests=args.num_prompts,
|
679
|
-
|
715
|
+
image_count=args.image_count,
|
680
716
|
input_len=args.random_input_len,
|
681
717
|
output_len=args.random_output_len,
|
682
718
|
range_ratio=args.random_range_ratio,
|
683
|
-
|
684
|
-
|
685
|
-
|
719
|
+
processor=processor,
|
720
|
+
image_content=args.image_content,
|
721
|
+
image_format=args.image_format,
|
722
|
+
image_resolution=args.image_resolution,
|
686
723
|
)
|
687
724
|
elif args.dataset_name == "generated-shared-prefix":
|
688
725
|
assert not tokenize_prompt
|
@@ -696,12 +733,11 @@ def get_dataset(args, tokenizer):
|
|
696
733
|
args=args,
|
697
734
|
)
|
698
735
|
elif args.dataset_name == "mmmu":
|
699
|
-
|
736
|
+
processor = get_processor(model_id)
|
700
737
|
input_requests = sample_mmmu_requests(
|
701
738
|
num_requests=args.num_prompts,
|
702
|
-
|
739
|
+
processor=processor,
|
703
740
|
fixed_output_len=args.random_output_len,
|
704
|
-
apply_chat_template=args.apply_chat_template,
|
705
741
|
random_sample=True,
|
706
742
|
)
|
707
743
|
elif args.dataset_name == "mooncake":
|
@@ -746,6 +782,8 @@ ASYNC_REQUEST_FUNCS = {
|
|
746
782
|
class BenchmarkMetrics:
|
747
783
|
completed: int
|
748
784
|
total_input: int
|
785
|
+
total_input_text: int
|
786
|
+
total_input_vision: int
|
749
787
|
total_output: int
|
750
788
|
total_output_retokenized: int
|
751
789
|
request_throughput: float
|
@@ -839,9 +877,17 @@ class DatasetRow:
|
|
839
877
|
prompt: str
|
840
878
|
prompt_len: int
|
841
879
|
output_len: int
|
880
|
+
text_prompt_len: Optional[int] = None
|
881
|
+
vision_prompt_len: Optional[int] = None
|
842
882
|
image_data: Optional[List[str]] = None
|
843
883
|
timestamp: Optional[float] = None
|
844
884
|
|
885
|
+
def __post_init__(self):
|
886
|
+
if self.text_prompt_len is None:
|
887
|
+
self.text_prompt_len = self.prompt_len
|
888
|
+
if self.vision_prompt_len is None:
|
889
|
+
self.vision_prompt_len = 0
|
890
|
+
|
845
891
|
|
846
892
|
async def get_mooncake_request_over_time(
|
847
893
|
input_requests: List[Dict],
|
@@ -918,9 +964,8 @@ async def get_mooncake_request_over_time(
|
|
918
964
|
|
919
965
|
def sample_mmmu_requests(
|
920
966
|
num_requests: int,
|
921
|
-
|
967
|
+
processor: AutoProcessor,
|
922
968
|
fixed_output_len: Optional[int] = None,
|
923
|
-
apply_chat_template: bool = True,
|
924
969
|
random_sample: bool = True,
|
925
970
|
) -> List[DatasetRow]:
|
926
971
|
"""
|
@@ -999,54 +1044,12 @@ def sample_mmmu_requests(
|
|
999
1044
|
question = example.get("question")
|
1000
1045
|
|
1001
1046
|
# Construct the prompt
|
1002
|
-
|
1003
|
-
if apply_chat_template:
|
1004
|
-
try:
|
1005
|
-
is_phi4_multimodal = (
|
1006
|
-
"phi-4-multimodal" in tokenizer.name_or_path.lower()
|
1007
|
-
)
|
1008
|
-
if is_phi4_multimodal:
|
1009
|
-
# <|endoftext10|> is the image token used in the phi-4-multimodal model.
|
1010
|
-
content = prompt.replace("image 1", "<|endoftext10|>")
|
1011
|
-
else:
|
1012
|
-
content = [
|
1013
|
-
{
|
1014
|
-
"type": "image_url",
|
1015
|
-
"image_url": {"url": image_data},
|
1016
|
-
},
|
1017
|
-
{"type": "text", "text": prompt},
|
1018
|
-
]
|
1019
|
-
prompt = tokenizer.apply_chat_template(
|
1020
|
-
[
|
1021
|
-
{
|
1022
|
-
"role": "user",
|
1023
|
-
"content": content,
|
1024
|
-
}
|
1025
|
-
],
|
1026
|
-
add_generation_prompt=True,
|
1027
|
-
tokenize=False,
|
1028
|
-
)
|
1029
|
-
except Exception as e:
|
1030
|
-
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
|
1031
|
-
print(
|
1032
|
-
f"Error applying chat template: {e}, fallback to <image> tag"
|
1033
|
-
)
|
1034
|
-
prompt = f"<image>{prompt}"
|
1035
|
-
|
1036
|
-
# Calculate token lengths for text only (without image data)
|
1037
|
-
prompt_token_ids = tokenizer.encode(prompt)
|
1038
|
-
prompt_len = len(prompt_token_ids)
|
1039
|
-
|
1047
|
+
text_prompt = f"Question: {question}\n\nAnswer: "
|
1040
1048
|
output_len = fixed_output_len if fixed_output_len is not None else 256
|
1041
|
-
|
1042
|
-
|
1043
|
-
DatasetRow(
|
1044
|
-
prompt=prompt,
|
1045
|
-
prompt_len=prompt_len,
|
1046
|
-
output_len=output_len,
|
1047
|
-
image_data=[image_data],
|
1048
|
-
)
|
1049
|
+
data_row = create_mm_data_row(
|
1050
|
+
text_prompt, [image], [image_data], output_len, processor
|
1049
1051
|
)
|
1052
|
+
filtered_dataset.append(data_row)
|
1050
1053
|
|
1051
1054
|
except Exception as e:
|
1052
1055
|
print(f"Error processing example {i}: {e}")
|
@@ -1134,7 +1137,11 @@ def sample_sharegpt_requests(
|
|
1134
1137
|
continue
|
1135
1138
|
|
1136
1139
|
filtered_dataset.append(
|
1137
|
-
DatasetRow(
|
1140
|
+
DatasetRow(
|
1141
|
+
prompt=prompt,
|
1142
|
+
prompt_len=prompt_len,
|
1143
|
+
output_len=output_len,
|
1144
|
+
)
|
1138
1145
|
)
|
1139
1146
|
|
1140
1147
|
print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
|
@@ -1245,7 +1252,7 @@ def sample_random_requests(
|
|
1245
1252
|
return input_requests
|
1246
1253
|
|
1247
1254
|
|
1248
|
-
def
|
1255
|
+
def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
1249
1256
|
"""Parse image resolution into (width, height).
|
1250
1257
|
|
1251
1258
|
Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
|
@@ -1270,24 +1277,79 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
|
|
1270
1277
|
return (width, height)
|
1271
1278
|
|
1272
1279
|
raise ValueError(
|
1273
|
-
f"Unsupported
|
1280
|
+
f"Unsupported image resolution: {image_resolution}. "
|
1274
1281
|
"Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
|
1275
1282
|
)
|
1276
1283
|
|
1277
1284
|
|
1278
|
-
def
|
1285
|
+
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
|
1286
|
+
try:
|
1287
|
+
content_items = [
|
1288
|
+
{"type": "image_url", "image_url": {"url": img_url}}
|
1289
|
+
for img_url in images_base64
|
1290
|
+
]
|
1291
|
+
content_items.append({"type": "text", "text": text_prompt})
|
1292
|
+
prompt_str = processor.apply_chat_template(
|
1293
|
+
[{"role": "user", "content": content_items}],
|
1294
|
+
add_generation_prompt=True,
|
1295
|
+
tokenize=False,
|
1296
|
+
)
|
1297
|
+
except Exception:
|
1298
|
+
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1299
|
+
prompt_str = f"<image>{text_prompt}"
|
1300
|
+
|
1301
|
+
# Calculate total tokens (text + vision)
|
1302
|
+
prompt_len = processor(
|
1303
|
+
text=[prompt_str],
|
1304
|
+
images=images,
|
1305
|
+
padding=False,
|
1306
|
+
return_tensors="pt",
|
1307
|
+
)["input_ids"].numel()
|
1308
|
+
|
1309
|
+
# Calculate text-only tokens
|
1310
|
+
try:
|
1311
|
+
# Create text-only version of the prompt
|
1312
|
+
text_only_prompt = processor.apply_chat_template(
|
1313
|
+
[{"role": "user", "content": text_prompt}],
|
1314
|
+
add_generation_prompt=True,
|
1315
|
+
tokenize=False,
|
1316
|
+
)
|
1317
|
+
text_prompt_len = processor(
|
1318
|
+
text=[text_only_prompt],
|
1319
|
+
padding=False,
|
1320
|
+
return_tensors="pt",
|
1321
|
+
)["input_ids"].numel()
|
1322
|
+
except Exception:
|
1323
|
+
# Fallback: just tokenize the text prompt directly
|
1324
|
+
text_prompt_len = len(processor.tokenizer.encode(text_prompt))
|
1325
|
+
|
1326
|
+
# Vision tokens = total tokens - text tokens
|
1327
|
+
vision_prompt_len = prompt_len - text_prompt_len
|
1328
|
+
|
1329
|
+
return DatasetRow(
|
1330
|
+
prompt=text_prompt,
|
1331
|
+
prompt_len=prompt_len,
|
1332
|
+
output_len=output_len,
|
1333
|
+
text_prompt_len=text_prompt_len,
|
1334
|
+
vision_prompt_len=vision_prompt_len,
|
1335
|
+
image_data=images_base64,
|
1336
|
+
)
|
1337
|
+
|
1338
|
+
|
1339
|
+
def sample_image_requests(
|
1279
1340
|
num_requests: int,
|
1280
|
-
|
1341
|
+
image_count: int,
|
1281
1342
|
input_len: int,
|
1282
1343
|
output_len: int,
|
1283
1344
|
range_ratio: float,
|
1284
|
-
|
1285
|
-
|
1286
|
-
|
1345
|
+
processor: AutoProcessor,
|
1346
|
+
image_content: str,
|
1347
|
+
image_format: str,
|
1348
|
+
image_resolution: str,
|
1287
1349
|
) -> List[DatasetRow]:
|
1288
|
-
"""Generate requests with
|
1350
|
+
"""Generate requests with images.
|
1289
1351
|
|
1290
|
-
- Each request includes ``
|
1352
|
+
- Each request includes ``image_count`` images.
|
1291
1353
|
- Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
|
1292
1354
|
or custom 'heightxwidth' (e.g., 1080x1920).
|
1293
1355
|
- Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
|
@@ -1302,12 +1364,12 @@ def sample_random_image_requests(
|
|
1302
1364
|
) from e
|
1303
1365
|
|
1304
1366
|
# Parse resolution (supports presets and 'heightxwidth')
|
1305
|
-
width, height =
|
1367
|
+
width, height = parse_image_resolution(image_resolution)
|
1306
1368
|
|
1307
1369
|
# Check for potentially problematic combinations and warn user
|
1308
|
-
if width * height >= 1920 * 1080 and
|
1370
|
+
if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
|
1309
1371
|
warnings.warn(
|
1310
|
-
f"High resolution ({width}x{height}) with {
|
1372
|
+
f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
|
1311
1373
|
f"may take a long time. Consider reducing resolution or image count.",
|
1312
1374
|
UserWarning,
|
1313
1375
|
stacklevel=2,
|
@@ -1321,53 +1383,50 @@ def sample_random_image_requests(
|
|
1321
1383
|
int(output_len * range_ratio), output_len + 1, size=num_requests
|
1322
1384
|
)
|
1323
1385
|
|
1324
|
-
def _gen_random_image_data_uri(
|
1325
|
-
|
1326
|
-
|
1386
|
+
def _gen_random_image_data_uri(
|
1387
|
+
width: int = width, height: int = height
|
1388
|
+
) -> (Image, str, int):
|
1389
|
+
if image_content == "blank":
|
1390
|
+
# Generate blank white image
|
1391
|
+
arr = np.full((height, width, 3), 255, dtype=np.uint8)
|
1392
|
+
else:
|
1393
|
+
# Generate random colored image
|
1394
|
+
arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
|
1395
|
+
img = Image.fromarray(arr)
|
1327
1396
|
buf = io.BytesIO()
|
1328
|
-
img.save(buf, format=
|
1397
|
+
img.save(buf, format=image_format, quality=85)
|
1329
1398
|
encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
|
1330
|
-
|
1399
|
+
image_data = f"data:image/{image_format};base64,{encoded}"
|
1400
|
+
image_bytes = len(image_data.encode("utf-8"))
|
1401
|
+
return img, image_data, image_bytes
|
1331
1402
|
|
1332
1403
|
dataset: List[DatasetRow] = []
|
1404
|
+
total_image_bytes = 0
|
1333
1405
|
for i in range(num_requests):
|
1334
1406
|
# Generate text prompt
|
1335
|
-
text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
|
1407
|
+
text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
|
1336
1408
|
|
1337
1409
|
# Generate image list
|
1338
|
-
images
|
1339
|
-
|
1340
|
-
prompt_str = text_prompt
|
1341
|
-
if apply_chat_template:
|
1342
|
-
try:
|
1343
|
-
content_items = [
|
1344
|
-
{"type": "image_url", "image_url": {"url": img_url}}
|
1345
|
-
for img_url in images
|
1346
|
-
]
|
1347
|
-
content_items.append({"type": "text", "text": text_prompt})
|
1348
|
-
prompt_str = tokenizer.apply_chat_template(
|
1349
|
-
[{"role": "user", "content": content_items}],
|
1350
|
-
add_generation_prompt=True,
|
1351
|
-
tokenize=False,
|
1352
|
-
)
|
1353
|
-
except Exception:
|
1354
|
-
# Some tokenizers do not support list content; fall back to a placeholder in the text
|
1355
|
-
prompt_str = f"<image>{text_prompt}"
|
1356
|
-
|
1357
|
-
prompt_token_ids = tokenizer.encode(prompt_str)
|
1358
|
-
prompt_token_len = len(prompt_token_ids)
|
1359
|
-
|
1360
|
-
dataset.append(
|
1361
|
-
DatasetRow(
|
1362
|
-
prompt=prompt_str,
|
1363
|
-
prompt_len=prompt_token_len,
|
1364
|
-
output_len=int(output_lens[i]),
|
1365
|
-
image_data=images,
|
1366
|
-
)
|
1410
|
+
images, images_base64, images_bytes = zip(
|
1411
|
+
*[_gen_random_image_data_uri() for _ in range(image_count)]
|
1367
1412
|
)
|
1413
|
+
total_image_bytes += sum(list(images_bytes))
|
1414
|
+
|
1415
|
+
data_row = create_mm_data_row(
|
1416
|
+
text_prompt,
|
1417
|
+
list(images),
|
1418
|
+
list(images_base64),
|
1419
|
+
int(output_lens[i]),
|
1420
|
+
processor,
|
1421
|
+
)
|
1422
|
+
|
1423
|
+
dataset.append(data_row)
|
1368
1424
|
|
1369
1425
|
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
|
1370
1426
|
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
|
1427
|
+
print(
|
1428
|
+
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
|
1429
|
+
)
|
1371
1430
|
return dataset
|
1372
1431
|
|
1373
1432
|
|
@@ -1439,7 +1498,9 @@ def sample_generated_shared_prefix_requests(
|
|
1439
1498
|
|
1440
1499
|
input_requests.append(
|
1441
1500
|
DatasetRow(
|
1442
|
-
prompt=full_prompt,
|
1501
|
+
prompt=full_prompt,
|
1502
|
+
prompt_len=prompt_len,
|
1503
|
+
output_len=output_len,
|
1443
1504
|
)
|
1444
1505
|
)
|
1445
1506
|
total_input_tokens += prompt_len
|
@@ -1521,6 +1582,8 @@ def calculate_metrics(
|
|
1521
1582
|
output_lens: List[int] = []
|
1522
1583
|
retokenized_output_lens: List[int] = []
|
1523
1584
|
total_input = 0
|
1585
|
+
total_input_text = 0
|
1586
|
+
total_input_vision = 0
|
1524
1587
|
completed = 0
|
1525
1588
|
itls: List[float] = []
|
1526
1589
|
tpots: List[float] = []
|
@@ -1534,7 +1597,9 @@ def calculate_metrics(
|
|
1534
1597
|
tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
|
1535
1598
|
)
|
1536
1599
|
retokenized_output_lens.append(retokenized_output_len)
|
1537
|
-
total_input +=
|
1600
|
+
total_input += input_requests[i].prompt_len
|
1601
|
+
total_input_text += input_requests[i].text_prompt_len
|
1602
|
+
total_input_vision += input_requests[i].vision_prompt_len
|
1538
1603
|
if output_len > 1:
|
1539
1604
|
tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
|
1540
1605
|
itls += outputs[i].itl
|
@@ -1556,6 +1621,8 @@ def calculate_metrics(
|
|
1556
1621
|
metrics = BenchmarkMetrics(
|
1557
1622
|
completed=completed,
|
1558
1623
|
total_input=total_input,
|
1624
|
+
total_input_text=total_input_text,
|
1625
|
+
total_input_vision=total_input_vision,
|
1559
1626
|
total_output=sum(output_lens),
|
1560
1627
|
total_output_retokenized=sum(retokenized_output_lens),
|
1561
1628
|
request_throughput=completed / dur_s,
|
@@ -1770,9 +1837,15 @@ async def benchmark(
|
|
1770
1837
|
server_info_json = server_info.json()
|
1771
1838
|
if "decode" in server_info_json:
|
1772
1839
|
server_info_json = server_info_json["decode"][0]
|
1773
|
-
|
1774
|
-
"
|
1775
|
-
|
1840
|
+
if (
|
1841
|
+
"internal_states" in server_info_json
|
1842
|
+
and server_info_json["internal_states"]
|
1843
|
+
):
|
1844
|
+
accept_length = server_info_json["internal_states"][0].get(
|
1845
|
+
"avg_spec_accept_length", None
|
1846
|
+
)
|
1847
|
+
else:
|
1848
|
+
accept_length = None
|
1776
1849
|
else:
|
1777
1850
|
accept_length = None
|
1778
1851
|
else:
|
@@ -1804,6 +1877,10 @@ async def benchmark(
|
|
1804
1877
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
1805
1878
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
1806
1879
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
1880
|
+
print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
|
1881
|
+
print(
|
1882
|
+
"{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
|
1883
|
+
)
|
1807
1884
|
print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
|
1808
1885
|
print(
|
1809
1886
|
"{:<40} {:<10}".format(
|
@@ -1873,6 +1950,8 @@ async def benchmark(
|
|
1873
1950
|
"duration": benchmark_duration,
|
1874
1951
|
"completed": metrics.completed,
|
1875
1952
|
"total_input_tokens": metrics.total_input,
|
1953
|
+
"total_input_text_tokens": metrics.total_input_text,
|
1954
|
+
"total_input_vision_tokens": metrics.total_input_vision,
|
1876
1955
|
"total_output_tokens": metrics.total_output,
|
1877
1956
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
1878
1957
|
"request_throughput": metrics.request_throughput,
|
@@ -1907,11 +1986,11 @@ async def benchmark(
|
|
1907
1986
|
output_file_name = args.output_file
|
1908
1987
|
else:
|
1909
1988
|
now = datetime.now().strftime("%m%d")
|
1910
|
-
if args.dataset_name == "
|
1989
|
+
if args.dataset_name == "image":
|
1911
1990
|
output_file_name = (
|
1912
1991
|
f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
|
1913
|
-
f"{args.random_output_len}_{args.
|
1914
|
-
f"{args.
|
1992
|
+
f"{args.random_output_len}_{args.image_count}imgs_"
|
1993
|
+
f"{args.image_resolution}.jsonl"
|
1915
1994
|
)
|
1916
1995
|
elif args.dataset_name.startswith("random"):
|
1917
1996
|
output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
|
@@ -2087,6 +2166,12 @@ def run_benchmark(args_: argparse.Namespace):
|
|
2087
2166
|
"Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
|
2088
2167
|
)
|
2089
2168
|
|
2169
|
+
if args.dataset_name in ["image", "mmmu"]:
|
2170
|
+
args.apply_chat_template = True
|
2171
|
+
assert (
|
2172
|
+
not args.tokenize_prompt
|
2173
|
+
), "`--tokenize-prompt` not compatible with image dataset"
|
2174
|
+
|
2090
2175
|
print(f"{args}\n")
|
2091
2176
|
|
2092
2177
|
# Read dataset
|
@@ -2094,7 +2179,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
2094
2179
|
model_id = args.model
|
2095
2180
|
tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
|
2096
2181
|
tokenizer = get_tokenizer(tokenizer_id)
|
2097
|
-
input_requests = get_dataset(args, tokenizer)
|
2182
|
+
input_requests = get_dataset(args, tokenizer, model_id)
|
2098
2183
|
|
2099
2184
|
# compatible with SimpleNamespace
|
2100
2185
|
if not hasattr(args, "flush_cache"):
|
@@ -2175,7 +2260,7 @@ if __name__ == "__main__":
|
|
2175
2260
|
"random-ids",
|
2176
2261
|
"generated-shared-prefix",
|
2177
2262
|
"mmmu",
|
2178
|
-
"
|
2263
|
+
"image",
|
2179
2264
|
"mooncake",
|
2180
2265
|
],
|
2181
2266
|
help="Name of the dataset to benchmark on.",
|
@@ -2215,37 +2300,49 @@ if __name__ == "__main__":
|
|
2215
2300
|
"--random-input-len",
|
2216
2301
|
type=int,
|
2217
2302
|
default=1024,
|
2218
|
-
help="Number of input tokens per request, used only for random dataset.",
|
2303
|
+
help="Number of input tokens per request, used only for random and image dataset.",
|
2219
2304
|
)
|
2220
2305
|
parser.add_argument(
|
2221
2306
|
"--random-output-len",
|
2222
2307
|
default=1024,
|
2223
2308
|
type=int,
|
2224
|
-
help="Number of output tokens per request, used only for random dataset.",
|
2309
|
+
help="Number of output tokens per request, used only for random and image dataset.",
|
2225
2310
|
)
|
2226
2311
|
parser.add_argument(
|
2227
2312
|
"--random-range-ratio",
|
2228
2313
|
type=float,
|
2229
2314
|
default=0.0,
|
2230
2315
|
help="Range of sampled ratio of input/output length, "
|
2231
|
-
"used only for random dataset.",
|
2316
|
+
"used only for random and image dataset.",
|
2232
2317
|
)
|
2233
|
-
#
|
2318
|
+
# image dataset args
|
2234
2319
|
parser.add_argument(
|
2235
|
-
"--
|
2320
|
+
"--image-count",
|
2236
2321
|
type=int,
|
2237
2322
|
default=1,
|
2238
|
-
help="Number of images per request (only available with the
|
2323
|
+
help="Number of images per request (only available with the image dataset)",
|
2239
2324
|
)
|
2240
2325
|
parser.add_argument(
|
2241
|
-
"--
|
2326
|
+
"--image-resolution",
|
2242
2327
|
type=str,
|
2243
2328
|
default="1080p",
|
2244
2329
|
help=(
|
2245
|
-
"Resolution of
|
2330
|
+
"Resolution of images for image dataset. "
|
2246
2331
|
"Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
|
2247
2332
|
),
|
2248
2333
|
)
|
2334
|
+
parser.add_argument(
|
2335
|
+
"--image-format",
|
2336
|
+
type=str,
|
2337
|
+
default="jpeg",
|
2338
|
+
help=("Format of images for image dataset. " "Supports jpeg and png."),
|
2339
|
+
)
|
2340
|
+
parser.add_argument(
|
2341
|
+
"--image-content",
|
2342
|
+
type=str,
|
2343
|
+
default="random",
|
2344
|
+
help=("Content for images for image dataset. " "Supports random and blank."),
|
2345
|
+
)
|
2249
2346
|
parser.add_argument(
|
2250
2347
|
"--request-rate",
|
2251
2348
|
type=float,
|
sglang/compile_deep_gemm.py
CHANGED
@@ -141,6 +141,9 @@ def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
|
|
141
141
|
server_args.enable_torch_compile = False
|
142
142
|
print(f"Disable CUDA Graph and Torch Compile to save time...")
|
143
143
|
|
144
|
+
server_args.load_format = "dummy"
|
145
|
+
print(f"Set load format to dummy to save time...")
|
146
|
+
|
144
147
|
# Set watchdog timeout to compile_args.timeout because compilation will take a long time
|
145
148
|
server_args.watchdog_timeout = compile_args.timeout
|
146
149
|
server_args.warmups = "compile-deep-gemm"
|
sglang/launch_server.py
CHANGED
@@ -7,23 +7,9 @@ from sglang.srt.entrypoints.http_server import launch_server
|
|
7
7
|
from sglang.srt.server_args import prepare_server_args
|
8
8
|
from sglang.srt.utils import kill_process_tree
|
9
9
|
|
10
|
-
MOVE_ENVS_WARN = """
|
11
|
-
########################################################################
|
12
|
-
# For contributors and developers: #
|
13
|
-
# Please move environment variable definitions to sglang.srt.environ #
|
14
|
-
# using the following pattern: #
|
15
|
-
# SGLANG_XXX = EnvBool(False) #
|
16
|
-
# #
|
17
|
-
########################################################################
|
18
|
-
"""
|
19
|
-
|
20
10
|
if __name__ == "__main__":
|
21
11
|
server_args = prepare_server_args(sys.argv[1:])
|
22
12
|
|
23
|
-
from sglang.srt.server_args import print_deprecated_warning
|
24
|
-
|
25
|
-
print_deprecated_warning(MOVE_ENVS_WARN)
|
26
|
-
|
27
13
|
try:
|
28
14
|
launch_server(server_args)
|
29
15
|
finally:
|
sglang/srt/configs/__init__.py
CHANGED
@@ -9,6 +9,7 @@ from sglang.srt.configs.janus_pro import MultiModalityConfig
|
|
9
9
|
from sglang.srt.configs.kimi_vl import KimiVLConfig
|
10
10
|
from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
|
11
11
|
from sglang.srt.configs.longcat_flash import LongcatFlashConfig
|
12
|
+
from sglang.srt.configs.nemotron_h import NemotronHConfig
|
12
13
|
from sglang.srt.configs.qwen3_next import Qwen3NextConfig
|
13
14
|
from sglang.srt.configs.step3_vl import (
|
14
15
|
Step3TextConfig,
|
@@ -32,4 +33,5 @@ __all__ = [
|
|
32
33
|
"DotsVLMConfig",
|
33
34
|
"DotsOCRConfig",
|
34
35
|
"FalconH1Config",
|
36
|
+
"NemotronHConfig",
|
35
37
|
]
|