sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. sglang/bench_one_batch.py +0 -2
  2. sglang/bench_serving.py +224 -127
  3. sglang/compile_deep_gemm.py +3 -0
  4. sglang/launch_server.py +0 -14
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/falcon_h1.py +12 -58
  7. sglang/srt/configs/mamba_utils.py +117 -0
  8. sglang/srt/configs/model_config.py +68 -31
  9. sglang/srt/configs/nemotron_h.py +286 -0
  10. sglang/srt/configs/qwen3_next.py +11 -43
  11. sglang/srt/disaggregation/decode.py +7 -18
  12. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  13. sglang/srt/disaggregation/nixl/conn.py +55 -23
  14. sglang/srt/disaggregation/prefill.py +17 -32
  15. sglang/srt/entrypoints/engine.py +2 -2
  16. sglang/srt/entrypoints/grpc_request_manager.py +10 -23
  17. sglang/srt/entrypoints/grpc_server.py +220 -80
  18. sglang/srt/entrypoints/http_server.py +49 -1
  19. sglang/srt/entrypoints/openai/protocol.py +159 -31
  20. sglang/srt/entrypoints/openai/serving_chat.py +13 -71
  21. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  22. sglang/srt/environ.py +4 -0
  23. sglang/srt/function_call/function_call_parser.py +8 -6
  24. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  25. sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
  26. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
  27. sglang/srt/layers/attention/attention_registry.py +31 -22
  28. sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
  29. sglang/srt/layers/attention/flashattention_backend.py +0 -1
  30. sglang/srt/layers/attention/flashinfer_backend.py +223 -6
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
  32. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
  33. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  34. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
  35. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  36. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  37. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  38. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  39. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  40. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  41. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  42. sglang/srt/layers/attention/triton_backend.py +1 -1
  43. sglang/srt/layers/logits_processor.py +136 -6
  44. sglang/srt/layers/modelopt_utils.py +11 -0
  45. sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
  46. sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
  47. sglang/srt/layers/moe/ep_moe/layer.py +8 -286
  48. sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
  49. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  50. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  51. sglang/srt/layers/moe/utils.py +7 -1
  52. sglang/srt/layers/quantization/__init__.py +1 -1
  53. sglang/srt/layers/quantization/fp8.py +84 -18
  54. sglang/srt/layers/quantization/modelopt_quant.py +1 -1
  55. sglang/srt/layers/quantization/quark/quark.py +3 -1
  56. sglang/srt/layers/quantization/w4afp8.py +2 -16
  57. sglang/srt/lora/lora_manager.py +0 -8
  58. sglang/srt/managers/overlap_utils.py +18 -16
  59. sglang/srt/managers/schedule_batch.py +119 -90
  60. sglang/srt/managers/schedule_policy.py +1 -1
  61. sglang/srt/managers/scheduler.py +213 -126
  62. sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
  64. sglang/srt/managers/tokenizer_manager.py +270 -53
  65. sglang/srt/managers/tp_worker.py +39 -28
  66. sglang/srt/mem_cache/allocator.py +7 -2
  67. sglang/srt/mem_cache/chunk_cache.py +1 -1
  68. sglang/srt/mem_cache/memory_pool.py +162 -68
  69. sglang/srt/mem_cache/radix_cache.py +8 -3
  70. sglang/srt/mem_cache/swa_radix_cache.py +70 -14
  71. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  72. sglang/srt/model_executor/forward_batch_info.py +4 -18
  73. sglang/srt/model_executor/model_runner.py +55 -51
  74. sglang/srt/model_loader/__init__.py +1 -1
  75. sglang/srt/model_loader/loader.py +187 -6
  76. sglang/srt/model_loader/weight_utils.py +3 -0
  77. sglang/srt/models/falcon_h1.py +11 -9
  78. sglang/srt/models/gemma3_mm.py +16 -0
  79. sglang/srt/models/grok.py +5 -13
  80. sglang/srt/models/mixtral.py +1 -3
  81. sglang/srt/models/mllama4.py +11 -1
  82. sglang/srt/models/nemotron_h.py +514 -0
  83. sglang/srt/models/utils.py +5 -1
  84. sglang/srt/sampling/sampling_batch_info.py +11 -9
  85. sglang/srt/server_args.py +100 -33
  86. sglang/srt/speculative/eagle_worker.py +11 -13
  87. sglang/srt/speculative/ngram_worker.py +12 -11
  88. sglang/srt/speculative/spec_utils.py +0 -1
  89. sglang/srt/two_batch_overlap.py +1 -0
  90. sglang/srt/utils/common.py +18 -0
  91. sglang/srt/utils/hf_transformers_utils.py +2 -0
  92. sglang/test/longbench_v2/__init__.py +1 -0
  93. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  94. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  95. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  96. sglang/test/run_eval.py +40 -0
  97. sglang/test/simple_eval_longbench_v2.py +332 -0
  98. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  99. sglang/test/test_deterministic.py +18 -2
  100. sglang/test/test_deterministic_utils.py +81 -0
  101. sglang/test/test_disaggregation_utils.py +63 -0
  102. sglang/test/test_utils.py +32 -11
  103. sglang/version.py +1 -1
  104. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
  105. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
  106. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  107. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  108. sglang/test/test_block_fp8_ep.py +0 -358
  109. /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
  110. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  111. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  112. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
sglang/bench_one_batch.py CHANGED
@@ -204,7 +204,6 @@ def prepare_inputs_for_correctness_test(bench_args, tokenizer, custom_prompts):
204
204
  origin_input_ids=tmp_input_ids,
205
205
  sampling_params=sampling_params,
206
206
  )
207
- req.prefix_indices = []
208
207
  req.fill_ids = req.origin_input_ids
209
208
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
210
209
  req.logprob_start_len = len(req.origin_input_ids) - 1
@@ -248,7 +247,6 @@ def prepare_synthetic_inputs_for_latency_test(
248
247
  origin_input_ids=list(input_ids[i]),
249
248
  sampling_params=sampling_params,
250
249
  )
251
- req.prefix_indices = []
252
250
  req.fill_ids = req.origin_input_ids
253
251
  req.extend_input_len = len(req.fill_ids) - len(req.prefix_indices)
254
252
  req.logprob_start_len = len(req.origin_input_ids) - 1
sglang/bench_serving.py CHANGED
@@ -35,6 +35,7 @@ import numpy as np
35
35
  import requests
36
36
  from tqdm.asyncio import tqdm
37
37
  from transformers import (
38
+ AutoProcessor,
38
39
  AutoTokenizer,
39
40
  PreTrainedTokenizer,
40
41
  PreTrainedTokenizerBase,
@@ -209,6 +210,11 @@ async def async_request_openai_completions(
209
210
  **request_func_input.extra_request_body,
210
211
  }
211
212
 
213
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
214
+ if request_func_input.lora_name:
215
+ payload["model"] = request_func_input.lora_name
216
+ payload["lora_path"] = request_func_input.lora_name
217
+
212
218
  if request_func_input.image_data:
213
219
  payload.update({"image_data": request_func_input.image_data})
214
220
 
@@ -322,10 +328,17 @@ async def async_request_openai_chat_completions(
322
328
  "model": request_func_input.model,
323
329
  "messages": messages,
324
330
  "temperature": 0.0,
325
- "max_tokens": request_func_input.output_len,
331
+ "max_completion_tokens": request_func_input.output_len,
326
332
  "stream": not args.disable_stream,
333
+ "ignore_eos": not args.disable_ignore_eos,
327
334
  **request_func_input.extra_request_body,
328
335
  }
336
+
337
+ # hack to accommodate different LoRA conventions between SGLang and vLLM.
338
+ if request_func_input.lora_name:
339
+ payload["model"] = request_func_input.lora_name
340
+ payload["lora_path"] = request_func_input.lora_name
341
+
329
342
  headers = get_auth_headers()
330
343
 
331
344
  output = RequestFuncOutput.init_new(request_func_input)
@@ -648,7 +661,30 @@ def get_tokenizer(
648
661
  )
649
662
 
650
663
 
651
- def get_dataset(args, tokenizer):
664
+ def get_processor(
665
+ pretrained_model_name_or_path: str,
666
+ ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
667
+ assert (
668
+ pretrained_model_name_or_path is not None
669
+ and pretrained_model_name_or_path != ""
670
+ )
671
+ if pretrained_model_name_or_path.endswith(
672
+ ".json"
673
+ ) or pretrained_model_name_or_path.endswith(".model"):
674
+ from sglang.srt.hf_transformers_utils import get_processor
675
+
676
+ return get_processor(pretrained_model_name_or_path)
677
+
678
+ if pretrained_model_name_or_path is not None and not os.path.exists(
679
+ pretrained_model_name_or_path
680
+ ):
681
+ pretrained_model_name_or_path = get_model(pretrained_model_name_or_path)
682
+ return AutoProcessor.from_pretrained(
683
+ pretrained_model_name_or_path, trust_remote_code=True
684
+ )
685
+
686
+
687
+ def get_dataset(args, tokenizer, model_id=None):
652
688
  tokenize_prompt = getattr(args, "tokenize_prompt", False)
653
689
  if args.dataset_name == "sharegpt":
654
690
  assert not tokenize_prompt
@@ -661,7 +697,7 @@ def get_dataset(args, tokenizer):
661
697
  prompt_suffix=args.prompt_suffix,
662
698
  apply_chat_template=args.apply_chat_template,
663
699
  )
664
- elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
700
+ elif args.dataset_name.startswith("random"):
665
701
  input_requests = sample_random_requests(
666
702
  input_len=args.random_input_len,
667
703
  output_len=args.random_output_len,
@@ -672,17 +708,18 @@ def get_dataset(args, tokenizer):
672
708
  random_sample=args.dataset_name == "random",
673
709
  return_text=not tokenize_prompt,
674
710
  )
675
- elif args.dataset_name == "random-image":
676
- assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
677
- input_requests = sample_random_image_requests(
711
+ elif args.dataset_name == "image":
712
+ processor = get_processor(model_id)
713
+ input_requests = sample_image_requests(
678
714
  num_requests=args.num_prompts,
679
- num_images=args.random_image_num_images,
715
+ image_count=args.image_count,
680
716
  input_len=args.random_input_len,
681
717
  output_len=args.random_output_len,
682
718
  range_ratio=args.random_range_ratio,
683
- tokenizer=tokenizer,
684
- apply_chat_template=args.apply_chat_template,
685
- image_resolution=args.random_image_resolution,
719
+ processor=processor,
720
+ image_content=args.image_content,
721
+ image_format=args.image_format,
722
+ image_resolution=args.image_resolution,
686
723
  )
687
724
  elif args.dataset_name == "generated-shared-prefix":
688
725
  assert not tokenize_prompt
@@ -696,12 +733,11 @@ def get_dataset(args, tokenizer):
696
733
  args=args,
697
734
  )
698
735
  elif args.dataset_name == "mmmu":
699
- assert not tokenize_prompt
736
+ processor = get_processor(model_id)
700
737
  input_requests = sample_mmmu_requests(
701
738
  num_requests=args.num_prompts,
702
- tokenizer=tokenizer,
739
+ processor=processor,
703
740
  fixed_output_len=args.random_output_len,
704
- apply_chat_template=args.apply_chat_template,
705
741
  random_sample=True,
706
742
  )
707
743
  elif args.dataset_name == "mooncake":
@@ -746,6 +782,8 @@ ASYNC_REQUEST_FUNCS = {
746
782
  class BenchmarkMetrics:
747
783
  completed: int
748
784
  total_input: int
785
+ total_input_text: int
786
+ total_input_vision: int
749
787
  total_output: int
750
788
  total_output_retokenized: int
751
789
  request_throughput: float
@@ -839,9 +877,17 @@ class DatasetRow:
839
877
  prompt: str
840
878
  prompt_len: int
841
879
  output_len: int
880
+ text_prompt_len: Optional[int] = None
881
+ vision_prompt_len: Optional[int] = None
842
882
  image_data: Optional[List[str]] = None
843
883
  timestamp: Optional[float] = None
844
884
 
885
+ def __post_init__(self):
886
+ if self.text_prompt_len is None:
887
+ self.text_prompt_len = self.prompt_len
888
+ if self.vision_prompt_len is None:
889
+ self.vision_prompt_len = 0
890
+
845
891
 
846
892
  async def get_mooncake_request_over_time(
847
893
  input_requests: List[Dict],
@@ -918,9 +964,8 @@ async def get_mooncake_request_over_time(
918
964
 
919
965
  def sample_mmmu_requests(
920
966
  num_requests: int,
921
- tokenizer: PreTrainedTokenizerBase,
967
+ processor: AutoProcessor,
922
968
  fixed_output_len: Optional[int] = None,
923
- apply_chat_template: bool = True,
924
969
  random_sample: bool = True,
925
970
  ) -> List[DatasetRow]:
926
971
  """
@@ -999,54 +1044,12 @@ def sample_mmmu_requests(
999
1044
  question = example.get("question")
1000
1045
 
1001
1046
  # Construct the prompt
1002
- prompt = f"Question: {question}\n\nAnswer: "
1003
- if apply_chat_template:
1004
- try:
1005
- is_phi4_multimodal = (
1006
- "phi-4-multimodal" in tokenizer.name_or_path.lower()
1007
- )
1008
- if is_phi4_multimodal:
1009
- # <|endoftext10|> is the image token used in the phi-4-multimodal model.
1010
- content = prompt.replace("image 1", "<|endoftext10|>")
1011
- else:
1012
- content = [
1013
- {
1014
- "type": "image_url",
1015
- "image_url": {"url": image_data},
1016
- },
1017
- {"type": "text", "text": prompt},
1018
- ]
1019
- prompt = tokenizer.apply_chat_template(
1020
- [
1021
- {
1022
- "role": "user",
1023
- "content": content,
1024
- }
1025
- ],
1026
- add_generation_prompt=True,
1027
- tokenize=False,
1028
- )
1029
- except Exception as e:
1030
- # Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
1031
- print(
1032
- f"Error applying chat template: {e}, fallback to <image> tag"
1033
- )
1034
- prompt = f"<image>{prompt}"
1035
-
1036
- # Calculate token lengths for text only (without image data)
1037
- prompt_token_ids = tokenizer.encode(prompt)
1038
- prompt_len = len(prompt_token_ids)
1039
-
1047
+ text_prompt = f"Question: {question}\n\nAnswer: "
1040
1048
  output_len = fixed_output_len if fixed_output_len is not None else 256
1041
-
1042
- filtered_dataset.append(
1043
- DatasetRow(
1044
- prompt=prompt,
1045
- prompt_len=prompt_len,
1046
- output_len=output_len,
1047
- image_data=[image_data],
1048
- )
1049
+ data_row = create_mm_data_row(
1050
+ text_prompt, [image], [image_data], output_len, processor
1049
1051
  )
1052
+ filtered_dataset.append(data_row)
1050
1053
 
1051
1054
  except Exception as e:
1052
1055
  print(f"Error processing example {i}: {e}")
@@ -1134,7 +1137,11 @@ def sample_sharegpt_requests(
1134
1137
  continue
1135
1138
 
1136
1139
  filtered_dataset.append(
1137
- DatasetRow(prompt=prompt, prompt_len=prompt_len, output_len=output_len)
1140
+ DatasetRow(
1141
+ prompt=prompt,
1142
+ prompt_len=prompt_len,
1143
+ output_len=output_len,
1144
+ )
1138
1145
  )
1139
1146
 
1140
1147
  print(f"#Input tokens: {np.sum([x.prompt_len for x in filtered_dataset])}")
@@ -1245,7 +1252,7 @@ def sample_random_requests(
1245
1252
  return input_requests
1246
1253
 
1247
1254
 
1248
- def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1255
+ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
1249
1256
  """Parse image resolution into (width, height).
1250
1257
 
1251
1258
  Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
@@ -1270,24 +1277,79 @@ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1270
1277
  return (width, height)
1271
1278
 
1272
1279
  raise ValueError(
1273
- f"Unsupported random-image resolution: {image_resolution}. "
1280
+ f"Unsupported image resolution: {image_resolution}. "
1274
1281
  "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1275
1282
  )
1276
1283
 
1277
1284
 
1278
- def sample_random_image_requests(
1285
+ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
1286
+ try:
1287
+ content_items = [
1288
+ {"type": "image_url", "image_url": {"url": img_url}}
1289
+ for img_url in images_base64
1290
+ ]
1291
+ content_items.append({"type": "text", "text": text_prompt})
1292
+ prompt_str = processor.apply_chat_template(
1293
+ [{"role": "user", "content": content_items}],
1294
+ add_generation_prompt=True,
1295
+ tokenize=False,
1296
+ )
1297
+ except Exception:
1298
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1299
+ prompt_str = f"<image>{text_prompt}"
1300
+
1301
+ # Calculate total tokens (text + vision)
1302
+ prompt_len = processor(
1303
+ text=[prompt_str],
1304
+ images=images,
1305
+ padding=False,
1306
+ return_tensors="pt",
1307
+ )["input_ids"].numel()
1308
+
1309
+ # Calculate text-only tokens
1310
+ try:
1311
+ # Create text-only version of the prompt
1312
+ text_only_prompt = processor.apply_chat_template(
1313
+ [{"role": "user", "content": text_prompt}],
1314
+ add_generation_prompt=True,
1315
+ tokenize=False,
1316
+ )
1317
+ text_prompt_len = processor(
1318
+ text=[text_only_prompt],
1319
+ padding=False,
1320
+ return_tensors="pt",
1321
+ )["input_ids"].numel()
1322
+ except Exception:
1323
+ # Fallback: just tokenize the text prompt directly
1324
+ text_prompt_len = len(processor.tokenizer.encode(text_prompt))
1325
+
1326
+ # Vision tokens = total tokens - text tokens
1327
+ vision_prompt_len = prompt_len - text_prompt_len
1328
+
1329
+ return DatasetRow(
1330
+ prompt=text_prompt,
1331
+ prompt_len=prompt_len,
1332
+ output_len=output_len,
1333
+ text_prompt_len=text_prompt_len,
1334
+ vision_prompt_len=vision_prompt_len,
1335
+ image_data=images_base64,
1336
+ )
1337
+
1338
+
1339
+ def sample_image_requests(
1279
1340
  num_requests: int,
1280
- num_images: int,
1341
+ image_count: int,
1281
1342
  input_len: int,
1282
1343
  output_len: int,
1283
1344
  range_ratio: float,
1284
- tokenizer: PreTrainedTokenizerBase,
1285
- apply_chat_template: bool = True,
1286
- image_resolution: str = "1080p",
1345
+ processor: AutoProcessor,
1346
+ image_content: str,
1347
+ image_format: str,
1348
+ image_resolution: str,
1287
1349
  ) -> List[DatasetRow]:
1288
- """Generate requests with random images.
1350
+ """Generate requests with images.
1289
1351
 
1290
- - Each request includes ``num_images`` random images.
1352
+ - Each request includes ``image_count`` images.
1291
1353
  - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1292
1354
  or custom 'heightxwidth' (e.g., 1080x1920).
1293
1355
  - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
@@ -1302,12 +1364,12 @@ def sample_random_image_requests(
1302
1364
  ) from e
1303
1365
 
1304
1366
  # Parse resolution (supports presets and 'heightxwidth')
1305
- width, height = parse_random_image_resolution(image_resolution)
1367
+ width, height = parse_image_resolution(image_resolution)
1306
1368
 
1307
1369
  # Check for potentially problematic combinations and warn user
1308
- if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1370
+ if width * height >= 1920 * 1080 and image_count * num_requests >= 100:
1309
1371
  warnings.warn(
1310
- f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1372
+ f"High resolution ({width}x{height}) with {image_count * num_requests} total images "
1311
1373
  f"may take a long time. Consider reducing resolution or image count.",
1312
1374
  UserWarning,
1313
1375
  stacklevel=2,
@@ -1321,53 +1383,50 @@ def sample_random_image_requests(
1321
1383
  int(output_len * range_ratio), output_len + 1, size=num_requests
1322
1384
  )
1323
1385
 
1324
- def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1325
- arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1326
- img = Image.fromarray(arr, mode="RGB")
1386
+ def _gen_random_image_data_uri(
1387
+ width: int = width, height: int = height
1388
+ ) -> (Image, str, int):
1389
+ if image_content == "blank":
1390
+ # Generate blank white image
1391
+ arr = np.full((height, width, 3), 255, dtype=np.uint8)
1392
+ else:
1393
+ # Generate random colored image
1394
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1395
+ img = Image.fromarray(arr)
1327
1396
  buf = io.BytesIO()
1328
- img.save(buf, format="JPEG", quality=85)
1397
+ img.save(buf, format=image_format, quality=85)
1329
1398
  encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1330
- return f"data:image/jpeg;base64,{encoded}"
1399
+ image_data = f"data:image/{image_format};base64,{encoded}"
1400
+ image_bytes = len(image_data.encode("utf-8"))
1401
+ return img, image_data, image_bytes
1331
1402
 
1332
1403
  dataset: List[DatasetRow] = []
1404
+ total_image_bytes = 0
1333
1405
  for i in range(num_requests):
1334
1406
  # Generate text prompt
1335
- text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1407
+ text_prompt = gen_prompt(processor.tokenizer, int(input_lens[i]))
1336
1408
 
1337
1409
  # Generate image list
1338
- images = [_gen_random_image_data_uri() for _ in range(num_images)]
1339
-
1340
- prompt_str = text_prompt
1341
- if apply_chat_template:
1342
- try:
1343
- content_items = [
1344
- {"type": "image_url", "image_url": {"url": img_url}}
1345
- for img_url in images
1346
- ]
1347
- content_items.append({"type": "text", "text": text_prompt})
1348
- prompt_str = tokenizer.apply_chat_template(
1349
- [{"role": "user", "content": content_items}],
1350
- add_generation_prompt=True,
1351
- tokenize=False,
1352
- )
1353
- except Exception:
1354
- # Some tokenizers do not support list content; fall back to a placeholder in the text
1355
- prompt_str = f"<image>{text_prompt}"
1356
-
1357
- prompt_token_ids = tokenizer.encode(prompt_str)
1358
- prompt_token_len = len(prompt_token_ids)
1359
-
1360
- dataset.append(
1361
- DatasetRow(
1362
- prompt=prompt_str,
1363
- prompt_len=prompt_token_len,
1364
- output_len=int(output_lens[i]),
1365
- image_data=images,
1366
- )
1410
+ images, images_base64, images_bytes = zip(
1411
+ *[_gen_random_image_data_uri() for _ in range(image_count)]
1367
1412
  )
1413
+ total_image_bytes += sum(list(images_bytes))
1414
+
1415
+ data_row = create_mm_data_row(
1416
+ text_prompt,
1417
+ list(images),
1418
+ list(images_base64),
1419
+ int(output_lens[i]),
1420
+ processor,
1421
+ )
1422
+
1423
+ dataset.append(data_row)
1368
1424
 
1369
1425
  print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1370
1426
  print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1427
+ print(
1428
+ f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
1429
+ )
1371
1430
  return dataset
1372
1431
 
1373
1432
 
@@ -1439,7 +1498,9 @@ def sample_generated_shared_prefix_requests(
1439
1498
 
1440
1499
  input_requests.append(
1441
1500
  DatasetRow(
1442
- prompt=full_prompt, prompt_len=prompt_len, output_len=output_len
1501
+ prompt=full_prompt,
1502
+ prompt_len=prompt_len,
1503
+ output_len=output_len,
1443
1504
  )
1444
1505
  )
1445
1506
  total_input_tokens += prompt_len
@@ -1521,6 +1582,8 @@ def calculate_metrics(
1521
1582
  output_lens: List[int] = []
1522
1583
  retokenized_output_lens: List[int] = []
1523
1584
  total_input = 0
1585
+ total_input_text = 0
1586
+ total_input_vision = 0
1524
1587
  completed = 0
1525
1588
  itls: List[float] = []
1526
1589
  tpots: List[float] = []
@@ -1534,7 +1597,9 @@ def calculate_metrics(
1534
1597
  tokenizer.encode(outputs[i].generated_text, add_special_tokens=False)
1535
1598
  )
1536
1599
  retokenized_output_lens.append(retokenized_output_len)
1537
- total_input += outputs[i].prompt_len
1600
+ total_input += input_requests[i].prompt_len
1601
+ total_input_text += input_requests[i].text_prompt_len
1602
+ total_input_vision += input_requests[i].vision_prompt_len
1538
1603
  if output_len > 1:
1539
1604
  tpots.append((outputs[i].latency - outputs[i].ttft) / (output_len - 1))
1540
1605
  itls += outputs[i].itl
@@ -1556,6 +1621,8 @@ def calculate_metrics(
1556
1621
  metrics = BenchmarkMetrics(
1557
1622
  completed=completed,
1558
1623
  total_input=total_input,
1624
+ total_input_text=total_input_text,
1625
+ total_input_vision=total_input_vision,
1559
1626
  total_output=sum(output_lens),
1560
1627
  total_output_retokenized=sum(retokenized_output_lens),
1561
1628
  request_throughput=completed / dur_s,
@@ -1770,9 +1837,15 @@ async def benchmark(
1770
1837
  server_info_json = server_info.json()
1771
1838
  if "decode" in server_info_json:
1772
1839
  server_info_json = server_info_json["decode"][0]
1773
- accept_length = server_info_json["internal_states"][0].get(
1774
- "avg_spec_accept_length", None
1775
- )
1840
+ if (
1841
+ "internal_states" in server_info_json
1842
+ and server_info_json["internal_states"]
1843
+ ):
1844
+ accept_length = server_info_json["internal_states"][0].get(
1845
+ "avg_spec_accept_length", None
1846
+ )
1847
+ else:
1848
+ accept_length = None
1776
1849
  else:
1777
1850
  accept_length = None
1778
1851
  else:
@@ -1804,6 +1877,10 @@ async def benchmark(
1804
1877
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
1805
1878
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
1806
1879
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
1880
+ print("{:<40} {:<10}".format("Total input text tokens:", metrics.total_input_text))
1881
+ print(
1882
+ "{:<40} {:<10}".format("Total input vision tokens:", metrics.total_input_vision)
1883
+ )
1807
1884
  print("{:<40} {:<10}".format("Total generated tokens:", metrics.total_output))
1808
1885
  print(
1809
1886
  "{:<40} {:<10}".format(
@@ -1873,6 +1950,8 @@ async def benchmark(
1873
1950
  "duration": benchmark_duration,
1874
1951
  "completed": metrics.completed,
1875
1952
  "total_input_tokens": metrics.total_input,
1953
+ "total_input_text_tokens": metrics.total_input_text,
1954
+ "total_input_vision_tokens": metrics.total_input_vision,
1876
1955
  "total_output_tokens": metrics.total_output,
1877
1956
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
1878
1957
  "request_throughput": metrics.request_throughput,
@@ -1907,11 +1986,11 @@ async def benchmark(
1907
1986
  output_file_name = args.output_file
1908
1987
  else:
1909
1988
  now = datetime.now().strftime("%m%d")
1910
- if args.dataset_name == "random-image":
1989
+ if args.dataset_name == "image":
1911
1990
  output_file_name = (
1912
1991
  f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1913
- f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1914
- f"{args.random_image_resolution}.jsonl"
1992
+ f"{args.random_output_len}_{args.image_count}imgs_"
1993
+ f"{args.image_resolution}.jsonl"
1915
1994
  )
1916
1995
  elif args.dataset_name.startswith("random"):
1917
1996
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
@@ -2087,6 +2166,12 @@ def run_benchmark(args_: argparse.Namespace):
2087
2166
  "Because when the tokenizer counts the output tokens, if there is gibberish, it might count incorrectly.\n"
2088
2167
  )
2089
2168
 
2169
+ if args.dataset_name in ["image", "mmmu"]:
2170
+ args.apply_chat_template = True
2171
+ assert (
2172
+ not args.tokenize_prompt
2173
+ ), "`--tokenize-prompt` not compatible with image dataset"
2174
+
2090
2175
  print(f"{args}\n")
2091
2176
 
2092
2177
  # Read dataset
@@ -2094,7 +2179,7 @@ def run_benchmark(args_: argparse.Namespace):
2094
2179
  model_id = args.model
2095
2180
  tokenizer_id = args.tokenizer if args.tokenizer is not None else args.model
2096
2181
  tokenizer = get_tokenizer(tokenizer_id)
2097
- input_requests = get_dataset(args, tokenizer)
2182
+ input_requests = get_dataset(args, tokenizer, model_id)
2098
2183
 
2099
2184
  # compatible with SimpleNamespace
2100
2185
  if not hasattr(args, "flush_cache"):
@@ -2175,7 +2260,7 @@ if __name__ == "__main__":
2175
2260
  "random-ids",
2176
2261
  "generated-shared-prefix",
2177
2262
  "mmmu",
2178
- "random-image",
2263
+ "image",
2179
2264
  "mooncake",
2180
2265
  ],
2181
2266
  help="Name of the dataset to benchmark on.",
@@ -2215,37 +2300,49 @@ if __name__ == "__main__":
2215
2300
  "--random-input-len",
2216
2301
  type=int,
2217
2302
  default=1024,
2218
- help="Number of input tokens per request, used only for random dataset.",
2303
+ help="Number of input tokens per request, used only for random and image dataset.",
2219
2304
  )
2220
2305
  parser.add_argument(
2221
2306
  "--random-output-len",
2222
2307
  default=1024,
2223
2308
  type=int,
2224
- help="Number of output tokens per request, used only for random dataset.",
2309
+ help="Number of output tokens per request, used only for random and image dataset.",
2225
2310
  )
2226
2311
  parser.add_argument(
2227
2312
  "--random-range-ratio",
2228
2313
  type=float,
2229
2314
  default=0.0,
2230
2315
  help="Range of sampled ratio of input/output length, "
2231
- "used only for random dataset.",
2316
+ "used only for random and image dataset.",
2232
2317
  )
2233
- # random-image dataset args
2318
+ # image dataset args
2234
2319
  parser.add_argument(
2235
- "--random-image-num-images",
2320
+ "--image-count",
2236
2321
  type=int,
2237
2322
  default=1,
2238
- help="Number of images per request (only available with the random-image dataset)",
2323
+ help="Number of images per request (only available with the image dataset)",
2239
2324
  )
2240
2325
  parser.add_argument(
2241
- "--random-image-resolution",
2326
+ "--image-resolution",
2242
2327
  type=str,
2243
2328
  default="1080p",
2244
2329
  help=(
2245
- "Resolution of random images for random-image dataset. "
2330
+ "Resolution of images for image dataset. "
2246
2331
  "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2247
2332
  ),
2248
2333
  )
2334
+ parser.add_argument(
2335
+ "--image-format",
2336
+ type=str,
2337
+ default="jpeg",
2338
+ help=("Format of images for image dataset. " "Supports jpeg and png."),
2339
+ )
2340
+ parser.add_argument(
2341
+ "--image-content",
2342
+ type=str,
2343
+ default="random",
2344
+ help=("Content for images for image dataset. " "Supports random and blank."),
2345
+ )
2249
2346
  parser.add_argument(
2250
2347
  "--request-rate",
2251
2348
  type=float,
@@ -141,6 +141,9 @@ def refine_server_args(server_args: ServerArgs, compile_args: CompileArgs):
141
141
  server_args.enable_torch_compile = False
142
142
  print(f"Disable CUDA Graph and Torch Compile to save time...")
143
143
 
144
+ server_args.load_format = "dummy"
145
+ print(f"Set load format to dummy to save time...")
146
+
144
147
  # Set watchdog timeout to compile_args.timeout because compilation will take a long time
145
148
  server_args.watchdog_timeout = compile_args.timeout
146
149
  server_args.warmups = "compile-deep-gemm"
sglang/launch_server.py CHANGED
@@ -7,23 +7,9 @@ from sglang.srt.entrypoints.http_server import launch_server
7
7
  from sglang.srt.server_args import prepare_server_args
8
8
  from sglang.srt.utils import kill_process_tree
9
9
 
10
- MOVE_ENVS_WARN = """
11
- ########################################################################
12
- # For contributors and developers: #
13
- # Please move environment variable definitions to sglang.srt.environ #
14
- # using the following pattern: #
15
- # SGLANG_XXX = EnvBool(False) #
16
- # #
17
- ########################################################################
18
- """
19
-
20
10
  if __name__ == "__main__":
21
11
  server_args = prepare_server_args(sys.argv[1:])
22
12
 
23
- from sglang.srt.server_args import print_deprecated_warning
24
-
25
- print_deprecated_warning(MOVE_ENVS_WARN)
26
-
27
13
  try:
28
14
  launch_server(server_args)
29
15
  finally:
@@ -9,6 +9,7 @@ from sglang.srt.configs.janus_pro import MultiModalityConfig
9
9
  from sglang.srt.configs.kimi_vl import KimiVLConfig
10
10
  from sglang.srt.configs.kimi_vl_moonvit import MoonViTConfig
11
11
  from sglang.srt.configs.longcat_flash import LongcatFlashConfig
12
+ from sglang.srt.configs.nemotron_h import NemotronHConfig
12
13
  from sglang.srt.configs.qwen3_next import Qwen3NextConfig
13
14
  from sglang.srt.configs.step3_vl import (
14
15
  Step3TextConfig,
@@ -32,4 +33,5 @@ __all__ = [
32
33
  "DotsVLMConfig",
33
34
  "DotsOCRConfig",
34
35
  "FalconH1Config",
36
+ "NemotronHConfig",
35
37
  ]