sglang 0.5.1.post1__py3-none-any.whl → 0.5.1.post3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sglang/bench_one_batch_server.py +79 -53
  2. sglang/bench_serving.py +186 -14
  3. sglang/profiler.py +0 -1
  4. sglang/srt/conversation.py +38 -5
  5. sglang/srt/disaggregation/decode.py +4 -0
  6. sglang/srt/disaggregation/prefill.py +4 -0
  7. sglang/srt/entrypoints/engine.py +2 -2
  8. sglang/srt/entrypoints/openai/protocol.py +27 -24
  9. sglang/srt/entrypoints/openai/serving_chat.py +50 -9
  10. sglang/srt/entrypoints/openai/serving_completions.py +15 -0
  11. sglang/srt/entrypoints/tool.py +7 -7
  12. sglang/srt/function_call/deepseekv31_detector.py +222 -0
  13. sglang/srt/function_call/function_call_parser.py +2 -0
  14. sglang/srt/function_call/gpt_oss_detector.py +144 -256
  15. sglang/srt/harmony_parser.py +588 -0
  16. sglang/srt/hf_transformers_utils.py +16 -7
  17. sglang/srt/layers/attention/ascend_backend.py +218 -111
  18. sglang/srt/layers/attention/flashattention_backend.py +241 -7
  19. sglang/srt/layers/attention/flashinfer_backend.py +5 -2
  20. sglang/srt/layers/attention/flashinfer_mla_backend.py +76 -91
  21. sglang/srt/layers/attention/utils.py +15 -94
  22. sglang/srt/layers/communicator.py +1 -2
  23. sglang/srt/layers/moe/cutlass_moe.py +0 -15
  24. sglang/srt/layers/moe/ep_moe/layer.py +1 -7
  25. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=256,N=256,device_name=NVIDIA_B200,dtype=fp8_w8a8,block_shape=[128, 128].json +146 -0
  26. sglang/srt/layers/moe/fused_moe_triton/configs/triton_3_4_0/E=257,N=64,device_name=NVIDIA_A100-SXM4-80GB.json +146 -0
  27. sglang/srt/layers/moe/topk.py +1 -1
  28. sglang/srt/layers/quantization/deep_gemm_wrapper/compile_utils.py +133 -235
  29. sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py +5 -7
  30. sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py +5 -23
  31. sglang/srt/layers/quantization/fp8.py +2 -1
  32. sglang/srt/layers/quantization/fp8_kernel.py +2 -2
  33. sglang/srt/layers/quantization/fp8_utils.py +2 -2
  34. sglang/srt/layers/quantization/modelopt_quant.py +2 -2
  35. sglang/srt/layers/quantization/mxfp4.py +16 -23
  36. sglang/srt/layers/quantization/mxfp4_tensor.py +3 -1
  37. sglang/srt/layers/utils.py +0 -14
  38. sglang/srt/lora/lora_manager.py +29 -12
  39. sglang/srt/managers/cache_controller.py +223 -156
  40. sglang/srt/managers/detokenizer_manager.py +5 -0
  41. sglang/srt/managers/io_struct.py +30 -0
  42. sglang/srt/managers/scheduler.py +58 -7
  43. sglang/srt/managers/scheduler_metrics_mixin.py +15 -0
  44. sglang/srt/managers/tokenizer_manager.py +36 -3
  45. sglang/srt/mem_cache/hicache_storage.py +31 -20
  46. sglang/srt/mem_cache/hiradix_cache.py +12 -3
  47. sglang/srt/mem_cache/memory_pool.py +73 -14
  48. sglang/srt/mem_cache/memory_pool_host.py +3 -2
  49. sglang/srt/mem_cache/radix_cache.py +1 -0
  50. sglang/srt/mem_cache/storage/hf3fs/storage_hf3fs.py +5 -13
  51. sglang/srt/mem_cache/storage/mooncake_store/mooncake_store.py +85 -81
  52. sglang/srt/metrics/collector.py +5 -5
  53. sglang/srt/model_executor/cuda_graph_runner.py +2 -2
  54. sglang/srt/model_executor/model_runner.py +1 -1
  55. sglang/srt/models/deepseek_v2.py +12 -3
  56. sglang/srt/models/gpt_oss.py +2 -1
  57. sglang/srt/models/qwen2_5_vl.py +1 -0
  58. sglang/srt/offloader.py +115 -0
  59. sglang/srt/reasoning_parser.py +56 -300
  60. sglang/srt/server_args.py +10 -5
  61. sglang/srt/tokenizer/tiktoken_tokenizer.py +6 -1
  62. sglang/srt/utils.py +59 -12
  63. sglang/test/test_cutlass_moe.py +33 -28
  64. sglang/version.py +1 -1
  65. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/METADATA +6 -5
  66. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/RECORD +69 -65
  67. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/WHEEL +0 -0
  68. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/licenses/LICENSE +0 -0
  69. {sglang-0.5.1.post1.dist-info → sglang-0.5.1.post3.dist-info}/top_level.txt +0 -0
@@ -18,7 +18,7 @@ import json
18
18
  import multiprocessing
19
19
  import os
20
20
  import time
21
- from typing import Tuple
21
+ from typing import List, Tuple
22
22
 
23
23
  import requests
24
24
 
@@ -45,6 +45,7 @@ class BenchArgs:
45
45
  skip_warmup: bool = False
46
46
  show_report: bool = False
47
47
  profile: bool = False
48
+ profile_steps: int = 3
48
49
  profile_by_stage: bool = False
49
50
 
50
51
  @staticmethod
@@ -78,6 +79,9 @@ class BenchArgs:
78
79
  parser.add_argument("--skip-warmup", action="store_true")
79
80
  parser.add_argument("--show-report", action="store_true")
80
81
  parser.add_argument("--profile", action="store_true")
82
+ parser.add_argument(
83
+ "--profile-steps", type=int, default=BenchArgs.profile_steps
84
+ )
81
85
  parser.add_argument("--profile-by-stage", action="store_true")
82
86
 
83
87
  @classmethod
@@ -132,6 +136,7 @@ def run_one_case(
132
136
  result_filename: str,
133
137
  tokenizer,
134
138
  profile: bool = False,
139
+ profile_steps: int = 3,
135
140
  profile_by_stage: bool = False,
136
141
  ):
137
142
  requests.post(url + "/flush_cache")
@@ -162,7 +167,7 @@ def run_one_case(
162
167
  profile_link = None
163
168
  if profile:
164
169
  profile_link: str = run_profile(
165
- url, 3, ["CPU", "GPU"], None, None, profile_by_stage
170
+ url, profile_steps, ["CPU", "GPU"], None, None, profile_by_stage
166
171
  )
167
172
 
168
173
  tic = time.perf_counter()
@@ -247,6 +252,71 @@ def run_one_case(
247
252
  )
248
253
 
249
254
 
255
+ def get_report_summary(
256
+ result: List[Tuple], server_args: ServerArgs, bench_args: BenchArgs
257
+ ):
258
+ import tabulate
259
+
260
+ summary = (
261
+ f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
262
+ )
263
+
264
+ headers = [
265
+ "batch size",
266
+ "latency (s)",
267
+ "input throughput (tok/s)",
268
+ "output throughput (tok/s)",
269
+ "acc length",
270
+ "ITL (ms)",
271
+ "input cost ($/1M)",
272
+ "output cost ($/1M)",
273
+ ]
274
+ if bench_args.profile:
275
+ headers.append("profile")
276
+ rows = []
277
+
278
+ for (
279
+ batch_size,
280
+ latency,
281
+ ttft,
282
+ input_throughput,
283
+ output_throughput,
284
+ _,
285
+ _,
286
+ acc_length,
287
+ trace_link,
288
+ ) in result:
289
+ if is_blackwell():
290
+ hourly_cost_per_gpu = 4 # $4/hour for one B200
291
+ else:
292
+ hourly_cost_per_gpu = 2 # $2/hour for one H100
293
+
294
+ hourly_cost = hourly_cost_per_gpu * server_args.tp_size
295
+ input_util = 0.7
296
+ accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
297
+ itl = 1 / (output_throughput / batch_size) * 1000
298
+ input_cost = 1e6 / (input_throughput * input_util) / 3600 * hourly_cost
299
+ output_cost = 1e6 / output_throughput / 3600 * hourly_cost
300
+ row = [
301
+ batch_size,
302
+ latency,
303
+ input_throughput,
304
+ output_throughput,
305
+ accept_length,
306
+ itl,
307
+ input_cost,
308
+ output_cost,
309
+ ]
310
+ if trace_link:
311
+ row.append(f"[Profile]({trace_link})")
312
+ rows.append(row)
313
+
314
+ summary += tabulate.tabulate(
315
+ rows, headers=headers, tablefmt="github", floatfmt=".2f"
316
+ )
317
+ return summary
318
+
319
+
250
320
  def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
251
321
  if bench_args.base_url:
252
322
  proc, base_url = None, bench_args.base_url
@@ -321,6 +391,7 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
321
391
  result_filename=bench_args.result_filename,
322
392
  tokenizer=tokenizer,
323
393
  profile=bench_args.profile,
394
+ profile_steps=bench_args.profile_steps,
324
395
  profile_by_stage=bench_args.profile_by_stage,
325
396
  )[-1],
326
397
  )
@@ -337,63 +408,14 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
337
408
  if not bench_args.show_report:
338
409
  return
339
410
 
340
- summary = (
341
- f"\nInput lens: {bench_args.input_len}. Output lens: {bench_args.output_len}.\n"
342
- )
343
- summary += "| batch size | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) |"
344
-
345
- if bench_args.profile:
346
- summary += " profile |"
347
-
348
- summary += "\n"
349
- summary += "| ---------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ |"
350
-
351
- if bench_args.profile:
352
- summary += "-------------|"
353
- summary += "\n"
354
-
355
- for (
356
- batch_size,
357
- latency,
358
- ttft,
359
- input_throughput,
360
- output_throughput,
361
- overall_throughput,
362
- last_gen_throughput,
363
- acc_length,
364
- trace_link,
365
- ) in result:
366
- if is_blackwell():
367
- hourly_cost_per_gpu = 4 # $4/hour for one B200
368
- else:
369
- hourly_cost_per_gpu = 2 # $2/hour for one H100
370
-
371
- hourly_cost = hourly_cost_per_gpu * server_args.tp_size
372
- input_util = 0.7
373
- accept_length = round(acc_length, 2) if acc_length is not None else "n/a"
374
- line = (
375
- f"| {batch_size} | "
376
- f"{latency:.2f} | "
377
- f"{input_throughput:.2f} | "
378
- f"{output_throughput:.2f} | "
379
- f"{accept_length} | "
380
- f"{1 / (output_throughput/batch_size) * 1000:.2f} | "
381
- f"{1e6 / (input_throughput * input_util) / 3600 * hourly_cost:.2f} | "
382
- f"{1e6 / output_throughput / 3600 * hourly_cost:.2f} |"
383
- )
384
- if trace_link:
385
- line += f" [Profile]({trace_link}) |"
386
- line += "\n"
387
- summary += line
388
-
389
- # print metrics table
411
+ summary = get_report_summary(result, server_args, bench_args)
390
412
  print(summary)
391
413
 
392
414
  if is_in_ci():
393
415
  write_github_step_summary(summary)
394
416
 
395
417
 
396
- if __name__ == "__main__":
418
+ def main():
397
419
  parser = argparse.ArgumentParser()
398
420
  ServerArgs.add_cli_args(parser)
399
421
  BenchArgs.add_cli_args(parser)
@@ -402,3 +424,7 @@ if __name__ == "__main__":
402
424
  bench_args = BenchArgs.from_cli_args(args)
403
425
 
404
426
  run_benchmark(server_args, bench_args)
427
+
428
+
429
+ if __name__ == "__main__":
430
+ main()
sglang/bench_serving.py CHANGED
@@ -12,6 +12,8 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
12
12
 
13
13
  import argparse
14
14
  import asyncio
15
+ import base64
16
+ import io
15
17
  import json
16
18
  import os
17
19
  import pickle
@@ -71,7 +73,7 @@ class RequestFuncInput:
71
73
  output_len: int
72
74
  model: str
73
75
  lora_name: str
74
- image_data: str
76
+ image_data: Optional[List[str]]
75
77
  extra_request_body: Dict[str, Any]
76
78
 
77
79
 
@@ -289,16 +291,19 @@ async def async_request_openai_chat_completions(
289
291
  ), "OpenAI Chat Completions API URL must end with 'chat/completions'."
290
292
 
291
293
  if request_func_input.image_data:
294
+ # Build multi-image content: a list of image_url entries followed by the text
295
+ content_items = [
296
+ {
297
+ "type": "image_url",
298
+ "image_url": {"url": img_url},
299
+ }
300
+ for img_url in request_func_input.image_data
301
+ ]
302
+ content_items.append({"type": "text", "text": request_func_input.prompt})
292
303
  messages = [
293
304
  {
294
305
  "role": "user",
295
- "content": [
296
- {
297
- "type": "image_url",
298
- "image_url": {"url": request_func_input.image_data},
299
- },
300
- {"type": "text", "text": request_func_input.prompt},
301
- ],
306
+ "content": content_items,
302
307
  },
303
308
  ]
304
309
  else:
@@ -497,7 +502,7 @@ async def async_request_sglang_generate(
497
502
  **request_func_input.extra_request_body,
498
503
  }
499
504
 
500
- # Add image data if available
505
+ # Add image data if available (list of image urls/base64)
501
506
  if request_func_input.image_data:
502
507
  payload["image_data"] = request_func_input.image_data
503
508
 
@@ -648,7 +653,7 @@ def get_dataset(args, tokenizer):
648
653
  prompt_suffix=args.prompt_suffix,
649
654
  apply_chat_template=args.apply_chat_template,
650
655
  )
651
- elif args.dataset_name.startswith("random"):
656
+ elif args.dataset_name.startswith("random") and args.dataset_name != "random-image":
652
657
  input_requests = sample_random_requests(
653
658
  input_len=args.random_input_len,
654
659
  output_len=args.random_output_len,
@@ -659,6 +664,18 @@ def get_dataset(args, tokenizer):
659
664
  random_sample=args.dataset_name == "random",
660
665
  return_text=not tokenize_prompt,
661
666
  )
667
+ elif args.dataset_name == "random-image":
668
+ assert not tokenize_prompt, "random-image does not support --tokenize-prompt"
669
+ input_requests = sample_random_image_requests(
670
+ num_requests=args.num_prompts,
671
+ num_images=args.random_image_num_images,
672
+ input_len=args.random_input_len,
673
+ output_len=args.random_output_len,
674
+ range_ratio=args.random_range_ratio,
675
+ tokenizer=tokenizer,
676
+ apply_chat_template=args.apply_chat_template,
677
+ image_resolution=args.random_image_resolution,
678
+ )
662
679
  elif args.dataset_name == "generated-shared-prefix":
663
680
  assert not tokenize_prompt
664
681
  input_requests = sample_generated_shared_prefix_requests(
@@ -790,7 +807,7 @@ class DatasetRow:
790
807
  prompt: str
791
808
  prompt_len: int
792
809
  output_len: int
793
- image_data: Optional[str] = None
810
+ image_data: Optional[List[str]] = None
794
811
 
795
812
 
796
813
  def sample_mmmu_requests(
@@ -913,7 +930,7 @@ def sample_mmmu_requests(
913
930
  prompt=prompt,
914
931
  prompt_len=prompt_len,
915
932
  output_len=output_len,
916
- image_data=image_data,
933
+ image_data=[image_data],
917
934
  )
918
935
  )
919
936
 
@@ -1113,6 +1130,132 @@ def sample_random_requests(
1113
1130
  return input_requests
1114
1131
 
1115
1132
 
1133
+ def parse_random_image_resolution(image_resolution: str) -> Tuple[int, int]:
1134
+ """Parse image resolution into (width, height).
1135
+
1136
+ Supports presets '1080p', '720p', '360p' and custom 'heightxwidth' format
1137
+ (e.g., '1080x1920' means height=1080, width=1920).
1138
+ """
1139
+ resolution_to_size = {
1140
+ "4k": (3840, 2160),
1141
+ "1080p": (1920, 1080),
1142
+ "720p": (1280, 720),
1143
+ "360p": (640, 360),
1144
+ }
1145
+ if image_resolution in resolution_to_size:
1146
+ return resolution_to_size[image_resolution]
1147
+
1148
+ res = image_resolution.strip().lower()
1149
+ if "x" in res:
1150
+ parts = res.split("x")
1151
+ if len(parts) == 2 and parts[0].isdigit() and parts[1].isdigit():
1152
+ height = int(parts[0])
1153
+ width = int(parts[1])
1154
+ if height > 0 and width > 0:
1155
+ return (width, height)
1156
+
1157
+ raise ValueError(
1158
+ f"Unsupported random-image resolution: {image_resolution}. "
1159
+ "Choose from 4k, 1080p, 720p, 360p, or provide custom 'heightxwidth' (e.g., 1080x1920)."
1160
+ )
1161
+
1162
+
1163
+ def sample_random_image_requests(
1164
+ num_requests: int,
1165
+ num_images: int,
1166
+ input_len: int,
1167
+ output_len: int,
1168
+ range_ratio: float,
1169
+ tokenizer: PreTrainedTokenizerBase,
1170
+ apply_chat_template: bool = True,
1171
+ image_resolution: str = "1080p",
1172
+ ) -> List[DatasetRow]:
1173
+ """Generate requests with random images.
1174
+
1175
+ - Each request includes ``num_images`` random images.
1176
+ - Supported resolutions: 4k (3840x2160), 1080p (1920x1080), 720p (1280x720), 360p (640x360),
1177
+ or custom 'heightxwidth' (e.g., 1080x1920).
1178
+ - Text lengths follow the 'random' dataset sampling rule. ``prompt_len``
1179
+ only counts text tokens and excludes image data.
1180
+ """
1181
+ try:
1182
+ import pybase64
1183
+ from PIL import Image
1184
+ except ImportError as e:
1185
+ raise ImportError(
1186
+ "Please install Pillow to generate random images: pip install pillow"
1187
+ ) from e
1188
+
1189
+ # Parse resolution (supports presets and 'heightxwidth')
1190
+ width, height = parse_random_image_resolution(image_resolution)
1191
+
1192
+ # Check for potentially problematic combinations and warn user
1193
+ if width * height >= 1920 * 1080 and num_images * num_requests >= 100:
1194
+ warnings.warn(
1195
+ f"High resolution ({width}x{height}) with {num_images * num_requests} total images "
1196
+ f"may take a long time. Consider reducing resolution or image count.",
1197
+ UserWarning,
1198
+ stacklevel=2,
1199
+ )
1200
+
1201
+ # Sample text lengths
1202
+ input_lens = np.random.randint(
1203
+ max(int(input_len * range_ratio), 1), input_len + 1, size=num_requests
1204
+ )
1205
+ output_lens = np.random.randint(
1206
+ int(output_len * range_ratio), output_len + 1, size=num_requests
1207
+ )
1208
+
1209
+ def _gen_random_image_data_uri(width: int = width, height: int = height) -> str:
1210
+ arr = (np.random.rand(height, width, 3) * 255).astype(np.uint8)
1211
+ img = Image.fromarray(arr, mode="RGB")
1212
+ buf = io.BytesIO()
1213
+ img.save(buf, format="JPEG", quality=85)
1214
+ encoded = pybase64.b64encode(buf.getvalue()).decode("utf-8")
1215
+ return f"data:image/jpeg;base64,{encoded}"
1216
+
1217
+ dataset: List[DatasetRow] = []
1218
+ for i in range(num_requests):
1219
+ # Generate text prompt
1220
+ text_prompt = gen_prompt(tokenizer, int(input_lens[i]))
1221
+
1222
+ # Generate image list
1223
+ images = [_gen_random_image_data_uri() for _ in range(num_images)]
1224
+
1225
+ prompt_str = text_prompt
1226
+ if apply_chat_template:
1227
+ try:
1228
+ content_items = [
1229
+ {"type": "image_url", "image_url": {"url": img_url}}
1230
+ for img_url in images
1231
+ ]
1232
+ content_items.append({"type": "text", "text": text_prompt})
1233
+ prompt_str = tokenizer.apply_chat_template(
1234
+ [{"role": "user", "content": content_items}],
1235
+ add_generation_prompt=True,
1236
+ tokenize=False,
1237
+ )
1238
+ except Exception:
1239
+ # Some tokenizers do not support list content; fall back to a placeholder in the text
1240
+ prompt_str = f"<image>{text_prompt}"
1241
+
1242
+ prompt_token_ids = tokenizer.encode(prompt_str)
1243
+ prompt_token_len = len(prompt_token_ids)
1244
+
1245
+ dataset.append(
1246
+ DatasetRow(
1247
+ prompt=prompt_str,
1248
+ prompt_len=prompt_token_len,
1249
+ output_len=int(output_lens[i]),
1250
+ image_data=images,
1251
+ )
1252
+ )
1253
+
1254
+ print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
1255
+ print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
1256
+ return dataset
1257
+
1258
+
1116
1259
  def gen_prompt(tokenizer, token_num):
1117
1260
  """Generate a random prompt of specified token length using tokenizer vocabulary."""
1118
1261
  all_available_tokens = list(tokenizer.get_vocab().values())
@@ -1579,7 +1722,13 @@ async def benchmark(
1579
1722
  output_file_name = args.output_file
1580
1723
  else:
1581
1724
  now = datetime.now().strftime("%m%d")
1582
- if args.dataset_name.startswith("random"):
1725
+ if args.dataset_name == "random-image":
1726
+ output_file_name = (
1727
+ f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_"
1728
+ f"{args.random_output_len}_{args.random_image_num_images}imgs_"
1729
+ f"{args.random_image_resolution}.jsonl"
1730
+ )
1731
+ elif args.dataset_name.startswith("random"):
1583
1732
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_{args.random_input_len}_{args.random_output_len}.jsonl"
1584
1733
  else:
1585
1734
  output_file_name = f"{args.backend}_{now}_{args.num_prompts}_sharegpt.jsonl"
@@ -1819,7 +1968,14 @@ if __name__ == "__main__":
1819
1968
  "--dataset-name",
1820
1969
  type=str,
1821
1970
  default="sharegpt",
1822
- choices=["sharegpt", "random", "random-ids", "generated-shared-prefix", "mmmu"],
1971
+ choices=[
1972
+ "sharegpt",
1973
+ "random",
1974
+ "random-ids",
1975
+ "generated-shared-prefix",
1976
+ "mmmu",
1977
+ "random-image",
1978
+ ],
1823
1979
  help="Name of the dataset to benchmark on.",
1824
1980
  )
1825
1981
  parser.add_argument(
@@ -1872,6 +2028,22 @@ if __name__ == "__main__":
1872
2028
  help="Range of sampled ratio of input/output length, "
1873
2029
  "used only for random dataset.",
1874
2030
  )
2031
+ # random-image dataset args
2032
+ parser.add_argument(
2033
+ "--random-image-num-images",
2034
+ type=int,
2035
+ default=1,
2036
+ help="Number of images per request (only available with the random-image dataset)",
2037
+ )
2038
+ parser.add_argument(
2039
+ "--random-image-resolution",
2040
+ type=str,
2041
+ default="1080p",
2042
+ help=(
2043
+ "Resolution of random images for random-image dataset. "
2044
+ "Supports presets 4k/1080p/720p/360p or custom 'heightxwidth' (e.g., 1080x1920)."
2045
+ ),
2046
+ )
1875
2047
  parser.add_argument(
1876
2048
  "--request-rate",
1877
2049
  type=float,
sglang/profiler.py CHANGED
@@ -9,7 +9,6 @@ import argparse
9
9
  import json
10
10
  import os
11
11
  import time
12
- import urllib.parse
13
12
  from argparse import ArgumentParser
14
13
  from pathlib import Path
15
14
  from typing import List, Optional
@@ -26,6 +26,8 @@ Key components:
26
26
  # Adapted from
27
27
  # https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
28
28
  import dataclasses
29
+ import json
30
+ import os
29
31
  import re
30
32
  from enum import IntEnum, auto
31
33
  from typing import Callable, Dict, List, Optional, Tuple, Union
@@ -959,16 +961,42 @@ register_conv_template(
959
961
  )
960
962
 
961
963
 
964
+ MODEL_TYPE_TO_TEMPLATE = {
965
+ "internvl_chat": "internvl-2-5",
966
+ "deepseek_vl_v2": "deepseek-vl2",
967
+ "multi_modality": "janus-pro",
968
+ "phi4mm": "phi-4-mm",
969
+ "minicpmv": "minicpmv",
970
+ "minicpmo": "minicpmo",
971
+ }
972
+
973
+
974
+ def get_model_type(model_path: str) -> Optional[str]:
975
+ config_path = os.path.join(model_path, "config.json")
976
+ if not os.path.exists(config_path):
977
+ return None
978
+ try:
979
+ with open(config_path, "r", encoding="utf-8") as f:
980
+ config = json.load(f)
981
+ return config.get("model_type")
982
+ except (IOError, json.JSONDecodeError):
983
+ return None
984
+
985
+
962
986
  @register_conv_template_matching_function
963
987
  def match_internvl(model_path: str):
964
988
  if re.search(r"internvl", model_path, re.IGNORECASE):
965
989
  return "internvl-2-5"
990
+ model_type = get_model_type(model_path)
991
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
966
992
 
967
993
 
968
994
  @register_conv_template_matching_function
969
995
  def match_deepseek_janus_pro(model_path: str):
970
996
  if re.search(r"janus", model_path, re.IGNORECASE):
971
997
  return "janus-pro"
998
+ model_type = get_model_type(model_path)
999
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
972
1000
 
973
1001
 
974
1002
  @register_conv_template_matching_function
@@ -981,6 +1009,8 @@ def match_vicuna(model_path: str):
981
1009
  def match_deepseek_vl(model_path: str):
982
1010
  if re.search(r"deepseek.*vl2", model_path, re.IGNORECASE):
983
1011
  return "deepseek-vl2"
1012
+ model_type = get_model_type(model_path)
1013
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
984
1014
 
985
1015
 
986
1016
  @register_conv_template_matching_function
@@ -994,14 +1024,17 @@ def match_qwen_chat_ml(model_path: str):
994
1024
 
995
1025
 
996
1026
  @register_conv_template_matching_function
997
- def match_openbmb_minicpm(model_path: str):
998
- if re.search(r"minicpm-v", model_path, re.IGNORECASE):
999
- return "minicpmv"
1000
- elif re.search(r"minicpm-o", model_path, re.IGNORECASE):
1001
- return "minicpmo"
1027
+ def match_minicpm(model_path: str):
1028
+ match = re.search(r"minicpm-(v|o)", model_path, re.IGNORECASE)
1029
+ if match:
1030
+ return f"minicpm{match.group(1).lower()}"
1031
+ model_type = get_model_type(model_path)
1032
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
1002
1033
 
1003
1034
 
1004
1035
  @register_conv_template_matching_function
1005
1036
  def match_phi_4_mm(model_path: str):
1006
1037
  if "phi-4-multimodal" in model_path.lower():
1007
1038
  return "phi-4-mm"
1039
+ model_type = get_model_type(model_path)
1040
+ return MODEL_TYPE_TO_TEMPLATE.get(model_type)
@@ -334,6 +334,8 @@ class DecodePreallocQueue:
334
334
  error_message,
335
335
  status_code=HTTPStatus.INTERNAL_SERVER_ERROR,
336
336
  )
337
+ if self.scheduler.enable_metrics:
338
+ self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
337
339
  else:
338
340
  raise ValueError(f"Unexpected poll case: {poll}")
339
341
 
@@ -595,6 +597,8 @@ class DecodeTransferQueue:
595
597
  # unlock the kv cache or it will have memory leak
596
598
  self.tree_cache.cache_finished_req(decode_req.req)
597
599
  indices_to_remove.add(i)
600
+ if self.scheduler.enable_metrics:
601
+ self.scheduler.metrics_collector.increment_transfer_failed_reqs()
598
602
  continue
599
603
  elif poll == KVPoll.Success:
600
604
 
@@ -238,6 +238,8 @@ class PrefillBootstrapQueue:
238
238
  self.scheduler.stream_output([req], req.return_logprob)
239
239
  indices_to_remove.add(i)
240
240
  failed_reqs.append(req)
241
+ if self.scheduler.enable_metrics:
242
+ self.scheduler.metrics_collector.increment_bootstrap_failed_reqs()
241
243
  continue
242
244
 
243
245
  # KV.WaitingForInput - init here
@@ -522,6 +524,8 @@ class SchedulerDisaggregationPrefillMixin:
522
524
  req, error_message, status_code=HTTPStatus.INTERNAL_SERVER_ERROR
523
525
  )
524
526
  done_reqs.append(req)
527
+ if self.enable_metrics:
528
+ self.metrics_collector.increment_transfer_failed_reqs()
525
529
  else:
526
530
  assert False, f"Unexpected polling state {poll=}"
527
531
 
@@ -672,7 +672,7 @@ def _set_envs_and_config(server_args: ServerArgs):
672
672
  if server_args.attention_backend == "flashinfer":
673
673
  assert_pkg_version(
674
674
  "flashinfer_python",
675
- "0.2.11.post3",
675
+ "0.2.14.post1",
676
676
  "Please uninstall the old version and "
677
677
  "reinstall the latest version by following the instructions "
678
678
  "at https://docs.flashinfer.ai/installation.html.",
@@ -680,7 +680,7 @@ def _set_envs_and_config(server_args: ServerArgs):
680
680
  if _is_cuda and not get_bool_env_var("SGLANG_SKIP_SGL_KERNEL_VERSION_CHECK"):
681
681
  assert_pkg_version(
682
682
  "sgl-kernel",
683
- "0.3.5",
683
+ "0.3.7",
684
684
  "Please reinstall the latest version with `pip install sgl-kernel --force-reinstall`",
685
685
  )
686
686